diff --git a/.gitignore b/.gitignore
index dec8915..e6214f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,9 @@ __pycache__
 *.pth
 test.py
 test.ipynb
-experiment
\ No newline at end of file
+experiment
+analysis
+output
+rebuttal/
+*quant_cuda_kernel_*
+demo*
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 261eeb9..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/README.md b/README.md
index efe2188..fbb5885 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,23 @@
-# OWQ: Lessons learned from activation outliers for weight quantization in large language models
+# [AAAI 2024 (Oral)] &nbsp; OWQ: Outlier-Aware Weight Quantization for Efficient Fine-Tuning and Inference of Large Language Models 
 
-This is the code for the paper [OWQ: Lessons learned from activation outliers for weight quantization in large language models](https://arxiv.org/abs/2306.02272). OWQ preserves few weak columns as FP16, while quantizing other weights to 3/4-bits. OWQ achieves substantial quality improvements with only negligible storage and computation overhead, effectively preserving the benefits of low-precision acceleration.
+<p align="center">
+  <img src="./images/owq_llama.png" width="300px" height="300px">
+</p>
+This is the code for the paper [OWQ: Outlier-Aware Weight Quantization for Efficient Fine-Tuning and Inference of Large Language Models](https://arxiv.org/abs/2306.02272). OWQ preserves few weak columns as FP16, while quantizing other weights to 3/4-bits. OWQ achieves substantial quality improvements with only negligible storage and computation overhead, effectively preserving the benefits of low-precision acceleration.
 
+<p align="center">
+  <br>
+  <img src="./images/owq_figure.png">
+</p>
 
-## Updates (2024-01-22)
+## Updates (2024-01-29)
 * Integrated all models (OPT, LLaMA, BLOOM, Falcon) into `main.py` file. You can easily add custom or open-accessible huggingface models to `model_config.json` if you want.
 * Support 4bit matrix - FP16 vector product CUDA kernel.
 * Support BFloat16.
 
 ## Features
 * Implementation of the OWQ algorithm: `owq/recon.py`, `main.py`
-* 3/4-bit weight quantization of LLMs (OPT, LLaMA1,2 families and etc..): `main.py`
+* 3/4-bit weight quantization of LLMs (OPT, LLaMA-1,2 families and etc ...): `main.py`
 * Evaluating the perplexity of quantized models: `main.py`
 * Evaluating the zero-shot accuracy of quantized models: `zeroshot.py`
 * Supports 3/4-bit packed weight save / load (~1/5, ~1/4 file size of FP16 checkpoint, respectively.)
@@ -21,7 +28,7 @@ This is the code for the paper [OWQ: Lessons learned from activation outliers fo
 * [Install](#install)
 * [Usage](#usage)
 * [Zero-shot](#zero-shot)
-* [3-bit CUDA kernel](#3-bit-cuda-kernels)
+* [3/4-bit CUDA kernels](#34-bit-cuda-kernels)
 
 ## Install
 We highly recommend to use docker image that supports CUDA. If you use anaconda instead, you need to setup CUDA for kernel use.
@@ -67,43 +74,43 @@ We have tested 3/4-bit CUDA kernel on the NVIDIA A100, A6000 and RTX3090 GPU.
 
 ### Running OWQ & measuring the perplexity (PPL)
 
-Here we use OPT-1.3b model as an example. You can replace the model argument `opt-1.3b` among `opt-125m`, `opt-350m`, `opt-2.7b`, `opt-6.7b`, `opt-13b`, `opt-66b` or other models (e.g. `meta-llama/Llama-2-7b-hf`).
+Here we use llama-7b model (huggyllama/llama-7b) as an example. You can replace the model argument `llama-7b` among `llama-13b`, `llama-30b`, and `llama-65b` or other model families (e.g. `meta-llama/Llama-2-7b-hf`, `facebook/opt-6.7b`, `lmsys/vicuna-33b-v1.3`, etc ...).
 
 * OWQ using 3.01-bit (3-bit quantization + few FP16 weight columns)
 ```
-python main.py facebook/opt-1.3b c4 --wbits 3 --target_bit 3.01
+python main.py huggyllama/llama-7b c4 --wbits 3 --target_bit 3.01
 ```
 * OWQ using 4.01-bit (4-bit quantization + few FP16 weight columns)
 ```
-python main.py facebook/opt-1.3b c4 --wbits 4 --target_bit 4.01
+python main.py huggyllama/llama-7b c4 --wbits 4 --target_bit 4.01
 ```
 
 Below are the example for the other options (FP16, RTN, GPTQ). 
 ```
 # Measuring the ppl of the full precision (FP16) model
-python main.py facebook/opt-1.3b c4 --wbits 16
+python main.py huggyllama/llama-7b c4 --wbits 16
 
 # 4-bit Round-to-Nearest (RTN) quantization
-python main.py facebook/opt-1.3b c4 --wbits 4 --nearest
+python main.py huggyllama/llama-7b c4 --wbits 4 --nearest
 
 # GPTQ with 3-bit quantization
-python main.py facebook/opt-1.3b c4 --wbits 3 --tuning minmax
+python main.py huggyllama/llama-7b c4 --wbits 3 --tuning minmax
 ```
 
 ### Zero-shot
-Here we give an example of measuring zero-shot accuracy on `lambada_openai` and `piqa` tasks using opt-125m model.
+Here we give an example of measuring zero-shot accuracy on `hellaswag` tasks using llama-7b model.
 You need to generate quantized model checkpoint before measuring the zero-shot accuracy.  
 ```
 # making checkpoint file of OWQ reconstruction
-python main.py facebook/opt-125m c4 --wbits 3 --target_bit 3.05 --no-eval --save opt-125m_3_05.pth --packing
+python main.py huggyllama/llama-7b c4 --wbits 3 --target_bit 3.01 --no-eval --save llama-7b_3_01.pth --packing
 
-# measuring zero-shot accuracy (single-gpu)
-CUDA_VISIBLE_DEVICES=0 python zeroshot.py --model hf-causal-owq --model_args pretrained=facebook/opt-125m,load=opt-125m_3_05.pth --batch_size 4 --tasks lambada_openai --no_cache
+# measuring zero-shot accuracy (using single-gpu)
+CUDA_VISIBLE_DEVICES=0 python zeroshot.py --model hf-causal-owq --model_args pretrained=huggyllama/llama-7b,load=llama-7b_3_01.pth --batch_size 4 --tasks hellaswag --no_cache
 # multi-gpu
-CUDA_VISIBLE_DEVICES=0,1 python zeroshot.py --model hf-causal-owq --model_args pretrained=facebook/opt-125m,load=opt-125m_3_05.pth,use_accelerate=True --batch_size 4 --tasks lambada_openai --no_cache
+CUDA_VISIBLE_DEVICES=0,1 python zeroshot.py --model hf-causal-owq --model_args pretrained=huggyllama/llama-7b,load=llama-7b_3_01.pth,use_accelerate=True --batch_size 4 --tasks hellaswag --no_cache
 ```
 
-### Easy OWQ + Measuring PPL, Zeroshot sample
+### Easy OPT OWQ + Measuring PPL, Zeroshot sample
 ```
 bash scripts/opt_end_to_end_evaluation.sh 0 opt-1.3b
 ```
@@ -111,7 +118,7 @@ bash scripts/opt_end_to_end_evaluation.sh 0 opt-1.3b
 ## Demo
 Please refer to the README in the `demo` directory.
 
-## 3-bit CUDA Kernels 
+## 3/4-bit CUDA Kernels 
 
 ### Benchmark kernel performance
 ```
@@ -120,9 +127,9 @@ cd owq/kernel/
 python test_kernel.py
 ```
 
-### Benchmark language generation with 3/4-bit packed model (opt, llama)
+### Benchmark language generation with 3/4-bit packed model (opt, llama, etc...)
 ```
-# Example of OPT-65b language generation (single token)
+# Example of OPT-66b language generation (single token)
 
 # Save compressed model
 python main.py facebook/opt-66b c4 --wbits 3 --target_bit 3.01 --no-eval --save opt-66b_3_01.pth --packing
@@ -157,4 +164,4 @@ If you find our code or OWQ useful for your research, please consider citing:
   journal={arXiv preprint arXiv:2306.02272},
   year={2023}
 }
-```
\ No newline at end of file
+```
diff --git a/bloom.py b/bloom.py
deleted file mode 100644
index c0b3d42..0000000
--- a/bloom.py
+++ /dev/null
@@ -1,445 +0,0 @@
-import time
-
-import torch
-import torch.nn as nn
-
-import transformers
-
-from owq.recon import GPTQ_OWQ
-from owq.quant import *
-from owq.utils.misc import find_layers, check_arguments
-from owq.utils.datautils import *
-
-import argparse
-import random
-import os
-import numpy as np
-from tqdm import tqdm
-
-layer_list = ['qkv','dense','fc1','fc2']
-n_out_dict = {'self_attention.query_key_value':0,
-            'self_attention.dense':0,
-            'mlp.dense_h_to_4h':0,
-            'mlp.dense_4h_to_h':0}
-
-def get_bloom(model):
-    import torch
-    def skip(*args, **kwargs):
-        pass
-    torch.nn.init.kaiming_uniform_ = skip
-    torch.nn.init.uniform_ = skip
-    torch.nn.init.normal_ = skip
-    from transformers import BloomForCausalLM
-    model = BloomForCausalLM.from_pretrained(model, torch_dtype='auto')
-    model.seqlen = 2048
-    return model
-
-@torch.no_grad()
-def bloom_sequential(model, dataloader, dev, means=None, stds=None):
-    print('Starting ...')
-
-    use_cache = model.config.use_cache
-    model.config.use_cache = False
-    layers = model.transformer.h
-
-    model.transformer.word_embeddings = model.transformer.word_embeddings.to(dev)
-    model.transformer.word_embeddings_layernorm = model.transformer.word_embeddings_layernorm.to(dev)
-    layers[0] = layers[0].to(dev)
-
-    dtype = next(iter(model.parameters())).dtype
-    inps = torch.zeros(
-        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
-    )
-    cache = {'i': 0, 'attention_mask': None, 'alibi': None}
-
-    class Catcher(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-        def forward(self, inp, **kwargs):
-            inps[cache['i']] = inp
-            cache['i'] += 1
-            cache['attention_mask'] = kwargs['attention_mask']
-            cache['alibi'] = kwargs['alibi']
-            raise ValueError
-    layers[0] = Catcher(layers[0])
-    for batch in dataloader:
-        try:
-            model(batch[0].to(dev))
-        except ValueError:
-            pass
-    layers[0] = layers[0].module
-
-    layers[0] = layers[0].cpu()
-    model.transformer.word_embeddings = model.transformer.word_embeddings.cpu()
-    model.transformer.word_embeddings_layernorm = model.transformer.word_embeddings_layernorm.cpu()
-    torch.cuda.empty_cache()
-
-    outs = torch.zeros_like(inps)
-    attention_mask = cache['attention_mask']
-    alibi = cache['alibi']
-
-    print('Ready.')
-    
-    if args.target_bit is not None:
-        args.layers = layer_list if args.layers is None else args.layers
-        n_mp_layers = len(args.layers)
-        if 'qkv' in args.layers:
-            n_mp_layers += 2 # q k v
-        
-        r = (12 / (16 - args.wbits)) * (args.target_bit - args.wbits)
-        # r = (args.target_bit - args.wbits) * 16 / 12
-        r /= n_mp_layers
-
-        layer = find_layers(layers[0])
-        
-        for i in range(len(args.layers)):
-            if args.layers[i] == 'qkv':
-                name = 'self_attention.query_key_value'
-                n_out_dict[name] = round(layer[name].weight.data.shape[1] * r) * 3
-            elif args.layers[i] == 'dense':
-                name = 'self_attention.dense'
-                n_out_dict[name] = round(layer[name].weight.data.shape[1] * r)
-            elif args.layers[i] == 'fc1':
-                name = 'mlp.dense_h_to_4h'
-                n_out_dict[name] = round(layer[name].weight.data.shape[1] * r / 4)
-            elif args.layers[i] == 'fc2':
-                name = 'mlp.dense_4h_to_h'
-                n_out_dict[name] = round(layer[name].weight.data.shape[1] * r / 4)
-    
-    quantizers = {}
-    for i in range(len(layers)):
-        layer = layers[i].to(dev)
-        block_layers = find_layers(layer)
-        
-        if args.true_sequential:
-            sequential = [
-                ['self_attention.query_key_value'], ['self_attention.dense'],
-                ['mlp.dense_h_to_4h'], ['mlp.dense_4h_to_h']
-            ]
-        else:
-            sequential = [list(block_layers.keys())]
-
-        for names in sequential:
-            subset = {n: block_layers[n] for n in names}
-
-            gptq = {}
-            for name in subset:
-                gptq[name] = GPTQ_OWQ(subset[name], n_out=n_out_dict[name])
-                gptq[name].quantizer = Quantizer()
-                gptq[name].quantizer.configure(
-                    args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse')
-                )
-                gptq[name].quantizer.n_out = n_out_dict[name]
-                
-            def add_batch(name):
-                def tmp(_, inp, out):
-                    gptq[name].add_batch(inp[0].data, out.data)
-                return tmp
-            handles = []
-            for name in subset:
-                handles.append(subset[name].register_forward_hook(add_batch(name)))
-            for j in range(args.nsamples):
-                layer(inps[j].unsqueeze(0), attention_mask=attention_mask, alibi=alibi)
-            for h in handles:
-                h.remove()
-            
-            for name in names:
-                if name.endswith('query_key_value') and args.target_bit is not None:
-                    name = 'self_attention.query_key_value'
-                    layer_qkv = subset[name]
-                    W_q, W_k, W_v = torch.chunk(layer_qkv.weight.data, 3, dim=0)
-                    W_attn_dict = {'self_attention.query':W_q, 'self_attention.key':W_k, 'self_attention.value':W_v}
-                    for name1 in W_attn_dict:
-                        W = W_attn_dict[name1]
-                        subset[name1] = nn.Linear(W.shape[1], W.shape[0], device=W.device, dtype=W.dtype)
-                        subset[name1].weight.data = W.clone()
-                        gptq[name1] = GPTQ_OWQ(subset[name], n_out=n_out_dict[name])
-                        gptq[name1].quantizer = Quantizer()
-                        gptq[name1].quantizer.configure(
-                            args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse')
-                        )
-                        gptq[name1].quantizer.n_out = n_out_dict[name] // 3
-                        gptq[name1].H = gptq[name].H.clone()
-                            
-                    del subset[name]
-                    del W_q, W_k, W_v
-                    del gptq[name]
-                    torch.cuda.empty_cache()
-                    break
-
-            for name in subset:
-                if not args.no_frob_norm:
-                    W = subset[name].weight.data.clone().to(torch.float)
-                    temp_quantizer = Quantizer()
-                    temp_quantizer.configure(args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse'))
-                    temp_quantizer.find_params(W, weight=True, num=40)
-                    W_quant = temp_quantizer.quantize(W)
-                    frob_norm_error = (W - W_quant).pow(2).sum(dim=0)
-                else:
-                    frob_norm_error = None
-                out_ids = gptq[name].hessian_sorting(actorder=args.act_order, frob_norm=frob_norm_error)
-                gptq[name].quantizer.out_ids = out_ids.cpu()
-
-            if not args.no_frob_norm:
-                del W
-                del W_quant
-                del temp_quantizer
-                torch.cuda.empty_cache()
-
-            for name in subset:
-                print(f"Quantizing model.decoder.layers.{i}.{name}")
-                gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
-                gptq[name].free()
-
-            for name in names:
-                if name.endswith('query_key_value') and args.target_bit is not None:
-                    W_qkv = [subset[n].weight.data.clone() for n in W_attn_dict]
-                    layer_qkv.weight.data = torch.concat(W_qkv,dim=0)
-                    del W_qkv
-                    break
-        
-        for j in range(args.nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, alibi=alibi)[0]
-
-        layers[i] = layer.cpu()
-        del layer
-        del gptq
-        torch.cuda.empty_cache()
-        
-        inps, outs = outs, inps
-
-    model.config.use_cache = use_cache
-
-@torch.no_grad()
-def bloom_eval(model, testenc, dev):
-    print('Evaluation...')
-
-    testenc = testenc.input_ids
-    nsamples = testenc.numel() // model.seqlen
-
-    use_cache = model.config.use_cache
-    model.config.use_cache = False
-    layers = model.transformer.h
-
-    model.transformer.word_embeddings = model.transformer.word_embeddings.to(dev)
-    model.transformer.word_embeddings_layernorm = model.transformer.word_embeddings_layernorm.to(dev)
-    layers[0] = layers[0].to(dev)
-
-    dtype = next(iter(model.parameters())).dtype
-    inps = torch.zeros(
-        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
-    )
-    cache = {'i': 0, 'attention_mask': None, 'alibi': None}
-
-    class Catcher(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-        def forward(self, inp, **kwargs):
-            inps[cache['i']] = inp
-            cache['i'] += 1
-            cache['attention_mask'] = kwargs['attention_mask']
-            cache['alibi'] = kwargs['alibi']
-            raise ValueError
-    layers[0] = Catcher(layers[0])
-    for i in range(nsamples):
-        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
-        try:
-            model(batch)
-        except ValueError:
-            pass
-    layers[0] = layers[0].module
-
-    layers[0] = layers[0].cpu()
-    model.transformer.word_embeddings = model.transformer.word_embeddings.cpu()
-    model.transformer.word_embeddings_layernorm = model.transformer.word_embeddings_layernorm.cpu()
-    torch.cuda.empty_cache()
-
-    outs = torch.zeros_like(inps)
-    attention_mask = cache['attention_mask']
-    alibi = cache['alibi']
-
-    for i in tqdm(range(len(layers))):
-        layer = layers[i].to(dev)
-
-        if args.nearest:
-            subset = find_layers(layer)
-            for name in subset:
-                quantizer = Quantizer()
-                quantizer.configure(
-                    args.wbits, perchannel=True, sym=False, mse=False
-                )
-                W = subset[name].weight.data
-                quantizer.find_params(W, weight=True)
-                subset[name].weight.data = quantize(
-                    W, quantizer.scale, quantizer.zero, quantizer.maxq
-                ).to(next(iter(layer.parameters())).dtype)
-
-        for j in range(nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, alibi=alibi)[0]
-        layers[i] = layer.cpu() 
-        del layer
-        torch.cuda.empty_cache()
-        inps, outs = outs, inps
-
-    model.transformer.ln_f = model.transformer.ln_f.to(dev)
-    model.lm_head = model.lm_head.to(dev)
-
-    testenc = testenc.to(dev)
-    nlls = []
-    for i in range(nsamples):
-        hidden_states = inps[i].unsqueeze(0)
-        hidden_states = model.transformer.ln_f(hidden_states)
-        lm_logits = model.lm_head(hidden_states)
-        shift_logits = lm_logits[:, :-1, :].contiguous()
-        shift_labels = testenc[
-            :, (i * model.seqlen):((i + 1) * model.seqlen)
-        ][:, 1:]
-        loss_fct = nn.CrossEntropyLoss()
-        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-        neg_log_likelihood = loss.float() * model.seqlen
-        nlls.append(neg_log_likelihood)
-    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
-    print(ppl.item())
-
-    model.config.use_cache = use_cache
-    return ppl.item()
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        'model', type=str,
-        help='BLOOM model to load; pass `bigscience/bloom-X`.'
-    )
-    parser.add_argument(
-        'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
-        help='Where to extract calibration data from.'
-    )
-    parser.add_argument(
-        '--nsamples', type=int, default=128,
-        help='Number of calibration data samples.'
-    )
-    parser.add_argument(
-        '--wbits', type=int, default=16, choices=[2, 3, 4, 16],
-        help='The number of bits to use for weight quantization; use 16 for evaluating base model.'
-    )
-    parser.add_argument(
-        '--target_bit', type=float, default=None,
-        help='Effctive target bits for OWQ.'
-    )
-    parser.add_argument(
-        '--tuning', type=str, default='mse', choices=['mse', 'minmax'],
-        help='Method for quantization parameter tuning.'
-    )
-    parser.add_argument(
-        '--no_frob_norm', action='store_true',
-        help='Whether to use Frobenius norm for OWQ.'
-    )
-    parser.add_argument(
-        '--percdamp', type=float, default=.01,
-        help='Percent of the average Hessian diagonal to use for dampening.'
-    )
-    parser.add_argument(
-        '--layers', nargs='+', type=str, default=None, choices=layer_list,
-        help='Layers to apply OWQ.'
-    )
-    parser.add_argument(
-        '--seed', type=int, default=0,
-        help='Seed for sampling the calibration data.'
-    )
-    parser.add_argument(
-        '--nearest', action='store_true',
-        help='Whether to run the round-to-nearest quantization.'
-    ) 
-    parser.add_argument(
-        '--groupsize', type=int, default=-1,
-        help='Groupsize for fine-grained quantization; default uses full row.'
-    )
-
-    parser.add_argument(
-        '--no-eval', action='store_true',
-        help='Whether to evaluate model on WikiText-2, PTB and C4'
-    )
-    parser.add_argument(
-        '--save', type=str, default='',
-        help='Save quantized checkpoint under this name.'
-    )
-    parser.add_argument(
-        '--load', type=str, default='',
-        help='Load fake or 3bit quantized checkpoint.'
-    )
-    parser.add_argument(
-        '--logfile', type=str, default='',
-        help='Logging file name'
-    )
-    
-    parser.add_argument(
-        '--old-eval', action='store_true',
-        help='Whether to use the old version of PTB and C4 evaluation.'
-    )
-    parser.add_argument(
-        '--act-order', action='store_true',
-        help='Whether to apply the activation order GPTQ heuristic'
-    )
-    parser.add_argument(
-        '--true-sequential', action='store_true',
-        help='Whether to run in true sequential model.'
-    )
-
-    args = parser.parse_args()
-    check_arguments(args)
-    device = torch.device('cuda:0')
-
-    def seed_all(seed):
-        random.seed(seed)
-        os.environ['PYTHONHASHSEED'] = str(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-    seed_all(args.seed)
-    
-    model = get_bloom(args.model)
-    model.eval()
-    t = 0
-    if args.load:
-        print(f"Loading {args.load} ....")
-        model.load_state_dict(torch.load(args.load))
-        print("Done.")
-    else:
-        dataloader = get_loaders(
-            args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=True
-        )
-        if args.wbits < 16 and not args.nearest:
-            tick = time.time()
-            quantizers = bloom_sequential(model, dataloader, device)
-            t = round((time.time() - tick),1)
-            print(f"Running Time : {t}")
-
-    t1 = time.time()
-    ppl_scores = []
-    if not args.no_eval:
-        if args.old_eval:
-            ppl_tasks = ['wikitext2', 'ptb', 'c4']
-        else:
-            ppl_tasks = ['wikitext2','ptb-new', 'c4-new']
-        for dataset in ppl_tasks:
-            testloader = get_loaders(
-                dataset, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False
-            )
-            print(dataset)
-            ppl_score = bloom_eval(model, testloader, device)
-            ppl_scores.append((dataset,ppl_score))
-    t2 = time.time() - t1
-    
-    if args.logfile:
-        with open(f'{args.logfile}','a') as fp:
-            add_str = f"\nlayers : {args.layers}" + f"| target_bit : {args.target_bit}\n" if args.target_bit is not None else '\n'
-            fp.write(f"model : {args.model} | owq time : {round(t/60,1)}m / eval time : {round(t2/60,1)}m | seed : {args.seed} {add_str}")
-            for i in range(len(ppl_scores)):
-                fp.write(f"{ppl_scores[i][1]} ")
-            fp.write(f"\n\n")
-
-    if args.save:
-        torch.save(model.state_dict(), args.save)
diff --git a/demo/README.md b/demo/README.md
index c80cb45..f91c21a 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -26,7 +26,7 @@ python demo_2model.py lmsys/vicuna-7b-v1.3 lmsys/vicuna-33b-v1.3 --load2 {quanti
 ```
 Then you can get accessible Link to the demo page. Please enjoy!
 
-Note that **Quantized Vicuna-33B model using our OWQ method gives comparable or better chat quality, with similar memory usage comparing to FP vicuna-7B model.**
+Note that **Quantized Vicuna-33B model using our OWQ method gives comparable or better chat quality, with similar memory usage compared to the FP vicuna-7B model.**
 
 
 ### LLaMA-2 70B + OWQ 3.01 bit
@@ -40,7 +40,7 @@ python demo_llama2_70b.py meta-llama/Llama-2-70b-chat-hf --load {quantized-llama
 python demo_llama2_70b.py meta-llama/Llama-2-70b-chat-hf --load {quantized-llama-2-70b-weight-location} --gpus 0,1
 ```
 
-Please Note that we can run powerful chatbot model based on **LLaMA-2 70B** model just using **2x consumer GPUs (RTX 3090)**.
+Please note that we can run powerful chatbot model based on **LLaMA-2 70B** model just using **2x consumer GPUs (RTX 3090)**.
 
 
 
diff --git a/demo/demo_2model.py b/demo/demo_2model.py
index d4f1619..ace16bf 100644
--- a/demo/demo_2model.py
+++ b/demo/demo_2model.py
@@ -17,9 +17,6 @@
 def main(args):
     assert len(args.gpus.split(',')) == 2, "Two GPU devices are required. Please enter them separated by commas"
 
-    os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
-    os.environ["CUDA_VISIBLE_DEVICES"]=args.gpus
-
     global id1, id2
     id1, id2 = args.gpus.split(',')
     fmodel_name = args.fmodel.split('/')[-1].upper()
diff --git a/demo/demo_llama2_70b.py b/demo/demo_llama2_70b.py
index 63f56dc..dfc0df6 100644
--- a/demo/demo_llama2_70b.py
+++ b/demo/demo_llama2_70b.py
@@ -85,8 +85,6 @@ def main(args):
     multigpu = True if len(gpus_list) > 1 else False
 
     if multigpu:
-        os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
-        os.environ["CUDA_VISIBLE_DEVICES"]=args.gpus
         id1, id2 = gpus_list
         
         dev1 = torch.device(f'cuda:{id1}')
diff --git a/images/owq_figure.png b/images/owq_figure.png
new file mode 100644
index 0000000..6ced283
Binary files /dev/null and b/images/owq_figure.png differ
diff --git a/images/owq_llama.png b/images/owq_llama.png
new file mode 100644
index 0000000..d989829
Binary files /dev/null and b/images/owq_llama.png differ
diff --git a/llama.py b/llama.py
deleted file mode 100644
index 3fd5bf0..0000000
--- a/llama.py
+++ /dev/null
@@ -1,573 +0,0 @@
-import time
-
-import torch
-import torch.nn as nn
-
-import transformers
-
-from owq.recon import GPTQ_OWQ
-from owq.quant import *
-from owq.utils.misc import find_layers, check_arguments
-from owq.utils.datautils import *
-
-import argparse
-import random
-import os
-import numpy as np
-from tqdm import tqdm
-
-layer_list = ['k','v','q','o','up','gate','down']
-n_out_dict = {'self_attn.k_proj':0,
-            'self_attn.v_proj':0,
-            'self_attn.q_proj':0,
-            'self_attn.o_proj':0,
-            'mlp.up_proj':0,
-            'mlp.gate_proj':0,
-            'mlp.down_proj':0 }
-
-def get_llama(model):
-    import torch
-    def skip(*args, **kwargs):
-        pass
-    torch.nn.init.kaiming_uniform_ = skip
-    torch.nn.init.uniform_ = skip
-    torch.nn.init.normal_ = skip
-    from transformers import LlamaForCausalLM
-    model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto')
-    model.seqlen = 2048
-    return model
-
-@torch.no_grad()
-def llama_sequential(model, dataloader, dev):
-    print('Starting ...')
-
-    use_cache = model.config.use_cache
-    model.config.use_cache = False
-    layers = model.model.layers
-
-    model.model.embed_tokens = model.model.embed_tokens.to(dev)
-    model.model.norm = model.model.norm.to(dev)
-    layers[0] = layers[0].to(dev)
-
-    dtype = next(iter(model.parameters())).dtype
-    inps = torch.zeros(
-        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
-    )
-
-    cache = {'i': 0, 'attention_mask': None}
-
-    class Catcher(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-        def forward(self, inp, **kwargs):
-            inps[cache['i']] = inp
-            cache['i'] += 1
-            cache['attention_mask'] = kwargs['attention_mask']
-            cache['position_ids'] = kwargs['position_ids']
-            raise ValueError
-    
-    layers[0] = Catcher(layers[0])
-    for batch in dataloader:
-        try:
-            model(batch[0].to(dev))
-        except ValueError:
-            pass
-    layers[0] = layers[0].module
-    layers[0] = layers[0].cpu()
-    model.model.embed_tokens = model.model.embed_tokens.cpu()
-    model.model.norm = model.model.norm.cpu()
-    torch.cuda.empty_cache()
-
-    outs = torch.zeros_like(inps)
-    attention_mask = cache['attention_mask']
-    position_ids = cache['position_ids']
-
-    print('Ready.')
-
-    if args.target_bit is not None:
-        args.layers = layer_list if args.layers is None else args.layers
-        n_mp_layers = len(args.layers)
-        
-        r = (12 / (16 - args.wbits)) * (args.target_bit - args.wbits)
-        # r = (args.target_bit - args.wbits) * 16 / 12
-        r /= n_mp_layers
-
-        layer = find_layers(layers[0])
-        
-        for i in range(len(args.layers)):
-            if args.layers[i] in ('k','v','q','o'):
-                name = 'self_attn.' + args.layers[i] + '_proj'
-                n_out_dict[name] = round(layer[name].weight.data.shape[1] * r)
-            else:
-                name = 'mlp.' + args.layers[i] + '_proj'
-                n_out_dict[name] = round(layer[name].weight.data.shape[1] * r * 3 / 8)
-        
-    quantizers = {}
-    for i in range(len(layers)):
-        layer = layers[i].to(dev)
-        block_layers = find_layers(layer)
-
-        if args.true_sequential:
-            sequential = [
-                ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'],
-                ['self_attn.o_proj'],
-                ['mlp.up_proj', 'mlp.gate_proj'],
-                ['mlp.down_proj']
-            ]
-        else:
-            sequential = [list(block_layers.keys())]
-       
-        for names in sequential:
-            subset = {n: block_layers[n] for n in names}
-
-            gptq = {}
-            for name in subset:
-                gptq[name] = GPTQ_OWQ(subset[name], n_out=n_out_dict[name])
-                gptq[name].quantizer = Quantizer()
-                gptq[name].quantizer.configure(
-                    args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse')
-                )
-                gptq[name].quantizer.n_out = n_out_dict[name]
-                
-            def add_batch(name):
-                def tmp(_, inp, out):
-                    gptq[name].add_batch(inp[0].data, out.data)
-                return tmp
-            handles = []
-            for name in subset:
-                handles.append(subset[name].register_forward_hook(add_batch(name)))
-            for j in range(args.nsamples):
-                layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
-            for h in handles:
-                h.remove()
-            
-            for name in subset:
-                if not args.no_frob_norm:
-                    W = subset[name].weight.data.clone().to(torch.float)
-                    temp_quantizer = Quantizer()
-                    temp_quantizer.configure(args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse'))
-                    temp_quantizer.find_params(W, weight=True, num=40)
-                    W_quant = temp_quantizer.quantize(W)
-                    frob_norm_error = (W - W_quant).pow(2).sum(dim=0)
-                else:
-                    frob_norm_error = None
-                out_ids = gptq[name].hessian_sorting(actorder=args.act_order, frob_norm=frob_norm_error)
-                gptq[name].quantizer.out_ids = out_ids.cpu()
-                    
-            if not args.no_frob_norm:
-                del W
-                del W_quant
-                del temp_quantizer
-                torch.cuda.empty_cache()
-            
-            for name in subset:
-                print(f"Quantizing model.decoder.layers.{i}.{name}")
-                gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
-                quantizers['model.layers.%d.%s' % (i, name)] = gptq[name].quantizer.cpu()
-                gptq[name].free()
-
-        for j in range(args.nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
-        outs = torch.nan_to_num(outs)
-
-        layers[i] = layer.cpu()
-        del layer
-        del gptq 
-        torch.cuda.empty_cache()
-
-        inps, outs = outs, inps
-
-    model.config.use_cache = use_cache
-    
-    return quantizers
-
-@torch.no_grad()
-def llama_eval(model, testenc, dev):
-    print('Evaluating ...')
-
-    testenc = testenc.input_ids
-    nsamples = testenc.numel() // model.seqlen
-
-    use_cache = model.config.use_cache
-    model.config.use_cache = False
-    layers = model.model.layers
-
-    model.model.embed_tokens = model.model.embed_tokens.to(dev)
-    layers[0] = layers[0].to(dev)
-
-    dtype = next(iter(model.parameters())).dtype
-    inps = torch.zeros(
-        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
-    )
-    cache = {'i': 0, 'attention_mask': None}
-
-    class Catcher(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-        def forward(self, inp, **kwargs):
-            inps[cache['i']] = inp
-            cache['i'] += 1
-            cache['attention_mask'] = kwargs['attention_mask']
-            cache['position_ids'] = kwargs['position_ids']
-            raise ValueError
-    layers[0] = Catcher(layers[0])
-    for i in range(nsamples):
-        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
-        try:
-            model(batch)
-        except ValueError:
-            pass
-    layers[0] = layers[0].module
-
-    layers[0] = layers[0].cpu()
-    model.model.embed_tokens = model.model.embed_tokens.cpu()
-    torch.cuda.empty_cache()
-
-    outs = torch.zeros_like(inps)
-    attention_mask = cache['attention_mask']
-    position_ids = cache['position_ids']
-
-    for i in tqdm(range(len(layers))):
-        layer = layers[i].to(dev)
-        
-        if args.nearest:
-            subset = find_layers(layer)
-            for name in subset:
-                quantizer = Quantizer()
-                quantizer.configure(
-                    args.wbits, perchannel=True, sym=False, mse=False
-                )
-                W = subset[name].weight.data
-                quantizer.find_params(W, weight=True)
-                subset[name].weight.data = quantize(
-                    W, quantizer.scale, quantizer.zero, quantizer.maxq
-                ).to(next(iter(layer.parameters())).dtype)
-
-        for j in range(nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0]
-        outs = torch.nan_to_num(outs)
-
-        layers[i] = layer.cpu()
-        del layer
-        torch.cuda.empty_cache()
-        inps, outs = outs, inps
-
-    if model.model.norm is not None:
-        model.model.norm = model.model.norm.to(dev)
-    model.lm_head = model.lm_head.to(dev)
-
-    testenc = testenc.to(dev)
-    nlls = []
-    for i in range(nsamples):
-        hidden_states = inps[i].unsqueeze(0)
-        if model.model.norm is not None:
-            hidden_states = model.model.norm(hidden_states)
-        lm_logits = model.lm_head(hidden_states)
-        shift_logits = lm_logits[:, :-1, :].contiguous()
-        shift_labels = testenc[
-            :, (i * model.seqlen):((i + 1) * model.seqlen)
-        ][:, 1:]
-        loss_fct = nn.CrossEntropyLoss()
-        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-        neg_log_likelihood = loss.float() * model.seqlen
-        nlls.append(neg_log_likelihood)
-    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
-    print(ppl.item())
-
-    model.config.use_cache = use_cache
-    return ppl.item()
-
-def load_quant3(model, checkpoint, faster=False):
-    from transformers import LlamaConfig, LlamaForCausalLM
-    config = LlamaConfig.from_pretrained(model)
-    def noop(*args, **kwargs):
-        pass
-    torch.nn.init.kaiming_uniform_ = noop
-    torch.nn.init.uniform_ = noop
-    torch.nn.init.normal_ = noop
-
-    torch.set_default_dtype(torch.half)
-    transformers.modeling_utils._init_weights = False
-    torch.set_default_dtype(torch.half)
-    model = LlamaForCausalLM(config)
-    torch.set_default_dtype(torch.float)
-    model = model.eval()
-    layers = find_layers(model)
-    for name in ['lm_head']:
-        if name in layers:
-            del layers[name]
-            
-    ckpt = torch.load(checkpoint)
-    n_out_dict = ckpt['n_out_dict']
-    
-    make_quant3(model, n_out_dict, faster=faster)
-
-    model.load_state_dict(ckpt['model_state_dict'])
-    model.seqlen = model.config.max_position_embeddings
-
-    return model
-
-def llama_multigpu(model, gpus):
-    model.model.embed_tokens = model.model.embed_tokens.to(gpus[0])
-    model.model.norm = model.model.norm.to(gpus[-1])
-    import copy
-    import math
-    model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
-
-    cache = {'mask': None, 'pos_ids': None}
-
-    class MoveModule(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-            self.dev = next(iter(self.module.parameters())).device
-        def forward(self, *inp, **kwargs):
-            inp = list(inp)
-            if inp[0].device != self.dev:
-                inp[0] = inp[0].to(self.dev)
-            if cache['mask'] is None or cache['mask'].device != self.dev:
-                cache['mask'] = kwargs['attention_mask'].to(self.dev)
-            if cache['pos_ids'] is None or cache['pos_ids'].device != self.dev:
-                cache['pos_ids'] = kwargs['position_ids'].to(self.dev)
-            kwargs['attention_mask'] = cache['mask']
-            kwargs['position_ids'] = cache['pos_ids']
-            tmp = self.module(*inp, **kwargs)
-            return tmp
-
-    layers = model.model.layers
-    pergpu = math.ceil(len(layers) / len(gpus))
-    for i in range(len(layers)):
-        layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
-
-    model.gpus = gpus
-
-def benchmark(model, input_ids):
-    dev = torch.device('cuda:0')
-    input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else dev)
-    torch.cuda.synchronize()
-
-    cache = {'past': None}
-    def clear_past(i):
-        def tmp(layer, inp, out):
-            if cache['past']:
-                cache['past'][i] = None
-        return tmp
-    for i, layer in enumerate(model.model.layers):
-        layer.register_forward_hook(clear_past(i))
-
-    print('Benchmarking ...')
-
-    loss = nn.CrossEntropyLoss()
-    tot = 0.
-
-    def sync():
-        if hasattr(model, 'gpus'):
-            for gpu in model.gpus:
-                torch.cuda.synchronize(gpu)
-        else:
-            torch.cuda.synchronize()
-    with torch.no_grad():
-        attention_mask = torch.ones((1, input_ids.numel()), device=dev)
-        position_ids = torch.arange(0,input_ids.numel(), device=dev)
-        times = []
-        for i in range(input_ids.numel()):
-            print(i)
-            tick = time.time()
-            out = model(input_ids[:, i].reshape(1,-1),past_key_values=cache['past'],
-                        attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)),
-                        position_ids=position_ids[i])
-            sync()
-            times.append(time.time() - tick)
-            if i != input_ids.numel() - 1:
-                tot += loss(out.logits[0].to(dev), input_ids[:, (i + 1)].to(dev)).float()
-            cache['past'] = list(out.past_key_values)
-            del out
-        sync()
-        
-        print('Median:', np.median(times))
-        print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
-
-if __name__ == '__main__':
-    
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        'model', type=str,
-        help='LlaMa model to load; /path/to/llama_hf'
-    )
-    parser.add_argument(
-        'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
-        help='Where to extract calibration data from.'
-    )
-    parser.add_argument(
-        '--nsamples', type=int, default=128,
-        help='Number of calibration data samples.'
-    )
-    parser.add_argument(
-        '--wbits', type=int, default=16, choices=[2, 3, 4, 16],
-        help='The number of bits to use for weight quantization; use 16 for evaluating base model.'
-    )
-    parser.add_argument(
-        '--target_bit', type=float, default=None,
-        help='Effctive target bits for OWQ.'
-    )
-    parser.add_argument(
-        '--tuning', type=str, default='mse', choices=['mse', 'minmax'],
-        help='Method for quantization parameter tuning.'
-    )
-    parser.add_argument(
-        '--no_frob_norm', action='store_true',
-        help='Whether to use Frobenius norm for OWQ.'
-    )
-    parser.add_argument(
-        '--percdamp', type=float, default=.01,
-        help='Percent of the average Hessian diagonal to use for dampening.'
-    )
-    parser.add_argument(
-        '--layers', nargs='+', type=str, default=None, choices=layer_list,
-        help='Layers to apply OWQ.'
-    )
-    parser.add_argument(
-        '--seed', type=int, default=0,
-        help='Seed for sampling the calibration data.'
-    )
-    parser.add_argument(
-        '--nearest', action='store_true',
-        help='Whether to run the round-to-nearest quantization.'
-    ) 
-    parser.add_argument(
-        '--groupsize', type=int, default=-1,
-        help='Groupsize for fine-grained quantization; default uses full row.'
-    )
-
-    parser.add_argument(
-        '--no-eval', action='store_true',
-        help='Whether to evaluate model on WikiText-2, PTB and C4'
-    )
-    parser.add_argument(
-        '--save', type=str, default='',
-        help='Save quantized checkpoint under this name.'
-    )
-    parser.add_argument(
-        '--load', type=str, default='',
-        help='Load fake or 3bit quantized checkpoint.'
-    )
-    parser.add_argument(
-        '--logfile', type=str, default='',
-        help='Logging file name'
-    )
-    parser.add_argument(
-        '--packing', action='store_true',
-        help='Whether to save 3bit quantized model.'
-    )
-    parser.add_argument(
-        '--faster-kernel', action='store_true',
-        help='Whether to save and load 3bit quantized model using the faster kernel for benchmarking.'
-    )
-    parser.add_argument(
-        '--benchmark', type=int, default=0,
-        help='Number of tokens to use for benchmarking.'
-    )
-    
-    parser.add_argument(
-        '--old-eval', action='store_true',
-        help='Whether to use the old version of PTB and C4 evaluation.'
-    )
-    parser.add_argument(
-        '--act-order', action='store_true',
-        help='Whether to apply the activation order GPTQ heuristic'
-    )
-    parser.add_argument(
-        '--true-sequential', action='store_true',
-        help='Whether to run in true sequential model.'
-    )
-
-    args = parser.parse_args()
-    check_arguments(args)
-    device = torch.device('cuda:0')
-    
-    def seed_all(seed):
-        random.seed(seed)
-        os.environ['PYTHONHASHSEED'] = str(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-    seed_all(args.seed)
-    
-    t = 0
-    if args.load:
-        print(f"Loading {args.load} ....")
-        if args.packing:
-            model = load_quant3(args.model, args.load, args.faster_kernel)
-        else:
-            model = get_llama(args.model)    
-            model.load_state_dict(torch.load(args.load))
-        model.eval()
-        print("Done.")
-    else:
-        model = get_llama(args.model)
-        model.eval()
-    
-        if args.wbits < 16 and not args.nearest:
-            dataloader = get_loaders(
-                args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=True
-            )
-            tick = time.time()
-            quantizers = llama_sequential(model, dataloader, device)
-            t = round((time.time() - tick),1)
-            print(f"Running Time : {t}")
-
-    if args.benchmark:
-        dataloader = get_loaders(
-            args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False
-        )
-        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
-        if len(gpus) > 1:
-            llama_multigpu(model, gpus)
-        else:
-            model = model.to(device)
-        if args.benchmark:
-            input_ids = dataloader.input_ids[:, :args.benchmark]
-            benchmark(model, input_ids)
-        exit()
-
-    t1 = time.time()
-    ppl_scores = []
-    if not args.no_eval:
-        if args.old_eval:
-            ppl_tasks = ['wikitext2', 'ptb', 'c4']
-        else:
-            ppl_tasks = ['wikitext2','ptb-new', 'c4-new']
-        for dataset in ppl_tasks:
-            testloader = get_loaders(
-                dataset, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False
-            )
-            print(dataset)
-            ppl_score = llama_eval(model, testloader, device)
-            ppl_scores.append((dataset,ppl_score))
-    t2 = time.time() - t1
-    
-    if args.logfile:
-        with open(f'{args.logfile}','a') as fp:
-            add_str = f"| layers : {args.layers}" + f"| target_bit : {args.target_bit}\n" if args.target_bit is not None else '\n'
-            fp.write(f"model : {args.model} | owq time : {round(t/60,1)}m / eval time : {round(t2/60,1)}m | seed : {args.seed} {add_str}")
-            for i in range(len(ppl_scores)):
-                fp.write(f"{ppl_scores[i][1]} ")
-            fp.write(f"\n")
-
-    if args.save:
-        torch.save(model.state_dict(), args.save)
-        print(f"fake quantized model is saved to {args.save}")
-        if args.packing and args.wbits == 3:
-            temp = args.save.split('/')
-            temp[-1] = 'pack3_' + f"{'faster_' if args.faster_kernel else ''}" + temp[-1]
-            ckpt_path = '/'.join(temp)
-            n_out_dict = {n: n_out_saver(quantizers[n].n_out) for n in quantizers}
-            lm_pack3(model, quantizers, faster=args.faster_kernel)
-            torch.save({
-                'model_state_dict' : model.state_dict(),
-                'n_out_dict' : n_out_dict}, ckpt_path)
-            print(f"3bit quantized model is saved to {ckpt_path}")
-        else:
-            print("Only 3bits quantized model is supported")
\ No newline at end of file
diff --git a/main.py b/main.py
index cb4bd84..034bf4e 100644
--- a/main.py
+++ b/main.py
@@ -15,7 +15,6 @@
 
 @torch.no_grad()
 def layerwise_quantize(model, dataloader, dev, args):
-    # assert args.no_frob_norm == True
     meta = args.meta
     print('Starting ...')
 
@@ -78,7 +77,7 @@ def forward(self, inp, **kwargs):
         # r = (args.target_bit - args.wbits) * 16 / 12
         r /= n_owq_layers
 
-        layer = find_layers(layers[0], layers=[nn.Linear])
+        layer = find_layers(layers[0])
         
         for l in owq_layers:
             # for even number of n_out
@@ -92,7 +91,7 @@ def forward(self, inp, **kwargs):
     quantizers = {}
     for i in range(len(layers)):
         layer = layers[i].to(dev)
-        block_layers = find_layers(layer, layers=[nn.Linear])
+        block_layers = find_layers(layer)
 
         if args.true_sequential:
             sequential = meta['sequential']
@@ -226,7 +225,7 @@ def forward(self, inp, **kwargs):
         layer = layers[i].to(dev)
         
         if args.nearest:
-            subset = find_layers(layer, layers=args.meta['linears'])
+            subset = find_layers(layer)
             for name in subset:
                 quantizer = Quantizer(args.wbits, perchannel=True, sym=args.sym, mse=False)
                 W = subset[name].weight.data
@@ -290,7 +289,7 @@ def forward(self, *inp, **kwargs):
             if inp[0].device != self.dev:
                 inp[0] = inp[0].to(self.dev)
             for key in meta['inp_kwargs']:
-                if kwargs[key].device != self.dev:
+                if kwargs[key] != None and kwargs[key].device != self.dev:
                     kwargs[key] = kwargs[key].to(self.dev)
             tmp = self.module(*inp, **kwargs)
             return tmp
@@ -495,7 +494,7 @@ def sync():
     # benchmark
     if args.benchmark:
         dataloader = get_loaders(
-            args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=args.seqlen, train=True
+            args.dataset, nsamples=1, seed=args.seed, model=args.model, seqlen=args.seqlen, train=True
         )
         gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
         if len(gpus) > 1:
@@ -514,7 +513,7 @@ def sync():
     t1 = time.time()
     ppl_scores = []
     if not args.no_eval:
-        ppl_tasks = ['wikitext2','ptb-new', 'c4-new']
+        ppl_tasks = ['wikitext2','ptb', 'c4']
         for dataset in ppl_tasks:
             testloader = get_loaders(
                 dataset, seed=args.seed, model=args.model, seqlen=args.seqlen, train=False
diff --git a/model_config.json b/model_config.json
index 35ec950..260854b 100644
--- a/model_config.json
+++ b/model_config.json
@@ -74,7 +74,7 @@
     },
 "falcon":{
     "map_layer":{"qkv":"self_attention.query_key_value","dense":"self_attention.dense","fc1":"mlp.dense_h_to_4h","fc2":"mlp.dense_4h_to_h"}, 
-    "ratios":{"self_attention.query_key_value":3,"self_attention.dense":1,"mlp.dense_h_to_4h":0.25,"mlp.dense_4h_to_h":0.25},
+    "ratios":{"self_attention.query_key_value":1,"self_attention.dense":1,"mlp.dense_h_to_4h":0.25,"mlp.dense_4h_to_h":0.25},
     "sequential":[
         ["self_attention.query_key_value"],
         ["self_attention.dense"],
diff --git a/opt.py b/opt.py
deleted file mode 100644
index 396e1e9..0000000
--- a/opt.py
+++ /dev/null
@@ -1,586 +0,0 @@
-import time
-
-import torch
-import torch.nn as nn
-
-import transformers
-
-from owq.recon import GPTQ_OWQ
-from owq.quant import *
-from owq.utils.misc import find_layers, check_arguments
-from owq.utils.datautils import *
-
-import argparse
-import random
-import os
-import numpy as np
-from tqdm import tqdm
-
-layer_list = ['k','v','q','out','fc1','fc2']
-n_out_dict = {'self_attn.k_proj':0, 
-            'self_attn.v_proj':0, 
-            'self_attn.q_proj':0, 
-            'self_attn.out_proj':0, 
-            'fc1':0, 'fc2':0 }
-    
-def get_opt(model):
-    import torch
-    def skip(*args, **kwargs):
-        pass
-    torch.nn.init.kaiming_uniform_ = skip
-    torch.nn.init.uniform_ = skip
-    torch.nn.init.normal_ = skip
-    from transformers import OPTForCausalLM
-    model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto')
-    model.seqlen = model.config.max_position_embeddings
-    return model
-
-@torch.no_grad()
-def opt_sequential(model, dataloader, dev):
-    print('Starting ...')
-
-    use_cache = model.config.use_cache
-    model.config.use_cache = False
-    layers = model.model.decoder.layers
-
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) 
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
-    layers[0] = layers[0].to(dev)
-
-    dtype = next(iter(model.parameters())).dtype
-    inps = torch.zeros(
-        (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
-    )
-
-    cache = {'i': 0, 'attention_mask': None}
-
-    class Catcher(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-        def forward(self, inp, **kwargs):
-            inps[cache['i']] = inp
-            cache['i'] += 1
-            cache['attention_mask'] = kwargs['attention_mask']
-            raise ValueError
-
-    layers[0] = Catcher(layers[0])
-    for batch in dataloader:
-        try:
-            model(batch[0].to(dev))
-        except ValueError:
-            pass
-    layers[0] = layers[0].module
-    layers[0] = layers[0].cpu()
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
-    torch.cuda.empty_cache()
-
-    outs = torch.zeros_like(inps)
-    attention_mask = cache['attention_mask']
-
-    print('Ready.')
-    
-    if args.target_bit is not None:
-        args.layers = layer_list if args.layers is None else args.layers
-        n_mp_layers = len(args.layers)
-        
-        r = (12 / (16 - args.wbits)) * (args.target_bit - args.wbits)
-        # r = (args.target_bit - args.wbits) * 16 / 12
-        r /= n_mp_layers
-
-        layer = find_layers(layers[0])
-        
-        for i in range(len(args.layers)):
-            if args.layers[i] in ('k','v','q','out'):
-                name = 'self_attn.' + args.layers[i] + '_proj'
-                n_out_dict[name] = round(layer[name].weight.data.shape[1] * r)
-            else:
-                name = args.layers[i]
-                n_out_dict[name] = round(layer[name].weight.data.shape[1] * r / 4)
-    
-    quantizers = {}
-    for i in range(len(layers)):
-        layer = layers[i].to(dev)
-        block_layers = find_layers(layer)
-        
-        if args.true_sequential:
-            sequential = [
-                ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'],
-                ['self_attn.out_proj'],
-                ['fc1'],
-                ['fc2']
-            ]
-        else:
-            sequential = [list(block_layers.keys())]
-
-        for names in sequential:
-            subset = {n: block_layers[n] for n in names}
-            
-            gptq = {}
-            for name in subset:
-                gptq[name] = GPTQ_OWQ(subset[name], n_out=n_out_dict[name])
-                gptq[name].quantizer = Quantizer()
-                gptq[name].quantizer.configure(
-                    args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse')
-                )
-                gptq[name].quantizer.n_out = n_out_dict[name]
-                
-            def add_batch(name):
-                def tmp(_, inp, out):
-                    gptq[name].add_batch(inp[0].data, out.data)
-                return tmp
-            handles = []
-            for name in subset:
-                handles.append(subset[name].register_forward_hook(add_batch(name)))
-            for j in range(args.nsamples):
-                layer(inps[j].unsqueeze(0), attention_mask=attention_mask)
-            for h in handles:
-                h.remove()
-            
-            for name in subset:
-                if not args.no_frob_norm:
-                    W = subset[name].weight.data.clone().to(torch.float)
-                    temp_quantizer = Quantizer()
-                    temp_quantizer.configure(args.wbits, perchannel=True, sym=False, mse=(args.tuning == 'mse'))
-                    temp_quantizer.find_params(W, weight=True, num=40)
-                    W_quant = temp_quantizer.quantize(W)
-                    frob_norm_error = (W - W_quant).pow(2).sum(dim=0)
-                else:
-                    frob_norm_error = None
-                out_ids = gptq[name].hessian_sorting(actorder=args.act_order, frob_norm=frob_norm_error)
-                gptq[name].quantizer.out_ids = out_ids.cpu()
-                
-            if not args.no_frob_norm:
-                del W
-                del W_quant
-                del temp_quantizer
-                torch.cuda.empty_cache()
-
-            for name in subset:
-                print(f"Quantizing model.decoder.layers.{i}.{name}")
-                gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order)
-                quantizers['model.decoder.layers.%d.%s' % (i, name)] = gptq[name].quantizer.cpu()
-                gptq[name].free()
-                
-        for j in range(args.nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-
-        layers[i] = layer.cpu()
-        del layer
-        del gptq
-        torch.cuda.empty_cache()
-        
-        inps, outs = outs, inps
-
-    model.config.use_cache = use_cache
-    
-    return quantizers
-
-@torch.no_grad()
-def opt_eval(model, testenc, dev):
-    print('Evaluating ...')
-
-    testenc = testenc.input_ids
-    nsamples = testenc.numel() // model.seqlen
-
-    use_cache = model.config.use_cache
-    model.config.use_cache = False
-    layers = model.model.decoder.layers
-
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev) 
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(dev) 
-    layers[0] = layers[0].to(dev)
-
-    dtype = next(iter(model.parameters())).dtype
-    inps = torch.zeros(
-        (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev
-    )
-    cache = {'i': 0, 'attention_mask': None}
-
-    class Catcher(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-        def forward(self, inp, **kwargs):
-            inps[cache['i']] = inp
-            cache['i'] += 1
-            cache['attention_mask'] = kwargs['attention_mask']
-            raise ValueError
-    layers[0] = Catcher(layers[0])
-    for i in range(nsamples):
-        batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev)
-        try:
-            model(batch)
-        except ValueError:
-            pass
-    layers[0] = layers[0].module
-
-    layers[0] = layers[0].cpu()
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
-    torch.cuda.empty_cache()
-
-    outs = torch.zeros_like(inps)
-    attention_mask = cache['attention_mask']
-
-    for i in tqdm(range(len(layers))):
-        layer = layers[i].to(dev)
-
-        if args.nearest:
-            subset = find_layers(layer)
-            for name in subset:
-                quantizer = Quantizer()
-                quantizer.configure(
-                    args.wbits, perchannel=True, sym=False, mse=False
-                )
-                W = subset[name].weight.data
-                quantizer.find_params(W, weight=True)
-                subset[name].weight.data = quantize(
-                    W, quantizer.scale, quantizer.zero, quantizer.maxq
-                ).to(next(iter(layer.parameters())).dtype)
-
-        for j in range(nsamples):
-            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
-        layers[i] = layer.cpu()
-        del layer
-        torch.cuda.empty_cache()
-        inps, outs = outs, inps
-
-    if model.model.decoder.final_layer_norm is not None:
-        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev)
-    if model.model.decoder.project_out is not None:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
-    model.lm_head = model.lm_head.to(dev)
-
-    testenc = testenc.to(dev)
-    nlls = []
-    for i in range(nsamples):
-        hidden_states = inps[i].unsqueeze(0)
-        if model.model.decoder.final_layer_norm is not None:
-            hidden_states = model.model.decoder.final_layer_norm(hidden_states)
-        if model.model.decoder.project_out is not None:
-            hidden_states = model.model.decoder.project_out(hidden_states)
-        lm_logits = model.lm_head(hidden_states)
-        shift_logits = lm_logits[:, :-1, :].contiguous()
-        shift_labels = testenc[
-            :, (i * model.seqlen):((i + 1) * model.seqlen)
-        ][:, 1:]
-        loss_fct = nn.CrossEntropyLoss()
-        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-        neg_log_likelihood = loss.float() * model.seqlen
-        nlls.append(neg_log_likelihood)
-    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen))
-    print(ppl.item())
-
-    model.config.use_cache = use_cache
-    return ppl.item()
-
-def load_quant3(model, checkpoint, faster=False):
-    from transformers import OPTConfig, OPTForCausalLM 
-    config = OPTConfig.from_pretrained(model)
-    def noop(*args, **kwargs):
-        pass
-    torch.nn.init.kaiming_uniform_ = noop
-    torch.nn.init.uniform_ = noop
-    torch.nn.init.normal_ = noop
-
-    torch.set_default_dtype(torch.half)
-    transformers.modeling_utils._init_weights = False
-    torch.set_default_dtype(torch.half)
-    model = OPTForCausalLM(config)
-    torch.set_default_dtype(torch.float)
-    model = model.eval()
-    layers = find_layers(model)
-    for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']:
-        if name in layers:
-            del layers[name]
-            
-    ckpt = torch.load(checkpoint)
-    n_out_dict = ckpt['n_out_dict']
-    
-    make_quant3(model, n_out_dict, faster=faster)
-    
-    model.load_state_dict(ckpt['model_state_dict'])
-    model.seqlen = model.config.max_position_embeddings
-
-    return model
-
-def opt_multigpu(model, gpus):
-    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0])
-    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0])
-    if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in:
-        model.model.decoder.project_in = model.model.decoder.project_in.to(gpus[0])
-    if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out:
-        model.model.decoder.project_out = model.model.decoder.project_out.to(gpus[-1])
-    if hasattr(model.model.decoder, 'final_layer_norm') and model.model.decoder.final_layer_norm:
-        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(gpus[-1])
-    import copy
-    import math
-    model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1])
-
-    cache = {'mask': None}
-
-    class MoveModule(nn.Module):
-        def __init__(self, module):
-            super().__init__()
-            self.module = module
-            self.dev = next(iter(self.module.parameters())).device
-        def forward(self, *inp, **kwargs):
-            inp = list(inp)
-            if inp[0].device != self.dev:
-                inp[0] = inp[0].to(self.dev)
-            if cache['mask'] is None or cache['mask'].device != self.dev:
-                cache['mask'] = kwargs['attention_mask'].to(self.dev)
-            kwargs['attention_mask'] = cache['mask']
-            tmp = self.module(*inp, **kwargs)
-            return tmp
-
-    layers = model.model.decoder.layers
-    pergpu = math.ceil(len(layers) / len(gpus))
-    for i in range(len(layers)):
-        layers[i] = MoveModule(layers[i].to(gpus[i // pergpu]))
-
-    model.gpus = gpus
-
-def benchmark(model, input_ids):
-    dev = torch.device('cuda:0')
-    input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else dev)
-    torch.cuda.synchronize()
-
-    cache = {'past': None}
-    def clear_past(i):
-        def tmp(layer, inp, out):
-            if cache['past']:
-                cache['past'][i] = None
-        return tmp
-    for i, layer in enumerate(model.model.decoder.layers):
-        layer.register_forward_hook(clear_past(i))
-
-    print('Benchmarking ...')
-
-    loss = nn.CrossEntropyLoss()
-    tot = 0.
-
-    def sync():
-        if hasattr(model, 'gpus'):
-            for gpu in model.gpus:
-                torch.cuda.synchronize(gpu)
-        else:
-            torch.cuda.synchronize()
-    with torch.no_grad():
-        attention_mask = torch.ones((1, input_ids.numel()), device=dev)
-        times = []
-        for i in range(input_ids.numel()):
-            tick = time.time()
-            out = model(input_ids[:, i].reshape(1,-1),past_key_values=cache['past'],attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)))
-            sync()
-            times.append(time.time() - tick)
-            if i != input_ids.numel() - 1:
-                tot += loss(out.logits[0].to(dev), input_ids[:, (i + 1)].to(dev)).float()
-            cache['past'] = list(out.past_key_values)
-            del out
-        sync()
-        
-        print('Median:', np.median(times))
-        print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item())
-            
-if __name__ == '__main__':
-    
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        'model', type=str,
-        help='OPT model to load; pass `facebook/opt-X`.'
-    )
-    parser.add_argument(
-        'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'],
-        help='Where to extract calibration data from.'
-    )
-    parser.add_argument(
-        '--nsamples', type=int, default=128,
-        help='Number of calibration data samples.'
-    )
-    parser.add_argument(
-        '--wbits', type=int, default=16, choices=[2, 3, 4, 16],
-        help='The number of bits to use for weight quantization; use 16 for evaluating base model.'
-    )
-    parser.add_argument(
-        '--target_bit', type=float, default=None,
-        help='Effctive target bits for OWQ.'
-    )
-    parser.add_argument(
-        '--tuning', type=str, default='mse', choices=['mse', 'minmax'],
-        help='Method for quantization parameter tuning.'
-    )
-    parser.add_argument(
-        '--no_frob_norm', action='store_true',
-        help='Whether to use Frobenius norm for OWQ.'
-    )
-    parser.add_argument(
-        '--percdamp', type=float, default=.01,
-        help='Percent of the average Hessian diagonal to use for dampening.'
-    )
-    parser.add_argument(
-        '--layers', nargs='+', type=str, default=None, choices=layer_list,
-        help='Layers to apply OWQ.'
-    )
-    parser.add_argument(
-        '--seed', type=int, default=0,
-        help='Seed for sampling the calibration data.'
-    )
-    parser.add_argument(
-        '--nearest', action='store_true',
-        help='Whether to run the round-to-nearest quantization.'
-    ) 
-    parser.add_argument(
-        '--groupsize', type=int, default=-1,
-        help='Groupsize for fine-grained quantization; default uses full row.'
-    )
-    
-    parser.add_argument(
-        '--no-eval', action='store_true',
-        help='Whether to evaluate model on WikiText-2, PTB and C4'
-    )
-    parser.add_argument(
-        '--save', type=str, default='',
-        help='Save quantized checkpoint under this name.'
-    )
-    parser.add_argument(
-        '--load', type=str, default='',
-        help='Load fake or 3bit quantized checkpoint.'
-    )
-    parser.add_argument(
-        '--logfile', type=str, default='',
-        help='Logging file name'
-    )
-    parser.add_argument(
-        '--packing', action='store_true',
-        help='Whether to save 3bit quantized model.'
-    )
-    parser.add_argument(
-        '--faster-kernel', action='store_true',
-        help='Whether to save and load 3bit quantized model using the faster kernel for benchmarking.'
-    )
-    parser.add_argument(
-        '--benchmark', type=int, default=0,
-        help='Number of tokens to use for benchmarking.'
-    )
-    
-    parser.add_argument(
-        '--old-eval', action='store_true',
-        help='Whether to use the old version of PTB and C4 evaluation.'
-    )
-    parser.add_argument(
-        '--act-order', action='store_true',
-        help='Whether to apply the activation order GPTQ heuristic'
-    )
-    parser.add_argument(
-        '--true-sequential', action='store_true',
-        help='Whether to run in true sequential model.'
-    )
-
-    args = parser.parse_args()
-    check_arguments(args)
-    
-    device = torch.device('cuda:0')
-    
-    def seed_all(seed):
-        random.seed(seed)
-        os.environ['PYTHONHASHSEED'] = str(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-    seed_all(args.seed)
-    
-    t = 0
-    if args.load:
-        print(f"Loading {args.load} ....")
-        if args.packing:
-            model = load_quant3(args.model, args.load, args.faster_kernel)
-        else:
-            model = get_opt(args.model)    
-            model.load_state_dict(torch.load(args.load))
-        model.eval()
-        print("Done.")
-    else:
-        model = get_opt(args.model)
-        model.eval()
-    
-        if args.wbits < 16 and not args.nearest:
-            dataloader = get_loaders(
-                args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=True
-            )
-            tick = time.time()
-            quantizers = opt_sequential(model, dataloader, device)
-            t = round((time.time() - tick),1)
-            print(f"Running Time : {t}")
-
-    if args.benchmark:
-        dataloader = get_loaders(
-            args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False
-        )
-        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
-        if len(gpus) > 1:
-            opt_multigpu(model, gpus)
-        else:
-            model = model.to(device)
-        if args.benchmark:
-            input_ids = dataloader.input_ids[:, :args.benchmark]
-            benchmark(model, input_ids)
-        exit()
-
-    t1 = time.time()
-    ppl_scores = []
-    if not args.no_eval:
-        if args.old_eval:
-            ppl_tasks = ['wikitext2', 'ptb', 'c4']
-        else:
-            ppl_tasks = ['wikitext2','ptb-new', 'c4-new']
-        for dataset in ppl_tasks:
-            testloader = get_loaders(
-                dataset, seed=args.seed, model=args.model, seqlen=model.seqlen, train=False
-            )
-            print(dataset)
-            ppl_score = opt_eval(model, testloader, device)
-            ppl_scores.append((dataset,ppl_score))
-    t2 = time.time() - t1
-    
-    if args.logfile:
-        with open(f'{args.logfile}','a') as fp:
-            add_str = f"\nlayers : {args.layers}" + f"| target_bit : {args.target_bit}\n" if args.target_bit is not None else '\n'
-            fp.write(f"model : {args.model} | owq time : {round(t/60,1)}m / eval time : {round(t2/60,1)}m | seed : {args.seed} {add_str}")
-            for i in range(len(ppl_scores)):
-                fp.write(f"{ppl_scores[i][1]} ")
-            fp.write(f"\n\n")
-
-    if args.save:
-        torch.save(model.state_dict(), args.save)
-        print(f"fake quantized model is saved to {args.save}")
-        if args.packing and args.wbits == 3:
-            temp = args.save.split('/')
-            temp[-1] = 'pack3_' + f"{'faster_' if args.faster_kernel else ''}" + temp[-1]
-            ckpt_path = '/'.join(temp)
-            n_out_dict = {n: n_out_saver(quantizers[n].n_out) for n in quantizers}
-            lm_pack3(model, quantizers, faster=args.faster_kernel)
-            torch.save({
-                'model_state_dict' : model.state_dict(),
-                'n_out_dict' : n_out_dict}, ckpt_path)
-            print(f"3bit quantized model is saved to {ckpt_path}")
-        else:
-            print("Only 3bits quantized model is supported")
diff --git a/owq/kernel/setup_cuda.py b/owq/kernel/setup_cuda.py
index 98d3263..eaef4fd 100644
--- a/owq/kernel/setup_cuda.py
+++ b/owq/kernel/setup_cuda.py
@@ -1,10 +1,31 @@
 from setuptools import setup, Extension
 from torch.utils import cpp_extension
 
+extra_compile_args = {
+    "cxx": [
+        "-g", 
+        "-O3", 
+        "-fopenmp", 
+        "-lgomp", 
+        "-std=c++17",
+    ],
+    "nvcc": [
+        "-O3", 
+        "-std=c++17",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--use_fast_math",
+        "--threads=8"
+    ],
+}
+
 setup(
     name='owq_cuda',
     ext_modules=[cpp_extension.CUDAExtension(
-        'owq_cuda', ['owq_cuda.cpp', 'gemv.cu', 'dequant.cu']
+        name = 'owq_cuda', 
+        sources = ['owq_cuda.cpp', 'gemv.cu', 'dequant.cu'],
+        extra_compile_args=extra_compile_args,
     )],
-    cmdclass={'build_ext': cpp_extension.BuildExtension}
-)
+    cmdclass={'build_ext': cpp_extension.BuildExtension},
+    install_requires = ["torch"],
+)
\ No newline at end of file
diff --git a/owq/kernel/test_kernel.py b/owq/kernel/test_kernel.py
index f2c1805..0ca6b5d 100644
--- a/owq/kernel/test_kernel.py
+++ b/owq/kernel/test_kernel.py
@@ -132,8 +132,8 @@ def correctness(M=4*12288, N=12288, bits=3, outlieridx=[], faster=False):
 
 if __name__=="__main__":
     bits=3
-    for d, model in zip([4096],['opt-6.7b']): # opt
-    # for d, model in zip([12288],['opt-175b']): # opt
+    for d, model in zip([4096],['opt-6.7b']): # opt-6.7b
+    # for d, model in zip([12288],['opt-175b']): # opt-175b
         n = 6
         print(f'Benchmarking {model.upper()} matvec with outlier ...')
         for M,N in [[d,d],[d,d*4],[d*4,d]]:
diff --git a/owq/quant.py b/owq/quant.py
index 1ee2e2f..83e226e 100644
--- a/owq/quant.py
+++ b/owq/quant.py
@@ -1,6 +1,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
+from transformers.models.falcon.modeling_falcon import FalconLinear
 
 try:
     import owq_cuda
@@ -200,7 +201,7 @@ def make_quant(module, n_out_infos, wbits, name=''):
     for name1, child in module.named_children():
         make_quant(child, n_out_infos, wbits, name + '.' + name1 if name != '' else name1)
 
-def lm_pack(model, quantinfos, wbits, linears=[nn.Linear]):
+def lm_pack(model, quantinfos, wbits, linears=[nn.Linear, FalconLinear]):
     from owq.utils.misc import find_layers
     layers = find_layers(model, linears)
     layers = {n: layers[n] for n in quantinfos}
diff --git a/owq/utils/misc.py b/owq/utils/misc.py
index 3e2f6b5..287e96c 100644
--- a/owq/utils/misc.py
+++ b/owq/utils/misc.py
@@ -1,10 +1,11 @@
 import torch
 import torch.nn as nn
 import math
+from transformers.models.falcon.modeling_falcon import FalconLinear
 
 layer_list = ['q','k','v','qkv','o','out','dense','fc1','fc2','up','gate','down']
 
-def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
+def find_layers(module, layers=[nn.Linear, FalconLinear], name=''):
     if type(module) in layers:
         return {name: module}
     res = {}
diff --git a/requirements.txt b/requirements.txt
index 2dad1e3..d16d45a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,8 @@
 # torch==2.0.0
 transformers
 datasets
+accelerate
+peft
 sacrebleu
 sqlitedict
 scikit-learn