Skip to content

Commit

Permalink
add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
SunMarc committed Aug 31, 2023
1 parent 2344645 commit cd066ca
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 11 deletions.
5 changes: 4 additions & 1 deletion optimum/gptq/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,9 +483,12 @@ def post_init_model(self, model):
"Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU."
"You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
)
class StoreAttr(object):
pass
model.quantize_config = StoreAttr()
model.quantize_config.desc_act = self.desc_act
model = autogptq_post_init(model, use_act_order=self.desc_act)
if self.desc_act and not self.disable_exllama:
if self.desc_act and not self.disable_exllama and not (self.max_input_length is None):
model = exllama_set_max_input_length(model,self.max_input_length)
return model

Expand Down
61 changes: 51 additions & 10 deletions tests/gptq/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,8 @@ class GPTQTest(unittest.TestCase):
EXPECTED_OUTPUTS = set()
EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I")
EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.")
EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")
EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.")
EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")

# this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings
EXPECTED_RELATIVE_DIFFERENCE = 1.664253062
Expand Down Expand Up @@ -131,32 +130,74 @@ def test_serialization(self):
with init_empty_weights():
empty_model = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16)
empty_model.tie_weights()
quantized_model_from_saved = load_quantized_model(empty_model, save_folder=tmpdirname, device_map={"": 0})
quantized_model_from_saved = load_quantized_model(empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama)
self.check_inference_correctness(quantized_model_from_saved)

class GPTQTestExllama(GPTQTest):
disable_exllama = False
EXPECTED_OUTPUTS = set()
EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I")
EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.")
EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of")


class GPTQTestActOrder(GPTQTest):

EXPECTED_OUTPUTS = set()
EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.")
EXPECTED_OUTPUTS.add("Hello my name is jessie and i am a very sweet and")
EXPECTED_OUTPUTS.add("Hello my name is nathalie, I am a young girl from")

disable_exllama = True
desc_act=True

def test_generate_quality(self):
# act_order don't work with qlinear_cuda kernel
pass
def test_serialization(self):
# act_order don't work with qlinear_cuda kernel
pass

def test_exllama_serialization(self):
"""
Test the serialization of the model and the loading of the quantized weights with exllama kernel
"""
from accelerate import init_empty_weights

with tempfile.TemporaryDirectory() as tmpdirname:
self.quantizer.save(self.quantized_model, tmpdirname)
self.quantized_model.config.save_pretrained(tmpdirname)
with init_empty_weights():
empty_model = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16)
empty_model.tie_weights()
quantized_model_from_saved = load_quantized_model(empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False)
self.check_inference_correctness(quantized_model_from_saved)

def test_exllama_max_input_length(self):
"""
Test if the max_input_length works with exllama + act_order
"""
from accelerate import init_empty_weights
max_input_length = 4028

with tempfile.TemporaryDirectory() as tmpdirname:
self.quantizer.save(self.quantized_model, tmpdirname)
self.quantized_model.config.save_pretrained(tmpdirname)
with init_empty_weights():
empty_model = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16)
empty_model.tie_weights()
quantized_model_from_saved = load_quantized_model(empty_model, save_folder=tmpdirname, device_map={"": 0})
quantized_model_from_saved = load_quantized_model(empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length = 4028)

prompt = "I am in Paris and" * 450

prompt = "I am in Paris and" * 1000
inp = self.tokenizer(prompt, return_tensors="pt").to(0)
self.assertTrue(inp["input_ids"].shape[1] > 2048)

self.assertTrue(inp["input_ids"].shape[1] > 4028)
with self.assertRaises(RuntimeError) as cm:
res = quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
self.assertTrue("temp_state buffer is too small" in str(cm.exception))
self.assertTrue("temp_state buffer is too small" in str(cm.exception))

prompt = "I am in Paris and" * 500
inp = self.tokenizer(prompt, return_tensors="pt").to(0)
self.assertTrue(inp["input_ids"].shape[1] < 4028)
res = quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)

class GPTQUtilsTest(unittest.TestCase):
"""
Expand Down

0 comments on commit cd066ca

Please sign in to comment.