diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 3c6d9c0dfc..b1138a59af 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -483,9 +483,12 @@ def post_init_model(self, model): "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU." "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object" ) + class StoreAttr(object): + pass + model.quantize_config = StoreAttr() model.quantize_config.desc_act = self.desc_act model = autogptq_post_init(model, use_act_order=self.desc_act) - if self.desc_act and not self.disable_exllama: + if self.desc_act and not self.disable_exllama and not (self.max_input_length is None): model = exllama_set_max_input_length(model,self.max_input_length) return model diff --git a/tests/gptq/test_quantization.py b/tests/gptq/test_quantization.py index a6c22ccb48..609e0281a2 100644 --- a/tests/gptq/test_quantization.py +++ b/tests/gptq/test_quantization.py @@ -36,9 +36,8 @@ class GPTQTest(unittest.TestCase): EXPECTED_OUTPUTS = set() EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer. I") EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I") - EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.") - EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of") EXPECTED_OUTPUTS.add("Hello my name is John and I am a very good looking man.") + EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of") # this seems a little small considering that we are doing 4bit quant but we have a small model and ww don't quantize the embeddings EXPECTED_RELATIVE_DIFFERENCE = 1.664253062 @@ -131,15 +130,54 @@ def test_serialization(self): with init_empty_weights(): empty_model = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16) empty_model.tie_weights() - quantized_model_from_saved = load_quantized_model(empty_model, save_folder=tmpdirname, device_map={"": 0}) + quantized_model_from_saved = load_quantized_model(empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=self.disable_exllama) self.check_inference_correctness(quantized_model_from_saved) class GPTQTestExllama(GPTQTest): disable_exllama = False + EXPECTED_OUTPUTS = set() + EXPECTED_OUTPUTS.add("Hello my name is John, I am a professional photographer and I") + EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.") + EXPECTED_OUTPUTS.add("Hello my name is John, I am a student in the University of") + + +class GPTQTestActOrder(GPTQTest): + + EXPECTED_OUTPUTS = set() + EXPECTED_OUTPUTS.add("Hello my name is jay and i am a student at university.") + EXPECTED_OUTPUTS.add("Hello my name is jessie and i am a very sweet and") + EXPECTED_OUTPUTS.add("Hello my name is nathalie, I am a young girl from") + + disable_exllama = True desc_act=True + + def test_generate_quality(self): + # act_order don't work with qlinear_cuda kernel + pass + def test_serialization(self): + # act_order don't work with qlinear_cuda kernel + pass + + def test_exllama_serialization(self): + """ + Test the serialization of the model and the loading of the quantized weights with exllama kernel + """ + from accelerate import init_empty_weights + + with tempfile.TemporaryDirectory() as tmpdirname: + self.quantizer.save(self.quantized_model, tmpdirname) + self.quantized_model.config.save_pretrained(tmpdirname) + with init_empty_weights(): + empty_model = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16) + empty_model.tie_weights() + quantized_model_from_saved = load_quantized_model(empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False) + self.check_inference_correctness(quantized_model_from_saved) + def test_exllama_max_input_length(self): + """ + Test if the max_input_length works with exllama + act_order + """ from accelerate import init_empty_weights - max_input_length = 4028 with tempfile.TemporaryDirectory() as tmpdirname: self.quantizer.save(self.quantized_model, tmpdirname) @@ -147,16 +185,19 @@ def test_exllama_max_input_length(self): with init_empty_weights(): empty_model = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(self.model_name), torch_dtype=torch.float16) empty_model.tie_weights() - quantized_model_from_saved = load_quantized_model(empty_model, save_folder=tmpdirname, device_map={"": 0}) + quantized_model_from_saved = load_quantized_model(empty_model, save_folder=tmpdirname, device_map={"": 0}, disable_exllama=False, max_input_length = 4028) - prompt = "I am in Paris and" * 450 - + prompt = "I am in Paris and" * 1000 inp = self.tokenizer(prompt, return_tensors="pt").to(0) - self.assertTrue(inp["input_ids"].shape[1] > 2048) - + self.assertTrue(inp["input_ids"].shape[1] > 4028) with self.assertRaises(RuntimeError) as cm: res = quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3) - self.assertTrue("temp_state buffer is too small" in str(cm.exception)) + self.assertTrue("temp_state buffer is too small" in str(cm.exception)) + + prompt = "I am in Paris and" * 500 + inp = self.tokenizer(prompt, return_tensors="pt").to(0) + self.assertTrue(inp["input_ids"].shape[1] < 4028) + res = quantized_model_from_saved.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3) class GPTQUtilsTest(unittest.TestCase): """