Typo fix (Streamer mute)

JosefAlbers · web-flow · commit 4fe4e8fa11c6 · 2024-07-20T11:56:55.000+09:00
diff --git a/README.md b/README.md
@@ -56,17 +56,7 @@ generate(prompts, max_tokens=100)
 generate(prompts, max_tokens=100, blind_model=True)
 ```
 
-### Model and Cache Quantization
-
-```python
-# Model quantization
-generate("Describe the water cycle.", quantize_model=True)
-
-# Cache quantization
-generate("Explain quantum computing.", quantize_cache=True)
-```
-
-### Constrained Decoding (WIP)
+### Constrained (Beam Search) Decoding
 
 The `constrain` function allows for structured generation, which can be useful for tasks like code generation, function calling, chain-of-thought prompting, or multiple-choice question answering.
 
@@ -93,7 +83,11 @@ prompts = [
     "A 20-year-old woman presents with menorrhagia for the past several years. She says that her menses “have always been heavy”, and she has experienced easy bruising for as long as she can remember. Family history is significant for her mother, who had similar problems with bruising easily. The patient's vital signs include: heart rate 98/min, respiratory rate 14/min, temperature 36.1°C (96.9°F), and blood pressure 110/87 mm Hg. Physical examination is unremarkable. Laboratory tests show the following: platelet count 200,000/mm3, PT 12 seconds, and PTT 43 seconds. Which of the following is the most likely cause of this patient’s symptoms? A: Factor V Leiden B: Hemophilia A C: Lupus anticoagulant D: Protein C deficiency E: Von Willebrand disease",
     "A 25-year-old primigravida presents to her physician for a routine prenatal visit. She is at 34 weeks gestation, as confirmed by an ultrasound examination. She has no complaints, but notes that the new shoes she bought 2 weeks ago do not fit anymore. The course of her pregnancy has been uneventful and she has been compliant with the recommended prenatal care. Her medical history is unremarkable. She has a 15-pound weight gain since the last visit 3 weeks ago. Her vital signs are as follows: blood pressure, 148/90 mm Hg; heart rate, 88/min; respiratory rate, 16/min; and temperature, 36.6℃ (97.9℉). The blood pressure on repeat assessment 4 hours later is 151/90 mm Hg. The fetal heart rate is 151/min. The physical examination is significant for 2+ pitting edema of the lower extremity. Which of the following tests o should confirm the probable condition of this patient? A: Bilirubin assessment B: Coagulation studies C: Hematocrit assessment D: Leukocyte count with differential E: 24-hour urine protein"]
 
-constrain(prompts, constraints=[(30, ' The correct answer is'), (10, 'X.')], blind_model=True, quantize_model=True)
+# Apply vanilla constrained decoding
+constrain(prompts, constraints=[(30, ' The correct answer is'), (10, 'X.')], blind_model=True, quantize_model=True, use_beam=False)
+
+# Apply constrained beam decoding (ACB)
+constrain(prompts, constraints=[(30, ' The correct answer is'), (10, 'X.')], blind_model=True, quantize_model=True, use_beam=True)
 ```
 
 The constraints encourage a structured response that includes the thought process, making the output more informative and transparent:
@@ -131,6 +125,16 @@ batch_results = choose(prompts)
 print(batch_results)  # Output: ['C', 'B']
 ```
 
+### Model and Cache Quantization
+
+```python
+# Model quantization
+generate("Describe the water cycle.", quantize_model=True)
+
+# Cache quantization
+generate("Explain quantum computing.", quantize_cache=True)
+```
+
 ### (Q)LoRA Fine-tuning
 
 Training a LoRA Adapter
diff --git a/assets/ACB.pdf b/assets/ACB.pdf
diff --git a/assets/agent_toolchain.pdf b/assets/agent_toolchain.pdf
diff --git a/examples.py b/examples.py
@@ -4,28 +4,31 @@
 
 # Decoding Strategies
 
+## Code Generation
+
+### Greedy Decoding
+pv.generate("Write a Python function to calculate the Fibonacci sequence up to a given number n.", blind_model=True, quantize_model=True)
+
+### Constrained Decoding
+pv.constrain("Write a Python function to calculate the Fibonacci sequence up to a given number n.", [(100, "\n```python\n"), (100, " return "), (200, "\n```")], use_beam=False)
+
+### Constrained Beam Search
+pv.constrain("Write a Python function to calculate the Fibonacci sequence up to a given number n.", [(100, "\n```python\n"), (100, " return "), (200, "\n```")], use_beam=True)
+
 ## Multiple Choice Question Answering
 prompts = [
     "A 20-year-old woman presents with menorrhagia for the past several years. She says that her menses “have always been heavy”, and she has experienced easy bruising for as long as she can remember. Family history is significant for her mother, who had similar problems with bruising easily. The patient's vital signs include: heart rate 98/min, respiratory rate 14/min, temperature 36.1°C (96.9°F), and blood pressure 110/87 mm Hg. Physical examination is unremarkable. Laboratory tests show the following: platelet count 200,000/mm3, PT 12 seconds, and PTT 43 seconds. Which of the following is the most likely cause of this patient’s symptoms? A: Factor V Leiden B: Hemophilia A C: Lupus anticoagulant D: Protein C deficiency E: Von Willebrand disease",
     "A 25-year-old primigravida presents to her physician for a routine prenatal visit. She is at 34 weeks gestation, as confirmed by an ultrasound examination. She has no complaints, but notes that the new shoes she bought 2 weeks ago do not fit anymore. The course of her pregnancy has been uneventful and she has been compliant with the recommended prenatal care. Her medical history is unremarkable. She has a 15-pound weight gain since the last visit 3 weeks ago. Her vital signs are as follows: blood pressure, 148/90 mm Hg; heart rate, 88/min; respiratory rate, 16/min; and temperature, 36.6℃ (97.9℉). The blood pressure on repeat assessment 4 hours later is 151/90 mm Hg. The fetal heart rate is 151/min. The physical examination is significant for 2+ pitting edema of the lower extremity. Which of the following tests o should confirm the probable condition of this patient? A: Bilirubin assessment B: Coagulation studies C: Hematocrit assessment D: Leukocyte count with differential E: 24-hour urine protein"
 ]
 
-### Multiple Choice Selection
-pv.choose(prompts, choices='ABCDE')
-
 ### Constrained Decoding
 pv.constrain(prompts, constraints=[(100, ' The correct answer is'), (1, 'X.')], blind_model=True, quantize_model=True, use_beam=False)
 
-### Constrained Beam Search (ACB)
+### Constrained Beam Search
 pv.constrain(prompts, constraints=[(100, ' The correct answer is'), (1, 'X.')], blind_model=True, quantize_model=True, use_beam=True)
 
-## Code Generation
-
-### Constrained Decoding
-pv.constrain("Write a Python function to calculate the Fibonacci sequence up to a given number n.", [(100, "\n```python\n"), (100, " return "), (200, "\n```")], use_beam=False)
-
-### Constrained Beam Search
-pv.constrain("Write a Python function to calculate the Fibonacci sequence up to a given number n.", [(100, "\n```python\n"), (100, " return "), (200, "\n```")], use_beam=True)
+### Multiple Choice Selection
+pv.choose(prompts, choices='ABCDE')
 
 # Train
 pv.train_lora(
diff --git a/phi.py b/phi.py
@@ -535,8 +535,8 @@ def __call__(self, keys, values, n_beam):
             self.offset = new_offset
             return keys, values
         else:
-            self.kv[0,:,:,self.offset:new_offset,:] = keys
-            self.kv[1,:,:,self.offset:new_offset,:] = values
+            self.kv[0,:,:,self.offset:new_offset,:] = keys.astype(mx.float32)
+            self.kv[1,:,:,self.offset:new_offset,:] = values.astype(mx.float32)
             self.offset = new_offset
             return self.kv[0,:,:,:new_offset,:], self.kv[1,:,:,:new_offset,:]
 
diff --git a/phi_3_vision_mlx.py b/phi_3_vision_mlx.py
@@ -46,7 +46,7 @@ class Streamer:
     def __init__(self, processor, stream, mute):
         self.tokenizer = processor.tokenizer
         self.mute = mute
-        self.stream = stream and mute
+        self.stream = stream and (not mute)
         self.list_tokens = []
         self.idx_sofar = 0
     def __call__(self, token):
@@ -71,7 +71,7 @@ def end(self):
         else:
             arr_tokens = mx.concatenate(self.list_tokens, axis=1)
             list_txt = self.tokenizer.batch_decode([(i[:i.index(ID_EOS)+1] if ID_EOS in i else i) for i in arr_tokens.tolist()])
-            if self.mute is False:
+            if not self.mute:
                 for i, gen in enumerate(list_txt):
                     print(f'\n< Generated text for prompt #{i} >\n{gen}')
             return list_txt, arr_tokens.size
@@ -372,7 +372,7 @@ def _get_wt(model_path, model_cfg):
         return [(k, v) for wf in glob.glob(f"{model_path}/*.safetensors") for k, v in mx.load(wf).items()]
     return [(k, v.transpose(0, 2, 3, 1) if "patch_embedding.weight" in k else v) for wf in glob.glob(f"{model_path}/*.safetensors") for k, v in mx.load(wf).items()]
 
-def _generate(model, processor, prompt, images=None, max_tokens=1000, verbose=True, return_tps=False, early_stop=False, stream=True, mute=False):
+def _generate(model, processor, prompt, images=None, max_tokens=512, verbose=True, return_tps=False, early_stop=False, stream=True, mute=False):
     if images is not None and isinstance(prompt, list):
         raise ValueError('Images cannot be provided when prompt is a list')
     logit_stopper = LogitStopper(max_tokens, early_stop)
@@ -383,13 +383,13 @@ def _generate(model, processor, prompt, images=None, max_tokens=1000, verbose=Tr
     tic = Tic()
     logits, cache = model(**dict_input, max_tokens=max_tokens)
     token = mx.argmax(logits[:, -1, :], axis=-1)[:,None]
-    mx.eval(token, logits, cache)
+    mx.eval(token, logits)#, cache)
     streamer(token)
     prompt_time = tic()
     for i in range(max_tokens-1):
         logits, cache = model(input_ids=token, cache=cache, mask=mask, pids=pids)
         token = mx.argmax(logits[:, -1, :], axis=-1)[:,None]
-        mx.eval(token, logits, cache)
+        mx.eval(token, logits)#, cache)
         streamer(token)
         if logit_stopper(logits):
             break
@@ -529,7 +529,7 @@ def _get_beam(logits, cache, id_constraint, beam_idx=0, n_beam=3):
         dict_input = processor(prompt)
         logits, cache = model(**dict_input, max_tokens=constraint[0] + id_constraint.shape[0]+10)
         logits = nn.log_softmax(logits, axis=-1)
-        mx.eval(logits, cache)
+        mx.eval(logits)
         _score_0 = logits[:, -1, id_constraint[0]]
         tiled_id_constraint = mx.tile(id_constraint, (logits.shape[0], 1))
         logits_rest, _ = model(input_ids=tiled_id_constraint, cache=cache, advance_offset=0)
@@ -559,7 +559,7 @@ def _get_beam(logits, cache, id_constraint, beam_idx=0, n_beam=3):
             token_plus = mx.concatenate([token, tiled_id_constraint], axis=1)
             logits, cache = model(input_ids=token_plus, cache=cache, advance_offset=1)
             logits = nn.log_softmax(logits)
-            mx.eval(logits, cache)
+            mx.eval(logits)
             pre_beam_score = mx.concatenate([running_score, logits[mx.arange(logits.shape[0])[:,None], mx.arange(logits.shape[1]-1)[None,:], token_plus[:,1:]]], axis=1).mean(axis=1)
             pre_beam_synth = mx.concatenate(tokens + [tiled_id_constraint, synth_pad], axis=1)
             if use_beam:
@@ -1308,7 +1308,7 @@ def load(blind_model=False, quantize_model=False, quantize_cache=False, use_adap
         _setup()
     return _load(model_path=model_path, use_quantized_cache=quantize_cache, adapter_path=adapter_path)
 
-def generate(prompt, images=None, preload=None, blind_model=False, quantize_model=False, quantize_cache=False, use_adapter=False, max_tokens=1000, verbose=True, return_tps=False, early_stop=False, stream=True, apply_chat_template=True):
+def generate(prompt, images=None, preload=None, blind_model=False, quantize_model=False, quantize_cache=False, use_adapter=False, max_tokens=512, verbose=True, return_tps=False, early_stop=False, stream=True, apply_chat_template=True):
     """
     Generate text based on a given prompt, optionally with image input.
 
@@ -1464,7 +1464,6 @@ def constrain(prompt, constraints=[(30, ' The correct answer is'), (1, 'X.')], i
         prompt = _apply_chat_template(prompt, None, verbose)[0]
     return _constrain(*preload, prompt=prompt, constraints=constraints, use_beam=use_beam, verbose=verbose)
 
-
 def execute(code_strings, file_prefix=0, verbose=True):
     """
     Execute one or more Python code strings and capture the results.
diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
     url='https://github.com/JosefAlbers/Phi-3-Vision-MLX',
     py_modules=['phi_3_vision_mlx', 'gte', 'phi', 'api'],
     packages=find_packages(),
-    version='0.1.0-alpha',
+    version='0.1.1-alpha',
     readme="README.md",
     author_email="albersj66@gmail.com",
     description="Phi-3-Vision on Apple silicon with MLX",