Fixes to run quantized bumblebee models

seanmor5 · seanmor5 · commit a54ee13acf7b · 2024-07-30T10:35:02.000-04:00
diff --git a/lib/axon.ex b/lib/axon.ex
@@ -3849,9 +3849,19 @@ defmodule Axon do
       {_node, model} = Axon.pop_node(model)
   """
   @doc type: :graph
-  def pop_node(%Axon{nodes: nodes, output: id} = axon) do
-    {%{parent: [parent_id]} = popped, nodes} = Map.pop!(nodes, id)
-    {popped, %{axon | nodes: nodes, output: parent_id}}
+  def pop_node(%Axon{nodes: nodes, output: id}) do
+    {popped, nodes} = Map.pop!(nodes, id)
+
+    case popped do
+      %{op_name: :container, parent: parents, op: fun} = popped ->
+        {popped, apply(fun, Enum.map(parents, &%Axon{nodes: nodes, output: &1}) ++ [[]])}
+
+      %{parent: [_ | _] = parents} = popped ->
+        {popped, Enum.map(parents, &%Axon{nodes: nodes, output: &1})}
+
+      %{parent: [parent_id]} = popped ->
+        {popped, %Axon{nodes: nodes, output: parent_id}}
+    end
   end
 
   @doc """
diff --git a/lib/axon/quantization.ex b/lib/axon/quantization.ex
@@ -41,13 +41,16 @@ defmodule Axon.Quantization do
   All `:dense` layers in the model are replaced with `Axon.Quantization.weight_only_quantized_dense/3`.
   """
   def quantize_model(%Axon{} = model) do
-    quantized_dense_rewriter = fn [%Axon{} = x], _output, units, use_bias ->
-      weight_only_quantized_dense(x, units, use_bias: use_bias)
+    quantized_dense_rewriter = fn [%Axon{} = x], _output, name_fn, units, use_bias ->
+      weight_only_quantized_dense(x, units,
+        use_bias: use_bias,
+        name: name_fn
+      )
     end
 
     Axon.rewrite_nodes(model, fn
-      %Axon.Node{op: :dense, meta: meta} ->
-        &quantized_dense_rewriter.(&1, &2, meta[:units], meta[:use_bias])
+      %Axon.Node{op: :dense, meta: meta, name: name_fn} ->
+        &quantized_dense_rewriter.(&1, &2, name_fn, meta[:units], meta[:use_bias])
 
       _ ->
         :skip
diff --git a/lib/axon/quantization/layers.ex b/lib/axon/quantization/layers.ex
@@ -30,14 +30,26 @@ defmodule Axon.Quantization.Layers do
   end
 
   defnp weight_only_quantized_dense_impl(
-          input,
-          %QTensor{value: kernel, scale: scale},
+          x,
+          %QTensor{value: w_int8, scale: scales},
           bias,
           _opts
         ) do
-    input
-    |> Nx.dot([Nx.rank(input) - 1], Nx.as_type(kernel, Nx.type(input)), [0])
-    |> Nx.multiply(scale)
-    |> Nx.add(bias)
+    x_shape = Nx.shape(x)
+    last_dim = Nx.axis_size(x, -1)
+
+    x_view = Nx.reshape(x, {:auto, last_dim})
+
+    y = Nx.dot(x_view, Nx.as_type(Nx.transpose(w_int8), Nx.type(x)))
+    y = Nx.multiply(y, scales)
+    y = reshape_output(y, x_shape)
+
+    Nx.add(y, bias)
+  end
+
+  deftransformp reshape_output(output, x_shape) do
+    all_but_last = Tuple.delete_at(x_shape, tuple_size(x_shape) - 1)
+    new_shape = Tuple.append(all_but_last, :auto)
+    Nx.reshape(output, new_shape)
   end
 end
diff --git a/lib/axon/quantization/q_tensor.ex b/lib/axon/quantization/q_tensor.ex
@@ -22,7 +22,7 @@ defmodule Axon.Quantization.QTensor do
 
     case opts[:type] do
       {:s, 8} ->
-        dynamically_quantize_per_channel(x, min: -128, max: 127, type: {:s, 8})
+        dynamically_quantize_per_channel(Nx.transpose(x), min: -128, max: 127, type: {:s, 8})
 
       other ->
         raise "unsupported quantization type #{inspect(other)}"