Qwen3-0.6B quantized generation on Emily

Copy Markdown View Source
Mix.install(
  [
    {:emily, "~> 0.3"},
    # Bumblebee 0.6.3 (the latest Hex release) doesn't yet include
    # Qwen3 support, so pin the `main` ref that does. `override: true`
    # because Emily's mix.exs declares `{:bumblebee, "~> 0.6", optional: true}`
    # — Mix would otherwise refuse a github-ref dep as a child conflict.
    {:bumblebee,
     github: "elixir-nx/bumblebee",
     ref: "273805e95507dc7866b958d90e0012a3abad1761",
     override: true},
    {:tokenizers, "~> 0.5"},
    {:nx, "~> 0.10"},
    {:axon, "~> 0.7"},
    {:kino, "~> 0.14"}
  ],
  config: [
    nx: [default_backend: Emily.Backend]
  ]
)

Overview

This notebook loads Qwen/Qwen3-0.6B through Bumblebee, quantizes every dense layer's kernel to int4 via MLX affine group-wise quantization, and greedy-decodes a completion. It also demonstrates Emily.Stream for concurrent serving on a shared model.

The checkpoint is ~1.5 GB on first fetch. Budget several minutes for the cold run.

Dense baseline

{:ok, %{model: model, params: params, spec: spec}} =
  Bumblebee.load_model({:hf, "Qwen/Qwen3-0.6B"})

{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-0.6B"})
{:ok, generation_config} = Bumblebee.load_generation_config({:hf, "Qwen/Qwen3-0.6B"})

model_info = %{model: model, params: params, spec: spec}

config =
  Bumblebee.configure(generation_config,
    max_new_tokens: 32,
    strategy: %{type: :greedy_search}
  )

serving =
  Bumblebee.Text.generation(model_info, tokenizer, config,
    defn_options: [compiler: Emily.Compiler]
  )

%{results: [%{text: dense_text}]} =
  Nx.Serving.run(serving, "The quick brown fox jumps over the lazy dog.")

dense_text

Quantization transform

Rewriting the Axon graph so every :dense node runs through Emily.Quantization.Layers.quantized_dense/4 needs Axon, which is a test-only dep on the Emily package — Emily itself stays Axon-free so library consumers don't pull it transitively. The transform below is the recommended starting point: copy it into your own project alongside an Axon dep, and extend it as you need (a per-layer :except filter, stricter opts validation, different group sizes per layer).

Two steps:

  1. Graph rewriteAxon.rewrite_nodes/2 replaces each :dense node with a :quantized_dense sub-graph built with Axon.layer/3.
  2. Parameter quantization — after init (or Bumblebee load), walk the Axon.ModelState and swap each dense kernel for a %QuantizedWeight{}.
defmodule DenseTransform do
  alias Emily.Quantization.Layers
  alias Emily.QuantizedWeight

  @default_opts [bits: 4, group_size: 128, transpose: true]

  def quantize(model, model_state, opts \\ []) do
    opts = Keyword.merge(@default_opts, opts)
    {rewrite_graph(model), quantize_state(model, model_state, opts)}
  end

  # Replace every :dense node with a :quantized_dense layer whose
  # forward pass dispatches through Emily.Quantization.Layers.quantized_dense/4.
  defp rewrite_graph(model) do
    Axon.rewrite_nodes(model, fn
      %Axon.Node{op: :dense, meta: meta, name: name_fn} ->
        fn [x], _output ->
          quantized_dense_layer(x, meta[:units],
            use_bias: meta[:use_bias],
            name: name_fn
          )
        end

      _ ->
        :skip
    end)
  end

  defp quantized_dense_layer(x, units, opts) do
    kernel =
      Axon.param("kernel", &Axon.Shape.dense_kernel(&1, units),
        initializer: :glorot_uniform
      )

    {inputs, op} =
      if opts[:use_bias] do
        bias =
          Axon.param("bias", &Axon.Shape.dense_bias(&1, units),
            initializer: :zeros
          )

        {[x, kernel, bias], &Layers.quantized_dense/4}
      else
        {[x, kernel], &Layers.quantized_dense/3}
      end

    Axon.layer(op, inputs,
      name: opts[:name],
      meta: %{units: units, use_bias: opts[:use_bias]},
      op_name: :quantized_dense
    )
  end

  # Walk the ModelState, replacing each dense kernel tensor with a
  # %QuantizedWeight{}. For the default transpose: true (AWQ / MLX
  # convention, groups along the reduction axis) we pre-transpose the
  # [in, out] kernel to [out, in] before calling from_dense/2.
  defp quantize_state(model, state, opts) do
    transpose = opts[:transpose]

    dense_names =
      model
      |> Axon.properties()
      |> Enum.filter(fn {_name, op} -> op == :dense end)
      |> Enum.map(fn {name, _} -> name end)

    Enum.reduce(dense_names, state, fn name, acc ->
      update_in(acc, [Access.key!(:data), name, "kernel"], fn kernel ->
        source = if transpose, do: Nx.transpose(kernel), else: kernel

        QuantizedWeight.from_dense(source,
          group_size: opts[:group_size],
          bits: opts[:bits],
          transpose: transpose
        )
      end)
    end)
  end
end

Notes. The transform only rewrites top-level :dense nodes; a model with dense layers nested inside other Axon ops needs a recursive rewriter. The transpose: true default stores weights as [out, in] (MLX / AWQ convention, groups along the reduction axis); set false if you're feeding a checkpoint that's already laid out the other way. Bits must be one of [2, 4, 8] — the defn-native dequantize_defn/1 path doesn't cover {3, 6}.

Quantized inference

{qmodel, qparams} =
  DenseTransform.quantize(model, params,
    bits: 4,
    group_size: 128,
    transpose: true
  )

qmodel_info = %{model: qmodel, params: qparams, spec: spec}

qserving =
  Bumblebee.Text.generation(qmodel_info, tokenizer, config,
    defn_options: [compiler: Emily.Compiler]
  )

%{results: [%{text: quant_text}]} =
  Nx.Serving.run(qserving, "The quick brown fox jumps over the lazy dog.")

quant_text

The quantized output will drift from the dense baseline — int4 noise across every linear is expected. The test at test/emily/conformance/qwen3_quant_full_test.exs pins a deterministic reference string for regression-testing the quantization stack.

Concurrent serving via Emily.Stream

For concurrent inference on a shared model, each serving worker should own its own MLX command queue. Emily.Stream.with_stream/2 does that per-process:

stream = Emily.Stream.new(:gpu)

task1 =
  Task.async(fn ->
    Emily.Stream.with_stream(stream, fn ->
      Nx.Serving.run(qserving, "Question 1?")
    end)
  end)

task2 =
  Task.async(fn ->
    Emily.Stream.with_stream(Emily.Stream.new(:gpu), fn ->
      Nx.Serving.run(qserving, "Question 2?")
    end)
  end)

{Task.await(task1, :infinity), Task.await(task2, :infinity)}

Each Emily.Stream maps to its own Metal command queue. Weights are shared across streams — no duplication — so the memory cost of adding a stream is the Metal command buffer, not the model.

Create streams once at worker init, not per-request.