# Multi-backend bench: per-op + end-to-end + robustness. Iteration
# counts and sizes are PER-BACKEND-PER-WORKLOAD so BinaryBackend
# doesn't eat hours of wall clock on a 1024×1024 matmul.

defmodule FullBench do
  @hosts_with_exla ["super-io"]

  # Per-backend matmul scaling: (size, reps).
  # BinaryBackend caps at 256 because larger is hours of CPU.
  # GPU backends go up to 1024 with low rep counts at the top.
  @matmul_sched %{
    "BinaryBackend" => [{16, 100}, {64, 50}, {128, 20}, {256, 8}],
    "VulkanoBackend" => [{16, 200}, {64, 200}, {256, 100}, {1024, 30}],
    "spirit" => [{16, 200}, {64, 200}, {256, 100}, {1024, 30}],
    "EXLA" => [{16, 200}, {64, 200}, {256, 100}, {1024, 50}]
  }

  def main do
    {hostname, 0} = System.cmd("hostname", ["-s"])
    host = String.trim(hostname)
    IO.puts("\n========================================")
    IO.puts("HOST: #{host}")
    IO.puts("DATE: #{DateTime.utc_now() |> DateTime.to_iso8601()}")
    IO.puts("========================================\n")

    backends = available_backends(host)
    IO.puts("backends: #{Enum.map(backends, &elem(&1, 0)) |> Enum.join(", ")}\n")

    bench_a(backends)
    bench_b(backends)
    bench_c(backends)
  end

  defp available_backends(host) do
    base = [
      {"BinaryBackend", Nx.BinaryBackend},
      {"VulkanoBackend", Nx.Vulkan.VulkanoBackend}
    ]

    base =
      if Code.ensure_loaded?(Nx.Vulkan.Backend) do
        base ++ [{"spirit", Nx.Vulkan.Backend}]
      else
        base
      end

    if host in @hosts_with_exla and Code.ensure_loaded?(EXLA) do
      base ++ [{"EXLA", EXLA.Backend}]
    else
      base
    end
  end

  # ---- Bench A: per-op latency curves ----

  defp bench_a(backends) do
    IO.puts("=== BENCH A: per-op latency ===")

    for {name, mod} <- backends do
      IO.puts("\n[#{name}]")

      sched = Map.get(@matmul_sched, name, [{16, 100}])

      for {m, reps} <- sched do
        time_op("matmul #{m}", reps, fn ->
          a = make_tensor({m, m}, mod)
          b = make_tensor({m, m}, mod)
          Nx.dot(a, b)
        end)
      end

      add_size = if name == "BinaryBackend", do: 4096, else: 16384
      time_op("add #{add_size}", 100, fn ->
        a = make_tensor({add_size}, mod)
        b = make_tensor({add_size}, mod)
        Nx.add(a, b)
      end)

      sig_size = if name == "BinaryBackend", do: 4096, else: 16384
      time_op("sigmoid #{sig_size}", 100, fn ->
        a = make_tensor({sig_size}, mod)
        Nx.sigmoid(a)
      end)

      sum_size = if name == "BinaryBackend", do: 256, else: 1024
      time_op("sum #{sum_size}×#{sum_size}", 50, fn ->
        a = make_tensor({sum_size, sum_size}, mod)
        Nx.sum(a)
      end)
    end
  end

  # ---- Bench B: end-to-end workloads ----

  defp bench_b(backends) do
    IO.puts("\n\n=== BENCH B: end-to-end ===")

    for {name, mod} <- backends do
      IO.puts("\n[#{name}]")
      bench_axon_training_step(mod)
      bench_regime_log_p(mod, name)
    end
  end

  defp bench_axon_training_step(backend_mod) do
    if Code.ensure_loaded?(Axon) do
      model =
        Axon.input("x", shape: {nil, 8})
        |> Axon.dense(16, activation: :sigmoid)
        |> Axon.dense(2)

      {init_fn, predict_fn} = Axon.build(model, mode: :train)
      params = init_fn.(%{"x" => Nx.template({32, 8}, :f32)}, Axon.ModelState.empty())
      params = transfer_state(params, backend_mod)

      x = make_tensor({32, 8}, backend_mod)
      y = make_tensor({32, 2}, backend_mod)

      grad_fn = fn p, x_in, y_in ->
        Nx.Defn.value_and_grad(p, fn pp ->
          out = predict_fn.(pp, %{"x" => x_in}).prediction
          d = Nx.subtract(out, y_in)
          Nx.divide(Nx.sum(Nx.multiply(d, d)), Nx.tensor(32.0))
        end)
      end

      time_op("Axon training step", 30, fn ->
        Nx.Defn.jit_apply(grad_fn, [params, x, y], compiler: Nx.Defn.Evaluator)
      end)
    end
  end

  defp bench_regime_log_p(backend_mod, _name) do
    if Code.ensure_loaded?(Exmc.Trading.RegimeModel) do
      returns = for _ <- 1..200, do: :rand.uniform() * 0.02 - 0.01
      {ir, _} = Exmc.Trading.RegimeModel.build(returns, num_samples: 1, num_warmup: 1, ncp: false)
      {:ok, comps} = Exmc.NUTS.CustomSynth.extract_components(ir)
      fun = Exmc.NUTS.CustomSynth.MultiRvCustomSpec.compose_logp_defn(comps)

      q = Nx.tensor([0.01, 0.05, 0.02, 0.05, 0.02, 0.05, 0.05, 0.05], type: :f32, backend: backend_mod)
      obs = Nx.tensor(returns, type: :f32, backend: backend_mod)

      time_op("exmc regime log_p", 20, fn ->
        Nx.Defn.jit_apply(fun, [q, obs], compiler: Nx.Defn.Evaluator)
      end)
    end
  end

  # ---- Bench C: robustness ----

  defp bench_c(backends) do
    IO.puts("\n\n=== BENCH C: robustness (5000 mixed dispatches) ===")

    for {name, mod} <- backends do
      IO.puts("\n[#{name}]")

      # BinaryBackend would take forever for 128×128 matmul; size down.
      size = if name == "BinaryBackend", do: 32, else: 128
      n = if name == "BinaryBackend", do: 500, else: 5000

      a = make_tensor({size, size}, mod)

      try do
        {micros, _} =
          :timer.tc(fn ->
            Enum.reduce(1..n, a, fn _, acc ->
              Nx.dot(acc, a) |> Nx.sigmoid() |> Nx.divide(Nx.tensor(2.0))
            end)
          end)

        per = micros / n / 1000.0
        IO.puts("#{n} iter, size #{size}×#{size}: #{Float.round(micros / 1_000_000, 1)}s total, #{Float.round(per, 3)} ms/iter — OK")
      rescue
        e -> IO.puts("CRASHED: #{Exception.message(e)}")
      catch
        k, r -> IO.puts("CAUGHT #{k}: #{inspect(r)}")
      end
    end
  end

  # ---- helpers ----

  defp make_tensor(shape, backend) do
    n = shape |> Tuple.to_list() |> Enum.reduce(1, &*/2)

    Nx.iota({n}, type: :f32, backend: Nx.BinaryBackend)
    |> Nx.divide(Nx.tensor(n * 1.0))
    |> Nx.reshape(shape)
    |> Nx.backend_transfer(backend)
  end

  defp transfer_state(model_state, backend) do
    %{
      model_state
      | data:
          Map.new(model_state.data, fn {layer, params} ->
            {layer, Map.new(params, fn {k, v} -> {k, Nx.backend_transfer(v, backend)} end)}
          end)
    }
  end

  defp time_op(label, n_iter, fun) do
    fun.()
    fun.()
    {micros, _} = :timer.tc(fn -> for _ <- 1..n_iter, do: fun.() end)
    per = micros / n_iter / 1000.0
    IO.puts("  #{label}: #{Float.round(per, 3)} ms/iter")
  end
end

FullBench.main()