erllama_nif (erllama v0.2.0)

View Source

Summary

Types

adapter_ref()

-type adapter_ref() :: reference().

context_ref()

-type context_ref() :: reference().

model_ref()

-type model_ref() :: reference().

sampler_ref()

-type sampler_ref() :: reference().

token_id()

-type token_id() :: integer().

Functions

adapter_free(Adapter)

-spec adapter_free(adapter_ref()) -> ok | {error, atom()}.

adapter_load(Model, Path)

-spec adapter_load(model_ref(), iodata()) -> {ok, adapter_ref()} | {error, atom()}.

apply_chat_template(Model, Request)

-spec apply_chat_template(model_ref(), map()) -> {ok, [token_id()]} | {error, atom()}.

clear_sampler(Ctx)

-spec clear_sampler(context_ref()) -> ok.

configure_sampler(Ctx, Cfg)

-spec configure_sampler(context_ref(), map()) -> ok | {error, atom()}.

crc32c(Data)

-spec crc32c(iodata()) -> non_neg_integer().

decode_one(Ctx)

-spec decode_one(context_ref()) -> {ok, token_id()} | {eog, token_id()} | {error, term()}.

detokenize(Model, Tokens)

-spec detokenize(model_ref(), [token_id()]) -> binary() | {error, atom()}.

embed(Ctx, Tokens)

-spec embed(context_ref(), [token_id()]) -> {ok, [float()]} | {error, atom()}.

forward_with_argmax(Ctx, Tokens)

-spec forward_with_argmax(context_ref(), [token_id()]) -> {ok, [token_id() | eos]} | {error, atom()}.

free_context(Ctx)

-spec free_context(context_ref()) -> ok.

free_model(Model)

-spec free_model(model_ref()) -> ok.

fsync_dir(Path)

-spec fsync_dir(iodata()) -> ok | {error, atom()}.

kv_pack(Ctx, Tokens, NTokens)

-spec kv_pack(context_ref(), [token_id()], non_neg_integer()) -> binary() | {error, atom()}.

kv_pack(Ctx, Tokens, NTokens, SeqId)

-spec kv_pack(context_ref(), [token_id()], non_neg_integer(), non_neg_integer()) ->
                 binary() | {error, atom()}.

kv_seq_rm(Ctx, SeqId, P0, P1)

-spec kv_seq_rm(context_ref(), integer(), integer(), integer()) -> ok | {error, atom()}.

kv_unpack(Ctx, Bin, SeqId)

-spec kv_unpack(context_ref(), binary(), non_neg_integer()) -> ok | {error, atom()}.

load_model(Path, Opts)

-spec load_model(iodata(), map()) -> {ok, model_ref()} | {error, atom()}.

Load a GGUF model.

Recognised keys in Opts (all optional; defaults come from llama_model_default_params()):

  • n_gpu_layers :: integer() — number of layers offloaded to GPU.
  • main_gpu :: non_neg_integer() — GPU index when split_mode = none.
  • split_mode :: none | layer | row — how to split a model across multiple GPUs. Atom mapping: none -> LLAMA_SPLIT_MODE_NONE, layer -> LLAMA_SPLIT_MODE_LAYER, row -> LLAMA_SPLIT_MODE_ROW.

  • tensor_split :: [float()] — per-device proportions when splitting. Up to llama_max_devices() entries (16 in the vendored llama.cpp); shorter lists zero-fill the tail.
  • use_mmap, use_mlock, vocab_only :: boolean().

A bad atom for split_mode, or a non-numeric entry in tensor_split, raises badarg.

model_n_layer(Model)

-spec model_n_layer(model_ref()) -> non_neg_integer() | {error, atom()}.

model_size(Model)

-spec model_size(model_ref()) -> non_neg_integer() | {error, atom()}.

new_context(Model, Opts)

-spec new_context(model_ref(), map()) -> {ok, context_ref()} | {error, atom()}.

Build a new inference context against a loaded model.

Recognised keys in Opts (all optional; defaults come from llama_context_default_params()):

  • n_ctx, n_batch, n_ubatch, n_seq_max :: pos_integer().
  • n_threads, n_threads_batch :: pos_integer().
  • embeddings, offload_kqv :: boolean().
  • flash_attn :: boolean() | autotrue enables, false disables, auto lets llama.cpp decide based on the build and model. Maps to enum llama_flash_attn_type.

  • type_k, type_v :: f16 | f32 | bf16 | q4_0 | q5_0 | q5_1 | q8_0 — KV cache element type for keys and values. Maps to GGML_TYPE_*.

A bad atom for any of flash_attn, type_k, or type_v raises badarg.

prefill(Ctx, Tokens)

-spec prefill(context_ref(), [token_id()]) -> ok | {error, term()}.

sampler_free(Sampler)

-spec sampler_free(sampler_ref()) -> ok | {error, atom()}.

sampler_new(Ctx, Cfg)

-spec sampler_new(context_ref(), map()) -> {ok, sampler_ref()} | {error, atom()}.

set_adapters(Ctx, Adapters)

-spec set_adapters(context_ref(), [{adapter_ref(), float()}]) -> ok | {error, atom()}.

set_grammar(Ctx, Grammar)

-spec set_grammar(context_ref(), binary()) -> ok | {error, atom()}.

step(Ctx, Ops)

-spec step(context_ref(), [{non_neg_integer(), {prefill, [token_id()]} | {decode, sampler_ref()}}]) ->
              {ok, [{non_neg_integer(), prefilled | {token, token_id(), 0 | 1}}]} | {error, atom()}.

Multi-sequence batched decode.

Each tick is exactly one llama_decode call that mixes prefill and decode rows freely (SARATHI-style co-batching). The order matters: prefill rows decode their slice without sampling and leave logits on the last slice token for the NEXT tick to sample from. Decode rows sample from the previous tick's logits BEFORE the new batch is built, so the token returned to the caller is the same token that lands in KV by the time the call returns.

Each SeqId must be 0 <= SeqId < 256 (compile-time cap on the context's per_seq[] array; lift in the NIF if it becomes binding). Each SamplerRef must have been built via sampler_new/2 against this same CtxRef.

Errors:

  • {error, no_logits} — a decode row's seq has last_logits_idx = -1, i.e. no prefill has run for that seq since the context was built or since the last kv_unpack / kv_seq_rm. The caller must issue a prefill row for that seq first.
  • {error, batch_overflow} — the total slice length exceeds the context's n_batch. A budget-aware scheduler should shrink the prefill slices and retry.
  • {error, released} — the context or one of the samplers has been explicitly freed.
  • {error, exception}llama_decode or llama_sampler_sample threw across the C ABI; the context's decode_ready flag is cleared and the gen_statem owning it is expected to stop.

tokenize(Model, Text, Opts)

-spec tokenize(model_ref(), iodata(), map()) -> [token_id()] | {error, atom()}.

vram_info()

-spec vram_info() ->
                   {ok,
                    #{total_b := non_neg_integer(),
                      free_b := non_neg_integer(),
                      used_b := non_neg_integer()}} |
                   {error, atom()}.