View Source FLAMESlurmBackend (flame_slurm_backend v0.0.2)

Slurm Backend implementation.

Usage

Configure the flame backend in our configuration or application setup:

write a jobscript

#!/bin/bash
#SBATCH -o flame.%j.out
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --time=01:00:00
#SBATCH --mem=20G

export SLURM_FLAME_HOST=$(ip -f inet addr show ib0 | awk '/inet/ {print $2}' | cut -d/ -f1)

setting a Partition with GPU instances and 1 GPU per job

#SBATCH --partition=gpu
#SBATCH --gpus-per-node=1
# application.ex
children = [
  {FLAME.Pool,
    name: MyApp.SamplePool,
    code_sync: [
   start_apps: true,
   sync_beams: Kino.beam_paths(),
   compress: false,
   extract_dir: {
     FLAMESlurmBackend.SlurmClient,
     :path_job_id,
     [Path.absname("extract_dir")<>"/"]
   }
 ],
 min: 0,
 max: 1,
 max_concurrency: 1,
 idle_shutdown_after: :timer.minutes(10),
 timeout: :infinity,
 boot_timeout: 360000,
 track_resources: true,
    backend: {
      FLAMESlurmBackend,
      slurm_job: <jobscript>
    }
    }
]

when running inside a Livebook:

Start the Livebook with a matching Host part of the Erlang long name: Using LIVEBOOK_IP=0.0.0.0 is helpful if you create a portforwarding from the login Host of the Cluster.

#!/bin/bash
export SLURM_FLAME_HOST=$(ip -f inet addr show ib0 | awk '/inet/ {print $2}' | cut -d/ -f1)
epmd -daemon
LIVEBOOK_IP=0.0.0.0 livebook server --name livebook@$SLURM_FLAME_HOST
Kino.start_child(
  {FLAME.Pool,
   name: :runner,
   code_sync: [start_apps: true, sync_beams: Kino.beam_paths(), compress: false],
   min: 0,
   max: 1,
   max_concurrency: 10,
   idle_shutdown_after: :timer.minutes(1),
   timeout: :infinity,
   track_resources: true,
   backend: {FLAMESlurmBackend, slurm_job: <jobscript>}}
)