Merge two or more video streams into a single output using GPU-accelerated compositing. This guide covers side-by-side, vertical split, and picture-in-Picture (PiP) layouts.
Overview
┌──────────────┐
│ Stream A │──┐
│ (main) │ │ ┌─────────────┐ ┌──────────────┐
└──────────────┘ ├───▶│ GPU Compose │───▶│ Encoder │──▶ output.mp4
┌──────────────┐ │ │ (overlay) │ │ (FFmpeg) │
│ Stream B │──┘ └─────────────┘ └──────────────┘
│ (PiP) │
└──────────────┘All compositing happens on the GPU via the overlay_alpha kernel. No CPU
readback is needed until you optionally save a snapshot.
Opening Multiple Sources
# Open two media sources (files, RTMP streams, cameras, etc.)
{:ok, src_a} = ExCubecl.Media.open("main_speaker.mp4")
{:ok, src_b} = ExCubecl.Media.open("remote_guest.mp4")
# Inspect stream metadata
{:ok, streams_a} = ExCubecl.Media.streams(src_a)
{:ok, streams_b} = ExCubecl.Media.streams(src_b)
# => [
# %{index: 0, type: :video, codec: :h264, fps: 30,
# width: 1920, height: 1080},
# %{index: 1, type: :audio, codec: :aac,
# sample_rate: 48000, channels: 2}
# ]Picture-in-Picture (PiP)
The most common video merge pattern: a full-screen main video with a smaller overlay in the corner.
Basic PiP
defmodule PiPMerger do
use ExCubecl.MediaPipeline
@pip_width 320
@pip_height 240
@pip_x 1580 # 1920 - 320 - 20 (20px margin)
@pip_y 820 # 1080 - 240 - 20 (20px margin)
def start_link(opts) do
{:ok, src_a} = ExCubecl.Media.open(opts[:main])
{:ok, src_b} = ExCubecl.Media.open(opts[:overlay])
{:ok, enc} = ExCubecl.Transcode.start(opts[:output],
video: [codec: :h264, width: 1920, height: 1080, bitrate: "6M"],
audio: [codec: :aac, bitrate: "192k", sample_rate: 48000]
)
state = %{
source_a: src_a,
source_b: src_b,
encoder: enc,
main_width: 1920,
main_height: 1080
}
ExCubecl.MediaPipeline.start_link(__MODULE__, state, name: __MODULE__)
end
def handle_frame(frame, state) do
# Read the corresponding frame from the overlay source
{:ok, overlay_frame} = ExCubecl.Media.read_frame(state.source_b, :video)
# Scale the overlay to PiP size
{:ok, scaled_overlay} = ExCubecl.Video.scale(overlay_frame,
width: @pip_width,
height: @pip_height
)
# Composite the PiP onto the main frame
{:ok, composited} = ExCubecl.Video.overlay(frame, scaled_overlay,
x: @pip_x,
y: @pip_y,
alpha: 1.0
)
# Encode the merged frame
:ok = ExCubecl.Transcode.write_frame(state.encoder, composited)
{:ok, state}
end
end
# Start the PiP merger
{:ok, _pid} = PiPMerger.start_link(
main: "speaker.mp4",
overlay: "guest.mp4",
output: "merged_pip.mp4"
)
# Push frames from the main source
{:ok, src_a} = ExCubecl.Media.open("speaker.mp4")
case ExCubecl.Media.read_frame(src_a, :video) do
{:ok, frame} ->
ExCubecl.MediaPipeline.push_frame(PiPMerger, frame)
{:error, :eof} ->
IO.puts("All frames processed")
endPiP with Border and Shadow
For a more polished look, add a subtle border around the PiP window:
defmodule StyledPiP do
use ExCubecl.MediaPipeline
@pip_width 400
@pip_height 300
@pip_x 1480
@pip_y 760
@border_size 3
def handle_frame(frame, state) do
{:ok, overlay_frame} = ExCubecl.Media.read_frame(state.source_b, :video)
# Scale overlay to PiP size
{:ok, scaled} = ExCubecl.Video.scale(overlay_frame,
width: @pip_width,
height: @pip_height
)
# Step 1: Overlay a dark background (border/shadow)
# The border frame is slightly larger than the PiP
border_x = @pip_x - @border_size
border_y = @pip_y - @border_size
border_w = @pip_width + @border_size * 2
border_h = @pip_height + @border_size * 2
# Crop a dark region from the main frame as border background
{:ok, border_bg} = ExCubecl.Video.crop(frame,
x: border_x,
y: border_y,
width: border_w,
height: border_h
)
# Darken the border area
{:ok, darkened} = ExCubecl.Filter.apply(border_bg, :brightness_contrast,
brightness: -0.4,
contrast: 1.0
)
# Composite border onto main frame
{:ok, with_border} = ExCubecl.Video.overlay(frame, darkened,
x: border_x,
y: border_y,
alpha: 0.8
)
# Step 2: Overlay the scaled PiP on top
{:ok, composited} = ExCubecl.Video.overlay(with_border, scaled,
x: @pip_x,
y: @pip_y,
alpha: 1.0
)
:ok = ExCubecl.Transcode.write_frame(state.encoder, composited)
{:ok, state}
end
endSide-by-Side Merge
Place two videos next to each other horizontally. Useful for comparison views or interview formats.
defmodule SideBySide do
use ExCubecl.MediaPipeline
@output_width 1920
@output_height 540 # half of 1080p
@half_width 960
def start_link(opts) do
{:ok, src_a} = ExCubecl.Media.open(opts[:left])
{:ok, src_b} = ExCubecl.Media.open(opts[:right])
{:ok, enc} = ExCubecl.Transcode.start(opts[:output],
video: [codec: :h264, width: @output_width, height: @output_height,
bitrate: "8M"],
audio: [codec: :aac, bitrate: "192k"]
)
state = %{
source_a: src_a,
source_b: src_b,
encoder: enc
}
ExCubecl.MediaPipeline.start_link(__MODULE__, state)
end
def handle_frame(frame, state) do
# Read the matching frame from source B
{:ok, frame_b} = ExCubecl.Media.read_frame(state.source_b, :video)
# Scale both frames to half width
{:ok, left} = ExCubecl.Video.scale(frame,
width: @half_width,
height: @output_height
)
{:ok, right} = ExCubecl.Video.scale(frame_b,
width: @half_width,
height: @output_height
)
# Create a blank output frame (full width)
# We use the left frame as the base and overlay the right half
{:ok, merged} = ExCubecl.Video.overlay(left, right,
x: @half_width,
y: 0,
alpha: 1.0
)
:ok = ExCubecl.Transcode.write_frame(state.encoder, merged)
{:ok, state}
end
endVertical Split (Top/Bottom)
Stack two videos vertically. Useful for showing different camera angles of the same scene.
defmodule VerticalSplit do
use ExCubecl.MediaPipeline
@output_width 1920
@output_height 1080
@half_height 540
def handle_frame(frame, state) do
{:ok, frame_b} = ExCubecl.Media.read_frame(state.source_b, :video)
# Scale both to full width, half height
{:ok, top} = ExCubecl.Video.scale(frame,
width: @output_width,
height: @half_height
)
{:ok, bottom} = ExCubecl.Video.scale(frame_b,
width: @output_width,
height: @half_height
)
# Stack: bottom half overlaid below top half
{:ok, merged} = ExCubecl.Video.overlay(top, bottom,
x: 0,
y: @half_height,
alpha: 1.0
)
:ok = ExCubecl.Transcode.write_frame(state.encoder, merged)
{:ok, state}
end
end2×2 Grid (Four Streams)
Merge four video streams into a 2×2 grid layout.
defmodule GridMerge do
use ExCubecl.MediaPipeline
@cell_width 960
@cell_height 540
def start_link(opts) do
sources = Enum.map(opts[:inputs], &ExCubecl.Media.open/1)
{:ok, enc} = ExCubecl.Transcode.start(opts[:output],
video: [codec: :h264, width: 1920, height: 1080, bitrate: "10M"],
audio: [codec: :aac, bitrate: "192k"]
)
state = %{
sources: sources,
encoder: enc
}
ExCubecl.MediaPipeline.start_link(__MODULE__, state)
end
def handle_frame(_frame, state) do
# Read one frame from each source
frames =
Enum.map(state.sources, fn src ->
case ExCubecl.Media.read_frame(src, :video) do
{:ok, f} -> f
{:error, :eof} -> nil # Source exhausted
end
end)
|> Enum.reject(&is_nil/1)
# Scale all frames to cell size
scaled = Enum.map(frames, fn f ->
{:ok, s} = ExCubecl.Video.scale(f,
width: @cell_width,
height: @cell_height
)
s
end)
# Merge in 2×2 grid:
# [0] [1]
# [2] [3]
merged =
case scaled do
[top_left, top_right, bottom_left, bottom_right] ->
{:ok, top} = ExCubecl.Video.overlay(top_left, top_right,
x: @cell_width, y: 0, alpha: 1.0
)
{:ok, bottom} = ExCubecl.Video.overlay(bottom_left, bottom_right,
x: @cell_width, y: 0, alpha: 1.0
)
{:ok, grid} = ExCubecl.Video.overlay(top, bottom,
x: 0, y: @cell_height, alpha: 1.0
)
grid
[top_left, top_right] ->
{:ok, row} = ExCubecl.Video.overlay(top_left, top_right,
x: @cell_width, y: 0, alpha: 1.0
)
row
[single] ->
single
[] ->
# No frames available, send blank
{:ok, blank} = ExCubecl.buffer(
List.duplicate(0.0, @cell_width * @cell_height * 3),
[@cell_width * @cell_height * 3], :f32
)
# Wrap in a minimal VideoFrame-like struct for overlay compat
# In practice you'd handle this with a blank frame source
raise "No frames available from any source"
end
:ok = ExCubecl.Transcode.write_frame(state.encoder, merged)
{:ok, state}
end
endHandling Different Resolutions
When input streams have different resolutions, normalize them before compositing:
defmodule ResolutionNormalizer do
@target_width 1920
@target_height 1080
def normalize(frame, target_width, target_height) do
cond do
frame.width == target_width and frame.height == target_height ->
frame
true ->
{:ok, scaled} = ExCubecl.Video.scale(frame,
width: target_width,
height: target_height
)
scaled
end
end
# Scale while preserving aspect ratio (letterbox)
def normalize_preserve_aspect(frame, target_width, target_height) do
aspect = frame.width / frame.height
target_aspect = target_width / target_height
{w, h} =
cond do
aspect > target_aspect ->
# Wider than target: fit to width
{target_width, round(target_width / aspect)}
true ->
# Taller than target: fit to height
{round(target_height * aspect), target_height}
end
{:ok, scaled} = ExCubecl.Video.scale(frame, width: w, height: h)
# Center the scaled frame on a canvas of target size
# by overlaying at the correct offset
x_offset = div(target_width - w, 2)
y_offset = div(target_height - h, 2)
# Return the scaled frame with offset info for later compositing
{scaled, x_offset, y_offset}
end
endHandling Different Frame Rates
When streams have different frame rates, decide which drives the output:
defmodule FrameRateSync do
# Read frames from the secondary source at the pace of the primary.
# If the secondary is slower, repeat its last frame.
# If the secondary is faster, skip frames.
def handle_frame(primary_frame, state) do
# Only read a new frame from source_b when the PTS advances
primary_pts = primary_frame.pts
frame_b =
if should_advance_secondary?(primary_pts, state.last_pts_b) do
case ExCubecl.Media.read_frame(state.source_b, :video) do
{:ok, f} ->
Process.put(:last_frame_b, f)
f
{:error, :eof} ->
# Source exhausted — repeat last frame
Process.get(:last_frame_b, state.fallback_frame)
end
else
# Reuse last frame
Process.get(:last_frame_b, state.fallback_frame)
end
{:ok, composited} = ExCubecl.Video.overlay(primary_frame, frame_b,
x: 1580, y: 820, alpha: 1.0
)
:ok = ExCubecl.Transcode.write_frame(state.encoder, composited)
{:ok, %{state | last_pts_b: primary_pts}}
end
defp should_advance_secondary?(primary_pts, last_pts_b) do
# Advance if primary has moved forward by at least one frame duration
# (assuming 30fps ≈ 33333µs per frame)
primary_pts - last_pts_b >= 33_000
end
endAudio Merging
Merge audio from both streams simultaneously:
defmodule AudioVideoMerge do
use ExCubecl.MediaPipeline
def handle_frame(video_frame, state) do
# --- Video: PiP composite ---
{:ok, overlay_v} = ExCubecl.Media.read_frame(state.source_b, :video)
{:ok, scaled} = ExCubecl.Video.scale(overlay_v, width: 320, height: 240)
{:ok, composited} = ExCubecl.Video.overlay(video_frame, scaled,
x: 1580, y: 820, alpha: 1.0
)
# --- Audio: mix both tracks ---
{:ok, audio_a} = ExCubecl.Media.read_frame(state.source_a, :audio)
{:ok, audio_b} = ExCubecl.Media.read_frame(state.source_b, :audio)
{:ok, mixed_audio} = ExCubecl.Audio.mix([audio_a, audio_b],
gains: [1.0, 0.7] # Slightly lower the overlay audio
)
# --- Encode both ---
:ok = ExCubecl.Transcode.write_frame(state.encoder, composited)
:ok = ExCubecl.Transcode.write_samples(state.encoder, mixed_audio)
{:ok, state}
end
endComplete End-to-End Example
Merge two RTMP streams into a single PiP output:
defmodule ConferencePiP do
use ExCubecl.MediaPipeline
@pip_width 480
@pip_height 270
@pip_margin 24
def start_link(opts) do
{:ok, speaker} = ExCubecl.Media.open(opts[:speaker_url])
{:ok, guest} = ExCubecl.Media.open(opts[:guest_url])
{:ok, enc} = ExCubecl.Transcode.start(opts[:output],
video: [
codec: :h264,
width: 1920,
height: 1080,
bitrate: "6M",
fps: 30
],
audio: [
codec: :aac,
bitrate: "192k",
sample_rate: 48000
]
)
# Pre-create a rounded-corner mask for the PiP window
# (optional visual polish)
state = %{
speaker_src: speaker,
guest_src: guest,
encoder: enc,
frame_count: 0,
errors: []
}
ExCubecl.MediaPipeline.start_link(__MODULE__, state,
name: __MODULE__
)
end
def handle_frame(frame, state) do
# Read guest frame (PiP)
case ExCubecl.Media.read_frame(state.guest_src, :video) do
{:ok, guest_frame} ->
pip_frame = compose_pip(frame, guest_frame)
:ok = ExCubecl.Transcode.write_frame(state.encoder, pip_frame)
# Mix audio
with {:ok, spk_audio} <- ExCubecl.Media.read_frame(state.speaker_src, :audio),
{:ok, gst_audio} <- ExCubecl.Media.read_frame(state.guest_src, :audio),
{:ok, mixed} <- ExCubecl.Audio.mix([spk_audio, gst_audio],
gains: [1.0, 0.8]
) do
:ok = ExCubecl.Transcode.write_samples(state.encoder, mixed)
else
{:error, reason} ->
IO.puts("Audio merge error: #{inspect(reason)}")
end
{:ok, %{state | frame_count: state.frame_count + 1}}
{:error, :eof} ->
# Guest stream ended — continue with speaker only
:ok = ExCubecl.Transcode.write_frame(state.encoder, frame)
{:ok, %{state | frame_count: state.frame_count + 1}}
{:error, reason} ->
IO.puts("Guest read error: #{inspect(reason)}")
:ok = ExCubecl.Transcode.write_frame(state.encoder, frame)
{:ok, %{state | frame_count: state.frame_count + 1}}
end
end
defp compose_pip(main_frame, guest_frame) do
# Scale guest to PiP size
{:ok, scaled} = ExCubecl.Video.scale(guest_frame,
width: @pip_width,
height: @pip_height
)
# Position: bottom-right corner with margin
x = main_frame.width - @pip_width - @pip_margin
y = main_frame.height - @pip_height - @pip_margin
# Apply slight rounded-corner effect via alpha
{:ok, composited} = ExCubecl.Video.overlay(main_frame, scaled,
x: x,
y: y,
alpha: 0.95
)
composited
end
def handle_info({:frame, frame}, state) do
# Entry point for pushed frames
case handle_frame(frame, state) do
{:ok, new_state} -> {:noreply, new_state}
{:error, reason} -> {:stop, reason, state}
end
end
end
# Usage:
# {:ok, _pid} = ConferencePiP.start_link(
# speaker_url: "rtmp://server/speaker",
# guest_url: "rtmp://server/guest",
# output: "conference_output.mp4"
# )Saving Snapshots
Capture a still frame from the merged output at any point:
# Save a thumbnail of the current merged frame
:ok = ExCubecl.Video.snapshot(composited_frame, "thumbnail.png")
# Save periodically (e.g., every 300 frames)
if rem(state.frame_count, 300) == 0 do
filename = "snapshots/frame_#{state.frame_count}.png"
File.mkdir_p!("snapshots")
ExCubecl.Video.snapshot(composited_frame, filename)
endNote:
Video.snapshot/2triggers a GPU→CPU readback. Use it sparingly in performance-critical paths. For real-time previews, consider reducing the resolution before snapshotting.
Performance Tips
Scale before overlay: Always scale the overlay to its final size before compositing. Scaling a small region is cheaper than scaling the entire frame.
Match resolutions early: If both sources are the same resolution, skip the scale step entirely.
Use filter chains: Combine scale + color correction in a single
Filter.chain/2call to minimize GPU kernel launches.Avoid snapshots in the hot path: GPU→CPU readbacks are expensive. Save thumbnails asynchronously or at low frequency.
Pipeline mode for fixed layouts: If your merge layout doesn't change, use
ExCubecl.pipeline()directly instead of the GenServer for lower overhead:
{:ok, pipeline} = ExCubecl.pipeline()
:ok = ExCubecl.pipeline_add(pipeline, "bicubic_scale",
[guest_frame.handle], pip_scaled, %{width: 320, height: 240})
:ok = ExCubecl.pipeline_add(pipeline, "overlay_alpha",
[main_frame.handle, pip_scaled], output, %{x: 1580, y: 820, alpha: 1.0})
{:ok, _cmd_ids} = ExCubecl.pipeline_run(pipeline)
:ok = ExCubecl.pipeline_free(pipeline)- Handle stream endings gracefully: Always match on
{:error, :eof}when reading from secondary sources. Decide whether to continue with the primary only or stop the merge.