View Source Bit level parsing // MP3 header parser

One token module per header element

One way to cope with the problem it to create one specific token type for each part of the header.

This allow for catching errors at the lowest level, relieving callbacks of handling errors.

defmodule FrameSync do
  defstruct frame_sync: <<0xFF, 7::3>>

  def new, do: %__MODULE__{}
end

defimpl Grammar.Tokenizer.TokenExtractor, for: FrameSync do
  def try_read(_token_prototype, <<0xFF, 7::3, _rest::bitstring>>), do: {%FrameSync{}, 11}
  def try_read(_token_prototype, _input), do: nil

  def match?(%FrameSync{frame_sync: <<0xFF, 7::3>>}, %FrameSync{frame_sync: <<0xFF, 7::3>>}), do: true
  def match?(_token_prototype, _token), do: false
end
defmodule AudioVersionID do
  defstruct [:id]

  def new, do: %__MODULE__{}

  def new(id) when id != 1 do
    %__MODULE__{id: id}
  end
end

defimpl Grammar.Tokenizer.TokenExtractor, for: AudioVersionID do
  def try_read(_token_prototype, <<0::1, 1::1, _rest::bitstring>>), do: nil
  def try_read(%AudioVersionID{id: nil}, <<id::size(2), _rest::bitstring>>), do: {AudioVersionID.new(id), 2}
  def try_read(%AudioVersionID{id: id}, <<id::size(2), _rest::bitstring>>), do: {AudioVersionID.new(id), 2}
  def try_read(_token_prototype, _input), do: nil

  def match?(%AudioVersionID{id: nil}, %AudioVersionID{}), do: true
  def match?(version, version), do: true
  def match?(_token_prototype, _token), do: false
end
defmodule LayerDescription do
  defstruct [:id]

  def new, do: %__MODULE__{}

  def new(id) when id != 0 do
    %__MODULE__{id: id}
  end
end

defimpl Grammar.Tokenizer.TokenExtractor, for: LayerDescription do
  def try_read(_token_prototype, <<0::2, _rest::bitstring>>), do: nil
  def try_read(%LayerDescription{id: nil}, <<id::size(2), _rest::bitstring>>), do: {LayerDescription.new(id), 2}
  def try_read(%LayerDescription{id: id}, <<id::size(2), _rest::bitstring>>), do: {LayerDescription.new(id), 2}
  def try_read(_token_prototype, _input), do: nil

  def match?(%LayerDescription{id: nil}, %LayerDescription{}), do: true
  def match?(version, version), do: true
  def match?(_token_prototype, _token), do: false
end
# Forbidden
nil = Grammar.Tokenizer.TokenExtractor.try_read(LayerDescription.new(), <<0::2>>)

# Capture
{%{id: 2} , 2} = Grammar.Tokenizer.TokenExtractor.try_read(LayerDescription.new(), <<2::2>>)

# Match !
{%{id: 3} , 2} = Grammar.Tokenizer.TokenExtractor.try_read(LayerDescription.new(3), <<3::2>>)

# Match Error!
nil = Grammar.Tokenizer.TokenExtractor.try_read(LayerDescription.new(1), <<3::2>>)
defmodule Protection do
  defstruct [:protected?]

  def new, do: %__MODULE__{}
  def new(protected?) when is_boolean(protected?), do: %__MODULE__{protected?: protected?}
end

defimpl Grammar.Tokenizer.TokenExtractor, for: Protection do
  def try_read(%Protection{protected?: nil}, <<bit::1, _rest::bitstring>>), do: {Protection.new(bit === 0b1), 1}
  def try_read(%Protection{protected?: true}, <<1::1, _rest::bitstring>>), do: {Protection.new(true), 1}
  def try_read(%Protection{protected?: false}, <<0::1, _rest::bitstring>>), do: {Protection.new(false), 1}
  def try_read(_token_prototype, _input), do: nil

  def match?(%Protection{protected?: nil}, %Protection{}), do: true
  def match?(%Protection{} = protection, protection), do: true
  def match?(_token_prototype, _token), do: false
end
defmodule BitrateIndex do
  defstruct [:id]

  def new, do: %__MODULE__{}

  def new(id) when id != 1 do
    %__MODULE__{id: id}
  end
end

defimpl Grammar.Tokenizer.TokenExtractor, for: BitrateIndex do
  def try_read(_token_prototype, <<0xF::4, _rest::bitstring>>), do: nil
  def try_read(%BitrateIndex{id: nil}, <<id::size(4), _rest::bitstring>>), do: {BitrateIndex.new(id), 4}
  def try_read(%BitrateIndex{id: id}, <<id::size(4), _rest::bitstring>>), do: {BitrateIndex.new(id), 4}
  def try_read(_token_prototype, _input), do: nil

  def match?(%BitrateIndex{id: nil}, %BitrateIndex{}), do: true
  def match?(version, version), do: true
  def match?(_token_prototype, _token), do: false
end
defmodule SamplingRateFreqIdx do
  defstruct [:id]

  def new, do: %__MODULE__{}

  def new(id) when id != 0b11 do
    %__MODULE__{id: id}
  end
end

defimpl Grammar.Tokenizer.TokenExtractor, for: SamplingRateFreqIdx do
  def try_read(_token_prototype, <<1::1, 1::1, _rest::bitstring>>), do: nil
  def try_read(%SamplingRateFreqIdx{id: nil}, <<id::size(2), _rest::bitstring>>), do: {SamplingRateFreqIdx.new(id), 2}
  def try_read(%SamplingRateFreqIdx{id: id}, <<id::size(2), _rest::bitstring>>), do: {SamplingRateFreqIdx.new(id), 2}
  def try_read(_token_prototype, _input), do: nil

  def match?(%SamplingRateFreqIdx{id: nil}, %SamplingRateFreqIdx{}), do: true
  def match?(version, version), do: true
  def match?(_token_prototype, _token), do: false
end
defmodule Padding do
  defstruct [:padded?]

  def new, do: %__MODULE__{}
  def new(padded?) when is_boolean(padded?), do: %__MODULE__{padded?: padded?}
end

defimpl Grammar.Tokenizer.TokenExtractor, for: Padding do
  def try_read(%Padding{padded?: nil}, <<bit::1, _rest::bitstring>>), do: {Padding.new(bit === 0b1), 1}
  def try_read(%Padding{padded?: true}, <<1::1, _rest::bitstring>>), do: {Padding.new(true), 1}
  def try_read(%Padding{padded?: false}, <<0::1, _rest::bitstring>>), do: {Padding.new(false), 1}
  def try_read(_token_prototype, _input), do: nil

  def match?(%Padding{padded?: nil}, %Padding{}), do: true
  def match?(%Padding{} = protection, protection), do: true
  def match?(_token_prototype, _token), do: false
end
defmodule Private do
  defstruct [:private?]

  def new, do: %__MODULE__{}
  def new(private?) when is_boolean(private?), do: %__MODULE__{private?: private?}
end

defimpl Grammar.Tokenizer.TokenExtractor, for: Private do
  def try_read(%Private{private?: nil}, <<bit::1, _rest::bitstring>>), do: {Private.new(bit === 0b1), 1}
  def try_read(%Private{private?: true}, <<1::1, _rest::bitstring>>), do: {Private.new(true), 1}
  def try_read(%Private{private?: false}, <<0::1, _rest::bitstring>>), do: {Private.new(false), 1}
  def try_read(_token_prototype, _input), do: nil

  def match?(%Private{private?: nil}, %Private{}), do: true
  def match?(%Private{} = protection, protection), do: true
  def match?(_token_prototype, _token), do: false
end
test_header = <<
  0xFF::8, # Frame sync

  0x7::3, # Frame sync
  2::2, # MPEG Audio version ID
  1::2, # Layer description
  1::1, # Protection bit

  3::4, # Bitrate index
  1::2, # Sampling rate frequency index 
  0::1, # Padding bit
  1::1 # Private bit
>>

With all those token types, the grammar is basically a one-liner, i.e. a single call to add_clause.

Grammar.new()
|> Grammar.add_clause(:mp3_header, [
  FrameSync.new(),
  AudioVersionID.new(),
  LayerDescription.new(),
  Protection.new(),
  BitrateIndex.new(),
  SamplingRateFreqIdx.new(),
  Padding.new(),
  Private.new()
], fn [
  _magic,
  audio_id,
  layer_desc,
  protection,
  bitrate_idx,
  sampling_rf_idx,
  padding,
  private] ->
  %{
    audio_id: audio_id.id,
    layer_desc: layer_desc.id,
    protection?: protection.protected?,
    bitrate_idx: bitrate_idx.id,
    sampling_rf_idx: sampling_rf_idx.id,
    padding?: padding.padded?,
    private: private.private?
  }
end)
|> Grammar.prepare!()
|> Grammar.start(:mp3_header)
|> Grammar.loop(Grammar.Tokenizer.new(test_header, false, true))

A finer approach with generic tokens and rules

Lets define a generic token that matches a span of bits with a specific length.

When using a BitSpan entity as a constant value in a rule definition, the user is responsible for providing a value that accounts for endianess, wich is big (Elixir default mode) in the following implementation.

The same when interpreting the value extracted by the token, in callback functions.

defmodule BitSpan do
  defstruct [:size, :value]

  def new(size, value \\ nil) when is_integer(size) do
    %__MODULE__{size: size, value: value}
  end
end

defimpl Grammar.Tokenizer.TokenExtractor, for: BitSpan do
  def try_read(%BitSpan{size: size, value: nil}, input) do
    case input do
      <<value::size(size), _rest::bitstring>> -> {BitSpan.new(size, value), size}
      _ -> nil
    end
  end

  def try_read(%BitSpan{size: size, value: value}, input) do
    case input do
      <<^value::size(^size), _rest::bitstring>> -> {BitSpan.new(size, value), size}
      _ -> nil
    end
  end
  
  def try_read(_token_prototype, _input), do: nil

  def match?(%BitSpan{value: nil}, %BitSpan{}), do: true
  def match?(version, version), do: true
  def match?(_token_prototype, _token), do: false
end
# Capture
{%{value: 2, size: 2} , 2} = Grammar.Tokenizer.TokenExtractor.try_read(BitSpan.new(2), <<2::2>>)

# Match !
{%{value: 3, size: 2}, 2} = Grammar.Tokenizer.TokenExtractor.try_read(BitSpan.new(2, 3), <<3::2>>)

# Match Error!
nil = Grammar.Tokenizer.TokenExtractor.try_read(BitSpan.new(2, 1), <<3::2>>)

{%{value: 2040, size: 11} , 11} = Grammar.Tokenizer.TokenExtractor.try_read(BitSpan.new(11), <<0xFF::8, 0x1::4>>)

Using generic bit span matching requires to handle error in callback functions.

The generic token cannot detect invalid patterns (e.g. 0x01 for audio version ID field), and so the callback must either handle / propagate errors upstream, or throw an exception.

Lets define now the MP3HEader struct that will gather extracted information.

defmodule MP3Header do
  defstruct [:audio, :layer]

  @type t :: %__MODULE__ {
    audio: integer() | :error
  }

  def set_field(%__MODULE__{} = header, field, :error), do: struct(header, [{field, :error}])
  def set_field(%__MODULE__{} = header, field, {:ok, value}), do: struct(header, [{field, value}])
end

valid_header = <<0xFF::8, 0x7::3, 2::2, 1::2, 1::1, 3::4, 1::2, 0::1, 1::1>>
wrong_audio_header = <<0xFF::8, 0x7::3, 1::2, 1::2, 1::1, 3::4, 1::2, 0::1, 1::1>>
wrong_layer_header = <<0xFF::8, 0x7::3, 0::2, 0::2, 1::1, 3::4, 1::2, 0::1, 1::1>>

Now we expose a Grammar that uses the MP3Header struct and only BitSpan tokens to extract data from the input bitstring.

Note how must manually handle erroneous values for header sub-parts.

Grammar.new()
|> Grammar.add_clause(:mp3_header, [:frame_sync, :audio_version, :layer_desc], fn [_, audio, layer] ->
  %MP3Header{}
  |> MP3Header.set_field(:audio, audio)
  |> MP3Header.set_field(:layer, layer)
end)
|> Grammar.add_clause(:frame_sync, [BitSpan.new(11, 2047)], fn [%{value: value}] -> value end)
|> Grammar.add_clause(:audio_version, [BitSpan.new(2)], fn
  [%{value: 1}] -> :error
  [%{value: value}] -> {:ok, value}
end)
|> Grammar.add_clause(:layer_desc, [BitSpan.new(2)], fn
  [%{value: 0}] -> :error
  [%{value: value}] -> {:ok, value}
end)
# |> Grammar.add_clause(:protection, ["foo"], fn _ -> :ok end)
# |> Grammar.add_clause(:bitrate, ["foo"], fn _ -> :ok end)
# |> Grammar.add_clause(:sampling, ["foo"], fn _ -> :ok end)
# |> Grammar.add_clause(:padding, ["foo"], fn _ -> :ok end)
# |> Grammar.add_clause(:private, ["foo"], fn _ -> :ok end)
|> Grammar.prepare!()
|> Grammar.start(:mp3_header)
|> Grammar.loop(Grammar.Tokenizer.new(wrong_layer_header, false, true))

Using the DSL

The DSL can be used to express the same grammar. Just add the option sub_byte_matching: true when using Grammar to enable sub byte token extraction.

defmodule MP3HeaderParser do
  use Grammar, sub_byte_matching: true

  rule mp3_header(:frame_sync, :audio_version, :layer_desc) do
    [_, audio, layer] = params

    %MP3Header{}
    |> MP3Header.set_field(:audio, audio)
    |> MP3Header.set_field(:layer, layer)
  end

  rule frame_sync(BitSpan.new(11, 2047)) do
    [%{value: value}] = params
    value
  end

  rule audio_version(BitSpan.new(2)) do
    case params do
      [%{value: 1}] -> :error
      [%{value: value}] -> {:ok, value}
    end
  end

  rule layer_desc(BitSpan.new(2)) do
    case params do
      [%{value: 0}] -> :error
      [%{value: value}] -> {:ok, value}
    end
  end
end

MP3HeaderParser.parse(test_header)