View Source Bit level parsing // MP3 header parser
One token module per header element
One way to cope with the problem it to create one specific token type for each part of the header.
This allow for catching errors at the lowest level, relieving callbacks of handling errors.
defmodule FrameSync do
defstruct frame_sync: <<0xFF, 7::3>>
def new, do: %__MODULE__{}
end
defimpl Grammar.Tokenizer.TokenExtractor, for: FrameSync do
def try_read(_token_prototype, <<0xFF, 7::3, _rest::bitstring>>), do: {%FrameSync{}, 11}
def try_read(_token_prototype, _input), do: nil
def match?(%FrameSync{frame_sync: <<0xFF, 7::3>>}, %FrameSync{frame_sync: <<0xFF, 7::3>>}), do: true
def match?(_token_prototype, _token), do: false
end
defmodule AudioVersionID do
defstruct [:id]
def new, do: %__MODULE__{}
def new(id) when id != 1 do
%__MODULE__{id: id}
end
end
defimpl Grammar.Tokenizer.TokenExtractor, for: AudioVersionID do
def try_read(_token_prototype, <<0::1, 1::1, _rest::bitstring>>), do: nil
def try_read(%AudioVersionID{id: nil}, <<id::size(2), _rest::bitstring>>), do: {AudioVersionID.new(id), 2}
def try_read(%AudioVersionID{id: id}, <<id::size(2), _rest::bitstring>>), do: {AudioVersionID.new(id), 2}
def try_read(_token_prototype, _input), do: nil
def match?(%AudioVersionID{id: nil}, %AudioVersionID{}), do: true
def match?(version, version), do: true
def match?(_token_prototype, _token), do: false
end
defmodule LayerDescription do
defstruct [:id]
def new, do: %__MODULE__{}
def new(id) when id != 0 do
%__MODULE__{id: id}
end
end
defimpl Grammar.Tokenizer.TokenExtractor, for: LayerDescription do
def try_read(_token_prototype, <<0::2, _rest::bitstring>>), do: nil
def try_read(%LayerDescription{id: nil}, <<id::size(2), _rest::bitstring>>), do: {LayerDescription.new(id), 2}
def try_read(%LayerDescription{id: id}, <<id::size(2), _rest::bitstring>>), do: {LayerDescription.new(id), 2}
def try_read(_token_prototype, _input), do: nil
def match?(%LayerDescription{id: nil}, %LayerDescription{}), do: true
def match?(version, version), do: true
def match?(_token_prototype, _token), do: false
end
# Forbidden
nil = Grammar.Tokenizer.TokenExtractor.try_read(LayerDescription.new(), <<0::2>>)
# Capture
{%{id: 2} , 2} = Grammar.Tokenizer.TokenExtractor.try_read(LayerDescription.new(), <<2::2>>)
# Match !
{%{id: 3} , 2} = Grammar.Tokenizer.TokenExtractor.try_read(LayerDescription.new(3), <<3::2>>)
# Match Error!
nil = Grammar.Tokenizer.TokenExtractor.try_read(LayerDescription.new(1), <<3::2>>)
defmodule Protection do
defstruct [:protected?]
def new, do: %__MODULE__{}
def new(protected?) when is_boolean(protected?), do: %__MODULE__{protected?: protected?}
end
defimpl Grammar.Tokenizer.TokenExtractor, for: Protection do
def try_read(%Protection{protected?: nil}, <<bit::1, _rest::bitstring>>), do: {Protection.new(bit === 0b1), 1}
def try_read(%Protection{protected?: true}, <<1::1, _rest::bitstring>>), do: {Protection.new(true), 1}
def try_read(%Protection{protected?: false}, <<0::1, _rest::bitstring>>), do: {Protection.new(false), 1}
def try_read(_token_prototype, _input), do: nil
def match?(%Protection{protected?: nil}, %Protection{}), do: true
def match?(%Protection{} = protection, protection), do: true
def match?(_token_prototype, _token), do: false
end
defmodule BitrateIndex do
defstruct [:id]
def new, do: %__MODULE__{}
def new(id) when id != 1 do
%__MODULE__{id: id}
end
end
defimpl Grammar.Tokenizer.TokenExtractor, for: BitrateIndex do
def try_read(_token_prototype, <<0xF::4, _rest::bitstring>>), do: nil
def try_read(%BitrateIndex{id: nil}, <<id::size(4), _rest::bitstring>>), do: {BitrateIndex.new(id), 4}
def try_read(%BitrateIndex{id: id}, <<id::size(4), _rest::bitstring>>), do: {BitrateIndex.new(id), 4}
def try_read(_token_prototype, _input), do: nil
def match?(%BitrateIndex{id: nil}, %BitrateIndex{}), do: true
def match?(version, version), do: true
def match?(_token_prototype, _token), do: false
end
defmodule SamplingRateFreqIdx do
defstruct [:id]
def new, do: %__MODULE__{}
def new(id) when id != 0b11 do
%__MODULE__{id: id}
end
end
defimpl Grammar.Tokenizer.TokenExtractor, for: SamplingRateFreqIdx do
def try_read(_token_prototype, <<1::1, 1::1, _rest::bitstring>>), do: nil
def try_read(%SamplingRateFreqIdx{id: nil}, <<id::size(2), _rest::bitstring>>), do: {SamplingRateFreqIdx.new(id), 2}
def try_read(%SamplingRateFreqIdx{id: id}, <<id::size(2), _rest::bitstring>>), do: {SamplingRateFreqIdx.new(id), 2}
def try_read(_token_prototype, _input), do: nil
def match?(%SamplingRateFreqIdx{id: nil}, %SamplingRateFreqIdx{}), do: true
def match?(version, version), do: true
def match?(_token_prototype, _token), do: false
end
defmodule Padding do
defstruct [:padded?]
def new, do: %__MODULE__{}
def new(padded?) when is_boolean(padded?), do: %__MODULE__{padded?: padded?}
end
defimpl Grammar.Tokenizer.TokenExtractor, for: Padding do
def try_read(%Padding{padded?: nil}, <<bit::1, _rest::bitstring>>), do: {Padding.new(bit === 0b1), 1}
def try_read(%Padding{padded?: true}, <<1::1, _rest::bitstring>>), do: {Padding.new(true), 1}
def try_read(%Padding{padded?: false}, <<0::1, _rest::bitstring>>), do: {Padding.new(false), 1}
def try_read(_token_prototype, _input), do: nil
def match?(%Padding{padded?: nil}, %Padding{}), do: true
def match?(%Padding{} = protection, protection), do: true
def match?(_token_prototype, _token), do: false
end
defmodule Private do
defstruct [:private?]
def new, do: %__MODULE__{}
def new(private?) when is_boolean(private?), do: %__MODULE__{private?: private?}
end
defimpl Grammar.Tokenizer.TokenExtractor, for: Private do
def try_read(%Private{private?: nil}, <<bit::1, _rest::bitstring>>), do: {Private.new(bit === 0b1), 1}
def try_read(%Private{private?: true}, <<1::1, _rest::bitstring>>), do: {Private.new(true), 1}
def try_read(%Private{private?: false}, <<0::1, _rest::bitstring>>), do: {Private.new(false), 1}
def try_read(_token_prototype, _input), do: nil
def match?(%Private{private?: nil}, %Private{}), do: true
def match?(%Private{} = protection, protection), do: true
def match?(_token_prototype, _token), do: false
end
test_header = <<
0xFF::8, # Frame sync
0x7::3, # Frame sync
2::2, # MPEG Audio version ID
1::2, # Layer description
1::1, # Protection bit
3::4, # Bitrate index
1::2, # Sampling rate frequency index
0::1, # Padding bit
1::1 # Private bit
>>
With all those token types, the grammar is basically a one-liner, i.e. a single call to add_clause
.
Grammar.new()
|> Grammar.add_clause(:mp3_header, [
FrameSync.new(),
AudioVersionID.new(),
LayerDescription.new(),
Protection.new(),
BitrateIndex.new(),
SamplingRateFreqIdx.new(),
Padding.new(),
Private.new()
], fn [
_magic,
audio_id,
layer_desc,
protection,
bitrate_idx,
sampling_rf_idx,
padding,
private] ->
%{
audio_id: audio_id.id,
layer_desc: layer_desc.id,
protection?: protection.protected?,
bitrate_idx: bitrate_idx.id,
sampling_rf_idx: sampling_rf_idx.id,
padding?: padding.padded?,
private: private.private?
}
end)
|> Grammar.prepare!()
|> Grammar.start(:mp3_header)
|> Grammar.loop(Grammar.Tokenizer.new(test_header, false, true))
A finer approach with generic tokens and rules
Lets define a generic token that matches a span of bits with a specific length.
When using a BitSpan
entity as a constant value in a rule definition, the user is responsible for providing a value that accounts for endianess, wich is big
(Elixir default mode) in the following implementation.
The same when interpreting the value extracted by the token, in callback functions.
defmodule BitSpan do
defstruct [:size, :value]
def new(size, value \\ nil) when is_integer(size) do
%__MODULE__{size: size, value: value}
end
end
defimpl Grammar.Tokenizer.TokenExtractor, for: BitSpan do
def try_read(%BitSpan{size: size, value: nil}, input) do
case input do
<<value::size(size), _rest::bitstring>> -> {BitSpan.new(size, value), size}
_ -> nil
end
end
def try_read(%BitSpan{size: size, value: value}, input) do
case input do
<<^value::size(^size), _rest::bitstring>> -> {BitSpan.new(size, value), size}
_ -> nil
end
end
def try_read(_token_prototype, _input), do: nil
def match?(%BitSpan{value: nil}, %BitSpan{}), do: true
def match?(version, version), do: true
def match?(_token_prototype, _token), do: false
end
# Capture
{%{value: 2, size: 2} , 2} = Grammar.Tokenizer.TokenExtractor.try_read(BitSpan.new(2), <<2::2>>)
# Match !
{%{value: 3, size: 2}, 2} = Grammar.Tokenizer.TokenExtractor.try_read(BitSpan.new(2, 3), <<3::2>>)
# Match Error!
nil = Grammar.Tokenizer.TokenExtractor.try_read(BitSpan.new(2, 1), <<3::2>>)
{%{value: 2040, size: 11} , 11} = Grammar.Tokenizer.TokenExtractor.try_read(BitSpan.new(11), <<0xFF::8, 0x1::4>>)
Using generic bit span matching requires to handle error in callback functions.
The generic token cannot detect invalid patterns (e.g. 0x01
for audio version ID field), and so the callback must either handle / propagate errors upstream, or throw an exception.
Lets define now the MP3HEader
struct that will gather extracted information.
defmodule MP3Header do
defstruct [:audio, :layer]
@type t :: %__MODULE__ {
audio: integer() | :error
}
def set_field(%__MODULE__{} = header, field, :error), do: struct(header, [{field, :error}])
def set_field(%__MODULE__{} = header, field, {:ok, value}), do: struct(header, [{field, value}])
end
valid_header = <<0xFF::8, 0x7::3, 2::2, 1::2, 1::1, 3::4, 1::2, 0::1, 1::1>>
wrong_audio_header = <<0xFF::8, 0x7::3, 1::2, 1::2, 1::1, 3::4, 1::2, 0::1, 1::1>>
wrong_layer_header = <<0xFF::8, 0x7::3, 0::2, 0::2, 1::1, 3::4, 1::2, 0::1, 1::1>>
Now we expose a Grammar
that uses the MP3Header
struct and only BitSpan
tokens to extract data from the input bitstring.
Note how must manually handle erroneous values for header sub-parts.
Grammar.new()
|> Grammar.add_clause(:mp3_header, [:frame_sync, :audio_version, :layer_desc], fn [_, audio, layer] ->
%MP3Header{}
|> MP3Header.set_field(:audio, audio)
|> MP3Header.set_field(:layer, layer)
end)
|> Grammar.add_clause(:frame_sync, [BitSpan.new(11, 2047)], fn [%{value: value}] -> value end)
|> Grammar.add_clause(:audio_version, [BitSpan.new(2)], fn
[%{value: 1}] -> :error
[%{value: value}] -> {:ok, value}
end)
|> Grammar.add_clause(:layer_desc, [BitSpan.new(2)], fn
[%{value: 0}] -> :error
[%{value: value}] -> {:ok, value}
end)
# |> Grammar.add_clause(:protection, ["foo"], fn _ -> :ok end)
# |> Grammar.add_clause(:bitrate, ["foo"], fn _ -> :ok end)
# |> Grammar.add_clause(:sampling, ["foo"], fn _ -> :ok end)
# |> Grammar.add_clause(:padding, ["foo"], fn _ -> :ok end)
# |> Grammar.add_clause(:private, ["foo"], fn _ -> :ok end)
|> Grammar.prepare!()
|> Grammar.start(:mp3_header)
|> Grammar.loop(Grammar.Tokenizer.new(wrong_layer_header, false, true))
Using the DSL
The DSL can be used to express the same grammar. Just add the option sub_byte_matching: true
when using Grammar
to enable sub byte token extraction.
defmodule MP3HeaderParser do
use Grammar, sub_byte_matching: true
rule mp3_header(:frame_sync, :audio_version, :layer_desc) do
[_, audio, layer] = params
%MP3Header{}
|> MP3Header.set_field(:audio, audio)
|> MP3Header.set_field(:layer, layer)
end
rule frame_sync(BitSpan.new(11, 2047)) do
[%{value: value}] = params
value
end
rule audio_version(BitSpan.new(2)) do
case params do
[%{value: 1}] -> :error
[%{value: value}] -> {:ok, value}
end
end
rule layer_desc(BitSpan.new(2)) do
case params do
[%{value: 0}] -> :error
[%{value: value}] -> {:ok, value}
end
end
end
MP3HeaderParser.parse(test_header)