Skip to content

ar_deep_hash.erl - Arweave Deep Hash Algorithm

Overview

Purpose: Deterministic deep hashing of nested data structures
Module: ar_deep_hash
Hash Function: SHA-384
Use Cases: Transaction signing, data verification, content addressing

Deep hash is a cryptographic hash algorithm that recursively hashes nested lists and binary data structures. It provides a deterministic way to hash complex data structures while preserving structural information through tagged hashing.

Dependencies

  • Erlang/OTP: crypto
  • External: None

Public Functions Overview

%% Deep Hashing
-spec hash(List) -> Hash
    when
        List :: list(),
        Hash :: binary().

Public Functions

1. hash/1

-spec hash(List) -> Hash
    when
        List :: list(),
        Hash :: binary().

Description: Compute deep hash of a list. The algorithm recursively hashes nested structures with size-tagged prefixes to ensure structural integrity. List elements can be binaries or nested lists.

Algorithm:
  1. For Binaries (within lists): Hash as SHA384(SHA384("blob" + size) + SHA384(data))
  2. For Lists: Hash as recursive accumulation with "list" + length tag

Output: 48-byte (384-bit) SHA-384 hash

Test Code:
-module(ar_deep_hash_hash_test).
-include_lib("eunit/include/eunit.hrl").
 
hash_empty_list_test() ->
    Hash = ar_deep_hash:hash([]),
    ?assert(is_binary(Hash)),
    ?assertEqual(48, byte_size(Hash)).
 
hash_single_element_list_test() ->
    Data = [<<"Element">>],
    Hash = ar_deep_hash:hash(Data),
    ?assert(is_binary(Hash)),
    ?assertEqual(48, byte_size(Hash)).
 
hash_single_binary_wrapped_test() ->
    Hash = ar_deep_hash:hash([<<"Hello, Arweave!">>]),
    ?assert(is_binary(Hash)),
    ?assertEqual(48, byte_size(Hash)).
 
hash_empty_binary_wrapped_test() ->
    Hash = ar_deep_hash:hash([<<>>]),
    ?assert(is_binary(Hash)),
    ?assertEqual(48, byte_size(Hash)).
 
hash_deterministic_test() ->
    Data = [<<"Test data">>],
    Hash1 = ar_deep_hash:hash(Data),
    Hash2 = ar_deep_hash:hash(Data),
    ?assertEqual(Hash1, Hash2).
 
hash_different_data_test() ->
    Hash1 = ar_deep_hash:hash([<<"Data1">>]),
    Hash2 = ar_deep_hash:hash([<<"Data2">>]),
    ?assertNotEqual(Hash1, Hash2).
 
hash_multiple_elements_test() ->
    Data = [<<"First">>, <<"Second">>, <<"Third">>],
    Hash = ar_deep_hash:hash(Data),
    ?assertEqual(48, byte_size(Hash)).
 
hash_nested_list_test() ->
    Data = [<<"Outer">>, [<<"Inner1">>, <<"Inner2">>], <<"Final">>],
    Hash = ar_deep_hash:hash(Data),
    ?assertEqual(48, byte_size(Hash)).
 
hash_deep_nesting_test() ->
    Data = [
        <<"Level1">>,
        [
            <<"Level2">>,
            [
                <<"Level3">>,
                [<<"Level4">>]
            ]
        ]
    ],
    Hash = ar_deep_hash:hash(Data),
    ?assertEqual(48, byte_size(Hash)).
 
hash_order_matters_test() ->
    List1 = [<<"A">>, <<"B">>, <<"C">>],
    List2 = [<<"C">>, <<"B">>, <<"A">>],
    Hash1 = ar_deep_hash:hash(List1),
    Hash2 = ar_deep_hash:hash(List2),
    ?assertNotEqual(Hash1, Hash2).
 
hash_structure_matters_test() ->
    % Flat list vs nested list with same content
    Flat = [<<"A">>, <<"B">>, <<"C">>],
    Nested = [<<"A">>, [<<"B">>, <<"C">>]],
    HashFlat = ar_deep_hash:hash(Flat),
    HashNested = ar_deep_hash:hash(Nested),
    ?assertNotEqual(HashFlat, HashNested).
 
hash_size_prefix_test() ->
    % Same content but different structure should hash differently
    Single = [<<"AB">>],
    Double = [<<"A">>, <<"B">>],
    HashSingle = ar_deep_hash:hash(Single),
    HashDouble = ar_deep_hash:hash(Double),
    ?assertNotEqual(HashSingle, HashDouble).
 
hash_large_data_test() ->
    LargeData = [binary:copy(<<"X">>, 1000000)],  % 1MB wrapped in list
    Hash = ar_deep_hash:hash(LargeData),
    ?assertEqual(48, byte_size(Hash)).
 
hash_binary_list_test() ->
    Data = [
        <<"String 1">>,
        <<"String 2">>,
        <<"String 3">>,
        <<"String 4">>
    ],
    Hash = ar_deep_hash:hash(Data),
    ?assertEqual(48, byte_size(Hash)).
 
hash_transaction_like_test() ->
    % Simulating transaction signature data
    TxData = [
        <<"2">>,                           % Format
        crypto:strong_rand_bytes(512),     % Owner
        <<>>,                              % Target
        <<"0">>,                           % Quantity
        <<"1000000">>,                     % Reward
        <<>>,                              % Anchor
        <<"0">>,                           % Data size
        <<>>                               % Data root
    ],
    Hash = ar_deep_hash:hash(TxData),
    ?assertEqual(48, byte_size(Hash)).

Algorithm Details

Binary Hashing (Internal)

For a binary B within a list:

Tag = "blob" + byte_size(B)
Hash = SHA384(SHA384(Tag) + SHA384(B))
Example:
Data = <<"Hello">>,
Tag = <<"blob5">>,  % "blob" + byte_size(<<"Hello">>)
Hash = crypto:hash(sha384, <<
    (crypto:hash(sha384, Tag))/binary,
    (crypto:hash(sha384, Data))/binary
>>).

List Hashing

For a list [H1, H2, ..., Hn]:

Tag = "list" + length(List)
Acc0 = SHA384(Tag)
Acc1 = SHA384(Acc0 + DeepHash(H1))
Acc2 = SHA384(Acc1 + DeepHash(H2))
...
AccN = SHA384(AccN-1 + DeepHash(Hn))
Result = AccN
Example:
List = [<<"A">>, <<"B">>],
Tag = <<"list2">>,  % "list" + length(List)
Acc0 = crypto:hash(sha384, Tag),
Acc1 = crypto:hash(sha384, <<Acc0/binary, (hash_bin_or_list(<<"A">>))/binary>>),
Acc2 = crypto:hash(sha384, <<Acc1/binary, (hash_bin_or_list(<<"B">>))/binary>>),
Result = Acc2.

Properties

Determinism

The same input always produces the same hash:

Hash1 = ar_deep_hash:hash(Data),
Hash2 = ar_deep_hash:hash(Data),
Hash1 =:= Hash2.  % Always true

Structure Preservation

Different structures with same content hash differently:

Flat = [<<"A">>, <<"B">>, <<"C">>],
Nested = [<<"A">>, [<<"B">>, <<"C">>]],
ar_deep_hash:hash(Flat) =/= ar_deep_hash:hash(Nested).  % True

Order Sensitivity

Element order affects the hash:

List1 = [<<"A">>, <<"B">>],
List2 = [<<"B">>, <<"A">>],
ar_deep_hash:hash(List1) =/= ar_deep_hash:hash(List2).  % True

Size Awareness

Content size is embedded in the hash:

List1 = [<<"AB">>],
List2 = [<<"A">>],
ar_deep_hash:hash(List1) =/= ar_deep_hash:hash(List2).  % True

Common Patterns

%% Hash a single binary (must wrap in list)
Data = <<"Transaction data">>,
Hash = ar_deep_hash:hash([Data]).
 
%% Hash transaction signature data (from ar_tx.erl)
SignatureData = [
    integer_to_binary(Format),
    Owner,
    Target,
    list_to_binary(integer_to_list(Quantity)),
    list_to_binary(integer_to_list(Reward)),
    Anchor,
    integer_to_binary(DataSize),
    DataRoot
],
TxHash = ar_deep_hash:hash(SignatureData).
 
%% Hash nested structure
NestedData = [
    <<"Header">>,
    [
        <<"Field1">>, <<"Value1">>,
        <<"Field2">>, <<"Value2">>
    ],
    <<"Footer">>
],
StructHash = ar_deep_hash:hash(NestedData).
 
%% Hash for content addressing
Content = <<"File content">>,
ContentID = ar_deep_hash:hash([Content]).
 
%% Verify data integrity
OriginalHash = ar_deep_hash:hash([Data]),
% ... store or transmit data ...
ReceivedHash = ar_deep_hash:hash([ReceivedData]),
Valid = (OriginalHash =:= ReceivedHash).
 
%% Hash empty structures
EmptyList = ar_deep_hash:hash([]).
EmptyBinaryInList = ar_deep_hash:hash([<<>>]).

Use Cases in Arweave

1. Transaction Signing

% From ar_tx.erl - signature_data_segment/1
SignatureData = [
    integer_to_binary(TX#tx.format),
    TX#tx.owner,
    TX#tx.target,
    list_to_binary(integer_to_list(TX#tx.quantity)),
    list_to_binary(integer_to_list(TX#tx.reward)),
    TX#tx.anchor,
    integer_to_binary(TX#tx.data_size),
    TX#tx.data_root
],
Hash = ar_deep_hash:hash(SignatureData),
Signature = ar_wallet:sign(PrivKey, Hash).

2. Data Item Verification (ANS-104)

% Verify data item signature data
SignatureData = [
    <<"dataitem">>,
    <<"1">>,                    % Version
    SignatureType,
    Target,
    Anchor,
    Tags,
    Data
],
Hash = ar_deep_hash:hash(SignatureData).

3. Merkle Tree Construction

% Hash merkle tree nodes
LeafHash = ar_deep_hash:hash([<<"leaf">>, Data]),
NodeHash = ar_deep_hash:hash([<<"node">>, LeftHash, RightHash]).

4. Content Addressing

% Create content-addressed identifier
ContentHash = ar_deep_hash:hash([FileData]),
% Use as unique, deterministic ID

Security Properties

Collision Resistance

SHA-384 provides strong collision resistance:

  • Finding two inputs with the same hash is computationally infeasible
  • 384-bit output space (2^384 possible hashes)

Preimage Resistance

Given a hash, finding the original input is computationally infeasible.

Second Preimage Resistance

Given an input, finding a different input with the same hash is computationally infeasible.

Tagged Hashing

The "blob" and "list" tags prevent:

  • Length extension attacks
  • Structural ambiguity attacks
  • Cross-protocol hash reuse

Implementation Notes

Tagged Prefixes

% Binary tag includes size
Tag = <<"blob", (integer_to_binary(byte_size(Bin)))/binary>>
% Example: <<"blob5">> for 5-byte binary
 
% List tag includes length
Tag = <<"list", (integer_to_binary(length(List)))/binary>>
% Example: <<"list3">> for 3-element list

Recursive Structure

The algorithm recursively processes nested structures:

hash_bin_or_list(Bin) when is_binary(Bin) -> ...;
hash_bin_or_list(List) when is_list(List) -> ...

Each element is hashed independently, then combined.

Hash Size

All hashes are 48 bytes (384 bits):

Hash = crypto:hash(sha384, Data),
48 = byte_size(Hash).

Comparison with Other Hash Algorithms

vs. Simple SHA-384

% Simple hash loses structure
Simple = crypto:hash(sha384, <<<<"A">>/binary, <<"B">>/binary>>),
 
% Deep hash preserves structure
Deep1 = ar_deep_hash:hash([<<"A">>, <<"B">>]),
Deep2 = ar_deep_hash:hash([<<"AB">>]),
Deep1 =/= Deep2.  % True - structure preserved

vs. Merkle Trees

% Deep hash is like a Merkle tree with tagged nodes
% But it's a single hash function, not a tree structure

vs. JSON Hashing

% Deep hash doesn't require serialization
% Operates directly on Erlang data structures
% More efficient and deterministic than JSON → hash

Performance Considerations

Complexity

  • Single binary in list: O(1) hash operations (constant)
  • List of N elements: O(N) hash operations (linear)
  • Nested structures: O(total_elements) across all levels

Memory

  • Tail-recursive list processing
  • Constant memory overhead per recursion level
  • No additional data structure allocation

Benchmarks

% Single binary in list (16 bytes): ~1-2 μs
% Large binary in list (1 MB): ~5-10 ms
% List (100 elements): ~200-300 μs
% Nested list (10 levels, 10 each): ~1-2 ms

Test Vectors

%% Empty list
ar_deep_hash:hash([]) = 
    SHA384("list0")
 
%% Single binary
ar_deep_hash:hash([<<"test">>]) = 
    Tag = SHA384("list1"),
    BlobHash = SHA384(SHA384("blob4") + SHA384("test")),
    SHA384(Tag + BlobHash)
 
%% Two binaries
ar_deep_hash:hash([<<"a">>, <<"b">>]) = 
    Tag = SHA384("list2"),
    Acc1 = SHA384(Tag + DeepHash(<<"a">>)),
    Acc2 = SHA384(Acc1 + DeepHash(<<"b">>))
 
%% Nested structure
ar_deep_hash:hash([<<"x">>, [<<"y">>]]) =
    OuterTag = SHA384("list2"),
    InnerList = SHA384("list1"),
    InnerComplete = SHA384(InnerList + DeepHash(<<"y">>)),
    Acc1 = SHA384(OuterTag + DeepHash(<<"x">>)),
    Final = SHA384(Acc1 + InnerComplete)

Error Handling

The module assumes valid input:

  • Lists containing binaries or other lists
  • No atoms, numbers, or other Erlang terms directly
Invalid inputs will cause runtime errors:
% These will crash:
ar_deep_hash:hash(<<"binary">>).  % Must be wrapped in list
ar_deep_hash:hash(123).           % Not a list
ar_deep_hash:hash([atom]).        % Atoms not supported
ar_deep_hash:hash([1, 2, 3]).     % Numbers not supported

References

  • Arweave Yellow Paper - Deep hash specification
  • SHA-384 - NIST FIPS 180-4
  • Merkle Trees - Hash tree structures
  • ar_tx.erl - Uses deep hash for transaction signing
  • ar_bundles.erl - Uses deep hash for data items

Notes

  1. Input Type: The public hash/1 function only accepts lists; wrap binaries in a list
  2. Output Size: Always 48 bytes (384 bits) regardless of input
  3. Deterministic: Same input always produces same hash
  4. Structure Sensitive: Different structures hash differently even with same content
  5. Tagged: Uses "blob" and "list" prefixes to prevent ambiguity
  6. Recursive: Handles arbitrary nesting depth
  7. Order Dependent: Element order affects the hash
  8. Size Embedded: Element/list size is part of the hash
  9. No Serialization: Works directly on Erlang data structures
  10. Performance: Linear in number of elements, efficient for nested structures