hb_metrics_collector.erl - System Metrics Collector for Prometheus

Overview

Purpose: Collect system-level metrics for Prometheus monitoring
Module: hb_metrics_collector
Behavior: prometheus_collector
Metrics: Process uptime and system load average

This module implements a Prometheus collector that exposes Erlang VM and system metrics. It provides process uptime in seconds and system load average (5-minute) for monitoring HyperBEAM node health and performance.

Metrics Collected

process_uptime_seconds - Erlang process uptime
system_load - 5-minute system load average

Dependencies

Prometheus: prometheus, prometheus_model_helpers
Erlang/OTP: erlang (statistics), cpu_sup

Public Functions Overview

%% Prometheus Collector Callbacks
-spec deregister_cleanup(Registry) -> ok.
-spec collect_mf(Registry, Callback) -> ok.
-spec collect_metrics(MetricName, Data) -> Metrics.

Public Functions

1. deregister_cleanup/1

-spec deregister_cleanup(Registry) -> ok
    when
        Registry :: atom().

Description: Cleanup callback when collector is deregistered. Currently no-op.

Implementation:

deregister_cleanup(_) -> ok.

Test Code:

-module(hb_metrics_collector_cleanup_test).
-include_lib("eunit/include/eunit.hrl").
 
deregister_cleanup_test() ->
    % Always returns ok regardless of input
    ?assertEqual(ok, hb_metrics_collector:deregister_cleanup(default)),
    ?assertEqual(ok, hb_metrics_collector:deregister_cleanup(custom_registry)),
    ?assertEqual(ok, hb_metrics_collector:deregister_cleanup(undefined)).

2. collect_mf/2

-spec collect_mf(Registry, Callback) -> ok
    when
        Registry :: atom(),
        Callback :: fun((MetricFamily) -> ok).

Description: Main collection function called by Prometheus. Collects all metrics and invokes callback for each metric family.

Metrics Collected:

Process Uptime

{Uptime, _} = erlang:statistics(wall_clock),
Callback(
    create_gauge(
        process_uptime_seconds,
        "The number of seconds the Erlang process has been up.",
        Uptime
    )
)

Source: erlang:statistics(wall_clock)
Type: Gauge
Unit: Milliseconds (converted to seconds in collect_metrics)

System Load

SystemLoad = cpu_sup:avg5(),
Callback(
    create_gauge(
        system_load,
        "The load values are proportional to how long time a runnable Unix process has to spend in the run queue before it is scheduled. Accordingly, higher values mean more system load",
        SystemLoad
    )
)

Source: cpu_sup:avg5() (5-minute load average)
Type: Gauge
Unit: Load average value

Test Code:

-module(hb_metrics_collector_mf_test).
-include_lib("eunit/include/eunit.hrl").
 
collect_mf_returns_ok_test() ->
    % Use process dictionary to capture metrics
    put(test_metrics, []),
    Callback = fun(Metric) ->
        put(test_metrics, [Metric | get(test_metrics)])
    end,
    
    % collect_mf requires cpu_sup - may fail without os_mon
    case catch hb_metrics_collector:collect_mf(default, Callback) of
        ok ->
            Metrics = get(test_metrics),
            ?assert(length(Metrics) >= 2);
        {'EXIT', _} ->
            % cpu_sup not available - just verify export
            code:ensure_loaded(hb_metrics_collector),
            ?assert(erlang:function_exported(hb_metrics_collector, collect_mf, 2))
    end,
    erase(test_metrics).
 
collect_mf_invokes_callback_test() ->
    % Track callback invocations
    Self = self(),
    Callback = fun(Metric) ->
        Self ! {metric, Metric}
    end,
    
    case catch hb_metrics_collector:collect_mf(default, Callback) of
        ok ->
            % Should receive at least 2 metrics
            receive {metric, _} -> ok after 100 -> ok end,
            receive {metric, _} -> ok after 100 -> ok end;
        {'EXIT', _} ->
            % cpu_sup not running
            ok
    end.

3. collect_metrics/2

-spec collect_metrics(MetricName, Data) -> Metrics
    when
        MetricName :: system_load | process_uptime_seconds,
        Data :: number(),
        Metrics :: [prometheus_model_helpers:gauge_metric()].

Description: Format collected data into Prometheus gauge metrics.

System Load:

collect_metrics(system_load, SystemLoad) ->
    prometheus_model_helpers:gauge_metrics([{[], SystemLoad}]).

Process Uptime:

collect_metrics(process_uptime_seconds, Uptime) ->
    UptimeSeconds = Uptime / 1000,  % Convert ms to seconds
    prometheus_model_helpers:gauge_metrics([{[], UptimeSeconds}]).

Return Format:

Empty label list [] (no labels for these metrics)
Single value per metric

Test Code:

-module(hb_metrics_collector_format_test).
-include_lib("eunit/include/eunit.hrl").
 
collect_metrics_uptime_test() ->
    Uptime = 120000,  % 120 seconds in milliseconds
    
    % May require prometheus to be started
    case catch hb_metrics_collector:collect_metrics(process_uptime_seconds, Uptime) of
        {'EXIT', _} ->
            % prometheus not available - verify export
            code:ensure_loaded(hb_metrics_collector),
            ?assert(erlang:function_exported(hb_metrics_collector, collect_metrics, 2));
        Metrics ->
            ?assert(is_list(Metrics))
    end.
 
collect_metrics_load_test() ->
    Load = 2.5,
    
    case catch hb_metrics_collector:collect_metrics(system_load, Load) of
        {'EXIT', _} ->
            % prometheus not available
            code:ensure_loaded(hb_metrics_collector),
            ?assert(erlang:function_exported(hb_metrics_collector, collect_metrics, 2));
        Metrics ->
            ?assert(is_list(Metrics))
    end.
 
collect_metrics_uptime_conversion_test() ->
    % Test that uptime is converted from ms to seconds
    UptimeMs = 60000,  % 60 seconds
    
    case catch hb_metrics_collector:collect_metrics(process_uptime_seconds, UptimeMs) of
        {'EXIT', _} ->
            ok;  % prometheus not available
        Metrics when is_list(Metrics) ->
            ?assert(length(Metrics) > 0)
    end.

Helper Functions

create_gauge/3

-spec create_gauge(Name, Help, Data) -> MetricFamily
    when
        Name :: atom(),
        Help :: string(),
        Data :: number(),
        MetricFamily :: prometheus_model_helpers:metric_family().

Description: Create a Prometheus gauge metric family.

Implementation:

create_gauge(Name, Help, Data) ->
    prometheus_model_helpers:create_mf(Name, Help, gauge, ?MODULE, Data).

Parameters:

Name - Metric name (atom)
Help - Description string
gauge - Metric type
?MODULE - Collector module
Data - Metric value

Metric Details

process_uptime_seconds

Type: Gauge
Description: Duration the Erlang VM has been running
Unit: Seconds (float)
Source: erlang:statistics(wall_clock)
Update Frequency: On each scrape

Calculation:

{UptimeMs, _} = erlang:statistics(wall_clock),
UptimeSeconds = UptimeMs / 1000.

Use Cases:

Detect node restarts
Monitor uptime SLAs
Correlate issues with restart times
Track deployment times

Example PromQL:

# Node uptime in hours
process_uptime_seconds / 3600

# Nodes restarted in last hour
process_uptime_seconds < 3600

# Average uptime across cluster
avg(process_uptime_seconds)

system_load

Type: Gauge
Description: 5-minute system load average
Unit: Load value (float)
Source: cpu_sup:avg5()
Update Frequency: On each scrape

Interpretation:

< 1.0 - System underutilized
≈ CPU count - Optimal load
> CPU count - System overloaded

Use Cases:

Detect CPU saturation
Auto-scaling decisions
Performance degradation alerts
Capacity planning

Example PromQL:

# High load alert
system_load > 8

# Load per CPU core (assuming 8 cores)
system_load / 8

# Load spike detection
rate(system_load[5m]) > 2

Prometheus Integration

Registration

The collector is registered with Prometheus on application startup:

% In application startup
prometheus:register_collector(hb_metrics_collector).

Scrape Endpoint

Metrics exposed at standard Prometheus endpoint:

GET /metrics
 
# Response excerpt:
# TYPE process_uptime_seconds gauge
# HELP process_uptime_seconds The number of seconds the Erlang process has been up.
process_uptime_seconds 3600.5
 
# TYPE system_load gauge
# HELP system_load The load values are proportional to how long time...
system_load 2.3

Common Patterns

%% Register collector in application
application:ensure_all_started([prometheus, prometheus_cowboy]),
prometheus:register_collector(hb_metrics_collector).
 
%% Query metrics programmatically
Metrics = prometheus_text_format:format(),
% Returns all metrics in Prometheus text format
 
%% Check if collector is registered
Collectors = prometheus_registry:collectors(default),
IsRegistered = lists:member(hb_metrics_collector, Collectors).
 
%% Deregister collector
prometheus:deregister_collector(hb_metrics_collector).

Monitoring Dashboard

Grafana Queries

Uptime Panel:

# Display uptime in human-readable format
process_uptime_seconds

System Load Panel:

# Current load
system_load

# With threshold lines
system_load{job="hyperbeam"}

Combined View:

# Load vs Uptime correlation
system_load / (process_uptime_seconds / 3600)

Alerting Rules

High Load Alert

groups:
  - name: hyperbeam
    rules:
      - alert: HighSystemLoad
        expr: system_load > 8
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High system load on {{ $labels.instance }}"
          description: "Load average is {{ $value }}"

Recent Restart Alert

- alert: RecentRestart
  expr: process_uptime_seconds < 300
  for: 1m
  labels:
    severity: info
  annotations:
    summary: "Node {{ $labels.instance }} recently restarted"
    description: "Uptime is only {{ $value }} seconds"

CPU Supervisor

cpu_sup Requirements

The system_load metric requires cpu_sup application:

% Ensure cpu_sup is started
application:ensure_all_started(os_mon).
 
% cpu_sup is part of os_mon application

Platform Support:

Linux: ✅ Full support
macOS: ✅ Full support
Windows: ⚠️ Limited support
FreeBSD: ✅ Full support

Performance Impact

Collection Overhead:

erlang:statistics(wall_clock) - Negligible (~µs)
cpu_sup:avg5() - Low (~ms)

Scrape Frequency: Typically 15-60 seconds (Prometheus default)

Memory: Minimal - no historical data stored

Extending the Collector

Adding New Metrics

collect_mf(_Registry, Callback) ->
    % Existing metrics...
    
    % Add new metric
    MemoryUsed = erlang:memory(total),
    Callback(
        create_gauge(
            erlang_memory_bytes_total,
            "Total memory used by Erlang VM",
            MemoryUsed
        )
    ),
    
    ok.
 
% Add corresponding collect_metrics clause
collect_metrics(erlang_memory_bytes_total, Memory) ->
    prometheus_model_helpers:gauge_metrics([{[], Memory}]).

Adding Labels

collect_metrics(metric_with_labels, Data) ->
    prometheus_model_helpers:gauge_metrics([
        {[{node, node()}], Data}
    ]).

Troubleshooting

cpu_sup not available

% Check if cpu_sup is running
case whereis(cpu_sup) of
    undefined ->
        application:start(os_mon);
    _ -> ok
end.

Metrics not appearing

% Verify collector is registered
Collectors = prometheus_registry:collectors(default),
case lists:member(hb_metrics_collector, Collectors) of
    true -> io:format("Collector registered~n");
    false ->
        prometheus:register_collector(hb_metrics_collector),
        io:format("Collector registered now~n")
end.

References

Prometheus - Prometheus Documentation
Erlang Prometheus - prometheus.erl
CPU Supervisor - cpu_sup module
Metrics - Prometheus Metric Types

Notes

Gauge Metrics: Both metrics are gauges (point-in-time values)
No Labels: Metrics have no labels (simplest form)
Wall Clock: Uptime from VM wall clock, not OS uptime
5-min Average: System load is 5-minute average, not instant
Unit Conversion: Uptime converted from ms to seconds
Minimal Overhead: Very lightweight collection
Standard Integration: Follows prometheus_collector behavior
No State: Stateless collector (no internal state)
Platform Dependent: cpu_sup availability varies by OS
Scrape Interval: Collection happens on Prometheus scrape
No Caching: Fresh values on each scrape
Extensible: Easy to add more system metrics
Production Ready: Minimal, reliable metrics
No Filtering: All registered collectors scraped together
Standard Format: Prometheus text format output