hb_metrics_collector.erl - System Metrics Collector for Prometheus
Overview
Purpose: Collect system-level metrics for Prometheus monitoring
Module: hb_metrics_collector
Behavior: prometheus_collector
Metrics: Process uptime and system load average
This module implements a Prometheus collector that exposes Erlang VM and system metrics. It provides process uptime in seconds and system load average (5-minute) for monitoring HyperBEAM node health and performance.
Metrics Collected
- process_uptime_seconds - Erlang process uptime
- system_load - 5-minute system load average
Dependencies
- Prometheus:
prometheus,prometheus_model_helpers - Erlang/OTP:
erlang(statistics),cpu_sup
Public Functions Overview
%% Prometheus Collector Callbacks
-spec deregister_cleanup(Registry) -> ok.
-spec collect_mf(Registry, Callback) -> ok.
-spec collect_metrics(MetricName, Data) -> Metrics.Public Functions
1. deregister_cleanup/1
-spec deregister_cleanup(Registry) -> ok
when
Registry :: atom().Description: Cleanup callback when collector is deregistered. Currently no-op.
Implementation:deregister_cleanup(_) -> ok.-module(hb_metrics_collector_cleanup_test).
-include_lib("eunit/include/eunit.hrl").
deregister_cleanup_test() ->
% Always returns ok regardless of input
?assertEqual(ok, hb_metrics_collector:deregister_cleanup(default)),
?assertEqual(ok, hb_metrics_collector:deregister_cleanup(custom_registry)),
?assertEqual(ok, hb_metrics_collector:deregister_cleanup(undefined)).2. collect_mf/2
-spec collect_mf(Registry, Callback) -> ok
when
Registry :: atom(),
Callback :: fun((MetricFamily) -> ok).Description: Main collection function called by Prometheus. Collects all metrics and invokes callback for each metric family.
Metrics Collected:Process Uptime
{Uptime, _} = erlang:statistics(wall_clock),
Callback(
create_gauge(
process_uptime_seconds,
"The number of seconds the Erlang process has been up.",
Uptime
)
)Source: erlang:statistics(wall_clock)
Type: Gauge
Unit: Milliseconds (converted to seconds in collect_metrics)
System Load
SystemLoad = cpu_sup:avg5(),
Callback(
create_gauge(
system_load,
"The load values are proportional to how long time a runnable Unix process has to spend in the run queue before it is scheduled. Accordingly, higher values mean more system load",
SystemLoad
)
)Source: cpu_sup:avg5() (5-minute load average)
Type: Gauge
Unit: Load average value
-module(hb_metrics_collector_mf_test).
-include_lib("eunit/include/eunit.hrl").
collect_mf_returns_ok_test() ->
% Use process dictionary to capture metrics
put(test_metrics, []),
Callback = fun(Metric) ->
put(test_metrics, [Metric | get(test_metrics)])
end,
% collect_mf requires cpu_sup - may fail without os_mon
case catch hb_metrics_collector:collect_mf(default, Callback) of
ok ->
Metrics = get(test_metrics),
?assert(length(Metrics) >= 2);
{'EXIT', _} ->
% cpu_sup not available - just verify export
code:ensure_loaded(hb_metrics_collector),
?assert(erlang:function_exported(hb_metrics_collector, collect_mf, 2))
end,
erase(test_metrics).
collect_mf_invokes_callback_test() ->
% Track callback invocations
Self = self(),
Callback = fun(Metric) ->
Self ! {metric, Metric}
end,
case catch hb_metrics_collector:collect_mf(default, Callback) of
ok ->
% Should receive at least 2 metrics
receive {metric, _} -> ok after 100 -> ok end,
receive {metric, _} -> ok after 100 -> ok end;
{'EXIT', _} ->
% cpu_sup not running
ok
end.3. collect_metrics/2
-spec collect_metrics(MetricName, Data) -> Metrics
when
MetricName :: system_load | process_uptime_seconds,
Data :: number(),
Metrics :: [prometheus_model_helpers:gauge_metric()].Description: Format collected data into Prometheus gauge metrics.
System Load:collect_metrics(system_load, SystemLoad) ->
prometheus_model_helpers:gauge_metrics([{[], SystemLoad}]).collect_metrics(process_uptime_seconds, Uptime) ->
UptimeSeconds = Uptime / 1000, % Convert ms to seconds
prometheus_model_helpers:gauge_metrics([{[], UptimeSeconds}]).- Empty label list
[](no labels for these metrics) - Single value per metric
-module(hb_metrics_collector_format_test).
-include_lib("eunit/include/eunit.hrl").
collect_metrics_uptime_test() ->
Uptime = 120000, % 120 seconds in milliseconds
% May require prometheus to be started
case catch hb_metrics_collector:collect_metrics(process_uptime_seconds, Uptime) of
{'EXIT', _} ->
% prometheus not available - verify export
code:ensure_loaded(hb_metrics_collector),
?assert(erlang:function_exported(hb_metrics_collector, collect_metrics, 2));
Metrics ->
?assert(is_list(Metrics))
end.
collect_metrics_load_test() ->
Load = 2.5,
case catch hb_metrics_collector:collect_metrics(system_load, Load) of
{'EXIT', _} ->
% prometheus not available
code:ensure_loaded(hb_metrics_collector),
?assert(erlang:function_exported(hb_metrics_collector, collect_metrics, 2));
Metrics ->
?assert(is_list(Metrics))
end.
collect_metrics_uptime_conversion_test() ->
% Test that uptime is converted from ms to seconds
UptimeMs = 60000, % 60 seconds
case catch hb_metrics_collector:collect_metrics(process_uptime_seconds, UptimeMs) of
{'EXIT', _} ->
ok; % prometheus not available
Metrics when is_list(Metrics) ->
?assert(length(Metrics) > 0)
end.Helper Functions
create_gauge/3
-spec create_gauge(Name, Help, Data) -> MetricFamily
when
Name :: atom(),
Help :: string(),
Data :: number(),
MetricFamily :: prometheus_model_helpers:metric_family().Description: Create a Prometheus gauge metric family.
Implementation:create_gauge(Name, Help, Data) ->
prometheus_model_helpers:create_mf(Name, Help, gauge, ?MODULE, Data).Name- Metric name (atom)Help- Description stringgauge- Metric type?MODULE- Collector moduleData- Metric value
Metric Details
process_uptime_seconds
Type: Gauge
Description: Duration the Erlang VM has been running
Unit: Seconds (float)
Source: erlang:statistics(wall_clock)
Update Frequency: On each scrape
{UptimeMs, _} = erlang:statistics(wall_clock),
UptimeSeconds = UptimeMs / 1000.- Detect node restarts
- Monitor uptime SLAs
- Correlate issues with restart times
- Track deployment times
# Node uptime in hours
process_uptime_seconds / 3600
# Nodes restarted in last hour
process_uptime_seconds < 3600
# Average uptime across cluster
avg(process_uptime_seconds)
system_load
Type: Gauge
Description: 5-minute system load average
Unit: Load value (float)
Source: cpu_sup:avg5()
Update Frequency: On each scrape
- < 1.0 - System underutilized
- ≈ CPU count - Optimal load
- > CPU count - System overloaded
- Detect CPU saturation
- Auto-scaling decisions
- Performance degradation alerts
- Capacity planning
# High load alert
system_load > 8
# Load per CPU core (assuming 8 cores)
system_load / 8
# Load spike detection
rate(system_load[5m]) > 2
Prometheus Integration
Registration
The collector is registered with Prometheus on application startup:
% In application startup
prometheus:register_collector(hb_metrics_collector).Scrape Endpoint
Metrics exposed at standard Prometheus endpoint:
GET /metrics
# Response excerpt:
# TYPE process_uptime_seconds gauge
# HELP process_uptime_seconds The number of seconds the Erlang process has been up.
process_uptime_seconds 3600.5
# TYPE system_load gauge
# HELP system_load The load values are proportional to how long time...
system_load 2.3Common Patterns
%% Register collector in application
application:ensure_all_started([prometheus, prometheus_cowboy]),
prometheus:register_collector(hb_metrics_collector).
%% Query metrics programmatically
Metrics = prometheus_text_format:format(),
% Returns all metrics in Prometheus text format
%% Check if collector is registered
Collectors = prometheus_registry:collectors(default),
IsRegistered = lists:member(hb_metrics_collector, Collectors).
%% Deregister collector
prometheus:deregister_collector(hb_metrics_collector).Monitoring Dashboard
Grafana Queries
Uptime Panel:# Display uptime in human-readable format
process_uptime_seconds
# Current load
system_load
# With threshold lines
system_load{job="hyperbeam"}
# Load vs Uptime correlation
system_load / (process_uptime_seconds / 3600)
Alerting Rules
High Load Alert
groups:
- name: hyperbeam
rules:
- alert: HighSystemLoad
expr: system_load > 8
for: 5m
labels:
severity: warning
annotations:
summary: "High system load on {{ $labels.instance }}"
description: "Load average is {{ $value }}"Recent Restart Alert
- alert: RecentRestart
expr: process_uptime_seconds < 300
for: 1m
labels:
severity: info
annotations:
summary: "Node {{ $labels.instance }} recently restarted"
description: "Uptime is only {{ $value }} seconds"CPU Supervisor
cpu_sup Requirements
The system_load metric requires cpu_sup application:
% Ensure cpu_sup is started
application:ensure_all_started(os_mon).
% cpu_sup is part of os_mon application- Linux: ✅ Full support
- macOS: ✅ Full support
- Windows: ⚠️ Limited support
- FreeBSD: ✅ Full support
Performance Impact
Collection Overhead:erlang:statistics(wall_clock)- Negligible (~µs)cpu_sup:avg5()- Low (~ms)
Scrape Frequency: Typically 15-60 seconds (Prometheus default)
Memory: Minimal - no historical data stored
Extending the Collector
Adding New Metrics
collect_mf(_Registry, Callback) ->
% Existing metrics...
% Add new metric
MemoryUsed = erlang:memory(total),
Callback(
create_gauge(
erlang_memory_bytes_total,
"Total memory used by Erlang VM",
MemoryUsed
)
),
ok.
% Add corresponding collect_metrics clause
collect_metrics(erlang_memory_bytes_total, Memory) ->
prometheus_model_helpers:gauge_metrics([{[], Memory}]).Adding Labels
collect_metrics(metric_with_labels, Data) ->
prometheus_model_helpers:gauge_metrics([
{[{node, node()}], Data}
]).Troubleshooting
cpu_sup not available
% Check if cpu_sup is running
case whereis(cpu_sup) of
undefined ->
application:start(os_mon);
_ -> ok
end.Metrics not appearing
% Verify collector is registered
Collectors = prometheus_registry:collectors(default),
case lists:member(hb_metrics_collector, Collectors) of
true -> io:format("Collector registered~n");
false ->
prometheus:register_collector(hb_metrics_collector),
io:format("Collector registered now~n")
end.References
- Prometheus - Prometheus Documentation
- Erlang Prometheus - prometheus.erl
- CPU Supervisor - cpu_sup module
- Metrics - Prometheus Metric Types
Notes
- Gauge Metrics: Both metrics are gauges (point-in-time values)
- No Labels: Metrics have no labels (simplest form)
- Wall Clock: Uptime from VM wall clock, not OS uptime
- 5-min Average: System load is 5-minute average, not instant
- Unit Conversion: Uptime converted from ms to seconds
- Minimal Overhead: Very lightweight collection
- Standard Integration: Follows prometheus_collector behavior
- No State: Stateless collector (no internal state)
- Platform Dependent: cpu_sup availability varies by OS
- Scrape Interval: Collection happens on Prometheus scrape
- No Caching: Fresh values on each scrape
- Extensible: Easy to add more system metrics
- Production Ready: Minimal, reliable metrics
- No Filtering: All registered collectors scraped together
- Standard Format: Prometheus text format output