Skip to content

Commit d68b989

Browse files
authored
core: add some very basic metrics (#289)
* core: add some very basic metrics * fix tests
1 parent d8a8956 commit d68b989

File tree

9 files changed

+400
-3
lines changed

9 files changed

+400
-3
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ dependencies = [
3434
"aiohttp>=3.11.0,<4",
3535
"cryptography>=46.0.0",
3636
"numpy>=2.0.0,<3",
37+
"prometheus-client>=0.21.0,<1",
3738
]
3839

3940
[project.license]

src/lean_spec/subspecs/api/server.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
"""
2-
API server for checkpoint sync and node status endpoints.
2+
API server for checkpoint sync, node status, and metrics endpoints.
33
44
Provides HTTP endpoints for:
55
- /lean/states/finalized - Serve finalized checkpoint state as SSZ
66
- /health - Health check endpoint
7+
- /metrics - Prometheus metrics endpoint
78
89
This matches the checkpoint sync API implemented in zeam.
910
"""
@@ -18,6 +19,8 @@
1819

1920
from aiohttp import web
2021

22+
from lean_spec.subspecs.metrics import generate_metrics
23+
2124
if TYPE_CHECKING:
2225
from lean_spec.subspecs.forkchoice import Store
2326

@@ -34,6 +37,14 @@ async def _handle_health(_request: web.Request) -> web.Response:
3437
return web.json_response({"status": "healthy", "service": "lean-spec-api"})
3538

3639

40+
async def _handle_metrics(_request: web.Request) -> web.Response:
41+
"""Handle Prometheus metrics endpoint."""
42+
return web.Response(
43+
body=generate_metrics(),
44+
content_type="text/plain; version=0.0.4; charset=utf-8",
45+
)
46+
47+
3748
@dataclass(frozen=True, slots=True)
3849
class ApiServerConfig:
3950
"""Configuration for the API server."""
@@ -87,6 +98,7 @@ async def start(self) -> None:
8798
app.add_routes(
8899
[
89100
web.get("/health", _handle_health),
101+
web.get("/metrics", _handle_metrics),
90102
web.get("/lean/states/finalized", self._handle_finalized_state),
91103
]
92104
)
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""
2+
Metrics module for observability.
3+
4+
Provides counters, gauges, and histograms for tracking consensus client behavior.
5+
Exposes metrics in Prometheus text format.
6+
"""
7+
8+
from .registry import (
9+
REGISTRY,
10+
attestations_invalid,
11+
attestations_produced,
12+
attestations_received,
13+
attestations_valid,
14+
block_processing_time,
15+
blocks_processed,
16+
blocks_proposed,
17+
current_slot,
18+
finalized_slot,
19+
generate_metrics,
20+
head_slot,
21+
justified_slot,
22+
peers_connected,
23+
reorgs,
24+
validators_count,
25+
)
26+
27+
__all__ = [
28+
"REGISTRY",
29+
"attestations_invalid",
30+
"attestations_produced",
31+
"attestations_received",
32+
"attestations_valid",
33+
"block_processing_time",
34+
"blocks_processed",
35+
"blocks_proposed",
36+
"current_slot",
37+
"finalized_slot",
38+
"generate_metrics",
39+
"head_slot",
40+
"justified_slot",
41+
"peers_connected",
42+
"reorgs",
43+
"validators_count",
44+
]
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
"""
2+
Metric registry using prometheus_client.
3+
4+
Provides pre-defined metrics for a consensus client.
5+
Exposes metrics in Prometheus text format via the /metrics endpoint.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
from prometheus_client import (
11+
CollectorRegistry,
12+
Counter,
13+
Gauge,
14+
Histogram,
15+
generate_latest,
16+
)
17+
18+
# Create a dedicated registry for lean-spec metrics.
19+
#
20+
# Using a dedicated registry avoids pollution from default Python process metrics.
21+
REGISTRY = CollectorRegistry()
22+
23+
# -----------------------------------------------------------------------------
24+
# Node Information
25+
# -----------------------------------------------------------------------------
26+
27+
head_slot = Gauge(
28+
"lean_head_slot",
29+
"Current head slot",
30+
registry=REGISTRY,
31+
)
32+
33+
current_slot = Gauge(
34+
"lean_current_slot",
35+
"Current time slot",
36+
registry=REGISTRY,
37+
)
38+
39+
justified_slot = Gauge(
40+
"lean_justified_slot",
41+
"Latest justified slot",
42+
registry=REGISTRY,
43+
)
44+
45+
finalized_slot = Gauge(
46+
"lean_finalized_slot",
47+
"Latest finalized slot",
48+
registry=REGISTRY,
49+
)
50+
51+
validators_count = Gauge(
52+
"lean_validators_count",
53+
"Active validators",
54+
registry=REGISTRY,
55+
)
56+
57+
# -----------------------------------------------------------------------------
58+
# Block Processing
59+
# -----------------------------------------------------------------------------
60+
61+
blocks_processed = Counter(
62+
"lean_blocks_processed_total",
63+
"Total blocks processed",
64+
registry=REGISTRY,
65+
)
66+
67+
block_processing_time = Histogram(
68+
"lean_block_processing_seconds",
69+
"Block processing duration",
70+
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5),
71+
registry=REGISTRY,
72+
)
73+
74+
# -----------------------------------------------------------------------------
75+
# Attestations
76+
# -----------------------------------------------------------------------------
77+
78+
attestations_received = Counter(
79+
"lean_attestations_received_total",
80+
"Total attestations received",
81+
registry=REGISTRY,
82+
)
83+
84+
attestations_valid = Counter(
85+
"lean_attestations_valid_total",
86+
"Valid attestations",
87+
registry=REGISTRY,
88+
)
89+
90+
attestations_invalid = Counter(
91+
"lean_attestations_invalid_total",
92+
"Invalid attestations",
93+
registry=REGISTRY,
94+
)
95+
96+
# -----------------------------------------------------------------------------
97+
# Network
98+
# -----------------------------------------------------------------------------
99+
100+
peers_connected = Gauge(
101+
"lean_peers_connected",
102+
"Connected peers",
103+
registry=REGISTRY,
104+
)
105+
106+
# -----------------------------------------------------------------------------
107+
# Consensus Events
108+
# -----------------------------------------------------------------------------
109+
110+
reorgs = Counter(
111+
"lean_reorgs_total",
112+
"Chain reorganizations",
113+
registry=REGISTRY,
114+
)
115+
116+
# -----------------------------------------------------------------------------
117+
# Validator Production
118+
# -----------------------------------------------------------------------------
119+
120+
blocks_proposed = Counter(
121+
"lean_blocks_proposed_total",
122+
"Blocks proposed by this node",
123+
registry=REGISTRY,
124+
)
125+
126+
attestations_produced = Counter(
127+
"lean_attestations_produced_total",
128+
"Attestations produced by this node",
129+
registry=REGISTRY,
130+
)
131+
132+
133+
def generate_metrics() -> bytes:
134+
"""
135+
Generate Prometheus metrics output.
136+
137+
Returns:
138+
Prometheus text format output as bytes.
139+
"""
140+
return generate_latest(REGISTRY)

src/lean_spec/subspecs/sync/service.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
from dataclasses import dataclass, field
4242
from typing import TYPE_CHECKING
4343

44+
from lean_spec.subspecs import metrics
4445
from lean_spec.subspecs.chain.clock import SlotClock
4546
from lean_spec.subspecs.containers import Block, SignedBlockWithAttestation
4647
from lean_spec.subspecs.containers.attestation import SignedAttestation
@@ -203,12 +204,24 @@ def _process_block_wrapper(
203204
# Delegate to the actual block processor (typically Store.on_block).
204205
#
205206
# The processor validates the block and updates forkchoice state.
206-
new_store = self.process_block(store, block)
207+
with metrics.block_processing_time.time():
208+
new_store = self.process_block(store, block)
207209

208210
# Track metrics after successful processing.
209211
#
210212
# We only count blocks that pass validation and update the store.
211213
self._blocks_processed += 1
214+
metrics.blocks_processed.inc()
215+
216+
# Update chain state metrics.
217+
metrics.head_slot.set(float(new_store.blocks[new_store.head].slot))
218+
metrics.justified_slot.set(float(new_store.latest_justified.slot))
219+
metrics.finalized_slot.set(float(new_store.latest_finalized.slot))
220+
221+
# Update validator count from head state.
222+
head_state = new_store.states.get(new_store.head)
223+
if head_state is not None:
224+
metrics.validators_count.set(float(len(head_state.validators)))
212225

213226
# Persist block and state to database if available.
214227
#

src/lean_spec/subspecs/validator/service.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from dataclasses import dataclass, field
3030
from typing import TYPE_CHECKING
3131

32+
from lean_spec.subspecs import metrics
3233
from lean_spec.subspecs.chain.clock import SlotClock
3334
from lean_spec.subspecs.chain.config import SECONDS_PER_INTERVAL
3435
from lean_spec.subspecs.containers import (
@@ -184,6 +185,7 @@ async def _maybe_produce_block(self, slot: Slot) -> None:
184185
# Create signed block wrapper for publishing.
185186
signed_block = self._sign_block(block, validator_index, signatures)
186187
self._blocks_produced += 1
188+
metrics.blocks_proposed.inc()
187189

188190
# Emit the block for network propagation.
189191
await self.on_block(signed_block)
@@ -217,6 +219,7 @@ async def _produce_attestations(self, slot: Slot) -> None:
217219
# Sign the attestation using our secret key.
218220
signed_attestation = self._sign_attestation(attestation_data, validator_index)
219221
self._attestations_produced += 1
222+
metrics.attestations_produced.inc()
220223

221224
# Emit the attestation for network propagation.
222225
await self.on_attestation(signed_attestation)

0 commit comments

Comments
 (0)