Skip to content

Commit 22302d7

Browse files
committed
fix: do not register CRDB metrics in init()
1 parent 0e16dd1 commit 22302d7

8 files changed

Lines changed: 91 additions & 64 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
1111
### Fixed
1212
- Regression introduced in 1.49.2: missing spans in ReadSchema calls (https://github.com/authzed/spicedb/pull/2947)
1313
- Long standing bug in the way postgres revisions were being compared. Sometimes revisions that were actually overlapping were erroneously being ordered. (https://github.com/authzed/spicedb/pull/2958)
14+
- Do not register CRDB metrics in `init()` functions (https://github.com/authzed/spicedb/pull/2966)
1415

1516
## [1.49.2] - 2026-03-02
1617
### Added

internal/datastore/crdb/crdb.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ func newCRDBDatastore(ctx context.Context, url string, options ...Option) (datas
197197
gcWindow: config.gcWindow,
198198
watchEnabled: !config.watchDisabled,
199199
schema: *schema.Schema(config.columnOptimizationOption, config.withIntegrity, false),
200+
healthTracker: healthChecker,
200201
}
201202
ds.SetNowFunc(ds.headRevisionInternal)
202203

@@ -289,6 +290,8 @@ type crdbDatastore struct {
289290
supportsIntegrity bool
290291
watchEnabled bool
291292

293+
healthTracker *pool.NodeHealthTracker
294+
292295
uniqueID atomic.Pointer[string]
293296
}
294297

@@ -481,6 +484,9 @@ func (cds *crdbDatastore) Close() error {
481484
}
482485
cds.readPool.Close()
483486
cds.writePool.Close()
487+
if cds.healthTracker != nil {
488+
cds.healthTracker.Close()
489+
}
484490
for _, collector := range cds.collectors {
485491
ok := prometheus.Unregister(collector)
486492
if !ok {

internal/datastore/crdb/crdb_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -904,7 +904,7 @@ func TestRegisterPrometheusCollectors(t *testing.T) {
904904

905905
writePoolConfig, err := pgxpool.ParseConfig(fmt.Sprintf("postgres://db:password@pg.example.com:5432/mydb?pool_max_conns=%d", writeMaxConns))
906906
require.NoError(t, err)
907-
writePool, err := pool.NewRetryPool(t.Context(), "read", writePoolConfig, nil, 18, 20)
907+
writePool, err := pool.NewRetryPool(t.Context(), "write", writePoolConfig, nil, 18, 20)
908908
require.NoError(t, err)
909909

910910
// Create datastore with those pools

internal/datastore/crdb/pool/balancer.go

Lines changed: 34 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -20,24 +20,6 @@ import (
2020
"github.com/authzed/spicedb/pkg/genutil"
2121
)
2222

23-
var (
24-
connectionsPerCRDBNodeCountGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
25-
Name: "crdb_connections_per_node",
26-
Help: "The number of active connections SpiceDB holds to each CockroachDB node, by pool (read/write). Imbalanced values across nodes suggest the connection balancer is unable to redistribute connections evenly.",
27-
}, []string{"pool", "node_id"})
28-
29-
pruningTimeHistogram = prometheus.NewHistogramVec(prometheus.HistogramOpts{
30-
Name: "crdb_pruning_duration",
31-
Help: "Duration in milliseconds of one iteration of the CockroachDB connection balancer pruning excess connections from over-represented nodes. Elevated values indicate the balancer is struggling to rebalance connections.",
32-
Buckets: []float64{.1, .2, .5, 1, 2, 5, 10, 20, 50, 100},
33-
}, []string{"pool"})
34-
)
35-
36-
func init() {
37-
prometheus.MustRegister(connectionsPerCRDBNodeCountGauge)
38-
prometheus.MustRegister(pruningTimeHistogram)
39-
}
40-
4123
type balancePoolConn[C balanceConn] interface {
4224
Conn() C
4325
Release()
@@ -81,6 +63,9 @@ type nodeConnectionBalancer[P balancePoolConn[C], C balanceConn] struct {
8163
healthTracker *NodeHealthTracker
8264
rnd *rand.Rand
8365
seed int64
66+
67+
connectionsPerCRDBNodeCountGauge *prometheus.GaugeVec
68+
pruningTimeHistogram prometheus.Histogram
8469
}
8570

8671
// newNodeConnectionBalancer is generic over underlying connection types for
@@ -99,12 +84,35 @@ func newNodeConnectionBalancer[P balancePoolConn[C], C balanceConn](pool balance
9984
seed = 0
10085
}
10186
}
87+
var (
88+
connectionsPerCRDBNodeCountGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
89+
Name: "crdb_connections_per_node",
90+
ConstLabels: prometheus.Labels{
91+
"pool": pool.ID(),
92+
},
93+
Help: "The number of active connections SpiceDB holds to each CockroachDB node, by pool (read/write). Imbalanced values across nodes suggest the connection balancer is unable to redistribute connections evenly.",
94+
}, []string{"node_id"})
95+
96+
pruningTimeHistogram = prometheus.NewHistogram(prometheus.HistogramOpts{
97+
Name: "crdb_pruning_duration",
98+
ConstLabels: prometheus.Labels{
99+
"pool": pool.ID(),
100+
},
101+
Help: "Duration in milliseconds of one iteration of the CockroachDB connection balancer pruning excess connections from over-represented nodes. Elevated values indicate the balancer is struggling to rebalance connections.",
102+
Buckets: []float64{.1, .2, .5, 1, 2, 5, 10, 20, 50, 100},
103+
})
104+
)
105+
106+
prometheus.MustRegister(connectionsPerCRDBNodeCountGauge)
107+
prometheus.MustRegister(pruningTimeHistogram)
102108
return &nodeConnectionBalancer[P, C]{
103-
ticker: time.NewTicker(interval),
104-
sem: semaphore.NewWeighted(1),
105-
healthTracker: healthTracker,
106-
pool: pool,
107-
seed: seed,
109+
ticker: time.NewTicker(interval),
110+
sem: semaphore.NewWeighted(1),
111+
healthTracker: healthTracker,
112+
pool: pool,
113+
seed: seed,
114+
connectionsPerCRDBNodeCountGauge: connectionsPerCRDBNodeCountGauge,
115+
pruningTimeHistogram: pruningTimeHistogram,
108116
// nolint:gosec
109117
// use of non cryptographically secure random number generator is not concern here,
110118
// as it's used for shuffling the nodes to balance the connections when the number of
@@ -137,7 +145,7 @@ func (p *nodeConnectionBalancer[P, C]) Prune(ctx context.Context) {
137145
func (p *nodeConnectionBalancer[P, C]) mustPruneConnections(ctx context.Context) {
138146
start := time.Now()
139147
defer func() {
140-
pruningTimeHistogram.WithLabelValues(p.pool.ID()).Observe(float64(time.Since(start).Milliseconds()))
148+
p.pruningTimeHistogram.Observe(float64(time.Since(start).Milliseconds()))
141149
}()
142150
conns := p.pool.AcquireAllIdle(ctx)
143151
defer func() {
@@ -182,8 +190,7 @@ func (p *nodeConnectionBalancer[P, C]) mustPruneConnections(ctx context.Context)
182190
p.healthTracker.RLock()
183191
for node := range p.healthTracker.nodesEverSeen {
184192
if _, ok := connectionCounts[node]; !ok {
185-
connectionsPerCRDBNodeCountGauge.DeletePartialMatch(map[string]string{
186-
"pool": p.pool.ID(),
193+
p.connectionsPerCRDBNodeCountGauge.DeletePartialMatch(map[string]string{
187194
"node_id": strconv.FormatUint(uint64(node), 10),
188195
})
189196
}
@@ -205,8 +212,7 @@ func (p *nodeConnectionBalancer[P, C]) mustPruneConnections(ctx context.Context)
205212
initialPerNodeMax := p.pool.MaxConns() / nodeCount
206213
for i, node := range nodes {
207214
count := connectionCounts[node]
208-
connectionsPerCRDBNodeCountGauge.WithLabelValues(
209-
p.pool.ID(),
215+
p.connectionsPerCRDBNodeCountGauge.WithLabelValues(
210216
strconv.FormatUint(uint64(node), 10),
211217
).Set(float64(count))
212218

internal/datastore/crdb/pool/balancer_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,9 @@ func TestNodeConnectionBalancerPrune(t *testing.T) {
141141
t.Run(tt.name, func(t *testing.T) {
142142
tracker, err := NewNodeHealthChecker("")
143143
require.NoError(t, err)
144+
t.Cleanup(func() {
145+
tracker.Close()
146+
})
144147
for _, n := range tt.nodes {
145148
tracker.healthyNodes[n] = struct{}{}
146149
}

internal/datastore/crdb/pool/health.go

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package pool
22

33
import (
44
"context"
5+
"fmt"
56
"math/rand"
67
"sync"
78
"time"
@@ -18,26 +19,18 @@ import (
1819

1920
const errorBurst = 2
2021

21-
var healthyCRDBNodeCountGauge = prometheus.NewGauge(prometheus.GaugeOpts{
22-
Name: "crdb_healthy_nodes",
23-
Help: "the number of healthy crdb nodes detected by spicedb",
24-
})
25-
26-
func init() {
27-
prometheus.MustRegister(healthyCRDBNodeCountGauge)
28-
}
29-
3022
// NodeHealthTracker detects changes in the node pool by polling the cluster periodically and recording
3123
// the node ids that are seen. This is used to detect new nodes that come online that have either previously
3224
// been marked unhealthy due to connection errors or due to scale up.
3325
//
3426
// Consumers can manually mark a node healthy or unhealthy as well.
3527
type NodeHealthTracker struct {
3628
sync.RWMutex
37-
connConfig *pgx.ConnConfig
38-
healthyNodes map[uint32]struct{} // GUARDED_BY(RWMutex)
39-
nodesEverSeen map[uint32]*rate.Limiter // GUARDED_BY(RWMutex)
40-
newLimiter func() *rate.Limiter
29+
connConfig *pgx.ConnConfig
30+
healthyNodes map[uint32]struct{} // GUARDED_BY(RWMutex)
31+
nodesEverSeen map[uint32]*rate.Limiter // GUARDED_BY(RWMutex)
32+
newLimiter func() *rate.Limiter
33+
healthyCRDBNodeCountGauge prometheus.Gauge
4134
}
4235

4336
// NewNodeHealthChecker builds a health checker that polls the cluster at the given url.
@@ -47,16 +40,31 @@ func NewNodeHealthChecker(url string) (*NodeHealthTracker, error) {
4740
return nil, err
4841
}
4942

43+
healthyCRDBNodeCountGauge := prometheus.NewGauge(prometheus.GaugeOpts{
44+
Name: "crdb_healthy_nodes",
45+
Help: "the number of healthy crdb nodes detected by spicedb",
46+
})
47+
48+
err = prometheus.Register(healthyCRDBNodeCountGauge)
49+
if err != nil {
50+
return nil, fmt.Errorf("failed to register crdb healthy nodes metric: %w", err)
51+
}
52+
5053
return &NodeHealthTracker{
51-
connConfig: connConfig,
52-
healthyNodes: make(map[uint32]struct{}, 0),
53-
nodesEverSeen: make(map[uint32]*rate.Limiter, 0),
54+
healthyCRDBNodeCountGauge: healthyCRDBNodeCountGauge,
55+
connConfig: connConfig,
56+
healthyNodes: make(map[uint32]struct{}, 0),
57+
nodesEverSeen: make(map[uint32]*rate.Limiter, 0),
5458
newLimiter: func() *rate.Limiter {
5559
return rate.NewLimiter(rate.Every(1*time.Minute), errorBurst)
5660
},
5761
}, nil
5862
}
5963

64+
func (t *NodeHealthTracker) Close() {
65+
_ = prometheus.Unregister(t.healthyCRDBNodeCountGauge)
66+
}
67+
6068
// Poll starts polling the cluster and recording the node IDs that it sees.
6169
func (t *NodeHealthTracker) Poll(ctx context.Context, interval time.Duration) {
6270
ticker := jitterbug.New(interval, jitterbug.Uniform{
@@ -104,7 +112,7 @@ func (t *NodeHealthTracker) SetNodeHealth(nodeID uint32, healthy bool) {
104112
t.Lock()
105113
defer t.Unlock()
106114
defer func() {
107-
healthyCRDBNodeCountGauge.Set(float64(len(t.healthyNodes)))
115+
t.healthyCRDBNodeCountGauge.Set(float64(len(t.healthyNodes)))
108116
}()
109117

110118
if _, ok := t.nodesEverSeen[nodeID]; !ok {

internal/datastore/crdb/pool/pool.go

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,6 @@ type pgxPool interface {
2828
Stat() *pgxpool.Stat
2929
}
3030

31-
var resetHistogram = prometheus.NewHistogram(prometheus.HistogramOpts{
32-
Name: "crdb_client_resets",
33-
Help: "Distribution of the number of client-side transaction restarts per transaction attempt. Restarts occur when CockroachDB returns a serialization failure (40001) and the driver retries the transaction from scratch. Sustained high values indicate transaction contention.",
34-
Buckets: []float64{0, 1, 2, 5, 10, 20, 50},
35-
})
36-
37-
func init() {
38-
prometheus.MustRegister(resetHistogram)
39-
}
40-
4131
type ctxDisableRetries struct{}
4232

4333
var (
@@ -46,9 +36,10 @@ var (
4636
)
4737

4838
type RetryPool struct {
49-
pool pgxPool
50-
id string
51-
healthTracker *NodeHealthTracker
39+
pool pgxPool
40+
id string
41+
healthTracker *NodeHealthTracker
42+
resetHistogram prometheus.Histogram
5243

5344
sync.RWMutex
5445
maxRetries uint8
@@ -144,6 +135,20 @@ func NewRetryPool(ctx context.Context, name string, config *pgxpool.Config, heal
144135
return nil, err
145136
}
146137

138+
p.resetHistogram = prometheus.NewHistogram(prometheus.HistogramOpts{
139+
Name: "crdb_client_resets",
140+
ConstLabels: prometheus.Labels{
141+
"pool_name": name, // this is needed to avoid "duplicate metrics collector registration attempted"
142+
},
143+
Help: "Distribution of the number of client-side transaction restarts per transaction attempt. Restarts occur when CockroachDB returns a serialization failure (40001) and the driver retries the transaction from scratch. Sustained high values indicate transaction contention.",
144+
Buckets: []float64{0, 1, 2, 5, 10, 20, 50},
145+
})
146+
147+
err = prometheus.Register(p.resetHistogram)
148+
if err != nil {
149+
return nil, fmt.Errorf("error registering CRDB reset histogram: %w", err)
150+
}
151+
147152
p.pool = pool
148153
return p, nil
149154
}
@@ -253,6 +258,7 @@ func (p *RetryPool) Config() *pgxpool.Config {
253258
// Close closes the underlying pgxpool.Pool
254259
func (p *RetryPool) Close() {
255260
p.pool.Close()
261+
prometheus.Unregister(p.resetHistogram)
256262
}
257263

258264
// Stat returns the underlying pgxpool.Pool stats
@@ -315,7 +321,7 @@ func (p *RetryPool) withRetries(ctx context.Context, acquireTimeout time.Duratio
315321

316322
var retries uint8
317323
defer func() {
318-
resetHistogram.Observe(float64(retries))
324+
p.resetHistogram.Observe(float64(retries))
319325
}()
320326

321327
maxRetries := p.maxRetries

internal/datastore/proxy/schemacaching/watchingcache.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,6 @@ var definitionsReadTotalCounter = prometheus.NewCounterVec(prometheus.CounterOpt
5656

5757
const maximumRetryCount = 10
5858

59-
func init() {
60-
prometheus.MustRegister(namespacesFallbackModeGauge, caveatsFallbackModeGauge, schemaCacheRevisionGauge, definitionsReadCachedCounter, definitionsReadTotalCounter)
61-
}
62-
6359
// watchingCachingProxy is a datastore proxy that caches schema (namespaces and caveat definitions)
6460
// and updates its cache via a WatchSchema call. If the supplied datastore to be wrapped does not support
6561
// this API, or the data is not available in this case or an error occurs, the updating cache fallsback
@@ -78,6 +74,7 @@ type watchingCachingProxy struct {
7874

7975
// createWatchingCacheProxy creates and returns a watching cache proxy.
8076
func createWatchingCacheProxy(delegate datastore.Datastore, c cache.Cache[cache.StringKey, *cacheEntry], gcWindow time.Duration, watchHeartbeat time.Duration) *watchingCachingProxy {
77+
prometheus.MustRegister(namespacesFallbackModeGauge, caveatsFallbackModeGauge, schemaCacheRevisionGauge, definitionsReadCachedCounter, definitionsReadTotalCounter)
8178
fallbackCache := &definitionCachingProxy{
8279
Datastore: delegate,
8380
c: c,

0 commit comments

Comments
 (0)