Skip to content

Commit d5dc1a1

Browse files
committed
feat(health): Add max time between metrics check
1 parent b5fe07d commit d5dc1a1

4 files changed

Lines changed: 135 additions & 13 deletions

File tree

plugins/outputs/health/README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
4646
# tls_cert = "/etc/telegraf/cert.pem"
4747
# tls_key = "/etc/telegraf/key.pem"
4848

49+
## Maximum expected time between metrics being written
50+
## Enforces an unhealthy state if there was no new metric seen for at least
51+
## the specified time. The check is disabled by default and only used if a
52+
## positive time is specified.
53+
# max_time_between_metrics = "0s"
54+
4955
## NOTE: Due to the way TOML is parsed, tables must be at the END of the
5056
## plugin definition, otherwise additional config options are read as part of
5157
## the table
@@ -67,6 +73,20 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
6773
## field = "buffer_size"
6874
```
6975

76+
### Maximum time between metrics
77+
78+
The health plugin can assert that metrics are being delivered to it at an
79+
expected rate when setting `max_time_between_metrics` to a positive number.
80+
The check measures the time between consecutive writes to the plugin and
81+
compares it to the defined `max_time_between_metrics`. When the time
82+
elapsed between writes is greater than the configured maximum time, the plugin
83+
will report an unhealthy status. As soon as metrics are written again to the
84+
plugin, the health status will reset to healthy. Before the first metrics are
85+
received, the status will be healthy.
86+
87+
Note that the metric timestamps are not taken into account, rather the time they
88+
are written to the plugin.
89+
7090
### compares
7191

7292
The `compares` check is used to assert basic mathematical relationships. Use

plugins/outputs/health/health.go

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -41,17 +41,20 @@ type Health struct {
4141
BasicPassword string `toml:"basic_password"`
4242
common_tls.ServerConfig
4343

44-
Compares []*Compares `toml:"compares"`
45-
Contains []*Contains `toml:"contains"`
44+
Compares []*Compares `toml:"compares"`
45+
Contains []*Contains `toml:"contains"`
46+
MaxTimeBetweenMetrics config.Duration `toml:"max_time_between_metrics"`
47+
4648
Log telegraf.Logger `toml:"-"`
4749
checkers []Checker
4850

49-
wg sync.WaitGroup
50-
server *http.Server
51-
origin string
52-
network string
53-
address string
54-
tlsConf *tls.Config
51+
wg sync.WaitGroup
52+
server *http.Server
53+
origin string
54+
network string
55+
address string
56+
tlsConf *tls.Config
57+
lastMetricTime time.Time
5558

5659
mu sync.Mutex
5760
healthy bool
@@ -143,7 +146,13 @@ func (h *Health) listen() (net.Listener, error) {
143146

144147
func (h *Health) ServeHTTP(rw http.ResponseWriter, _ *http.Request) {
145148
var code = http.StatusOK
146-
if !h.isHealthy() {
149+
150+
healthy := h.isHealthy()
151+
if h.MaxTimeBetweenMetrics > 0 && !h.lastMetricTime.IsZero() {
152+
healthy = healthy && time.Since(h.lastMetricTime) < time.Duration(h.MaxTimeBetweenMetrics)
153+
}
154+
155+
if !healthy {
147156
code = http.StatusServiceUnavailable
148157
}
149158

@@ -153,6 +162,7 @@ func (h *Health) ServeHTTP(rw http.ResponseWriter, _ *http.Request) {
153162

154163
// Write runs all checks over the metric batch and adjust health state.
155164
func (h *Health) Write(metrics []telegraf.Metric) error {
165+
h.lastMetricTime = time.Now()
156166
healthy := true
157167
for _, checker := range h.checkers {
158168
success := checker.Check(metrics)
@@ -219,10 +229,12 @@ func (h *Health) isHealthy() bool {
219229

220230
func NewHealth() *Health {
221231
return &Health{
222-
ServiceAddress: defaultServiceAddress,
223-
ReadTimeout: config.Duration(defaultReadTimeout),
224-
WriteTimeout: config.Duration(defaultWriteTimeout),
225-
healthy: true,
232+
ServiceAddress: defaultServiceAddress,
233+
ReadTimeout: config.Duration(defaultReadTimeout),
234+
WriteTimeout: config.Duration(defaultWriteTimeout),
235+
MaxTimeBetweenMetrics: 0,
236+
healthy: true,
237+
lastMetricTime: time.Time{},
226238
}
227239
}
228240

plugins/outputs/health/health_test.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/stretchr/testify/require"
1010

1111
"github.com/influxdata/telegraf"
12+
"github.com/influxdata/telegraf/config"
1213
"github.com/influxdata/telegraf/plugins/outputs/health"
1314
"github.com/influxdata/telegraf/testutil"
1415
)
@@ -214,3 +215,86 @@ func TestInitServiceAddress(t *testing.T) {
214215
})
215216
}
216217
}
218+
219+
func TestTimeBetweenMetrics(t *testing.T) {
220+
now := time.Now()
221+
past := time.Time{}.AddDate(2002, 0, 0)
222+
tests := []struct {
223+
name string
224+
maxTimeBetweenMetrics config.Duration
225+
metrics []telegraf.Metric
226+
expectedCode int
227+
}{
228+
{
229+
name: "healthy when disabled and old metric",
230+
maxTimeBetweenMetrics: config.Duration(0),
231+
metrics: []telegraf.Metric{
232+
testutil.MustMetric(
233+
"cpu",
234+
map[string]string{},
235+
map[string]any{
236+
"time_idle": 42,
237+
},
238+
past),
239+
},
240+
expectedCode: 200,
241+
},
242+
{
243+
name: "healthy when enabled and recent metric",
244+
maxTimeBetweenMetrics: config.Duration(5 * time.Second),
245+
metrics: []telegraf.Metric{
246+
testutil.MustMetric(
247+
"cpu",
248+
map[string]string{},
249+
map[string]any{
250+
"time_idle": 42,
251+
},
252+
now),
253+
},
254+
expectedCode: 200,
255+
},
256+
{
257+
name: "unhealthy when enabled and old metric",
258+
maxTimeBetweenMetrics: config.Duration(1 * time.Nanosecond),
259+
metrics: []telegraf.Metric{
260+
testutil.MustMetric(
261+
"cpu",
262+
map[string]string{},
263+
map[string]any{
264+
"time_idle": 42,
265+
},
266+
past),
267+
},
268+
expectedCode: 503,
269+
},
270+
}
271+
272+
for _, tt := range tests {
273+
t.Run(tt.name, func(t *testing.T) {
274+
dut := health.NewHealth()
275+
dut.ServiceAddress = "tcp://127.0.0.1:0"
276+
dut.Log = testutil.Logger{}
277+
dut.MaxTimeBetweenMetrics = tt.maxTimeBetweenMetrics
278+
279+
err := dut.Init()
280+
require.NoError(t, err)
281+
282+
err = dut.Connect()
283+
require.NoError(t, err)
284+
285+
err = dut.Write(tt.metrics)
286+
require.NoError(t, err)
287+
288+
resp, err := http.Get(dut.Origin())
289+
require.NoError(t, err)
290+
defer resp.Body.Close()
291+
require.Equal(t, tt.expectedCode, resp.StatusCode)
292+
293+
_, err = io.ReadAll(resp.Body)
294+
require.NoError(t, err)
295+
296+
err = dut.Close()
297+
require.NoError(t, err)
298+
})
299+
}
300+
}

plugins/outputs/health/sample.conf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@
2121
# tls_cert = "/etc/telegraf/cert.pem"
2222
# tls_key = "/etc/telegraf/key.pem"
2323

24+
## Maximum expected time between metrics being written
25+
## Enforces an unhealthy state if there was no new metric seen for at least
26+
## the specified time. The check is disabled by default and only used if a
27+
## positive time is specified.
28+
# max_time_between_metrics = "0s"
29+
2430
## NOTE: Due to the way TOML is parsed, tables must be at the END of the
2531
## plugin definition, otherwise additional config options are read as part of
2632
## the table

0 commit comments

Comments
 (0)