Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions cmd/alertmanager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ package main

import (
"context"
"crypto/sha256"
"encoding/hex"
"errors"
"fmt"
"log/slog"
Expand Down Expand Up @@ -140,6 +142,7 @@ func run() int {

var (
configFile = kingpin.Flag("config.file", "Alertmanager configuration file name.").Default("alertmanager.yml").String()
configAutoReloadInterval = kingpin.Flag("config.auto-reload-interval", "Interval for checking and automatically reloading the Alertmanager configuration file. Set to 0 to disable.").Default("0s").Duration()
dataDir = kingpin.Flag("storage.path", "Base path for data storage.").Default("data/").String()
retention = kingpin.Flag("data.retention", "How long to keep data for.").Default("120h").Duration()
maintenanceInterval = kingpin.Flag("data.maintenance-interval", "Interval between garbage collection and snapshotting to disk of the silences and the notification logs.").Default("15m").Duration()
Expand Down Expand Up @@ -659,6 +662,14 @@ func run() int {
signal.Notify(hup, syscall.SIGHUP)
signal.Notify(term, os.Interrupt, syscall.SIGTERM)

// Start the auto-reload watcher if the interval is non-zero.
if *configAutoReloadInterval > 0 {
logger.Info("Auto-reload enabled: checking for configuration changes", "interval", *configAutoReloadInterval, "file", *configFile)
watcherCtx, cancelWatcher := context.WithCancel(context.Background())
defer cancelWatcher()
go runConfigWatcher(watcherCtx, *configFile, *configAutoReloadInterval, webReload, logger)
}

for {
select {
case <-hup:
Expand All @@ -680,6 +691,91 @@ func run() int {
}
}

// configFileChecksum reads the file at path and returns its SHA256 hex digest.
// It returns an error if the file cannot be read.
func configFileChecksum(path string) (string, error) {
b, err := os.ReadFile(path)
if err != nil {
return "", fmt.Errorf("reading config file for checksum: %w", err)
}
sum := sha256.Sum256(b)
return hex.EncodeToString(sum[:]), nil
}

// runConfigWatcher polls the config file checksum every interval and sends
// to reloadCh when a change is detected. It exits when ctx is cancelled.
// Interval must be > 0; callers are responsible for checking this.
func runConfigWatcher(
ctx context.Context,
configFile string,
interval time.Duration,
reloadCh chan<- chan error,
logger *slog.Logger,
) {
// Compute the initial checksum at startup so we only reload on *changes*,
// not on the first tick unconditionally.
lastChecksum, err := configFileChecksum(configFile)
hasChecksum := err == nil
if err != nil {
// Log but don't abort - the coordinator already validated the file at
// startup, so this is a transient read error. We'll retry next tick.
logger.Warn("Auto-reload: failed to compute initial config checksum", "file", configFile, "err", err)
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

ticker := time.NewTicker(interval)
defer ticker.Stop()

for {
select {
case <-ctx.Done():
logger.Info("Auto-reload: watcher stopped", "file", configFile)
return
case <-ticker.C:
checksum, err := configFileChecksum(configFile)
if err != nil {
logger.Warn("Auto-reload: failed to read config file", "file", configFile, "err", err)
continue // don't update lastChecksum; retry next tick
}

if !hasChecksum {
// Startup read failed; seed the baseline now without reloading.
lastChecksum = checksum
hasChecksum = true
continue
}

if checksum == lastChecksum {
continue // no change
}

logger.Info("Auto-reload: config file changed, reloading", "file", configFile)

// Trigger reload via the same channel that SIGHUP and POST /-/reload use.
// Use a select so that a simultaneous SIGTERM doesn't leave this
// goroutine blocked on the send or the result receive.
errCh := make(chan error)
select {
case reloadCh <- errCh:
case <-ctx.Done():
return
}
select {
case err := <-errCh:
if err != nil {
logger.Error("Auto-reload: reload failed", "file", configFile, "err", err)
// Don't update lastChecksum so we retry on the next tick.
continue
}
case <-ctx.Done():
return
}

lastChecksum = checksum
logger.Info("Auto-reload: config reload successful", "file", configFile)
}
}
}

// clusterWait returns a function that inspects the current peer state and returns
// a duration of one base timeout for each peer with a higher ID than ourselves.
func clusterWait(p *cluster.Peer, timeout time.Duration) func() time.Duration {
Expand Down
186 changes: 186 additions & 0 deletions cmd/alertmanager/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,13 @@
package main

import (
"context"
"fmt"
"log/slog"
"os"
"path/filepath"
"testing"
"time"

"github.com/prometheus/common/promslog"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -95,3 +100,184 @@ func TestExternalURL(t *testing.T) {
})
}
}

func TestConfigFileChecksum_ReturnsConsistentHash(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "alertmanager.yml")
require.NoError(t, os.WriteFile(path, []byte("content: a"), 0o644))

sum1, err := configFileChecksum(path)
require.NoError(t, err)
sum2, err := configFileChecksum(path)
require.NoError(t, err)

require.Equal(t, sum1, sum2, "same content should produce same checksum")
}

func TestConfigFileChecksum_DifferentContentDifferentHash(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "alertmanager.yml")

require.NoError(t, os.WriteFile(path, []byte("content: a"), 0o644))
sumA, err := configFileChecksum(path)
require.NoError(t, err)

require.NoError(t, os.WriteFile(path, []byte("content: b"), 0o644))
sumB, err := configFileChecksum(path)
require.NoError(t, err)

require.NotEqual(t, sumA, sumB, "different content must produce different checksum")
}

func TestConfigFileChecksum_MissingFileReturnsError(t *testing.T) {
_, err := configFileChecksum("/nonexistent/path/alertmanager.yml")
require.Error(t, err)
}

func TestRunConfigWatcher_NoReloadWhenFileUnchanged(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "alertmanager.yml")
require.NoError(t, os.WriteFile(path, []byte("route:\n receiver: default\n"), 0o644))

reloadCh := make(chan chan error, 1)
ctx, cancel := context.WithTimeout(context.Background(), 150*time.Millisecond)
defer cancel()

go runConfigWatcher(ctx, path, 50*time.Millisecond, reloadCh, slog.Default())

// Let watcher run for 3 ticks minimum.
<-ctx.Done()

// reloadCh must be empty — no reload should have been triggered.
require.Empty(t, reloadCh, "no reload expected when file is unchanged")
}

func TestRunConfigWatcher_TriggersReloadOnChange(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "alertmanager.yml")
require.NoError(t, os.WriteFile(path, []byte("original"), 0o644))

reloadCh := make(chan chan error, 1)
ctx := t.Context()

go runConfigWatcher(ctx, path, 30*time.Millisecond, reloadCh, slog.Default())

// Wait one tick to let initial checksum be set.
time.Sleep(50 * time.Millisecond)

// Change the file.
require.NoError(t, os.WriteFile(path, []byte("changed"), 0o644))

// Wait for the watcher to detect the change and send to reloadCh.
select {
case errCh := <-reloadCh:
errCh <- nil // Simulate a successful reload.
case <-time.After(300 * time.Millisecond):
t.Fatal("timed out waiting for reload signal after file change")
}
}

func TestRunConfigWatcher_DoesNotRetriggerAfterSuccessfulReload(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "alertmanager.yml")
require.NoError(t, os.WriteFile(path, []byte("original"), 0o644))

reloadCh := make(chan chan error, 2) // buffer=2 to catch any spurious second reload
ctx := t.Context()

go runConfigWatcher(ctx, path, 30*time.Millisecond, reloadCh, slog.Default())

time.Sleep(50 * time.Millisecond) // allow initial checksum to be set

// Change the file once.
require.NoError(t, os.WriteFile(path, []byte("changed"), 0o644))

// Consume the first (expected) reload.
select {
case errCh := <-reloadCh:
errCh <- nil
case <-time.After(300 * time.Millisecond):
t.Fatal("expected first reload not received")
}

// Let 3 more ticks pass — file is unchanged so no second reload should come.
time.Sleep(150 * time.Millisecond)

require.Empty(t, reloadCh, "no second reload expected after successful reload of same content")
}

func TestRunConfigWatcher_RetriesAfterFailedReload(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "alertmanager.yml")
require.NoError(t, os.WriteFile(path, []byte("original"), 0o644))

reloadCh := make(chan chan error, 2)
ctx := t.Context()

go runConfigWatcher(ctx, path, 30*time.Millisecond, reloadCh, slog.Default())

time.Sleep(50 * time.Millisecond)

require.NoError(t, os.WriteFile(path, []byte("invalid-config"), 0o644))

// First attempt — simulate a reload failure.
select {
case errCh := <-reloadCh:
errCh <- fmt.Errorf("simulated parse error")
case <-time.After(300 * time.Millisecond):
t.Fatal("expected first reload attempt not received")
}

// Second attempt — watcher must retry because lastChecksum was not updated.
select {
case errCh := <-reloadCh:
errCh <- nil // Now succeeds.
case <-time.After(300 * time.Millisecond):
t.Fatal("expected retry reload attempt not received after failed reload")
}
}

func TestRunConfigWatcher_HandlesUnreadableFile(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "alertmanager.yml")
require.NoError(t, os.WriteFile(path, []byte("original"), 0o644))

reloadCh := make(chan chan error, 1)
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
defer cancel()

go runConfigWatcher(ctx, path, 30*time.Millisecond, reloadCh, slog.Default())

time.Sleep(50 * time.Millisecond)

// Remove the file entirely — simulates a transient FS issue.
require.NoError(t, os.Remove(path))

// Watcher should log a warning but NOT send to reloadCh.
<-ctx.Done()
require.Empty(t, reloadCh, "no reload expected when file is unreadable")
}

func TestRunConfigWatcher_ExitsWhenContextCancelled(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "alertmanager.yml")
require.NoError(t, os.WriteFile(path, []byte("content"), 0o644))

reloadCh := make(chan chan error, 1)
ctx, cancel := context.WithCancel(context.Background())

done := make(chan struct{})
go func() {
runConfigWatcher(ctx, path, 30*time.Millisecond, reloadCh, slog.Default())
close(done)
}()

cancel() // Cancel immediately.

select {
case <-done:
// Watcher exited cleanly.
case <-time.After(500 * time.Millisecond):
t.Fatal("watcher goroutine did not exit after context cancellation")
}
}
16 changes: 16 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,22 @@ is not well-formed, the changes will not be applied and an error is logged.
A configuration reload is triggered by sending a `SIGHUP` to the process or
sending an HTTP POST request to the `/-/reload` endpoint.

## Auto-reload

To have Alertmanager automatically reload its configuration when the file
changes on disk, set `--config.auto-reload-interval` to a non-zero duration.
Alertmanager will poll the file for changes at that interval and apply the new
configuration if it detects a difference. The feature is off by default.

```bash
alertmanager --config.file=alertmanager.yml --config.auto-reload-interval=30s
```

This is especially useful in Kubernetes where ConfigMaps and Secrets are
updated through mounted files. The reload follows the same code path as
`SIGHUP` and `POST /-/reload`, so an invalid configuration is rejected and
logged without affecting the running configuration.

## Limits

Alertmanager supports a number of configurable limits via command-line flags.
Expand Down
4 changes: 4 additions & 0 deletions docs/management_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,7 @@ POST /-/reload
This endpoint triggers a reload of the Alertmanager configuration file.

An alternative way to trigger a configuration reload is by sending a `SIGHUP` to the Alertmanager process.

Configuration can also be reloaded automatically on file change using the
`--config.auto-reload-interval` flag. See the
[configuration documentation](configuration.md) for details.
Loading