replay: fix race between FlushEnd and refreshMetrics

xxmplus · claude · xxmplus · commit 20d2c08bd736 · 2026-03-04T17:55:15.000-08:00
527ea2b added a FlushEnd handler that closes compactionMu.ch to wake refreshMetrics when a flush completes. Unlike CompactionEnd, the handler does not increment the completed counter because that counter only tracks compactions. This creates a gap: nextCompactionCompletes has no way to detect that a flush notification was already delivered, so it can create a new channel that nobody will ever close. The race occurs in refreshMetrics between r.d.Metrics() and nextCompactionCompletes(). If a flush completes in this window: 1. r.d.Metrics() acquires d.mu, observes flushing=true (Flush.NumInProgress=1), releases d.mu. 2. The flush goroutine acquires d.mu, completes, fires FlushEnd (under d.mu). The replay handler closes compactionMu.ch and nils it. 3. nextCompactionCompletes sees ch==nil, creates a new channel. With no counter increment to detect the flush, it returns alreadyOccurred=false. 4. compactionsAppearQuiesced uses the stale metrics from step 1 (NumInProgress=1) and returns false. 5. The loop re-enters the first select with a channel nobody will close and stepsApplied==nil (blocks forever). Permanent hang. Fix this with three changes: 1. Track both flushes and compactions in the started/completed counters. Rename compactionMu to compactionOrFlushMu and add a FlushBegin handler that increments started. The FlushEnd handler now also increments completed, matching CompactionEnd. This allows nextCompactionOrFlushCompletes to detect flush completions through the counter, eliminating the race. 2. Switch compactionsAppearQuiesced to use only the started/completed counter (started == completed) instead of checking DB.Metrics().NumInProgress. There is a scheduling window between AddInProgressLocked (which increments NumInProgress under d.mu) and CompactionBegin (which fires in a separate goroutine that must re-acquire d.mu). During this window NumInProgress > 0 but started == completed. Using NumInProgress would block quiescence detection during cascading compactions. The counter does not have this window, and the 1-second quiescence confirmation handles any false positives from compactions that are scheduled but have not yet fired CompactionBegin. 3. Fix a pre-existing tight loop in refreshMetrics: when nextCompactionOrFlushCompletes detects a completion via the counter (alreadyOccurred=true), the old code skipped the first select and immediately re-acquired d.mu to collect metrics, even though the quiescence check would be skipped anyway. Under heavy compaction load this tight loop contends with event handlers for d.mu. Fix this by catching up the counter without collecting metrics when alreadyOccurred is true, then falling through to collect metrics once caught up. Fixes #5820. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/replay/replay.go b/replay/replay.go
@@ -303,10 +303,21 @@ type Runner struct {
 		countByReason    map[string]int
 		durationByReason map[string]time.Duration
 	}
-	// compactionMu holds state for tracking the number of compactions
-	// started and completed and waking waiting goroutines when a new compaction
-	// completes. See nextCompactionCompletes.
-	compactionMu struct {
+	// compactionOrFlushMu holds state for tracking the number of compactions
+	// and flushes started and completed, and waking waiting goroutines when
+	// one completes. See nextCompactionOrFlushCompletes.
+	//
+	// State transitions:
+	//   FlushBegin / CompactionBegin: started++
+	//   FlushEnd   / CompactionEnd:   completed++; close(ch); ch = nil
+	//
+	// The channel ch is created on-demand by nextCompactionOrFlushCompletes
+	// and closed by the End handlers to wake any goroutine waiting for
+	// activity to finish. The started/completed counters allow
+	// nextCompactionOrFlushCompletes to detect events that occurred between
+	// the caller's last observation and the current call, without relying
+	// solely on the channel.
+	compactionOrFlushMu struct {
 		sync.Mutex
 		ch        chan struct{}
 		started   int64
@@ -359,7 +370,7 @@ func (r *Runner) Run(ctx context.Context) error {
 
 	// Extend the user-provided Options with extensions necessary for replay
 	// mechanics.
-	r.compactionMu.ch = make(chan struct{})
+	r.compactionOrFlushMu.ch = make(chan struct{})
 	r.Opts.AddEventListener(r.eventListener())
 	r.writeStallMetrics.countByReason = make(map[string]int)
 	r.writeStallMetrics.durationByReason = make(map[string]time.Duration)
@@ -398,14 +409,15 @@ func (r *Runner) refreshMetrics(ctx context.Context) error {
 	var workloadExhausted bool
 	var workloadExhaustedAt time.Time
 	stepsApplied := r.stepsApplied
-	compactionCount, alreadyCompleted, compactionCh := r.nextCompactionCompletes(0)
+	compactionCount, alreadyCompleted, compactionCh := r.nextCompactionOrFlushCompletes(0)
 	for {
 		if !alreadyCompleted {
 			select {
 			case <-ctx.Done():
 				return ctx.Err()
 			case <-compactionCh:
-				// Fall through to refreshing dbMetrics.
+				// A compaction or flush completed. Fall through to
+				// refreshing dbMetrics.
 			case _, ok := <-stepsApplied:
 				if !ok {
 					workloadExhausted = true
@@ -418,6 +430,15 @@ func (r *Runner) refreshMetrics(ctx context.Context) error {
 				}
 				// Fall through to refreshing dbMetrics.
 			}
+		} else {
+			// One or more completions were already detected via the
+			// counter. Catch up to the latest count without collecting
+			// metrics (which would acquire d.mu and contend with event
+			// handlers). Once caught up, fall through to collect fresh
+			// metrics and check quiescence.
+			for alreadyCompleted {
+				compactionCount, alreadyCompleted, _ = r.nextCompactionOrFlushCompletes(compactionCount)
+			}
 		}
 
 		m := r.d.Metrics()
@@ -434,7 +455,7 @@ func (r *Runner) refreshMetrics(ctx context.Context) error {
 		r.metrics.totalSize.record(int64(m.DiskSpaceUsage()))
 		r.metrics.writeThroughput.record(int64(r.metrics.writeBytes.Load()))
 
-		compactionCount, alreadyCompleted, compactionCh = r.nextCompactionCompletes(compactionCount)
+		compactionCount, alreadyCompleted, compactionCh = r.nextCompactionOrFlushCompletes(compactionCount)
 		// Consider whether replaying is complete. There are two necessary
 		// conditions:
 		//
@@ -453,7 +474,7 @@ func (r *Runner) refreshMetrics(ctx context.Context) error {
 		// progress). If it appears that compactions have quiesced, pause for a
 		// fixed duration to see if a new one is scheduled. If not, consider
 		// compactions quiesced.
-		if workloadExhausted && !alreadyCompleted && r.compactionsAppearQuiesced(m) {
+		if workloadExhausted && !alreadyCompleted && r.compactionsAppearQuiesced() {
 			select {
 			case <-compactionCh:
 				// A new compaction just finished; compactions have not
@@ -466,7 +487,7 @@ func (r *Runner) refreshMetrics(ctx context.Context) error {
 				// from the moment quiescence was confirmed, rather than
 				// re-fetching (which could race with new compactions).
 				finalM := r.d.Metrics()
-				if r.compactionsAppearQuiesced(finalM) {
+				if r.compactionsAppearQuiesced() {
 					r.metrics.quiesceDuration = time.Since(workloadExhaustedAt)
 					r.finalMetrics = finalM
 					return nil
@@ -476,58 +497,51 @@ func (r *Runner) refreshMetrics(ctx context.Context) error {
 	}
 }
 
-// compactionsAppearQuiesced returns true if the database may have quiesced, and
-// there likely won't be additional compactions scheduled. Detecting quiescence
-// is a bit fraught: The various signals that Pebble makes available are
-// adjusted at different points in the compaction lifecycle, and database
-// mutexes are dropped and acquired between them. This makes it difficult to
-// reliably identify when compactions quiesce.
-//
-// For example, our call to DB.Metrics() may acquire the DB.mu mutex when a
-// compaction has just successfully completed, but before it's managed to
-// schedule the next compaction (DB.mu is dropped while it attempts to acquire
-// the manifest lock).
-func (r *Runner) compactionsAppearQuiesced(m *pebble.Metrics) bool {
-	r.compactionMu.Lock()
-	defer r.compactionMu.Unlock()
-	if m.Flush.NumInProgress > 0 {
-		return false
-	} else if m.Compact.NumInProgress > 0 && r.compactionMu.started != r.compactionMu.completed {
-		return false
-	}
-	return true
+// compactionsAppearQuiesced returns true if all flushes and compactions
+// that have started (FlushBegin/CompactionBegin) have also completed
+// (FlushEnd/CompactionEnd). This relies solely on the started/completed
+// counters rather than DB.Metrics().NumInProgress, because there is a
+// scheduling window between when a compaction is added to the in-progress
+// set (under d.mu) and when CompactionBegin fires (in a separate
+// goroutine). During this window NumInProgress > 0 but
+// started == completed, and using NumInProgress would prevent quiescence
+// detection during cascading compactions.
+func (r *Runner) compactionsAppearQuiesced() bool {
+	r.compactionOrFlushMu.Lock()
+	defer r.compactionOrFlushMu.Unlock()
+	return r.compactionOrFlushMu.started == r.compactionOrFlushMu.completed
 }
 
-// nextCompactionCompletes may be used to be notified when new compactions
-// complete. The caller is responsible for holding on to a monotonically
-// increasing count representing the number of compactions that have been
-// observed, beginning at zero.
+// nextCompactionOrFlushCompletes may be used to be notified when a new
+// compaction or flush completes. The caller is responsible for holding on to a
+// monotonically increasing count representing the number of completions that
+// have been observed, beginning at zero.
 //
-// The caller passes their current count as an argument. If a new compaction has
-// already completed since their provided count, nextCompactionCompletes returns
-// the new count and a true boolean return value. If a new compaction has not
-// yet completed, it returns a channel that will be closed when the next
-// compaction completes. This scheme allows the caller to select{...},
-// performing some action on every compaction completion.
-func (r *Runner) nextCompactionCompletes(
+// The caller passes their current count as an argument. If a new compaction or
+// flush has already completed since their provided count,
+// nextCompactionOrFlushCompletes returns the new count and a true boolean
+// return value. If neither has completed, it returns a channel that will be
+// closed when the next completion occurs. This scheme allows the caller to
+// select{...}, performing some action on every compaction or flush completion.
+func (r *Runner) nextCompactionOrFlushCompletes(
 	lastObserved int64,
 ) (count int64, alreadyOccurred bool, ch chan struct{}) {
-	r.compactionMu.Lock()
-	defer r.compactionMu.Unlock()
+	r.compactionOrFlushMu.Lock()
+	defer r.compactionOrFlushMu.Unlock()
 
-	if lastObserved < r.compactionMu.completed {
-		// There has already been another compaction since the last one observed
-		// by this caller. Return immediately.
-		return r.compactionMu.completed, true, nil
+	if lastObserved < r.compactionOrFlushMu.completed {
+		// There has already been another compaction or flush since the last
+		// one observed by this caller. Return immediately.
+		return r.compactionOrFlushMu.completed, true, nil
 	}
 
-	// The last observed compaction is still the most recent compaction.
-	// Return a channel that the caller can wait on to be notified when the
-	// next compaction occurs.
-	if r.compactionMu.ch == nil {
-		r.compactionMu.ch = make(chan struct{})
+	// No new completions since the caller's last observation. Return a
+	// channel that the caller can wait on to be notified when the next
+	// compaction or flush completes.
+	if r.compactionOrFlushMu.ch == nil {
+		r.compactionOrFlushMu.ch = make(chan struct{})
 	}
-	return lastObserved, false, r.compactionMu.ch
+	return lastObserved, false, r.compactionOrFlushMu.ch
 }
 
 // Wait waits for the workload replay to complete. Wait returns once the entire
@@ -657,35 +671,32 @@ func (r *Runner) eventListener() pebble.EventListener {
 			defer r.writeStallMetrics.Unlock()
 			r.writeStallMetrics.durationByReason[writeStallReason] += time.Since(writeStallBegin)
 		},
+		FlushBegin: func(_ pebble.FlushInfo) {
+			r.compactionOrFlushMu.Lock()
+			defer r.compactionOrFlushMu.Unlock()
+			r.compactionOrFlushMu.started++
+		},
 		FlushEnd: func(_ pebble.FlushInfo) {
-			// Close compactionMu.ch to wake refreshMetrics so it can re-check
-			// quiescence. This is necessary because compactionsAppearQuiesced
-			// checks Flush.NumInProgress; if the last in-flight operation is a flush
-			// and no compaction follows, refreshMetrics would block on
-			// compactionMu.ch forever.
-			r.compactionMu.Lock()
-			defer r.compactionMu.Unlock()
-			if r.compactionMu.ch != nil {
-				close(r.compactionMu.ch)
-				r.compactionMu.ch = nil
+			r.compactionOrFlushMu.Lock()
+			defer r.compactionOrFlushMu.Unlock()
+			r.compactionOrFlushMu.completed++
+			if r.compactionOrFlushMu.ch != nil {
+				close(r.compactionOrFlushMu.ch)
+				r.compactionOrFlushMu.ch = nil
 			}
 		},
 		CompactionBegin: func(_ pebble.CompactionInfo) {
-			r.compactionMu.Lock()
-			defer r.compactionMu.Unlock()
-			r.compactionMu.started++
+			r.compactionOrFlushMu.Lock()
+			defer r.compactionOrFlushMu.Unlock()
+			r.compactionOrFlushMu.started++
 		},
 		CompactionEnd: func(_ pebble.CompactionInfo) {
-			// Keep track of the number of compactions that complete and notify
-			// anyone waiting for a compaction to complete. See the function
-			// nextCompactionCompletes for the corresponding receiver side.
-			r.compactionMu.Lock()
-			defer r.compactionMu.Unlock()
-			r.compactionMu.completed++
-			if r.compactionMu.ch != nil {
-				// Signal that a compaction has completed.
-				close(r.compactionMu.ch)
-				r.compactionMu.ch = nil
+			r.compactionOrFlushMu.Lock()
+			defer r.compactionOrFlushMu.Unlock()
+			r.compactionOrFlushMu.completed++
+			if r.compactionOrFlushMu.ch != nil {
+				close(r.compactionOrFlushMu.ch)
+				r.compactionOrFlushMu.ch = nil
 			}
 		},
 	}
diff --git a/replay/replay_test.go b/replay/replay_test.go
@@ -628,11 +628,11 @@ func TestCompactionsQuiesce(t *testing.T) {
 }
 
 // TestFlushEndNotifiesRefreshMetrics is a regression test for a hang where
-// refreshMetrics blocks forever on compactionMu.ch when the last in-flight
-// operation is a flush and no compaction follows. With
-// DisableAutomaticCompactions, no CompactionEnd event ever fires, so the only
-// way for refreshMetrics to make progress is via the FlushEnd handler closing
-// compactionMu.ch. Without the fix, this test hangs.
+// refreshMetrics blocks forever when the last in-flight operation is a flush
+// and no compaction follows. With DisableAutomaticCompactions, no CompactionEnd
+// event ever fires, so the only way for refreshMetrics to make progress is via
+// the FlushEnd handler incrementing compactionOrFlushMu.completed and closing
+// the notification channel. Without the fix, this test hangs.
 func TestFlushEndNotifiesRefreshMetrics(t *testing.T) {
 	// Build a workload that consists of a single flush and no compactions.
 	workloadFS := buildFlushOnlyWorkload(t)
@@ -667,9 +667,9 @@ func TestFlushEndNotifiesRefreshMetrics(t *testing.T) {
 	} else if invariants.Enabled {
 		wait = 30 * time.Second
 	}
-	// Without the FlushEnd handler closing compactionMu.ch, Wait would hang
-	// forever because DisableAutomaticCompactions prevents any CompactionEnd
-	// event from ever firing.
+	// Without the FlushEnd handler incrementing compactionOrFlushMu.completed,
+	// Wait would hang forever because DisableAutomaticCompactions prevents any
+	// CompactionEnd event from ever firing.
 	require.Eventually(t, func() bool { return done.Load() },
 		wait, time.Millisecond, "(*replay.Runner).Wait didn't terminate")
 	require.NoError(t, err)