Skip to content

Commit 0b09a0a

Browse files
committed
Add test failover support for DR placement control
Introduce non-destructive TestFailover action to verify secondary cluster readiness without committing to failover. - Add VRGActionTestFailover and update CRD enums/YAML - Implement placement logic, cleanup, and action execution refactor - Exclude test primaries from multi-primary checks - Restore original placement decisions after test failover - Treat TestFailover like Failover for resync and VolSync restore - Skip LastAppDeploymentCluster updates during test failover - Improve comments, readability, and lint compliance Signed-off-by: Benamar Mekhissi <[email protected]>
1 parent eb31090 commit 0b09a0a

9 files changed

Lines changed: 134 additions & 14 deletions

api/v1alpha1/drplacementcontrol_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ const (
132132
ProgressionDeleting = ProgressionStatus("Deleting")
133133
ProgressionDeleted = ProgressionStatus("Deleted")
134134
ProgressionActionPaused = ProgressionStatus("Paused")
135-
ProgressionTestingFailover = ProgressionStatus("TestingFailover")
135+
ProgressionTestingFailover = ProgressionStatus("TestingFailover")
136136
)
137137

138138
// DRPlacementControlSpec defines the desired state of DRPlacementControl

api/v1alpha1/volumereplicationgroup_types.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ type MoverConfig struct {
160160
}
161161

162162
// VRGAction which will be either a Failover or Relocate
163-
// +kubebuilder:validation:Enum=Failover;Relocate
163+
// +kubebuilder:validation:Enum=Failover;Relocate;TestFailover
164164
type VRGAction string
165165

166166
// These are the valid values for VRGAction
@@ -172,6 +172,11 @@ const (
172172
// Relocate, VRG was relocated to/from this cluster,
173173
// the to/from is determined by VRG spec.ReplicationState values of Primary/Secondary respectively
174174
VRGActionRelocate = VRGAction("Relocate")
175+
176+
// TestFailover, VRG is in a test failover state where the secondary is temporarily promoted
177+
// to primary to verify readiness and data consistency without committing to the actual failover.
178+
// Test failover is non-destructive and can be aborted to return to the original state.
179+
VRGActionTestFailover = VRGAction("TestFailover")
175180
)
176181

177182
type KubeObjectProtectionSpec struct {

config/crd/bases/ramendr.openshift.io_drplacementcontrols.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ spec:
7979
enum:
8080
- Failover
8181
- Relocate
82+
- TestFailover
8283
type: string
8384
drPolicyRef:
8485
description: DRPolicyRef is the reference to the DRPolicy participating

config/crd/bases/ramendr.openshift.io_protectedvolumereplicationgrouplists.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ spec:
115115
enum:
116116
- Failover
117117
- Relocate
118+
- TestFailover
118119
type: string
119120
async:
120121
description: VRGAsyncSpec has the parameters associated

config/crd/bases/ramendr.openshift.io_volumereplicationgroups.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ spec:
6666
enum:
6767
- Failover
6868
- Relocate
69+
- TestFailover
6970
type: string
7071
async:
7172
description: VRGAsyncSpec has the parameters associated with RegionalDR

internal/controller/drplacementcontrol.go

Lines changed: 40 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -108,10 +108,35 @@ func (d *DRPCInstance) startProcessing() bool {
108108
func (d *DRPCInstance) processPlacement() (bool, error) {
109109
d.log.Info("Process DRPC Placement", "DRAction", d.instance.Spec.Action)
110110

111+
// Handle test failover cleanup when switching away from test failover to another action
111112
if d.instance.Spec.Action != rmn.ActionTestFailover {
112113
rmnutil.AddAnnotation(d.instance, "ramendr.openshift.io/last-action", string(d.instance.Spec.Action))
114+
115+
// If we're exiting a test failover (were in TestingFailover progression), clean up placement
116+
// decisions that were retained during the test, then requeue for the new action.
117+
if d.instance.Status.Progression == rmn.ProgressionTestingFailover {
118+
for clusterName, vrg := range d.vrgs {
119+
// Find the primary VRG that was used for testing and clean up after it
120+
if isVRGPrimary(vrg) && vrg.Spec.Action == rmn.VRGActionTestFailover {
121+
// Remove cluster decision that was retained during test failover
122+
if err := d.reconciler.removeClusterDecisionAfterTestFailover(d.ctx, d.userPlacement, clusterName); err != nil {
123+
return false, err
124+
}
125+
126+
// Reset progression state back to completed for the new action
127+
d.setProgression(rmn.ProgressionCompleted)
128+
129+
// Requeue to process the new action with a clean slate (placement decisions reset)
130+
return false, nil
131+
}
132+
}
133+
}
113134
}
114135

136+
return d.executeAction()
137+
}
138+
139+
func (d *DRPCInstance) executeAction() (bool, error) {
115140
switch d.instance.Spec.Action {
116141
case rmn.ActionFailover:
117142
return d.RunFailover()
@@ -150,13 +175,7 @@ func (d *DRPCInstance) RunInitialDeployment() (bool, error) {
150175
// Check if we already deployed in the homeCluster or elsewhere
151176
deployed, clusterName := d.isDeployed(homeCluster)
152177
if deployed && clusterName != homeCluster {
153-
err := d.ensureVRGManifestWork(clusterName)
154-
if err != nil {
155-
return !done, err
156-
}
157-
158-
// IF deployed on cluster that is not the preferred HomeCluster, then we are done
159-
return done, nil
178+
return d.ensureInitialDeployActionCompleted(homeCluster)
160179
}
161180

162181
// Ensure that initial deployment is complete
@@ -398,8 +417,7 @@ func (d *DRPCInstance) RunFailover() (bool, error) {
398417
if d.instance.Spec.Action == rmn.ActionTestFailover {
399418
d.setProgression(rmn.ProgressionTestingFailover)
400419

401-
err := d.ensurePlacement(failoverCluster)
402-
if err != nil {
420+
if err := d.ensurePlacement(failoverCluster); err != nil {
403421
return !done, err
404422
}
405423

@@ -967,6 +985,14 @@ func (d *DRPCInstance) ensureFailoverActionCompleted(srcCluster string) (bool, e
967985
return d.ensureActionCompleted(srcCluster)
968986
}
969987

988+
func (d *DRPCInstance) ensureInitialDeployActionCompleted(srcCluster string) (bool, error) {
989+
// This function was added to handle cleanup in case test failover was initiated while VRG was still
990+
// in the initial deployment state. It ensures proper finalization anc cleanup if necessary.
991+
d.setProgression(rmn.ProgressionCleaningUp)
992+
993+
return d.ensureActionCompleted(srcCluster)
994+
}
995+
970996
func isDiscoveredApp(drpc *rmn.DRPlacementControl) bool {
971997
return drpc.Spec.ProtectedNamespaces != nil && len(*drpc.Spec.ProtectedNamespaces) > 0
972998
}
@@ -1150,10 +1176,12 @@ func (d *DRPCInstance) runFinalSync(homeCluster string) (bool, error) {
11501176
}
11511177

11521178
func (d *DRPCInstance) areMultipleVRGsPrimary() bool {
1179+
// Count actual primaries, excluding test failover primaries which are temporary and can coexist
1180+
// with the original primary during testing without being considered a conflicting state.
11531181
numOfPrimaries := 0
11541182

11551183
for _, vrg := range d.vrgs {
1156-
if isVRGPrimary(vrg) {
1184+
if isVRGPrimary(vrg) && vrg.Spec.Action != rmn.VRGActionTestFailover {
11571185
numOfPrimaries++
11581186
}
11591187
}
@@ -1927,6 +1955,8 @@ func vrgAction(drpcAction rmn.DRAction) rmn.VRGAction {
19271955
return rmn.VRGActionFailover
19281956
case rmn.ActionRelocate:
19291957
return rmn.VRGActionRelocate
1958+
case rmn.ActionTestFailover:
1959+
return rmn.VRGActionTestFailover
19301960
default:
19311961
return ""
19321962
}

internal/controller/drplacementcontrol_controller.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2004,6 +2004,86 @@ func (r *DRPlacementControlReconciler) removePlacementClusterDecisionForFailover
20042004
return nil
20052005
}
20062006

2007+
func (r *DRPlacementControlReconciler) removeClusterDecisionAfterTestFailover(
2008+
ctx context.Context,
2009+
placement interface{},
2010+
clusterName string,
2011+
) error {
2012+
switch obj := placement.(type) {
2013+
case *plrv1.PlacementRule:
2014+
return r.removePlacementRuleClusterDecisionAfterTestFailover(ctx, obj, clusterName)
2015+
case *clrapiv1beta1.Placement:
2016+
return r.removePlacementClusterDecisionAfterTestFailover(ctx, obj, clusterName)
2017+
default:
2018+
return fmt.Errorf("failed to find Placement or PlacementRule")
2019+
}
2020+
}
2021+
2022+
func (r *DRPlacementControlReconciler) removePlacementRuleClusterDecisionAfterTestFailover(
2023+
_ context.Context,
2024+
_ *plrv1.PlacementRule,
2025+
_ string,
2026+
) error {
2027+
// PlacementRule support for test failover cleanup is not yet implemented.
2028+
// PlacementRule is a legacy API; modern clusters use Placement (PlacementDecision).
2029+
return nil
2030+
}
2031+
2032+
// removePlacementClusterDecisionAfterTestFailover removes a cluster decision that matches the passed in clusterName
2033+
// after test failover cleanup to restore the placement to its original state.
2034+
func (r *DRPlacementControlReconciler) removePlacementClusterDecisionAfterTestFailover(
2035+
ctx context.Context,
2036+
placement *clrapiv1beta1.Placement,
2037+
clusterName string,
2038+
) error {
2039+
plDecision, err := r.getPlacementDecisionFromPlacement(placement)
2040+
if err != nil {
2041+
return err
2042+
}
2043+
2044+
if plDecision == nil {
2045+
return nil
2046+
}
2047+
2048+
dropped := false
2049+
decisions := []clrapiv1beta1.ClusterDecision{}
2050+
2051+
for idx := range plDecision.Status.Decisions {
2052+
if plDecision.Status.Decisions[idx].ClusterName == clusterName {
2053+
dropped = true
2054+
2055+
continue
2056+
}
2057+
2058+
plDecision.Status.Decisions[idx].Reason = plDecision.Status.Decisions[idx].ClusterName
2059+
decisions = append(decisions, plDecision.Status.Decisions[idx])
2060+
}
2061+
2062+
if !dropped {
2063+
return nil
2064+
}
2065+
2066+
plDecision.Status = clrapiv1beta1.PlacementDecisionStatus{
2067+
Decisions: decisions,
2068+
}
2069+
2070+
if err := r.Status().Update(ctx, plDecision); err != nil {
2071+
return fmt.Errorf(
2072+
"failed to update placementDecision status to drop cluster decision (%s) for 'test' failover (%w)",
2073+
clusterName,
2074+
err,
2075+
)
2076+
}
2077+
2078+
r.Log.Info(
2079+
"Updated PlacementDecision to drop cluster decision for 'test' failover",
2080+
"ClusterName", clusterName,
2081+
"PlacementDecision", plDecision.Status.Decisions,
2082+
)
2083+
2084+
return nil
2085+
}
2086+
20072087
// retainClusterDecisionAsFailover retains a cluster decision in the placement with the reason as
20082088
// PlacementDecisionReasonFailoverRetained
20092089
func (r *DRPlacementControlReconciler) retainClusterDecisionAsFailover(

internal/controller/vrg_volrep.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1263,7 +1263,8 @@ func (v *VRGInstance) autoResync(state volrep.ReplicationState) bool {
12631263
return false
12641264
}
12651265

1266-
if v.instance.Spec.Action != ramendrv1alpha1.VRGActionFailover {
1266+
if v.instance.Spec.Action != ramendrv1alpha1.VRGActionFailover &&
1267+
v.instance.Spec.Action != ramendrv1alpha1.VRGActionTestFailover {
12671268
return false
12681269
}
12691270

internal/controller/vrg_volsync.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ func (v *VRGInstance) restorePVsAndPVCsForVolSync() (int, error) {
3333
numPVsRestored := 0
3434

3535
for _, rdSpec := range v.instance.Spec.VolSync.RDSpec {
36-
failoverAction := v.instance.Spec.Action == ramendrv1alpha1.VRGActionFailover
36+
failoverAction := (v.instance.Spec.Action == ramendrv1alpha1.VRGActionFailover) ||
37+
(v.instance.Spec.Action == ramendrv1alpha1.VRGActionTestFailover)
3738

3839
var err error
3940
// Source conditions are not needed and should not be added to vrg.status.ProtectedPVCs,

0 commit comments

Comments
 (0)