Skip to content

Commit 1178d74

Browse files
committed
fixes from review
1 parent e3b9fb2 commit 1178d74

File tree

16 files changed

+194
-77
lines changed

16 files changed

+194
-77
lines changed

cmd/thv-operator/api/v1alpha1/virtualmcpserver_types.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,13 @@ type OperationalConfig struct {
388388
// FailureHandling configures failure handling behavior
389389
// +optional
390390
FailureHandling *FailureHandlingConfig `json:"failureHandling,omitempty"`
391+
392+
// CapabilityCacheTTL is the time-to-live for cached capability entries.
393+
// Defaults to 5 minutes if not specified.
394+
// Shorter values (e.g., "10s") can be used for testing to see capability updates sooner.
395+
// +kubebuilder:default="5m"
396+
// +optional
397+
CapabilityCacheTTL string `json:"capabilityCacheTTL,omitempty"`
391398
}
392399

393400
// TimeoutConfig configures timeout settings

cmd/thv-operator/controllers/virtualmcpserver_controller.go

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,7 +1128,13 @@ func countBackendHealth(ctx context.Context, backends []mcpv1alpha1.DiscoveredBa
11281128
return ready, unhealthy
11291129
}
11301130

1131-
// determineStatusFromBackends evaluates backend health to determine status
1131+
// determineStatusFromBackends evaluates backend health to determine status.
1132+
// Backend health is based on:
1133+
// - Workload phase (MCPServer/MCPRemoteProxy phase): Always evaluated
1134+
// - Runtime health checks: Only when health monitoring is configured
1135+
//
1136+
// When health monitoring is disabled, backends with healthy workload phase are marked as Ready.
1137+
// When health monitoring is enabled, backends must pass both workload phase checks and runtime health checks.
11321138
func (*VirtualMCPServerReconciler) determineStatusFromBackends(
11331139
ctx context.Context,
11341140
vmcp *mcpv1alpha1.VirtualMCPServer,
@@ -1640,6 +1646,11 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
16401646
discoveredBackends := make([]mcpv1alpha1.DiscoveredBackend, 0, len(typedWorkloads))
16411647
now := metav1.Now()
16421648

1649+
// Check if health monitoring is enabled for this VirtualMCPServer
1650+
healthMonitoringEnabled := vmcp.Spec.Operational != nil &&
1651+
vmcp.Spec.Operational.FailureHandling != nil &&
1652+
vmcp.Spec.Operational.FailureHandling.HealthCheckInterval != ""
1653+
16431654
// Convert vmcp.Backend to DiscoveredBackend for all workloads in the group
16441655
for _, workloadInfo := range typedWorkloads {
16451656
backend, found := discoveredBackendMap[workloadInfo.Name]
@@ -1674,6 +1685,13 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
16741685
ctxLogger.V(1).Info("Backend MCPServer not ready, marking as unavailable",
16751686
"name", workloadInfo.Name,
16761687
"phase", mcpServer.Status.Phase)
1688+
} else if backendStatus == mcpv1alpha1.BackendStatusUnknown && !healthMonitoringEnabled {
1689+
// When health monitoring is disabled and workload phase is healthy,
1690+
// treat unknown health status as ready (no runtime health checks configured)
1691+
backendStatus = mcpv1alpha1.BackendStatusReady
1692+
ctxLogger.V(1).Info("Health monitoring disabled with healthy phase, marking backend as ready",
1693+
"name", workloadInfo.Name,
1694+
"phase", mcpServer.Status.Phase)
16771695
}
16781696
}
16791697
case workloads.WorkloadTypeMCPRemoteProxy:
@@ -1688,6 +1706,13 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
16881706
ctxLogger.V(1).Info("Backend MCPRemoteProxy not ready, marking as unavailable",
16891707
"name", workloadInfo.Name,
16901708
"phase", mcpRemoteProxy.Status.Phase)
1709+
} else if backendStatus == mcpv1alpha1.BackendStatusUnknown && !healthMonitoringEnabled {
1710+
// When health monitoring is disabled and workload phase is healthy,
1711+
// treat unknown health status as ready (no runtime health checks configured)
1712+
backendStatus = mcpv1alpha1.BackendStatusReady
1713+
ctxLogger.V(1).Info("Health monitoring disabled with healthy phase, marking backend as ready",
1714+
"name", workloadInfo.Name,
1715+
"phase", mcpRemoteProxy.Status.Phase)
16911716
}
16921717
}
16931718
}
@@ -1715,14 +1740,11 @@ func (r *VirtualMCPServerReconciler) discoverBackends(
17151740
// Performance: Health status responses are cached with healthStatusCacheTTL to reduce HTTP
17161741
// overhead from frequent reconciliations while maintaining relatively fresh health data.
17171742
// The vmcp health endpoint itself returns cached results from periodic health checks.
1718-
if vmcp.Status.URL != "" {
1743+
if vmcp.Status.URL != "" && healthMonitoringEnabled {
17191744
// Parse health check interval from spec to derive query timeout
17201745
var healthCheckInterval time.Duration
1721-
if vmcp.Spec.Operational != nil && vmcp.Spec.Operational.FailureHandling != nil &&
1722-
vmcp.Spec.Operational.FailureHandling.HealthCheckInterval != "" {
1723-
if interval, err := time.ParseDuration(vmcp.Spec.Operational.FailureHandling.HealthCheckInterval); err == nil {
1724-
healthCheckInterval = interval
1725-
}
1746+
if interval, err := time.ParseDuration(vmcp.Spec.Operational.FailureHandling.HealthCheckInterval); err == nil {
1747+
healthCheckInterval = interval
17261748
}
17271749

17281750
healthStatus := r.queryVMCPHealthStatus(ctx, vmcp.Status.URL, healthCheckInterval)

cmd/thv-operator/pkg/vmcpconfig/converter.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -921,5 +921,12 @@ func (*Converter) convertOperational(
921921
}
922922
}
923923

924+
// Parse capability cache TTL
925+
if vmcp.Spec.Operational.CapabilityCacheTTL != "" {
926+
if duration, err := time.ParseDuration(vmcp.Spec.Operational.CapabilityCacheTTL); err == nil {
927+
operational.CapabilityCacheTTL = vmcpconfig.Duration(duration)
928+
}
929+
}
930+
924931
return operational
925932
}

cmd/vmcp/app/commands.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,16 @@ func runServe(cmd *cobra.Command, _ []string) error {
295295
agg := aggregator.NewDefaultAggregator(backendClient, conflictResolver, cfg.Aggregation.Tools)
296296

297297
// Create discovery manager for lazy per-user capability discovery
298-
discoveryMgr, err := discovery.NewManager(agg)
298+
// Use configured cache TTL if available, otherwise use default
299+
var discoveryMgr discovery.Manager
300+
if cfg.Operational != nil && cfg.Operational.CapabilityCacheTTL > 0 {
301+
cacheTTL := time.Duration(cfg.Operational.CapabilityCacheTTL)
302+
logger.Debugf("Creating discovery manager with custom cache TTL: %v", cacheTTL)
303+
discoveryMgr, err = discovery.NewManagerWithTTL(agg, cacheTTL)
304+
} else {
305+
logger.Debugf("Creating discovery manager with default cache TTL: %v", discovery.DefaultCacheTTL)
306+
discoveryMgr, err = discovery.NewManager(agg)
307+
}
299308
if err != nil {
300309
return fmt.Errorf("failed to create discovery manager: %w", err)
301310
}

deploy/charts/operator-crds/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@ apiVersion: v2
22
name: toolhive-operator-crds
33
description: A Helm chart for installing the ToolHive Operator CRDs into Kubernetes.
44
type: application
5-
version: 0.0.88
5+
version: 0.0.89
66
appVersion: "0.0.1"

deploy/charts/operator-crds/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# ToolHive Operator CRDs Helm Chart
22

3-
![Version: 0.0.88](https://img.shields.io/badge/Version-0.0.88-informational?style=flat-square)
3+
![Version: 0.0.89](https://img.shields.io/badge/Version-0.0.89-informational?style=flat-square)
44
![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square)
55

66
A Helm chart for installing the ToolHive Operator CRDs into Kubernetes.

deploy/charts/operator-crds/files/crds/toolhive.stacklok.dev_virtualmcpservers.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,6 +644,13 @@ spec:
644644
description: Operational defines operational settings like timeouts
645645
and health checks
646646
properties:
647+
capabilityCacheTTL:
648+
default: 5m
649+
description: |-
650+
CapabilityCacheTTL is the time-to-live for cached capability entries.
651+
Defaults to 5 minutes if not specified.
652+
Shorter values (e.g., "10s") can be used for testing to see capability updates sooner.
653+
type: string
647654
failureHandling:
648655
description: FailureHandling configures failure handling behavior
649656
properties:

deploy/charts/operator-crds/templates/toolhive.stacklok.dev_virtualmcpservers.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,13 @@ spec:
647647
description: Operational defines operational settings like timeouts
648648
and health checks
649649
properties:
650+
capabilityCacheTTL:
651+
default: 5m
652+
description: |-
653+
CapabilityCacheTTL is the time-to-live for cached capability entries.
654+
Defaults to 5 minutes if not specified.
655+
Shorter values (e.g., "10s") can be used for testing to see capability updates sooner.
656+
type: string
650657
failureHandling:
651658
description: FailureHandling configures failure handling behavior
652659
properties:

docs/operator/crd-api.md

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/vmcp/config/config.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,11 @@ type OperationalConfig struct {
242242

243243
// FailureHandling configures failure handling.
244244
FailureHandling *FailureHandlingConfig `json:"failure_handling,omitempty" yaml:"failure_handling,omitempty"`
245+
246+
// CapabilityCacheTTL is the time-to-live for cached capability entries.
247+
// Defaults to 5 minutes if not specified or zero.
248+
// Shorter values (e.g., 10s) can be used for testing to see capability updates sooner.
249+
CapabilityCacheTTL Duration `json:"capability_cache_ttl,omitempty" yaml:"capability_cache_ttl,omitempty"`
245250
}
246251

247252
// TimeoutConfig configures timeouts.

0 commit comments

Comments
 (0)