Skip to content

Commit d580b44

Browse files
author
Radovan Fuchs
committed
fix
1 parent adc7dd9 commit d580b44

File tree

3 files changed

+125
-15
lines changed

3 files changed

+125
-15
lines changed

tests/e2e-prow/rhoai/pipeline-konflux.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,9 @@ fi
236236
# So behave/e2e-ops can kill this listener before rebinding 8080 (restart-lightspeed hooks).
237237
# Debug hook/port churn: export E2E_OPS_VERBOSE=1 before running pipeline.sh
238238
export E2E_LSC_PORT_FORWARD_PID_FILE="${E2E_LSC_PORT_FORWARD_PID_FILE:-/tmp/e2e-lightspeed-port-forward.pid}"
239+
export E2E_LLAMA_PORT_FORWARD_PID_FILE="${E2E_LLAMA_PORT_FORWARD_PID_FILE:-/tmp/e2e-llama-port-forward.pid}"
239240
rm -f "$E2E_LSC_PORT_FORWARD_PID_FILE"
241+
rm -f "$E2E_LLAMA_PORT_FORWARD_PID_FILE"
240242

241243
oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n $NAMESPACE
242244

@@ -275,6 +277,7 @@ PF_JWKS_PID=$!
275277
log "Starting port-forward for llama-stack (MCP / llama_stack_client hooks)..."
276278
oc port-forward svc/llama-stack-service-svc 8321:8321 -n $NAMESPACE &
277279
PF_LLAMA_PID=$!
280+
echo "$PF_LLAMA_PID" >"$E2E_LLAMA_PORT_FORWARD_PID_FILE"
278281

279282
# Wait for port-forward to be usable (app may not be listening immediately; port-forward can drop)
280283
log "Waiting for port-forward to lightspeed-stack to be ready..."
@@ -324,6 +327,7 @@ for i in $(seq 1 36); do
324327
log "Llama port-forward died, restarting (attempt $i)..."
325328
oc port-forward svc/llama-stack-service-svc 8321:8321 -n $NAMESPACE &
326329
PF_LLAMA_PID=$!
330+
echo "$PF_LLAMA_PID" >"$E2E_LLAMA_PORT_FORWARD_PID_FILE"
327331
fi
328332
sleep 5
329333
done
@@ -364,6 +368,13 @@ if [[ -n "${E2E_LSC_PORT_FORWARD_PID_FILE:-}" && -f "$E2E_LSC_PORT_FORWARD_PID_F
364368
fi
365369
rm -f "$E2E_LSC_PORT_FORWARD_PID_FILE"
366370
fi
371+
if [[ -n "${E2E_LLAMA_PORT_FORWARD_PID_FILE:-}" && -f "$E2E_LLAMA_PORT_FORWARD_PID_FILE" ]]; then
372+
read -r _ll_pf <"$E2E_LLAMA_PORT_FORWARD_PID_FILE" 2>/dev/null || true
373+
if [[ "${_ll_pf:-}" =~ ^[0-9]+$ ]]; then
374+
kill -9 "$_ll_pf" 2>/dev/null || true
375+
fi
376+
rm -f "$E2E_LLAMA_PORT_FORWARD_PID_FILE"
377+
fi
367378
kill $PF_LCS_PID 2>/dev/null || true
368379
kill $PF_JWKS_PID 2>/dev/null || true
369380
kill $PF_LLAMA_PID 2>/dev/null || true

tests/e2e-prow/rhoai/scripts/e2e-ops.sh

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,17 @@
88
# - In-cluster pod logs (Uvicorn up, Llama OK) do not reflect localhost bind races; "address already in use"
99
# is the CI runner, not the application.
1010
# - E2E_LSC_PORT_FORWARD_PID_FILE coordinates the handoff.
11+
# - pipeline-konflux.sh (and hooks) forward llama-stack-service-svc to localhost:8321 for
12+
# Behave steps that call Llama Stack directly (MCP toolgroups, shields). When the llama
13+
# pod is recreated, that forward must be restarted or you get "PodSandbox ... not found" /
14+
# APIConnectionError on subsequent scenarios.
15+
# - E2E_LLAMA_PORT_FORWARD_PID_FILE coordinates killing/restarting the 8321 forward.
1116
#
1217
# Commands:
1318
# restart-lightspeed - Restart lightspeed-stack pod and port-forward
14-
# restart-llama-stack - Restart/restore llama-stack pod
19+
# restart-llama-stack - Restart/restore llama-stack pod and localhost:8321 forward
1520
# restart-port-forward - Re-establish port-forward for lightspeed
21+
# restart-llama-port-forward - Re-establish port-forward for Llama Stack (8321)
1622
# wait-for-pod <name> [attempts] - Wait for a pod to be ready
1723
# update-configmap <name> <file> - Update ConfigMap from file
1824
# get-configmap-content <name> - Get ConfigMap content (outputs to stdout)
@@ -25,6 +31,7 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
2531
MANIFEST_DIR="$SCRIPT_DIR/../manifests/lightspeed"
2632
# Written by pipeline.sh when it starts LCS port-forward; e2e-ops kills this PID before rebinding 8080.
2733
E2E_LSC_PORT_FORWARD_PID_FILE="${E2E_LSC_PORT_FORWARD_PID_FILE:-/tmp/e2e-lightspeed-port-forward.pid}"
34+
E2E_LLAMA_PORT_FORWARD_PID_FILE="${E2E_LLAMA_PORT_FORWARD_PID_FILE:-/tmp/e2e-llama-port-forward.pid}"
2835

2936
# ============================================================================
3037
# Helper functions
@@ -123,6 +130,24 @@ kill_stale_lightspeed_forward() {
123130
free_local_tcp_port "$port"
124131
}
125132

133+
# Kill anything likely to hold the Llama Stack local forward (localhost:8321).
134+
kill_stale_llama_forward() {
135+
local port="${1:-8321}"
136+
local saved_pf
137+
if [[ -f "$E2E_LLAMA_PORT_FORWARD_PID_FILE" ]]; then
138+
read -r saved_pf <"$E2E_LLAMA_PORT_FORWARD_PID_FILE" 2>/dev/null || true
139+
if [[ "$saved_pf" =~ ^[0-9]+$ ]]; then
140+
kill -9 "$saved_pf" 2>/dev/null || true
141+
fi
142+
fi
143+
pkill -9 -f "port-forward.*llama-stack-service-svc.*${port}:${port}" 2>/dev/null || true
144+
pkill -9 -f "oc port-forward svc/llama-stack-service-svc ${port}:${port}" 2>/dev/null || true
145+
pkill -9 -f "port-forward pod/llama-stack-service.*${port}:${port}" 2>/dev/null || true
146+
free_local_tcp_port "$port"
147+
sleep 1
148+
free_local_tcp_port "$port"
149+
}
150+
126151
# After oc port-forward dies in <2s, show recent oc stderr from the log file.
127152
e2e_ops_emit_port_forward_immediate_failure_diag() {
128153
echo "[e2e-ops] /tmp/port-forward.log (tail 25):"
@@ -266,6 +291,11 @@ cmd_restart_llama_stack() {
266291
oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite
267292
fi
268293

294+
if ! cmd_restart_llama_port_forward; then
295+
echo "ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:-8321} port-forward failed"
296+
exit 1
297+
fi
298+
269299
echo "===== Llama-stack restore complete ====="
270300
}
271301

@@ -334,6 +364,84 @@ cmd_restart_port_forward() {
334364
return 1
335365
}
336366

367+
verify_llama_local_forward() {
368+
local max_attempts="${1:-15}"
369+
local http_code=""
370+
local attempt
371+
372+
for ((attempt=1; attempt<=max_attempts; attempt++)); do
373+
http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://127.0.0.1:8321/v1/health" 2>/dev/null) || http_code="000"
374+
if [[ "$http_code" == "200" ]]; then
375+
return 0
376+
fi
377+
if [[ $attempt -lt $max_attempts ]]; then
378+
sleep 2
379+
fi
380+
done
381+
echo "Llama Stack localhost:8321 connectivity check failed (HTTP: ${http_code:-unknown})"
382+
return 1
383+
}
384+
385+
cmd_restart_llama_port_forward() {
386+
local local_port="${LOCAL_LLAMA_PORT:-8321}"
387+
local remote_port="${REMOTE_LLAMA_PORT:-8321}"
388+
local max_attempts=6
389+
local pf_pid
390+
local pf_resource
391+
local llama_pf_log="/tmp/port-forward-llama.log"
392+
393+
echo "Re-establishing Llama Stack port-forward on $local_port:$remote_port..."
394+
395+
for ((attempt=1; attempt<=max_attempts; attempt++)); do
396+
kill_stale_llama_forward "$local_port"
397+
sleep 3
398+
399+
if [[ $attempt -le 2 ]]; then
400+
pf_resource="svc/llama-stack-service-svc"
401+
else
402+
pf_resource="pod/llama-stack-service"
403+
fi
404+
echo "Llama port-forward attempt $attempt/$max_attempts -> $pf_resource"
405+
406+
: >"$llama_pf_log"
407+
nohup oc port-forward "$pf_resource" "$local_port:$remote_port" -n "$NAMESPACE" \
408+
</dev/null >"$llama_pf_log" 2>&1 &
409+
pf_pid=$!
410+
disown "$pf_pid" 2>/dev/null || true
411+
sleep 3
412+
413+
if ! kill -0 "$pf_pid" 2>/dev/null; then
414+
echo "Llama port-forward process exited immediately:"
415+
if [[ -s "$llama_pf_log" ]]; then
416+
tail -25 "$llama_pf_log" 2>/dev/null | sed 's/^/[e2e-ops] /' || true
417+
fi
418+
kill_stale_llama_forward "$local_port"
419+
sleep 2
420+
continue
421+
fi
422+
sleep 4
423+
424+
if verify_llama_local_forward 12; then
425+
echo "$pf_pid" >"$E2E_LLAMA_PORT_FORWARD_PID_FILE"
426+
echo "[e2e-ops] Llama through port-forward: GET http://127.0.0.1:$local_port/v1/health -> OK"
427+
echo "✓ Llama Stack port-forward established (PID: $pf_pid, $pf_resource)"
428+
return 0
429+
fi
430+
431+
if [[ $attempt -lt $max_attempts ]]; then
432+
echo "Llama forward attempt $attempt failed, retrying..."
433+
kill -9 "$pf_pid" 2>/dev/null || true
434+
sleep 2
435+
fi
436+
done
437+
438+
echo "Failed to establish Llama Stack port-forward on :$local_port"
439+
if [[ -s "$llama_pf_log" ]]; then
440+
tail -30 "$llama_pf_log" 2>/dev/null | sed 's/^/[e2e-ops] /' || true
441+
fi
442+
return 1
443+
}
444+
337445
cmd_wait_for_pod() {
338446
local pod_name="${1:?Pod name required}"
339447
local max_attempts="${2:-24}"
@@ -395,6 +503,9 @@ case "$COMMAND" in
395503
restart-llama-stack)
396504
cmd_restart_llama_stack
397505
;;
506+
restart-llama-port-forward)
507+
cmd_restart_llama_port_forward
508+
;;
398509
restart-port-forward)
399510
cmd_restart_port_forward
400511
;;
@@ -416,6 +527,7 @@ case "$COMMAND" in
416527
echo "Commands:"
417528
echo " restart-lightspeed - Restart lightspeed-stack pod and port-forward"
418529
echo " restart-llama-stack - Restart/restore llama-stack pod"
530+
echo " restart-llama-port-forward - Re-establish port-forward for Llama (8321)"
419531
echo " restart-port-forward - Re-establish port-forward for lightspeed"
420532
echo " wait-for-pod <name> [attempts] - Wait for a pod to be ready"
421533
echo " update-configmap <name> <file> - Update ConfigMap from file"

tests/e2e/test_list.txt

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,5 @@
1+
features/faiss.feature
12
features/smoketests.feature
23
features/authorized_noop.feature
3-
features/authorized_noop_token.feature
4-
features/authorized_rh_identity.feature
5-
features/rbac.feature
6-
features/conversations.feature
7-
features/conversation_cache_v2.feature
8-
features/feedback.feature
9-
features/health.feature
10-
features/info.feature
11-
features/responses.feature
12-
features/query.feature
13-
features/rlsapi_v1.feature
14-
features/rlsapi_v1_errors.feature
15-
features/streaming_query.feature
16-
features/rest_api.feature
174
features/mcp.feature
185
features/models.feature

0 commit comments

Comments
 (0)