88# - In-cluster pod logs (Uvicorn up, Llama OK) do not reflect localhost bind races; "address already in use"
99# is the CI runner, not the application.
1010# - E2E_LSC_PORT_FORWARD_PID_FILE coordinates the handoff.
11+ # - pipeline-konflux.sh (and hooks) forward llama-stack-service-svc to localhost:8321 for
12+ # Behave steps that call Llama Stack directly (MCP toolgroups, shields). When the llama
13+ # pod is recreated, that forward must be restarted or you get "PodSandbox ... not found" /
14+ # APIConnectionError on subsequent scenarios.
15+ # - E2E_LLAMA_PORT_FORWARD_PID_FILE coordinates killing/restarting the 8321 forward.
1116#
1217# Commands:
1318# restart-lightspeed - Restart lightspeed-stack pod and port-forward
14- # restart-llama-stack - Restart/restore llama-stack pod
19+ # restart-llama-stack - Restart/restore llama-stack pod and localhost:8321 forward
1520# restart-port-forward - Re-establish port-forward for lightspeed
21+ # restart-llama-port-forward - Re-establish port-forward for Llama Stack (8321)
1622# wait-for-pod <name> [attempts] - Wait for a pod to be ready
1723# update-configmap <name> <file> - Update ConfigMap from file
1824# get-configmap-content <name> - Get ConfigMap content (outputs to stdout)
@@ -25,6 +31,7 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
2531MANIFEST_DIR=" $SCRIPT_DIR /../manifests/lightspeed"
2632# Written by pipeline.sh when it starts LCS port-forward; e2e-ops kills this PID before rebinding 8080.
2733E2E_LSC_PORT_FORWARD_PID_FILE=" ${E2E_LSC_PORT_FORWARD_PID_FILE:-/ tmp/ e2e-lightspeed-port-forward.pid} "
34+ E2E_LLAMA_PORT_FORWARD_PID_FILE=" ${E2E_LLAMA_PORT_FORWARD_PID_FILE:-/ tmp/ e2e-llama-port-forward.pid} "
2835
2936# ============================================================================
3037# Helper functions
@@ -123,6 +130,24 @@ kill_stale_lightspeed_forward() {
123130 free_local_tcp_port " $port "
124131}
125132
133+ # Kill anything likely to hold the Llama Stack local forward (localhost:8321).
134+ kill_stale_llama_forward () {
135+ local port=" ${1:- 8321} "
136+ local saved_pf
137+ if [[ -f " $E2E_LLAMA_PORT_FORWARD_PID_FILE " ]]; then
138+ read -r saved_pf < " $E2E_LLAMA_PORT_FORWARD_PID_FILE " 2> /dev/null || true
139+ if [[ " $saved_pf " =~ ^[0-9]+$ ]]; then
140+ kill -9 " $saved_pf " 2> /dev/null || true
141+ fi
142+ fi
143+ pkill -9 -f " port-forward.*llama-stack-service-svc.*${port} :${port} " 2> /dev/null || true
144+ pkill -9 -f " oc port-forward svc/llama-stack-service-svc ${port} :${port} " 2> /dev/null || true
145+ pkill -9 -f " port-forward pod/llama-stack-service.*${port} :${port} " 2> /dev/null || true
146+ free_local_tcp_port " $port "
147+ sleep 1
148+ free_local_tcp_port " $port "
149+ }
150+
126151# After oc port-forward dies in <2s, show recent oc stderr from the log file.
127152e2e_ops_emit_port_forward_immediate_failure_diag () {
128153 echo " [e2e-ops] /tmp/port-forward.log (tail 25):"
@@ -266,6 +291,11 @@ cmd_restart_llama_stack() {
266291 oc label pod llama-stack-service pod=llama-stack-service -n " $NAMESPACE " --overwrite
267292 fi
268293
294+ if ! cmd_restart_llama_port_forward; then
295+ echo " ERROR: Llama pod is up but localhost:${LOCAL_LLAMA_PORT:- 8321} port-forward failed"
296+ exit 1
297+ fi
298+
269299 echo " ===== Llama-stack restore complete ====="
270300}
271301
@@ -334,6 +364,84 @@ cmd_restart_port_forward() {
334364 return 1
335365}
336366
367+ verify_llama_local_forward () {
368+ local max_attempts=" ${1:- 15} "
369+ local http_code=" "
370+ local attempt
371+
372+ for (( attempt= 1 ; attempt<= max_attempts; attempt++ )) ; do
373+ http_code=$( curl -s -o /dev/null -w ' %{http_code}' --max-time 5 " http://127.0.0.1:8321/v1/health" 2> /dev/null) || http_code=" 000"
374+ if [[ " $http_code " == " 200" ]]; then
375+ return 0
376+ fi
377+ if [[ $attempt -lt $max_attempts ]]; then
378+ sleep 2
379+ fi
380+ done
381+ echo " Llama Stack localhost:8321 connectivity check failed (HTTP: ${http_code:- unknown} )"
382+ return 1
383+ }
384+
385+ cmd_restart_llama_port_forward () {
386+ local local_port=" ${LOCAL_LLAMA_PORT:- 8321} "
387+ local remote_port=" ${REMOTE_LLAMA_PORT:- 8321} "
388+ local max_attempts=6
389+ local pf_pid
390+ local pf_resource
391+ local llama_pf_log=" /tmp/port-forward-llama.log"
392+
393+ echo " Re-establishing Llama Stack port-forward on $local_port :$remote_port ..."
394+
395+ for (( attempt= 1 ; attempt<= max_attempts; attempt++ )) ; do
396+ kill_stale_llama_forward " $local_port "
397+ sleep 3
398+
399+ if [[ $attempt -le 2 ]]; then
400+ pf_resource=" svc/llama-stack-service-svc"
401+ else
402+ pf_resource=" pod/llama-stack-service"
403+ fi
404+ echo " Llama port-forward attempt $attempt /$max_attempts -> $pf_resource "
405+
406+ : > " $llama_pf_log "
407+ nohup oc port-forward " $pf_resource " " $local_port :$remote_port " -n " $NAMESPACE " \
408+ < /dev/null > " $llama_pf_log " 2>&1 &
409+ pf_pid=$!
410+ disown " $pf_pid " 2> /dev/null || true
411+ sleep 3
412+
413+ if ! kill -0 " $pf_pid " 2> /dev/null; then
414+ echo " Llama port-forward process exited immediately:"
415+ if [[ -s " $llama_pf_log " ]]; then
416+ tail -25 " $llama_pf_log " 2> /dev/null | sed ' s/^/[e2e-ops] /' || true
417+ fi
418+ kill_stale_llama_forward " $local_port "
419+ sleep 2
420+ continue
421+ fi
422+ sleep 4
423+
424+ if verify_llama_local_forward 12; then
425+ echo " $pf_pid " > " $E2E_LLAMA_PORT_FORWARD_PID_FILE "
426+ echo " [e2e-ops] Llama through port-forward: GET http://127.0.0.1:$local_port /v1/health -> OK"
427+ echo " ✓ Llama Stack port-forward established (PID: $pf_pid , $pf_resource )"
428+ return 0
429+ fi
430+
431+ if [[ $attempt -lt $max_attempts ]]; then
432+ echo " Llama forward attempt $attempt failed, retrying..."
433+ kill -9 " $pf_pid " 2> /dev/null || true
434+ sleep 2
435+ fi
436+ done
437+
438+ echo " Failed to establish Llama Stack port-forward on :$local_port "
439+ if [[ -s " $llama_pf_log " ]]; then
440+ tail -30 " $llama_pf_log " 2> /dev/null | sed ' s/^/[e2e-ops] /' || true
441+ fi
442+ return 1
443+ }
444+
337445cmd_wait_for_pod () {
338446 local pod_name=" ${1:? Pod name required} "
339447 local max_attempts=" ${2:- 24} "
@@ -395,6 +503,9 @@ case "$COMMAND" in
395503 restart-llama-stack)
396504 cmd_restart_llama_stack
397505 ;;
506+ restart-llama-port-forward)
507+ cmd_restart_llama_port_forward
508+ ;;
398509 restart-port-forward)
399510 cmd_restart_port_forward
400511 ;;
@@ -416,6 +527,7 @@ case "$COMMAND" in
416527 echo " Commands:"
417528 echo " restart-lightspeed - Restart lightspeed-stack pod and port-forward"
418529 echo " restart-llama-stack - Restart/restore llama-stack pod"
530+ echo " restart-llama-port-forward - Re-establish port-forward for Llama (8321)"
419531 echo " restart-port-forward - Re-establish port-forward for lightspeed"
420532 echo " wait-for-pod <name> [attempts] - Wait for a pod to be ready"
421533 echo " update-configmap <name> <file> - Update ConfigMap from file"
0 commit comments