feat(e2e): add backward compatibility e2e tests

jakharmonika364 · jakharmonika364 · commit 36a5d01565fb · 2026-03-08T11:10:37.000+05:30
Signed-off-by: Monika Jakhar &lt;jakharmonika364@gmail.com&gt;
diff --git a/.github/scripts/build-all-images.sh b/.github/scripts/build-all-images.sh
@@ -1,39 +1,39 @@
 #!/bin/bash
 set -e
 
-function get_image_tag() {
+get_image_tag() {
     version=$(grep "^VERSION := " ./Makefile)
-    version=${version#VERSION := }
+    version="${version#VERSION := }"
 
     git_sha=$(git rev-parse --short HEAD || echo "HEAD")
-    export IMAGE_TAG=${version}-${git_sha}
+    export IMAGE_TAG="${version}-${git_sha}"
 }
 
-function build_images() {
+build_images() {
     images=(
-        ${IMG_REPO}/dataset-controller:${IMAGE_TAG}
-        ${IMG_REPO}/application-controller:${IMAGE_TAG}
-        ${IMG_REPO}/alluxioruntime-controller:${IMAGE_TAG}
-        ${IMG_REPO}/jindoruntime-controller:${IMAGE_TAG}
-        ${IMG_REPO}/goosefsruntime-controller:${IMAGE_TAG}
-        ${IMG_REPO}/juicefsruntime-controller:${IMAGE_TAG}
-        ${IMG_REPO}/thinruntime-controller:${IMAGE_TAG}
-        ${IMG_REPO}/efcruntime-controller:${IMAGE_TAG}
-        ${IMG_REPO}/vineyardruntime-controller:${IMAGE_TAG}
-        ${IMG_REPO}/fluid-csi:${IMAGE_TAG}
-        ${IMG_REPO}/fluid-webhook:${IMAGE_TAG}
-        ${IMG_REPO}/fluid-crd-upgrader:${IMAGE_TAG}
+        "${IMG_REPO}/dataset-controller:${IMAGE_TAG}"
+        "${IMG_REPO}/application-controller:${IMAGE_TAG}"
+        "${IMG_REPO}/alluxioruntime-controller:${IMAGE_TAG}"
+        "${IMG_REPO}/jindoruntime-controller:${IMAGE_TAG}"
+        "${IMG_REPO}/goosefsruntime-controller:${IMAGE_TAG}"
+        "${IMG_REPO}/juicefsruntime-controller:${IMAGE_TAG}"
+        "${IMG_REPO}/thinruntime-controller:${IMAGE_TAG}"
+        "${IMG_REPO}/efcruntime-controller:${IMAGE_TAG}"
+        "${IMG_REPO}/vineyardruntime-controller:${IMAGE_TAG}"
+        "${IMG_REPO}/fluid-csi:${IMAGE_TAG}"
+        "${IMG_REPO}/fluid-webhook:${IMAGE_TAG}"
+        "${IMG_REPO}/fluid-crd-upgrader:${IMAGE_TAG}"
     )
 
     make docker-build-all
 
-    for img in ${images[@]}; do
-        echo "Loading image $img to kind cluster..."
-        kind load docker-image $img --name ${KIND_CLUSTER}
+    for img in "${images[@]}"; do
+        echo "Loading image ${img} to kind cluster..."
+        kind load docker-image "${img}" --name "${KIND_CLUSTER}"
     done
 }
 
-function cleanup_docker_caches() {
+cleanup_docker_caches() {
     echo ">>> System disk usage after building fluid images"
     df -h
     echo ">>> Cleaning docker caches..."
diff --git a/.github/scripts/deploy-fluid-to-kind.sh b/.github/scripts/deploy-fluid-to-kind.sh
@@ -1,24 +1,26 @@
 #!/bin/bash
 set -e
 
-function get_image_tag() {
+get_image_tag() {
+    local version=""
     version=$(grep "^VERSION := " ./Makefile)
-    version=${version#VERSION := }
+    version="${version#VERSION := }"
 
+    local git_sha=""
     git_sha=$(git rev-parse --short HEAD || echo "HEAD")
-    export IMAGE_TAG=${version}-${git_sha}
+    export IMAGE_TAG="${version}-${git_sha}"
 }
 
-function deploy_fluid() {
-    echo "Replacing image tags in values.yaml with $IMAGE_TAG"
-    sed -i -E "s/version: &defaultVersion v[0-9]\.[0-9]\.[0-9]-[a-z0-9]+$/version: \&defaultVersion $IMAGE_TAG/g" charts/fluid/fluid/values.yaml
-    kubectl create ns fluid-system
-    helm install --create-namespace --set runtime.jindo.smartdata.imagePrefix=registry-cn-hongkong.ack.aliyuncs.com/acs --set runtime.jindo.fuse.imagePrefix=registry-cn-hongkong.ack.aliyuncs.com/acs fluid charts/fluid/fluid
+deploy_fluid() {
+    echo "Replacing image tags in values.yaml with ${IMAGE_TAG}"
+    sed -i -E "s/version: &defaultVersion .+$/version: \&defaultVersion ${IMAGE_TAG}/g" charts/fluid/fluid/values.yaml
+    kubectl create ns fluid-system || true
+    helm upgrade --install --namespace fluid-system --create-namespace --set runtime.jindo.smartdata.imagePrefix=registry-cn-hongkong.ack.aliyuncs.com/acs --set runtime.jindo.fuse.imagePrefix=registry-cn-hongkong.ack.aliyuncs.com/acs fluid charts/fluid/fluid
 }
 
-function main() {
+main() {
     get_image_tag
-    if [[ -z "$IMAGE_TAG" ]];then
+    if [[ -z "${IMAGE_TAG}" ]]; then
         echo "Failed to get image tag, exiting..."
         exit 1
     fi
diff --git a/.github/scripts/gha-backward-compatibility.sh b/.github/scripts/gha-backward-compatibility.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+syslog() {
+    echo ">>> ${1}"
+}
+
+panic() {
+    local err_msg="${1}"
+    syslog "backward compatibility test failed: ${err_msg}"
+    exit 1
+}
+
+check_control_plane_status() {
+    echo "=== Unique image tags used by Fluid control plane ==="
+    kubectl get pod -n fluid-system -o jsonpath='
+      {range .items[*]}{range .spec.containers[*]}{.image}{"\n"}{end}{range .spec.initContainers[*]}{.image}{"\n"}{end}{end}' \
+      | sed 's/.*://' \
+      | sort -u
+
+    # Timeout counter (30 minutes = 360*5 seconds)
+    local timeout=360
+    local counter=0
+    local status_interval=36
+    
+    while true; do
+        total_pods=$(kubectl get pod -n fluid-system --no-headers | grep -cv "Completed")
+        running_pods=$(kubectl get pod -n fluid-system --no-headers | grep -c "Running")
+        not_running_pods=$((total_pods - running_pods))
+
+        if ((counter % status_interval == 0)); then
+            syslog "[Status Check $((counter / status_interval))] Pod status: ${running_pods}/${total_pods} running (${not_running_pods} not ready)"
+            if [[ "${not_running_pods}" -gt 0 ]]; then
+                echo "=== Not running pods ==="
+                kubectl get pods -n fluid-system \
+                    --field-selector=status.phase!=Running \
+                    -o=custom-columns='NAME:.metadata.name,STATUS:.status.phase,REASON:.status.reason'
+            fi
+        fi
+
+        if [[ "${total_pods}" -ne 0 ]] && [[ "${total_pods}" -eq "${running_pods}" ]]; then
+            break
+        fi
+        
+        if [[ "${counter}" -ge "${timeout}" ]]; then
+            panic "Timeout waiting for control plane after ${counter} checks!"
+        fi
+        
+        sleep 5
+        ((counter++))
+    done
+    syslog "Fluid control plane is ready after ${counter} checks!"
+}
+
+wait_dataset_bound() {
+    local dataset_name="${1}"
+    local deadline=180
+    local log_interval=0
+    local log_times=0
+    while true; do
+        # Use || true to prevent script from exiting if field is missing
+        last_state=$(kubectl get dataset "${dataset_name}" -ojsonpath='{.status.phase}' 2>/dev/null || echo "")
+        
+        if [[ "${log_interval}" -eq 3 ]]; then
+            ((log_times++))
+            syslog "checking dataset.status.phase==Bound (already $((log_times * 3 * 5))s, last state: ${last_state:-None})"
+            if [[ $((log_times * 3 * 5)) -ge "${deadline}" ]]; then
+                panic "timeout for ${deadline}s waiting for dataset bound!"
+            fi
+            log_interval=0
+        fi
+
+        if [[ "${last_state}" == "Bound" ]]; then
+            break
+        fi
+        ((log_interval++))
+        sleep 5
+    done
+    syslog "Found dataset ${dataset_name} status.phase==Bound"
+}
+
+wait_job_completed() {
+    local job_name="${1}"
+    local deadline=600 # 10 minutes
+    local counter=0
+    while true; do
+        # Handle missing fields gracefully
+        succeed=$(kubectl get job "${job_name}" -ojsonpath='{.status.succeeded}' 2>/dev/null || echo "0")
+        failed=$(kubectl get job "${job_name}" -ojsonpath='{.status.failed}' 2>/dev/null || echo "0")
+        
+        # Ensure variables are treated as integers
+        [[ -z "${succeed}" ]] && succeed=0
+        [[ -z "${failed}" ]] && failed=0
+
+        if [[ "${failed}" -gt 0 ]]; then
+            panic "job ${job_name} failed when accessing data"
+        fi
+        if [[ "${succeed}" -gt 0 ]]; then
+            break
+        fi
+        
+        ((counter++))
+        if [[ $((counter * 5)) -ge "${deadline}" ]]; then
+            panic "timeout for ${deadline}s waiting for job ${job_name} completion!"
+        fi
+        sleep 5
+    done
+    syslog "Found succeeded job ${job_name}"
+}
+
+setup_old_fluid() {
+    syslog "Setting up older version of Fluid from charts"
+    helm repo add fluid https://fluid-cloudnative.github.io/charts
+    helm repo update fluid
+    
+    # We ignore errors in case namespace exists
+    kubectl create ns fluid-system || true
+    
+    helm install fluid fluid/fluid --namespace fluid-system --wait
+    check_control_plane_status
+}
+
+create_dataset() {
+    syslog "Creating alluxio dataset..."
+    kubectl apply -f test/gha-e2e/alluxio/dataset.yaml
+    # give it 10s to let the API server and controller settle
+    sleep 10
+    wait_dataset_bound "zookeeper"
+}
+
+upgrade_fluid() {
+    syslog "Upgrading Fluid to the locally built current version..."
+    ./.github/scripts/deploy-fluid-to-kind.sh
+    check_control_plane_status
+}
+
+verify_backward_compatibility() {
+    syslog "Verifying backward compatibility..."
+    # Ensure the dataset created earlier is still bound
+    wait_dataset_bound "zookeeper"
+    
+    # create job to access data over the runtime
+    kubectl apply -f test/gha-e2e/alluxio/job.yaml
+    wait_job_completed "fluid-test"
+    
+    # Clean up
+    kubectl delete -f test/gha-e2e/alluxio/
+}
+
+main() {
+    set -e
+    syslog "[BACKWARD COMPATIBILITY TEST STARTS AT $(date)]"
+    
+    setup_old_fluid
+    create_dataset
+    upgrade_fluid
+    verify_backward_compatibility
+    
+    syslog "[BACKWARD COMPATIBILITY TEST SUCCEEDED AT $(date)]"
+}
+
+main
diff --git a/.github/workflows/backward-compatibility-e2e.yml b/.github/workflows/backward-compatibility-e2e.yml
@@ -0,0 +1,79 @@
+name: E2E Backward Compatibility Check
+on:
+  pull_request:
+    branches: [master, release-*]
+    paths-ignore:
+      - "docs/**"
+      - "addons/**"
+      - "sdk/**"
+      - "static/**"
+
+permissions:
+  contents: read
+  actions: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  GO_VERSION: 1.24.12
+
+jobs:
+  backward-compat-test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        kubernetes-version:
+          ["v1.33.2", "v1.30.13", "v1.28.15", "v1.24.17", "v1.22.17"]
+    env:
+      GOPATH: ${{ github.workspace }}
+      GO111MODULE: auto
+      KIND_CLUSTER: fluid-cluster
+    defaults:
+      run:
+        working-directory: ${{ env.GOPATH }}/src/github.com/fluid-cloudnative/fluid
+
+    steps:
+      - name: Set up Go
+        uses: actions/setup-go@7a3fe6cf4cb3a834922a1244abfce67bcef6a0c5 # v6.2.0
+        with:
+          go-version: ${{ env.GO_VERSION }}
+
+      - name: Set up Helm
+        uses: azure/setup-helm@1a275c3b69536ee54be43f2070a358922e12c8d4 # v4.3.1
+
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          path: ${{ env.GOPATH }}/src/github.com/fluid-cloudnative/fluid
+
+      - name: Create k8s Kind Cluster
+        uses: helm/kind-action@92086f6be054225fa813e0a4b13787fc9088faab # v1.13.0
+        with:
+          version: v0.29.0
+          node_image: kindest/node:${{ matrix.kubernetes-version }}
+          cluster_name: ${{ env.KIND_CLUSTER }}
+          kubectl_version: ${{ matrix.kubernetes-version }}
+
+      - name: Build current fluid docker images
+        env:
+          IMG_REPO: fluidcloudnative
+        run: |
+          echo ">>> System disk usage before build fluid images"
+          df -h
+          ./.github/scripts/build-all-images.sh
+
+      - name: Run backward compatibility e2e tests
+        timeout-minutes: 40
+        run: |
+          bash ./.github/scripts/gha-backward-compatibility.sh
+
+      - name: Dump environment
+        if: ${{ !cancelled() }}
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: gha-backward-compat-logs-${{ github.job }}-${{ matrix.kubernetes-version }}
+          path: "src/github.com/fluid-cloudnative/fluid/e2e-tmp/testcase-*.tgz"
+          retention-days: 14