diff --git a/create_dev_admin.sh b/create_dev_admin.sh index 1dbf9a7..9304fce 100755 --- a/create_dev_admin.sh +++ b/create_dev_admin.sh @@ -1,5 +1,20 @@ #!/bin/bash +# Parse --track flag (default: standard) +TRACK="standard" +for arg in "$@"; do + case "$arg" in + --track=*) TRACK="${arg#*=}" ;; + esac +done + +if [[ "$TRACK" != "standard" && "$TRACK" != "nix" ]]; then + echo "ERROR: unsupported track '$TRACK' -- must be 'standard' or 'nix'" + exit 1 +fi + +echo "Track: $TRACK" +echo "" echo "Select auth type:" echo " 1) IBM Cloud IAM" echo " 2) GitHub" @@ -45,25 +60,33 @@ oc adm policy add-role-to-user edit "$IDENTITY" -n $USERNAME # create RBAC for the user oc apply -f <(sed -e "s//$USERNAME/g" -e "s//$IDENTITY/g" rbac.yml) -# create PVC for the user (auto-detect storage class) -if oc get sc ocs-storagecluster-cephfs &>/dev/null; then - oc apply -f <(sed "s//$USERNAME/g" pvc/persistent-workspace-pvc.yml) - CREDS_SOURCE_NS="openshift-storage" -elif oc get sc lvms-nvme-vg &>/dev/null; then - oc apply -f <(sed "s//$USERNAME/g" pvc/lvms-user-pvc.yml) - CREDS_SOURCE_NS="nfs-server" -elif oc get sc nfs-rwx &>/dev/null; then - oc apply -f <(sed "s//$USERNAME/g" pvc/pytorch-nfs-rwx-pvc.yml) - CREDS_SOURCE_NS="nfs-server" -else - echo "ERROR: No supported storage class found (ocs-storagecluster-cephfs, lvms-nvme-vg, or nfs-rwx)" - exit 1 -fi +# -- Track-specific resources -- +# Nix track: PVCs and deployments are managed by the Helm chart; skip them here. +# Standard track: create workspace PVC and tooling ConfigMaps as before. +if [ "$TRACK" = "standard" ]; then + # create PVC for the user (auto-detect storage class) + if oc get sc ocs-storagecluster-cephfs &>/dev/null; then + oc apply -f <(sed "s//$USERNAME/g" pvc/persistent-workspace-pvc.yml) + CREDS_SOURCE_NS="openshift-storage" + elif oc get sc lvms-nvme-vg &>/dev/null; then + oc apply -f <(sed "s//$USERNAME/g" pvc/lvms-user-pvc.yml) + CREDS_SOURCE_NS="nfs-server" + elif oc get sc nfs-rwx &>/dev/null; then + oc apply -f <(sed "s//$USERNAME/g" pvc/pytorch-nfs-rwx-pvc.yml) + CREDS_SOURCE_NS="nfs-server" + else + echo "ERROR: No supported storage class found (ocs-storagecluster-cephfs, lvms-nvme-vg, or nfs-rwx)" + exit 1 + fi + + # create configmaps for bazel and gdbinit + oc apply -f <(sed "s//$USERNAME/g" config_map/bazel-configmap.yml) + oc apply -f <(sed "s//$USERNAME/g" config_map/gdbinit-configmap.yml) -# copy COS backup credentials to user namespace -if oc get secret cos-backup-creds -n "$CREDS_SOURCE_NS" &>/dev/null; then - oc get secret cos-backup-creds -n "$CREDS_SOURCE_NS" -o json | \ - python3 -c " + # copy COS backup credentials to user namespace + if [ -n "$CREDS_SOURCE_NS" ] && oc get secret cos-backup-creds -n "$CREDS_SOURCE_NS" &>/dev/null; then + oc get secret cos-backup-creds -n "$CREDS_SOURCE_NS" -o json | \ + python3 -c " import sys, json s = json.load(sys.stdin) s['metadata']['namespace'] = '$USERNAME' @@ -71,14 +94,13 @@ for k in ['uid', 'resourceVersion', 'creationTimestamp', 'managedFields']: s['metadata'].pop(k, None) json.dump(s, sys.stdout) " | oc apply -f - + fi +else + echo "Skipping workspace PVC and ConfigMaps (managed by Helm chart for track: $TRACK)" fi # push quay image secret to pull image from quay oc apply -f <(sed "s//$USERNAME/g" rh-ee-sampark-dev-bot-secret.yml) -# create configmaps for bazel and gdbinit -oc apply -f <(sed "s//$USERNAME/g" config_map/bazel-configmap.yml) -oc apply -f <(sed "s//$USERNAME/g" config_map/gdbinit-configmap.yml) - # create resourcequotas oc apply -f <(sed "s//$USERNAME/g" resourcequotas.yml) \ No newline at end of file diff --git a/create_dev_user.sh b/create_dev_user.sh index 4d9350d..5d0f3ae 100755 --- a/create_dev_user.sh +++ b/create_dev_user.sh @@ -1,5 +1,13 @@ #!/bin/bash +# Parse --track flag (default: standard) +TRACK="standard" +for arg in "$@"; do + case "$arg" in + --track=*) TRACK="${arg#*=}" ;; + esac +done + read -p "Enter openshift username: " USERNAME read -e -p "Enter ssh private key path for github: " SSH_KEY_PATH read -e -p "Enter gcloud application default credentials path: " GCLOUD_CREDENTIALS @@ -16,10 +24,40 @@ oc create secret generic $USERNAME-gcloud-config \ --namespace=$USERNAME \ --from-file=$GCLOUD_CREDENTIALS -# create deployment for the user -oc apply -f <(sed "s//$USERNAME/g" deployment/deployment-mig-18g.yml) -oc apply -f <(sed "s//$USERNAME/g" deployment/deployment-mig-35g.yml) -oc apply -f <(sed "s//$USERNAME/g" deployment/deployment-mig-10g-rdma.yml) -oc apply -f <(sed "s//$USERNAME/g" deployment/deployment-mig-20g-rdma.yml) +# -- Track-specific resources -- +# Nix track: auto-detect storage class (same logic as create_dev_admin.sh) +# and deploy via Helm chart. +# Standard track: apply the static deployment YAMLs. +if [ "$TRACK" = "standard" ]; then + oc apply -f <(sed "s//$USERNAME/g" deployment/deployment-mig-18g.yml) + oc apply -f <(sed "s//$USERNAME/g" deployment/deployment-mig-35g.yml) + oc apply -f <(sed "s//$USERNAME/g" deployment/deployment-mig-10g-rdma.yml) + oc apply -f <(sed "s//$USERNAME/g" deployment/deployment-mig-20g-rdma.yml) +else + # Auto-detect storage class (mirrors create_dev_admin.sh detection order) + if oc get sc ocs-storagecluster-cephfs &>/dev/null; then + STORAGE_CLASS="ocs-storagecluster-cephfs" + ACCESS_MODE="ReadWriteMany" + elif oc get sc lvms-nvme-vg &>/dev/null; then + STORAGE_CLASS="lvms-nvme-vg" + ACCESS_MODE="ReadWriteOnce" + elif oc get sc nfs-rwx &>/dev/null; then + STORAGE_CLASS="nfs-rwx" + ACCESS_MODE="ReadWriteMany" + else + echo "ERROR: No supported storage class found (ocs-storagecluster-cephfs, lvms-nvme-vg, or nfs-rwx)" + exit 1 + fi + echo "Detected storage class: $STORAGE_CLASS ($ACCESS_MODE)" + + helm install $USERNAME-dev devcontainers/nix/chart \ + --namespace $USERNAME \ + --set username=$USERNAME \ + --set storage.home.storageClass=$STORAGE_CLASS \ + --set storage.home.accessMode=$ACCESS_MODE \ + --set storage.nixCache.storageClass=$STORAGE_CLASS \ + --set storage.nixCache.accessMode=$ACCESS_MODE \ + --set imagePullSecret=rh-ee-sampark-dev-bot-pull-secret +fi oc project $USERNAME \ No newline at end of file diff --git a/devcontainers/nix/README.md b/devcontainers/nix/README.md new file mode 100644 index 0000000..8a9fbb1 --- /dev/null +++ b/devcontainers/nix/README.md @@ -0,0 +1,242 @@ +# Nix Dev Environment + +Helm chart that deploys a nix-managed GPU dev environment on OpenShift. +A minimal Fedora + Nix container image is used -- all development tools +(CUDA, PyTorch build deps, shell, editor, etc.) are installed via +home-manager at runtime. Your home directory and nix binary cache persist +across pod restarts. + +## Prerequisites + +- Namespace provisioned by an admin (`create_dev_admin.sh` -- skip the PVC + creation step, the Helm chart manages its own storage) +- `helm` CLI installed +- `oc` CLI installed and authenticated (`oc login`) +- SSH key registered on your GitHub account + (https://github.com/settings/keys) +- GCloud application default credentials for Vertex AI + +## Setup + +### 1. Create secrets + +Run the existing user setup script from the repo root. It will prompt for +your username, SSH key path, and gcloud credentials path: + +```bash +./create_dev_user.sh +``` + +This creates the `-git-ssh-key` and `-gcloud-config` +secrets in your namespace. You can skip the deployment step at the end -- +the Helm chart handles that. + +**Important:** The SSH key must be the one registered on your GitHub account. +Verify with `ssh -T git@github.com` -- it should print +`Hi ! You've successfully authenticated`. If your SSH agent +uses a different key than the key file you provide, the settings repo clone +will fail. See [Troubleshooting](#ssh-key-rejected-during-init) below. + +### 2. Deploy + +```bash +helm install -dev ./devcontainers/nix/chart \ + --set username= \ + -n +``` + +This creates a deployment (paused at 0 replicas) and two persistent volumes +(100Gi home directory + 50Gi nix binary cache). + +To use a personal settings repo: + +```bash +helm install -dev ./devcontainers/nix/chart \ + --set username= \ + --set nix.settings.repo=git@github.com:/my-settings.git \ + --set nix.settings.profile=default \ + -n +``` + +### 3. Start the pod + +```bash +oc scale deployment -dev -n --replicas=1 +``` + +The pod goes through two init stages before it's ready: + +``` +Init:0/2 -- Seeding /nix from the container image (~2s) +Init:1/2 -- Cloning settings + running home-manager switch + First boot: ~5-10 min (downloads packages from cache.nixos.org) + Subsequent boots: ~30s (uses persistent nix binary cache) +Running -- Dev environment ready +``` + +Watch progress: + +```bash +oc get pods -n -w +``` + +### 4. Connect + +**Terminal:** + +```bash +oc exec -it deployment/-dev -n -- zsh +``` + +**SSH (for VS Code Remote, port forwarding, etc.):** + +```bash +oc port-forward deployment/-dev -n 2222:22 +# Then in another terminal, or in VS Code SSH config: +ssh -p 2222 root@localhost +``` + +## GPU profiles + +The default GPU is MIG 2g.35gb. To use a different GPU, pass `--set` flags +during `helm install` or `helm upgrade`: + +| GPU | Flags | Use case | +|---|---|---| +| MIG 2g.35gb | *(default, no flags needed)* | Standard development | +| MIG 1g.18gb | `--set gpu.type=nvidia.com/mig-1g.18gb --set gpu.runtimeClassName=nvidia-cdi` | Smaller workloads, testing | +| MIG 1g.18gb x2 | `--set gpu.type=nvidia.com/mig-1g.18gb --set gpu.count=2 --set gpu.runtimeClassName=nvidia-cdi` | Parallel workloads | +| Full GPU | `--set gpu.type=nvidia.com/gpu` | Performance benchmarking | +| CPU only | `--set gpu.enabled=false` | Builds, code review | + +Or use the bundled profile files as a shorthand: + +```bash +helm install -dev ./devcontainers/nix/chart \ + --set username= \ + -f devcontainers/nix/chart/profiles/mig-18g.yaml \ + -n +``` + +### Switching profiles + +Scale down, upgrade, scale back up: + +```bash +oc scale deployment -dev -n --replicas=0 +helm upgrade -dev ./devcontainers/nix/chart \ + --set username= \ + --set gpu.type=nvidia.com/mig-1g.18gb \ + --set gpu.runtimeClassName=nvidia-cdi \ + -n +oc scale deployment -dev -n --replicas=1 +``` + +## Customization + +The dev environment is configured through a `settings.nix` file. If you +provided a settings repo during deploy, it was cloned to +`~/workspace/settings`. To modify your environment after connecting: + +```bash +vim ~/workspace/settings/settings.nix +torched apply +``` + +If you deployed without a settings repo, a default template was initialized. +See [torched-devcontainer](https://github.com/hinriksnaer/torched-devcontainer) +for the template format and available options. + +## Teardown + +Remove the deployment but keep your data (home directory + nix cache): + +```bash +helm uninstall -dev -n +``` + +The PVCs are preserved -- a future `helm install` will reuse them and boot +quickly from the cached nix store. + +To delete everything including persistent data: + +```bash +helm uninstall -dev -n +oc delete pvc home- nix-store- -n +``` + +## Troubleshooting + +### SSH key rejected during init + +``` +git@github.com: Permission denied (publickey). +``` + +The SSH key in the secret doesn't match what's registered on GitHub. Check +which key your local SSH actually uses: + +```bash +ssh -vT git@github.com 2>&1 | grep "Offering public key" +``` + +Compare the fingerprint with what's in the secret: + +```bash +oc exec deployment/-dev -n -- \ + ssh-keygen -l -f /root/.ssh-keys/id_ed25519 +``` + +If they don't match, recreate the secret with the correct key: + +```bash +oc delete secret -git-ssh-key -n +oc create secret generic -git-ssh-key \ + --namespace= \ + --from-file=ssh-privatekey=$HOME/.ssh/ \ + --from-file=ssh-publickey=$HOME/.ssh/.pub \ + --from-file=known_hosts=<(ssh-keyscan github.com 2>/dev/null) +``` + +Then restart the pod: `oc scale` to 0 then back to 1. + +### Init-config takes a long time + +First boot downloads ~250MB of nix packages from cache.nixos.org. This is +normal and takes 5-10 minutes. Subsequent boots use the persistent NFS +binary cache and complete in ~30 seconds. + +Check progress: + +```bash +oc logs deployment/-dev -c init-config -n -f +``` + +### Quota exceeded + +``` +exceeded quota: gpu-quota +``` + +The nix track creates 150Gi of PVCs (100Gi home + 50Gi nix cache). If the +admin script already created a 250Gi workspace PVC, the total may exceed +the 300Gi storage quota. Delete the unused workspace PVC: + +```bash +oc delete pvc pytorch-ibmc-storage- -n +``` + +### Pod stuck in CrashLoopBackOff + +Check the main container logs: + +```bash +oc logs deployment/-dev -c dev -n +``` + +### Checking init container logs + +```bash +oc logs deployment/-dev -c init-nix -n +oc logs deployment/-dev -c init-config -n +``` diff --git a/devcontainers/nix/chart/.helmignore b/devcontainers/nix/chart/.helmignore new file mode 100644 index 0000000..c471a9a --- /dev/null +++ b/devcontainers/nix/chart/.helmignore @@ -0,0 +1,7 @@ +# Patterns to ignore when building helm packages. +.git +.gitignore +*.swp +*.bak +*.tmp +.DS_Store diff --git a/devcontainers/nix/chart/Chart.yaml b/devcontainers/nix/chart/Chart.yaml new file mode 100644 index 0000000..1f3cf9a --- /dev/null +++ b/devcontainers/nix/chart/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v2 +name: devcontainers +description: GPU development environments for OpenShift +version: 0.1.0 +type: application +appVersion: "1.0.0" +annotations: + charts.openshift.io/name: Dev Containers + charts.openshift.io/provider: TorchedHat diff --git a/devcontainers/nix/chart/profiles/cpu-only.yaml b/devcontainers/nix/chart/profiles/cpu-only.yaml new file mode 100644 index 0000000..9a42374 --- /dev/null +++ b/devcontainers/nix/chart/profiles/cpu-only.yaml @@ -0,0 +1,13 @@ +gpu: + enabled: false + +resources: + requests: + cpu: "8" + memory: 48Gi + limits: + cpu: "8" + memory: 48Gi + +env: + MAX_JOBS: "4" diff --git a/devcontainers/nix/chart/profiles/mig-18g-2g.yaml b/devcontainers/nix/chart/profiles/mig-18g-2g.yaml new file mode 100644 index 0000000..2174c97 --- /dev/null +++ b/devcontainers/nix/chart/profiles/mig-18g-2g.yaml @@ -0,0 +1,4 @@ +gpu: + type: nvidia.com/mig-1g.18gb + count: 2 + runtimeClassName: nvidia-cdi diff --git a/devcontainers/nix/chart/profiles/mig-18g.yaml b/devcontainers/nix/chart/profiles/mig-18g.yaml new file mode 100644 index 0000000..6a1133a --- /dev/null +++ b/devcontainers/nix/chart/profiles/mig-18g.yaml @@ -0,0 +1,4 @@ +gpu: + type: nvidia.com/mig-1g.18gb + count: 1 + runtimeClassName: nvidia-cdi diff --git a/devcontainers/nix/chart/profiles/mig-35g.yaml b/devcontainers/nix/chart/profiles/mig-35g.yaml new file mode 100644 index 0000000..6f9a438 --- /dev/null +++ b/devcontainers/nix/chart/profiles/mig-35g.yaml @@ -0,0 +1,4 @@ +gpu: + type: nvidia.com/mig-2g.35gb + count: 1 + runtimeClassName: "" diff --git a/devcontainers/nix/chart/templates/NOTES.txt b/devcontainers/nix/chart/templates/NOTES.txt new file mode 100644 index 0000000..8e0b079 --- /dev/null +++ b/devcontainers/nix/chart/templates/NOTES.txt @@ -0,0 +1,34 @@ +{{ .Values.track | upper }} dev environment deployed for {{ .Values.username }}. + + Replicas: {{ .Values.replicas }} + GPU: {{ if .Values.gpu.enabled }}{{ .Values.gpu.type }} x{{ .Values.gpu.count }}{{ else }}none{{ end }} + Image: {{ include "devcontainers.image" . }} +{{- if eq .Values.track "nix" }} + Settings: {{ if .Values.nix.settings.repo }}{{ .Values.nix.settings.repo }}{{ else }}template (default){{ end }} +{{- end }} +{{- if .Values.hostNetwork }} + Network: hostNetwork (requires hostnetwork SCC) +{{- end }} +{{- if .Values.rdma.enabled }} + RDMA: {{ .Values.rdma.resource }} x{{ .Values.rdma.count }} +{{- end }} + +Prerequisites (via create_dev_admin.sh --track={{ .Values.track }}): + - Namespace, RBAC, anyuid SCC, resource quota + - cluster-reader role (for node access) + - hostnetwork SCC (provisioned for all users; required if hostNetwork=true) + +Scale up: + oc scale deployment {{ include "devcontainers.name" . }} -n {{ .Values.username }} --replicas=1 + +Connect: + oc exec -it deployment/{{ include "devcontainers.name" . }} -n {{ .Values.username }} -- zsh + +{{- if eq .Values.track "nix" }} + +Customize: + vim ~/workspace/settings/settings.nix && torched apply +{{- end }} + +Teardown (PVCs preserved): + helm uninstall {{ .Release.Name }} -n {{ .Values.username }} diff --git a/devcontainers/nix/chart/templates/_helpers.tpl b/devcontainers/nix/chart/templates/_helpers.tpl new file mode 100644 index 0000000..310ce7b --- /dev/null +++ b/devcontainers/nix/chart/templates/_helpers.tpl @@ -0,0 +1,80 @@ +{{/* +Validate required values. Called at the top of every template. +*/}} +{{- define "devcontainers.validate" -}} +{{- if not .Values.username -}} + {{- fail "username is required: --set username=" -}} +{{- end -}} +{{- $supported := list "nix" -}} +{{- if not (has .Values.track $supported) -}} + {{- fail (printf "unsupported track %q -- must be one of: %s" .Values.track (join ", " $supported)) -}} +{{- end -}} +{{- end }} + +{{/* +Deployment name: -dev +*/}} +{{- define "devcontainers.name" -}} +{{- printf "%s-dev" .Values.username -}} +{{- end }} + +{{/* +Namespace: always the username +*/}} +{{- define "devcontainers.namespace" -}} +{{- .Values.username -}} +{{- end }} + +{{/* +Full container image reference +*/}} +{{- define "devcontainers.image" -}} +{{- printf "%s:%s" .Values.image.repository .Values.image.tag -}} +{{- end }} + +{{/* +Standard Kubernetes labels applied to all resources. +Follows the app.kubernetes.io labelling convention. +*/}} +{{- define "devcontainers.labels" -}} +app: {{ include "devcontainers.name" . }} +app.kubernetes.io/name: {{ .Chart.Name }} +app.kubernetes.io/instance: {{ .Release.Name }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +helm.sh/chart: {{ printf "%s-%s" .Chart.Name .Chart.Version }} +devcontainers/track: {{ .Values.track }} +devcontainers/username: {{ .Values.username }} +{{- end }} + +{{/* +Selector labels for deployment matchLabels and pod labels. +Must be immutable after creation -- keep minimal. +*/}} +{{- define "devcontainers.selectorLabels" -}} +app: {{ include "devcontainers.name" . }} +app.kubernetes.io/name: {{ .Chart.Name }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Home PVC claim name: uses existingClaim if set, otherwise home- +*/}} +{{- define "devcontainers.homeClaim" -}} +{{- if .Values.storage.home.existingClaim -}} +{{- .Values.storage.home.existingClaim -}} +{{- else -}} +{{- printf "home-%s" .Values.username -}} +{{- end -}} +{{- end }} + +{{/* +Nix cache PVC claim name: uses existingClaim if set, otherwise nix-store- +*/}} +{{- define "devcontainers.nixCacheClaim" -}} +{{- if .Values.storage.nixCache.existingClaim -}} +{{- .Values.storage.nixCache.existingClaim -}} +{{- else -}} +{{- printf "nix-store-%s" .Values.username -}} +{{- end -}} +{{- end }} diff --git a/devcontainers/nix/chart/templates/deployment.yaml b/devcontainers/nix/chart/templates/deployment.yaml new file mode 100644 index 0000000..52c91d3 --- /dev/null +++ b/devcontainers/nix/chart/templates/deployment.yaml @@ -0,0 +1,259 @@ +{{- include "devcontainers.validate" . -}} +--- +kind: Deployment +apiVersion: apps/v1 +metadata: + name: {{ include "devcontainers.name" . }} + namespace: {{ include "devcontainers.namespace" . }} + labels: + {{- include "devcontainers.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicas }} + selector: + matchLabels: + {{- include "devcontainers.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "devcontainers.selectorLabels" . | nindent 8 }} + spec: + {{- if .Values.gpu.runtimeClassName }} + runtimeClassName: {{ .Values.gpu.runtimeClassName }} + {{- end }} + {{- if .Values.hostNetwork }} + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + {{- if .Values.imagePullSecret }} + imagePullSecrets: + - name: {{ .Values.imagePullSecret }} + {{- end }} + + {{/* ---- Volumes ---- */}} + volumes: + - name: ssh-key + secret: + secretName: {{ .Values.username }}-git-ssh-key + items: + - key: ssh-privatekey + path: id_ed25519 + - key: ssh-publickey + path: id_ed25519.pub + - key: known_hosts + path: known_hosts + defaultMode: 0600 + - name: gcloud-config + secret: + secretName: {{ .Values.username }}-gcloud-config + defaultMode: 0600 + - name: dshm + emptyDir: + medium: Memory + sizeLimit: {{ .Values.storage.shm.sizeLimit }} + {{- if eq .Values.track "nix" }} + - name: nix-local + emptyDir: + sizeLimit: {{ .Values.storage.nixLocal.sizeLimit }} + - name: nix-cache + persistentVolumeClaim: + claimName: {{ include "devcontainers.nixCacheClaim" . }} + - name: home + persistentVolumeClaim: + claimName: {{ include "devcontainers.homeClaim" . }} + {{- end }} + + {{/* ---- Init containers ---- */}} + initContainers: + {{- if eq .Values.track "nix" }} + - name: init-nix + image: {{ include "devcontainers.image" . | quote }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["/bin/bash", "-c"] + args: + - | + echo ":: seeding local nix store..." + cp -a /nix/. /mnt/nix/ + resources: + requests: { cpu: 500m, memory: 512Mi } + limits: { cpu: 500m, memory: 512Mi } + volumeMounts: + - name: nix-local + mountPath: /mnt/nix + - name: init-config + image: {{ include "devcontainers.image" . | quote }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + env: + - name: HM_PROFILE + value: {{ .Values.nix.settings.profile | quote }} + - name: SETTINGS_REPO + value: {{ .Values.nix.settings.repo | quote }} + command: ["/bin/bash", "-c"] + args: + - | + # Configure nix for local store + NFS binary cache + mkdir -p /root/.config/nix + cat > /root/.config/nix/nix.conf <<'NIXCONF' + experimental-features = nix-command flakes + build-users-group = + build-dir = /tmp/nix-builds + substituters = file:///nix-cache https://cache.nixos.org + trusted-public-keys = cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY= + require-sigs = false + NIXCONF + + . /root/.nix-profile/etc/profile.d/nix.sh 2>/dev/null || true + + # SSH keys for git clone + mkdir -p /root/.ssh + cp /root/.ssh-keys/id_ed25519 /root/.ssh/id_ed25519 2>/dev/null + cp /root/.ssh-keys/known_hosts /root/.ssh/known_hosts 2>/dev/null + chmod 700 /root/.ssh 2>/dev/null; chmod 600 /root/.ssh/* 2>/dev/null + + # Clone/pull settings + if [ -n "$SETTINGS_REPO" ]; then + if [ ! -d ~/workspace/settings ]; then + echo ":: cloning settings repo..." + /usr/bin/git clone "$SETTINGS_REPO" ~/workspace/settings + elif [ ! -d ~/workspace/settings/.git ]; then + echo ":: backing up template, cloning settings repo..." + mv ~/workspace/settings ~/workspace/settings.bak + /usr/bin/git clone "$SETTINGS_REPO" ~/workspace/settings + else + echo ":: updating settings repo..." + /usr/bin/git -C ~/workspace/settings pull --ff-only 2>/dev/null || true + fi + else + if [ ! -f ~/workspace/settings/flake.nix ]; then + echo ":: initializing settings from template..." + mkdir -p ~/workspace/settings + cd ~/workspace/settings + nix flake init -t github:hinriksnaer/torched-devcontainer --refresh + fi + fi + + # Home-manager switch + echo ":: applying home-manager config (profile: ${HM_PROFILE:-default})..." + if timeout 600 nix run --print-build-logs home-manager/master -- switch -b backup \ + --flake "/root/workspace/settings#${HM_PROFILE:-default}" --print-build-logs; then + echo ":: pushing closures to nix cache..." + hm_path=$(readlink -f /root/.local/state/home-manager/gcroots/current-home 2>/dev/null) + [ -n "$hm_path" ] && nix copy --to file:///nix-cache "$hm_path" --no-check-sigs 2>/dev/null || true + devshell_path=$(nix path-info --flake "/root/workspace/settings#devShells.x86_64-linux.default" 2>/dev/null) + [ -n "$devshell_path" ] && nix copy --to file:///nix-cache "$devshell_path" --no-check-sigs 2>/dev/null || true + else + echo ":: WARNING: home-manager switch failed or timed out." + echo ":: Run 'torched apply' manually after the pod starts." + fi + resources: + requests: + cpu: {{ .Values.nix.initConfig.resources.requests.cpu | quote }} + memory: {{ .Values.nix.initConfig.resources.requests.memory }} + limits: + cpu: {{ .Values.nix.initConfig.resources.limits.cpu | quote }} + memory: {{ .Values.nix.initConfig.resources.limits.memory }} + volumeMounts: + - name: nix-local + mountPath: /nix + - name: nix-cache + mountPath: /nix-cache + - name: home + mountPath: /root + - name: ssh-key + mountPath: /root/.ssh-keys + readOnly: true + - name: gcloud-config + mountPath: /root/.config/gcloud + readOnly: true + {{- end }} + + {{/* ---- Main container ---- */}} + containers: + - name: dev + image: {{ include "devcontainers.image" . | quote }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + stdin: true + tty: true + {{- if eq .Values.track "nix" }} + command: ["/bin/bash", "-c"] + args: + - | + chmod 755 /root + cp /root/.ssh-keys/id_ed25519.pub /root/.ssh/authorized_keys 2>/dev/null + chmod 600 /root/.ssh/authorized_keys 2>/dev/null + /usr/sbin/dropbear -R -E -p {{ .Values.ssh.port }} + exec sleep infinity + {{- end }} + ports: + {{- if and (eq .Values.track "nix") .Values.ssh.enabled }} + - containerPort: {{ .Values.ssh.port }} + protocol: TCP + {{- end }} + - containerPort: 8080 + protocol: TCP + {{- if and (eq .Values.track "nix") .Values.ssh.enabled }} + startupProbe: + tcpSocket: + port: {{ .Values.ssh.port }} + initialDelaySeconds: 2 + periodSeconds: 5 + failureThreshold: 6 + livenessProbe: + tcpSocket: + port: {{ .Values.ssh.port }} + periodSeconds: 30 + failureThreshold: 3 + {{- end }} + env: + - name: MAX_JOBS + value: {{ .Values.env.MAX_JOBS | quote }} + {{- range $k, $v := .Values.extraEnv }} + - name: {{ $k }} + value: {{ $v | quote }} + {{- end }} + resources: + requests: + cpu: {{ .Values.resources.requests.cpu | quote }} + memory: {{ .Values.resources.requests.memory }} + {{- if .Values.gpu.enabled }} + {{ .Values.gpu.type }}: {{ .Values.gpu.count | quote }} + {{- end }} + {{- if .Values.rdma.enabled }} + {{ .Values.rdma.resource }}: {{ .Values.rdma.count | quote }} + {{- end }} + limits: + cpu: {{ .Values.resources.limits.cpu | quote }} + memory: {{ .Values.resources.limits.memory }} + {{- if .Values.gpu.enabled }} + {{ .Values.gpu.type }}: {{ .Values.gpu.count | quote }} + {{- end }} + {{- if .Values.rdma.enabled }} + {{ .Values.rdma.resource }}: {{ .Values.rdma.count | quote }} + {{- end }} + volumeMounts: + {{- if eq .Values.track "nix" }} + - name: home + mountPath: /root + - name: nix-local + mountPath: /nix + - name: nix-cache + mountPath: /nix-cache + - name: ssh-key + mountPath: /root/.ssh-keys + readOnly: true + - name: gcloud-config + mountPath: /root/.config/gcloud + readOnly: true + {{- end }} + - name: dshm + mountPath: /dev/shm + + terminationGracePeriodSeconds: 30 + {{- if eq .Values.track "nix" }} + securityContext: + runAsUser: 0 + runAsGroup: 0 + {{- end }} + strategy: + type: {{ .Values.strategy }} + revisionHistoryLimit: 3 + progressDeadlineSeconds: {{ .Values.progressDeadlineSeconds }} diff --git a/devcontainers/nix/chart/templates/pvcs.yaml b/devcontainers/nix/chart/templates/pvcs.yaml new file mode 100644 index 0000000..a2238aa --- /dev/null +++ b/devcontainers/nix/chart/templates/pvcs.yaml @@ -0,0 +1,42 @@ +{{- include "devcontainers.validate" . -}} +{{- if eq .Values.track "nix" -}} + +{{- if .Values.storage.home.create }} +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "devcontainers.homeClaim" . }} + namespace: {{ include "devcontainers.namespace" . }} + annotations: + "helm.sh/resource-policy": keep + labels: + {{- include "devcontainers.labels" . | nindent 4 }} +spec: + accessModes: [{{ .Values.storage.home.accessMode }}] + storageClassName: {{ .Values.storage.home.storageClass }} + resources: + requests: + storage: {{ .Values.storage.home.size }} +{{- end }} + +{{- if .Values.storage.nixCache.create }} +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "devcontainers.nixCacheClaim" . }} + namespace: {{ include "devcontainers.namespace" . }} + annotations: + "helm.sh/resource-policy": keep + labels: + {{- include "devcontainers.labels" . | nindent 4 }} +spec: + accessModes: [{{ .Values.storage.nixCache.accessMode }}] + storageClassName: {{ .Values.storage.nixCache.storageClass }} + resources: + requests: + storage: {{ .Values.storage.nixCache.size }} +{{- end }} + +{{- end }} diff --git a/devcontainers/nix/chart/values.schema.json b/devcontainers/nix/chart/values.schema.json new file mode 100644 index 0000000..7445c1b --- /dev/null +++ b/devcontainers/nix/chart/values.schema.json @@ -0,0 +1,347 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": ["username"], + "properties": { + "username": { + "type": "string", + "title": "Username", + "description": "OpenShift username. Used as the namespace and resource name prefix.", + "minLength": 1 + }, + "track": { + "type": "string", + "title": "Track", + "description": "Deployment track. Determines how the dev environment is configured.", + "enum": ["nix"], + "default": "nix" + }, + "gpu": { + "type": "object", + "title": "GPU", + "required": ["enabled", "type"], + "properties": { + "enabled": { + "type": "boolean", + "title": "Enable GPU", + "description": "Attach a GPU resource to the pod.", + "default": true + }, + "type": { + "type": "string", + "title": "GPU Type", + "description": "Kubernetes device resource name.", + "enum": [ + "nvidia.com/gpu", + "nvidia.com/mig-1g.18gb", + "nvidia.com/mig-2g.35gb" + ], + "default": "nvidia.com/gpu" + }, + "count": { + "type": "integer", + "title": "GPU Count", + "description": "Number of GPU slices to request.", + "default": 1, + "minimum": 1, + "maximum": 8 + }, + "runtimeClassName": { + "type": "string", + "title": "Runtime Class", + "description": "Set to nvidia-cdi for MIG slices. Leave empty for full GPUs or CPU-only.", + "enum": ["", "nvidia-cdi"], + "default": "" + } + } + }, + "nix": { + "type": "object", + "title": "Nix Settings", + "description": "Configuration for the nix-managed dev environment. Only used when track is nix.", + "properties": { + "settings": { + "type": "object", + "title": "User Settings", + "properties": { + "repo": { + "type": "string", + "title": "Settings Repo", + "description": "Git SSH URL for your personal settings repo. Leave empty to use the default template.", + "default": "" + }, + "profile": { + "type": "string", + "title": "Home-Manager Profile", + "description": "The flake output profile name for home-manager.", + "default": "default" + } + } + }, + "initConfig": { + "type": "object", + "title": "Init Config Resources", + "description": "Resource limits for the init container that runs home-manager switch.", + "properties": { + "resources": { + "type": "object", + "properties": { + "requests": { + "type": "object", + "properties": { + "cpu": { "type": "string", "title": "CPU Request", "default": "1" }, + "memory": { "type": "string", "title": "Memory Request", "default": "4Gi" } + } + }, + "limits": { + "type": "object", + "properties": { + "cpu": { "type": "string", "title": "CPU Limit", "default": "4" }, + "memory": { "type": "string", "title": "Memory Limit", "default": "8Gi" } + } + } + } + } + } + } + } + }, + "image": { + "type": "object", + "title": "Image", + "description": "Container image used for all containers (init + main).", + "properties": { + "repository": { + "type": "string", + "title": "Repository", + "default": "quay.io/rh-ee-hgudmund/torched-devcontainer" + }, + "tag": { + "type": "string", + "title": "Tag", + "default": "latest" + }, + "pullPolicy": { + "type": "string", + "title": "Pull Policy", + "enum": ["Always", "IfNotPresent", "Never"], + "default": "Always" + } + } + }, + "imagePullSecret": { + "type": "string", + "title": "Image Pull Secret", + "description": "Name of an existing image pull secret. Leave empty if the image is public.", + "default": "" + }, + "resources": { + "type": "object", + "title": "Resources", + "description": "CPU and memory for the main dev container.", + "properties": { + "requests": { + "type": "object", + "title": "Requests", + "properties": { + "cpu": { "type": "string", "title": "CPU Request", "default": "4" }, + "memory": { "type": "string", "title": "Memory Request", "default": "32Gi" } + } + }, + "limits": { + "type": "object", + "title": "Limits", + "properties": { + "cpu": { "type": "string", "title": "CPU Limit", "default": "16" }, + "memory": { "type": "string", "title": "Memory Limit", "default": "32Gi" } + } + } + } + }, + "storage": { + "type": "object", + "title": "Storage", + "description": "Persistent volume configuration.", + "properties": { + "home": { + "type": "object", + "title": "Home Directory", + "properties": { + "create": { + "type": "boolean", + "title": "Create PVC", + "description": "Set to false to use an existing PVC.", + "default": true + }, + "existingClaim": { + "type": "string", + "title": "Existing Claim", + "description": "Name of an existing PVC to use instead of creating one.", + "default": "" + }, + "size": { + "type": "string", + "title": "Size", + "default": "100Gi" + }, + "storageClass": { + "type": "string", + "title": "Storage Class", + "description": "Storage class for the home PVC. Auto-detected by create_dev_user.sh: ocs-storagecluster-cephfs (CephFS), lvms-nvme-vg (LVMS), or nfs-rwx (NFS).", + "default": "" + }, + "accessMode": { + "type": "string", + "title": "Access Mode", + "enum": ["ReadWriteMany", "ReadWriteOnce"], + "default": "" + } + } + }, + "nixCache": { + "type": "object", + "title": "Nix Binary Cache", + "properties": { + "create": { + "type": "boolean", + "title": "Create PVC", + "default": true + }, + "existingClaim": { + "type": "string", + "title": "Existing Claim", + "default": "" + }, + "size": { + "type": "string", + "title": "Size", + "default": "50Gi" + }, + "storageClass": { + "type": "string", + "title": "Storage Class", + "description": "Storage class for the nix cache PVC. Auto-detected by create_dev_user.sh.", + "default": "" + }, + "accessMode": { + "type": "string", + "title": "Access Mode", + "enum": ["ReadWriteMany", "ReadWriteOnce"], + "default": "" + } + } + }, + "nixLocal": { + "type": "object", + "title": "Local Nix Store", + "description": "Ephemeral emptyDir for the nix store. Rebuilt on each pod restart.", + "properties": { + "sizeLimit": { + "type": "string", + "title": "Size Limit", + "default": "50Gi" + } + } + }, + "shm": { + "type": "object", + "title": "Shared Memory", + "properties": { + "sizeLimit": { + "type": "string", + "title": "Size Limit", + "default": "16Gi" + } + } + } + } + }, + "hostNetwork": { + "type": "boolean", + "title": "Host Network", + "description": "Enable host networking for UCX-based RDMA workloads. Requires hostnetwork SCC.", + "default": false + }, + "rdma": { + "type": "object", + "title": "RDMA", + "description": "RDMA device passthrough for multi-node training.", + "properties": { + "enabled": { + "type": "boolean", + "title": "Enable RDMA", + "default": false + }, + "resource": { + "type": "string", + "title": "RDMA Resource", + "description": "Kubernetes device resource name for RDMA.", + "default": "rdma/rdma_shared_device_a" + }, + "count": { + "type": "integer", + "title": "RDMA Device Count", + "default": 1, + "minimum": 1 + } + } + }, + "ssh": { + "type": "object", + "title": "SSH", + "properties": { + "enabled": { + "type": "boolean", + "title": "Enable SSH Server", + "description": "Run a dropbear SSH server in the container.", + "default": true + }, + "port": { + "type": "integer", + "title": "SSH Port", + "default": 22 + } + } + }, + "env": { + "type": "object", + "title": "Environment", + "properties": { + "MAX_JOBS": { + "type": "string", + "title": "MAX_JOBS", + "description": "Maximum parallel build jobs.", + "default": "8" + } + } + }, + "extraEnv": { + "type": "object", + "title": "Extra Environment Variables", + "description": "Additional environment variables as key-value pairs.", + "additionalProperties": { + "type": "string" + } + }, + "replicas": { + "type": "integer", + "title": "Replicas", + "description": "Set to 0 to create the deployment without starting a pod. Scale up to 1 when ready.", + "default": 0, + "minimum": 0, + "maximum": 1 + }, + "strategy": { + "type": "string", + "title": "Update Strategy", + "enum": ["Recreate", "RollingUpdate"], + "default": "Recreate" + }, + "progressDeadlineSeconds": { + "type": "integer", + "title": "Progress Deadline", + "description": "Seconds before a deployment is considered failed.", + "default": 900 + } + } +} diff --git a/devcontainers/nix/chart/values.yaml b/devcontainers/nix/chart/values.yaml new file mode 100644 index 0000000..b557915 --- /dev/null +++ b/devcontainers/nix/chart/values.yaml @@ -0,0 +1,132 @@ +# ============================================================================= +# Core identity +# ============================================================================= + +# -- Required. OpenShift username and namespace. +username: "" + +# -- Deployment track. Determines volumes, init containers, security context, +# -- and container startup behavior. +# -- Supported: "nix" +# -- Future: "fat-image", "torch-dev-containers" +track: nix + +# ============================================================================= +# Container image +# ============================================================================= + +image: + repository: quay.io/rh-ee-hgudmund/torched-devcontainer + tag: latest + pullPolicy: Always + +# -- Optional image pull secret name. Leave empty if image is public. +imagePullSecret: "" + +# ============================================================================= +# Track-specific: nix +# -- Configuration for the nix-managed dev environment track. +# -- These values are only used when track == "nix". +# ============================================================================= + +nix: + # -- Git SSH URL for user settings repo (optional). + # -- Falls back to nix flake init template if empty. + settings: + repo: "" + profile: default # Home-manager profile name + + # -- Resources for the init-config container (runs home-manager switch). + initConfig: + resources: + requests: + cpu: "1" + memory: 4Gi + limits: + cpu: "4" + memory: 8Gi + +# ============================================================================= +# GPU +# ============================================================================= + +gpu: + enabled: true + type: nvidia.com/gpu + count: 1 + # -- Runtime class. Set to nvidia-cdi for MIG slices, leave empty for full GPUs. + runtimeClassName: "" + +# ============================================================================= +# Resources (main container) +# ============================================================================= + +resources: + requests: + cpu: "4" + memory: 32Gi + limits: + cpu: "16" + memory: 32Gi + +# ============================================================================= +# Storage +# ============================================================================= + +storage: + # -- Home directory PVC. + home: + create: true # Set false to use an existing PVC + existingClaim: "" # If set, references this PVC instead of creating one + size: 100Gi + storageClass: "" # Auto-detected by create_dev_user.sh (CephFS > LVMS > NFS) + accessMode: "" # Set by create_dev_user.sh based on detected storage class + + # -- Nix binary cache PVC (nix track only). + nixCache: + create: true + existingClaim: "" + size: 50Gi + storageClass: "" # Auto-detected by create_dev_user.sh (CephFS > LVMS > NFS) + accessMode: "" # Set by create_dev_user.sh based on detected storage class + + # -- Ephemeral local nix store (nix track only). + nixLocal: + sizeLimit: 50Gi + + # -- Shared memory (/dev/shm). + shm: + sizeLimit: 16Gi + +# ============================================================================= +# Networking +# ============================================================================= + +# -- Enable host networking (required for UCX-based RDMA workloads). +# -- Requires hostnetwork SCC granted by create_dev_admin.sh. +hostNetwork: false + +# -- RDMA device passthrough for multi-node training. +rdma: + enabled: false + resource: rdma/rdma_shared_device_a + count: 1 + +ssh: + enabled: true + port: 22 + +# ============================================================================= +# Misc +# ============================================================================= + +env: + MAX_JOBS: "8" + +# -- Additional environment variables injected into the main container. +# -- Example: { CUDA_VISIBLE_DEVICES: "0", DEBUG: "1" } +extraEnv: {} + +replicas: 0 +strategy: Recreate +progressDeadlineSeconds: 900