TorchedHat · hinriksnaer · Jun 9, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/create_dev_admin.sh b/create_dev_admin.sh
@@ -1,5 +1,20 @@
 #!/bin/bash
 
+# Parse --track flag (default: standard)
+TRACK="standard"
+for arg in "$@"; do
+  case "$arg" in
+    --track=*) TRACK="${arg#*=}" ;;
+  esac
+done
+
+if [[ "$TRACK" != "standard" && "$TRACK" != "nix" ]]; then
+  echo "ERROR: unsupported track '$TRACK' -- must be 'standard' or 'nix'"
+  exit 1
+fi
+
+echo "Track: $TRACK"
+echo ""
 echo "Select auth type:"
 echo "  1) IBM Cloud IAM"
 echo "  2) GitHub"
@@ -45,40 +60,47 @@ oc adm policy add-role-to-user edit "$IDENTITY" -n $USERNAME
 # create RBAC for the user
 oc apply -f <(sed -e "s/<username>/$USERNAME/g" -e "s/<email>/$IDENTITY/g" rbac.yml)
 
-# create PVC for the user (auto-detect storage class)
-if oc get sc ocs-storagecluster-cephfs &>/dev/null; then
-  oc apply -f <(sed "s/<username>/$USERNAME/g" pvc/persistent-workspace-pvc.yml)
-  CREDS_SOURCE_NS="openshift-storage"
-elif oc get sc lvms-nvme-vg &>/dev/null; then
-  oc apply -f <(sed "s/<username>/$USERNAME/g" pvc/lvms-user-pvc.yml)
-  CREDS_SOURCE_NS="nfs-server"
-elif oc get sc nfs-rwx &>/dev/null; then
-  oc apply -f <(sed "s/<username>/$USERNAME/g" pvc/pytorch-nfs-rwx-pvc.yml)
-  CREDS_SOURCE_NS="nfs-server"
-else
-  echo "ERROR: No supported storage class found (ocs-storagecluster-cephfs, lvms-nvme-vg, or nfs-rwx)"
-  exit 1
-fi
+# -- Track-specific resources --
+# Nix track: PVCs and deployments are managed by the Helm chart; skip them here.
+# Standard track: create workspace PVC and tooling ConfigMaps as before.
+if [ "$TRACK" = "standard" ]; then
+  # create PVC for the user (auto-detect storage class)
+  if oc get sc ocs-storagecluster-cephfs &>/dev/null; then
+    oc apply -f <(sed "s/<username>/$USERNAME/g" pvc/persistent-workspace-pvc.yml)
+    CREDS_SOURCE_NS="openshift-storage"
+  elif oc get sc lvms-nvme-vg &>/dev/null; then
+    oc apply -f <(sed "s/<username>/$USERNAME/g" pvc/lvms-user-pvc.yml)
+    CREDS_SOURCE_NS="nfs-server"
+  elif oc get sc nfs-rwx &>/dev/null; then
+    oc apply -f <(sed "s/<username>/$USERNAME/g" pvc/pytorch-nfs-rwx-pvc.yml)
+    CREDS_SOURCE_NS="nfs-server"
+  else
+    echo "ERROR: No supported storage class found (ocs-storagecluster-cephfs, lvms-nvme-vg, or nfs-rwx)"
+    exit 1
+  fi
+
+  # create configmaps for bazel and gdbinit
+  oc apply -f <(sed "s/<username>/$USERNAME/g" config_map/bazel-configmap.yml)
+  oc apply -f <(sed "s/<username>/$USERNAME/g" config_map/gdbinit-configmap.yml)
 
-# copy COS backup credentials to user namespace
-if oc get secret cos-backup-creds -n "$CREDS_SOURCE_NS" &>/dev/null; then
-  oc get secret cos-backup-creds -n "$CREDS_SOURCE_NS" -o json | \
-    python3 -c "
+  # copy COS backup credentials to user namespace
+  if [ -n "$CREDS_SOURCE_NS" ] && oc get secret cos-backup-creds -n "$CREDS_SOURCE_NS" &>/dev/null; then
+    oc get secret cos-backup-creds -n "$CREDS_SOURCE_NS" -o json | \
+      python3 -c "
 import sys, json
 s = json.load(sys.stdin)
 s['metadata']['namespace'] = '$USERNAME'
 for k in ['uid', 'resourceVersion', 'creationTimestamp', 'managedFields']:
     s['metadata'].pop(k, None)
 json.dump(s, sys.stdout)
 " | oc apply -f -
+  fi
+else
+  echo "Skipping workspace PVC and ConfigMaps (managed by Helm chart for track: $TRACK)"
 fi
 
 # push quay image secret to pull image from quay
 oc apply -f <(sed "s/<username>/$USERNAME/g" rh-ee-sampark-dev-bot-secret.yml)
 
-# create configmaps for bazel and gdbinit
-oc apply -f <(sed "s/<username>/$USERNAME/g" config_map/bazel-configmap.yml)
-oc apply -f <(sed "s/<username>/$USERNAME/g" config_map/gdbinit-configmap.yml)
-
 # create resourcequotas
 oc apply -f <(sed "s/<username>/$USERNAME/g" resourcequotas.yml)
diff --git a/create_dev_user.sh b/create_dev_user.sh
@@ -1,5 +1,13 @@
 #!/bin/bash
 
+# Parse --track flag (default: standard)
+TRACK="standard"
+for arg in "$@"; do
+  case "$arg" in
+    --track=*) TRACK="${arg#*=}" ;;
+  esac
+done
+
 read -p "Enter openshift username: " USERNAME
 read -e -p "Enter ssh private key path for github: " SSH_KEY_PATH
 read -e -p "Enter gcloud application default credentials path: " GCLOUD_CREDENTIALS
@@ -16,10 +24,40 @@ oc create secret generic $USERNAME-gcloud-config \
   --namespace=$USERNAME \
   --from-file=$GCLOUD_CREDENTIALS
 
-# create deployment for the user
-oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-18g.yml)
-oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-35g.yml)
-oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-10g-rdma.yml)
-oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-20g-rdma.yml)
+# -- Track-specific resources --
+# Nix track: auto-detect storage class (same logic as create_dev_admin.sh)
+#            and deploy via Helm chart.
+# Standard track: apply the static deployment YAMLs.
+if [ "$TRACK" = "standard" ]; then
+  oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-18g.yml)
+  oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-35g.yml)
+  oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-10g-rdma.yml)
+  oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-20g-rdma.yml)
+else
+  # Auto-detect storage class (mirrors create_dev_admin.sh detection order)
+  if oc get sc ocs-storagecluster-cephfs &>/dev/null; then
+    STORAGE_CLASS="ocs-storagecluster-cephfs"
+    ACCESS_MODE="ReadWriteMany"
+  elif oc get sc lvms-nvme-vg &>/dev/null; then
+    STORAGE_CLASS="lvms-nvme-vg"
+    ACCESS_MODE="ReadWriteOnce"
+  elif oc get sc nfs-rwx &>/dev/null; then
+    STORAGE_CLASS="nfs-rwx"
+    ACCESS_MODE="ReadWriteMany"
+  else
+    echo "ERROR: No supported storage class found (ocs-storagecluster-cephfs, lvms-nvme-vg, or nfs-rwx)"
+    exit 1
+  fi
+  echo "Detected storage class: $STORAGE_CLASS ($ACCESS_MODE)"
+
+  helm install $USERNAME-dev devcontainers/nix/chart \
+    --namespace $USERNAME \
+    --set username=$USERNAME \
+    --set storage.home.storageClass=$STORAGE_CLASS \
+    --set storage.home.accessMode=$ACCESS_MODE \
+    --set storage.nixCache.storageClass=$STORAGE_CLASS \
+    --set storage.nixCache.accessMode=$ACCESS_MODE \
+    --set imagePullSecret=rh-ee-sampark-dev-bot-pull-secret
+fi
 
 oc project $USERNAME
diff --git a/devcontainers/nix/README.md b/devcontainers/nix/README.md
@@ -0,0 +1,242 @@
+# Nix Dev Environment
+
+Helm chart that deploys a nix-managed GPU dev environment on OpenShift.
+A minimal Fedora + Nix container image is used -- all development tools
+(CUDA, PyTorch build deps, shell, editor, etc.) are installed via
+home-manager at runtime. Your home directory and nix binary cache persist
+across pod restarts.
+
+## Prerequisites
+
+- Namespace provisioned by an admin (`create_dev_admin.sh` -- skip the PVC
+  creation step, the Helm chart manages its own storage)
+- `helm` CLI installed
+- `oc` CLI installed and authenticated (`oc login`)
+- SSH key registered on your GitHub account
+  (https://github.com/settings/keys)
+- GCloud application default credentials for Vertex AI
+
+## Setup
+
+### 1. Create secrets
+
+Run the existing user setup script from the repo root. It will prompt for
+your username, SSH key path, and gcloud credentials path:
+
+```bash
+./create_dev_user.sh
+```
+
+This creates the `<username>-git-ssh-key` and `<username>-gcloud-config`
+secrets in your namespace. You can skip the deployment step at the end --
+the Helm chart handles that.
+
+**Important:** The SSH key must be the one registered on your GitHub account.
+Verify with `ssh -T git@github.com` -- it should print
+`Hi <your-username>! You've successfully authenticated`. If your SSH agent
+uses a different key than the key file you provide, the settings repo clone
+will fail. See [Troubleshooting](#ssh-key-rejected-during-init) below.
+
+### 2. Deploy
+
+```bash
+helm install <username>-dev ./devcontainers/nix/chart \
+  --set username=<username> \
+  -n <username>
+```
+
+This creates a deployment (paused at 0 replicas) and two persistent volumes
+(100Gi home directory + 50Gi nix binary cache).
+
+To use a personal settings repo:
+
+```bash
+helm install <username>-dev ./devcontainers/nix/chart \
+  --set username=<username> \
+  --set nix.settings.repo=git@github.com:<user>/my-settings.git \
+  --set nix.settings.profile=default \
+  -n <username>
+```
+
+### 3. Start the pod
+
+```bash
+oc scale deployment <username>-dev -n <username> --replicas=1
+```
+
+The pod goes through two init stages before it's ready:
+
+```
+Init:0/2   -- Seeding /nix from the container image (~2s)
+Init:1/2   -- Cloning settings + running home-manager switch
+               First boot: ~5-10 min (downloads packages from cache.nixos.org)
+               Subsequent boots: ~30s (uses persistent nix binary cache)
+Running    -- Dev environment ready
+```
+
+Watch progress:
+
+```bash
+oc get pods -n <username> -w
+```
+
+### 4. Connect
+
+**Terminal:**
+
+```bash
+oc exec -it deployment/<username>-dev -n <username> -- zsh
+```
+
+**SSH (for VS Code Remote, port forwarding, etc.):**
+
+```bash
+oc port-forward deployment/<username>-dev -n <username> 2222:22
+# Then in another terminal, or in VS Code SSH config:
+ssh -p 2222 root@localhost
+```
+
+## GPU profiles
+
+The default GPU is MIG 2g.35gb. To use a different GPU, pass `--set` flags
+during `helm install` or `helm upgrade`:
+
+| GPU | Flags | Use case |
+|---|---|---|
+| MIG 2g.35gb | *(default, no flags needed)* | Standard development |
+| MIG 1g.18gb | `--set gpu.type=nvidia.com/mig-1g.18gb --set gpu.runtimeClassName=nvidia-cdi` | Smaller workloads, testing |
+| MIG 1g.18gb x2 | `--set gpu.type=nvidia.com/mig-1g.18gb --set gpu.count=2 --set gpu.runtimeClassName=nvidia-cdi` | Parallel workloads |
+| Full GPU | `--set gpu.type=nvidia.com/gpu` | Performance benchmarking |
+| CPU only | `--set gpu.enabled=false` | Builds, code review |
+
+Or use the bundled profile files as a shorthand:
+
+```bash
+helm install <username>-dev ./devcontainers/nix/chart \
+  --set username=<username> \
+  -f devcontainers/nix/chart/profiles/mig-18g.yaml \
+  -n <username>
+```
+
+### Switching profiles
+
+Scale down, upgrade, scale back up:
+
+```bash
+oc scale deployment <username>-dev -n <username> --replicas=0
+helm upgrade <username>-dev ./devcontainers/nix/chart \
+  --set username=<username> \
+  --set gpu.type=nvidia.com/mig-1g.18gb \
+  --set gpu.runtimeClassName=nvidia-cdi \
+  -n <username>
+oc scale deployment <username>-dev -n <username> --replicas=1
+```
+
+## Customization
+
+The dev environment is configured through a `settings.nix` file. If you
+provided a settings repo during deploy, it was cloned to
+`~/workspace/settings`. To modify your environment after connecting:
+
+```bash
+vim ~/workspace/settings/settings.nix
+torched apply
+```
+
+If you deployed without a settings repo, a default template was initialized.
+See [torched-devcontainer](https://github.com/hinriksnaer/torched-devcontainer)
+for the template format and available options.
+
+## Teardown
+
+Remove the deployment but keep your data (home directory + nix cache):
+
+```bash
+helm uninstall <username>-dev -n <username>
+```
+
+The PVCs are preserved -- a future `helm install` will reuse them and boot
+quickly from the cached nix store.
+
+To delete everything including persistent data:
+
+```bash
+helm uninstall <username>-dev -n <username>
+oc delete pvc home-<username> nix-store-<username> -n <username>
+```
+
+## Troubleshooting
+
+### SSH key rejected during init
+
+```
+git@github.com: Permission denied (publickey).
+```
+
+The SSH key in the secret doesn't match what's registered on GitHub. Check
+which key your local SSH actually uses:
+
+```bash
+ssh -vT git@github.com 2>&1 | grep "Offering public key"
+```
+
+Compare the fingerprint with what's in the secret:
+
+```bash
+oc exec deployment/<username>-dev -n <username> -- \
+  ssh-keygen -l -f /root/.ssh-keys/id_ed25519
+```
+
+If they don't match, recreate the secret with the correct key:
+
+```bash
+oc delete secret <username>-git-ssh-key -n <username>
+oc create secret generic <username>-git-ssh-key \
+  --namespace=<username> \
+  --from-file=ssh-privatekey=$HOME/.ssh/<correct-key> \
+  --from-file=ssh-publickey=$HOME/.ssh/<correct-key>.pub \
+  --from-file=known_hosts=<(ssh-keyscan github.com 2>/dev/null)
+```
+
+Then restart the pod: `oc scale` to 0 then back to 1.
+
+### Init-config takes a long time
+
+First boot downloads ~250MB of nix packages from cache.nixos.org. This is
+normal and takes 5-10 minutes. Subsequent boots use the persistent NFS
+binary cache and complete in ~30 seconds.
+
+Check progress:
+
+```bash
+oc logs deployment/<username>-dev -c init-config -n <username> -f
+```
+
+### Quota exceeded
+
+```
+exceeded quota: gpu-quota
+```
+
+The nix track creates 150Gi of PVCs (100Gi home + 50Gi nix cache). If the
+admin script already created a 250Gi workspace PVC, the total may exceed
+the 300Gi storage quota. Delete the unused workspace PVC:
+
+```bash
+oc delete pvc pytorch-ibmc-storage-<username> -n <username>
+```
+
+### Pod stuck in CrashLoopBackOff
+
+Check the main container logs:
+
+```bash
+oc logs deployment/<username>-dev -c dev -n <username>
+```
+
+### Checking init container logs
+
+```bash
+oc logs deployment/<username>-dev -c init-nix -n <username>
+oc logs deployment/<username>-dev -c init-config -n <username>
+```