Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 44 additions & 22 deletions create_dev_admin.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
#!/bin/bash

# Parse --track flag (default: standard)
TRACK="standard"
for arg in "$@"; do
case "$arg" in
--track=*) TRACK="${arg#*=}" ;;
esac
done

if [[ "$TRACK" != "standard" && "$TRACK" != "nix" ]]; then
echo "ERROR: unsupported track '$TRACK' -- must be 'standard' or 'nix'"
exit 1
fi

echo "Track: $TRACK"
echo ""
echo "Select auth type:"
echo " 1) IBM Cloud IAM"
echo " 2) GitHub"
Expand Down Expand Up @@ -45,40 +60,47 @@ oc adm policy add-role-to-user edit "$IDENTITY" -n $USERNAME
# create RBAC for the user
oc apply -f <(sed -e "s/<username>/$USERNAME/g" -e "s/<email>/$IDENTITY/g" rbac.yml)

# create PVC for the user (auto-detect storage class)
if oc get sc ocs-storagecluster-cephfs &>/dev/null; then
oc apply -f <(sed "s/<username>/$USERNAME/g" pvc/persistent-workspace-pvc.yml)
CREDS_SOURCE_NS="openshift-storage"
elif oc get sc lvms-nvme-vg &>/dev/null; then
oc apply -f <(sed "s/<username>/$USERNAME/g" pvc/lvms-user-pvc.yml)
CREDS_SOURCE_NS="nfs-server"
elif oc get sc nfs-rwx &>/dev/null; then
oc apply -f <(sed "s/<username>/$USERNAME/g" pvc/pytorch-nfs-rwx-pvc.yml)
CREDS_SOURCE_NS="nfs-server"
else
echo "ERROR: No supported storage class found (ocs-storagecluster-cephfs, lvms-nvme-vg, or nfs-rwx)"
exit 1
fi
# -- Track-specific resources --
# Nix track: PVCs and deployments are managed by the Helm chart; skip them here.
# Standard track: create workspace PVC and tooling ConfigMaps as before.
if [ "$TRACK" = "standard" ]; then
# create PVC for the user (auto-detect storage class)
if oc get sc ocs-storagecluster-cephfs &>/dev/null; then
oc apply -f <(sed "s/<username>/$USERNAME/g" pvc/persistent-workspace-pvc.yml)
CREDS_SOURCE_NS="openshift-storage"
elif oc get sc lvms-nvme-vg &>/dev/null; then
oc apply -f <(sed "s/<username>/$USERNAME/g" pvc/lvms-user-pvc.yml)
CREDS_SOURCE_NS="nfs-server"
elif oc get sc nfs-rwx &>/dev/null; then
oc apply -f <(sed "s/<username>/$USERNAME/g" pvc/pytorch-nfs-rwx-pvc.yml)
CREDS_SOURCE_NS="nfs-server"
else
echo "ERROR: No supported storage class found (ocs-storagecluster-cephfs, lvms-nvme-vg, or nfs-rwx)"
exit 1
fi

# create configmaps for bazel and gdbinit
oc apply -f <(sed "s/<username>/$USERNAME/g" config_map/bazel-configmap.yml)
oc apply -f <(sed "s/<username>/$USERNAME/g" config_map/gdbinit-configmap.yml)

# copy COS backup credentials to user namespace
if oc get secret cos-backup-creds -n "$CREDS_SOURCE_NS" &>/dev/null; then
oc get secret cos-backup-creds -n "$CREDS_SOURCE_NS" -o json | \
python3 -c "
# copy COS backup credentials to user namespace
if [ -n "$CREDS_SOURCE_NS" ] && oc get secret cos-backup-creds -n "$CREDS_SOURCE_NS" &>/dev/null; then
oc get secret cos-backup-creds -n "$CREDS_SOURCE_NS" -o json | \
python3 -c "
import sys, json
s = json.load(sys.stdin)
s['metadata']['namespace'] = '$USERNAME'
for k in ['uid', 'resourceVersion', 'creationTimestamp', 'managedFields']:
s['metadata'].pop(k, None)
json.dump(s, sys.stdout)
" | oc apply -f -
fi
else
echo "Skipping workspace PVC and ConfigMaps (managed by Helm chart for track: $TRACK)"
fi

# push quay image secret to pull image from quay
oc apply -f <(sed "s/<username>/$USERNAME/g" rh-ee-sampark-dev-bot-secret.yml)

# create configmaps for bazel and gdbinit
oc apply -f <(sed "s/<username>/$USERNAME/g" config_map/bazel-configmap.yml)
oc apply -f <(sed "s/<username>/$USERNAME/g" config_map/gdbinit-configmap.yml)

# create resourcequotas
oc apply -f <(sed "s/<username>/$USERNAME/g" resourcequotas.yml)
48 changes: 43 additions & 5 deletions create_dev_user.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
#!/bin/bash

# Parse --track flag (default: standard)
TRACK="standard"
for arg in "$@"; do
case "$arg" in
--track=*) TRACK="${arg#*=}" ;;
esac
done

read -p "Enter openshift username: " USERNAME
read -e -p "Enter ssh private key path for github: " SSH_KEY_PATH
read -e -p "Enter gcloud application default credentials path: " GCLOUD_CREDENTIALS
Expand All @@ -16,10 +24,40 @@ oc create secret generic $USERNAME-gcloud-config \
--namespace=$USERNAME \
--from-file=$GCLOUD_CREDENTIALS

# create deployment for the user
oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-18g.yml)
oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-35g.yml)
oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-10g-rdma.yml)
oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-20g-rdma.yml)
# -- Track-specific resources --
# Nix track: auto-detect storage class (same logic as create_dev_admin.sh)
# and deploy via Helm chart.
# Standard track: apply the static deployment YAMLs.
if [ "$TRACK" = "standard" ]; then
oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-18g.yml)
oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-35g.yml)
oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-10g-rdma.yml)
oc apply -f <(sed "s/<username>/$USERNAME/g" deployment/deployment-mig-20g-rdma.yml)
else
# Auto-detect storage class (mirrors create_dev_admin.sh detection order)
if oc get sc ocs-storagecluster-cephfs &>/dev/null; then
STORAGE_CLASS="ocs-storagecluster-cephfs"
ACCESS_MODE="ReadWriteMany"
elif oc get sc lvms-nvme-vg &>/dev/null; then
STORAGE_CLASS="lvms-nvme-vg"
ACCESS_MODE="ReadWriteOnce"
elif oc get sc nfs-rwx &>/dev/null; then
STORAGE_CLASS="nfs-rwx"
ACCESS_MODE="ReadWriteMany"
else
echo "ERROR: No supported storage class found (ocs-storagecluster-cephfs, lvms-nvme-vg, or nfs-rwx)"
exit 1
fi
echo "Detected storage class: $STORAGE_CLASS ($ACCESS_MODE)"

helm install $USERNAME-dev devcontainers/nix/chart \
--namespace $USERNAME \
--set username=$USERNAME \
--set storage.home.storageClass=$STORAGE_CLASS \
--set storage.home.accessMode=$ACCESS_MODE \
--set storage.nixCache.storageClass=$STORAGE_CLASS \
--set storage.nixCache.accessMode=$ACCESS_MODE \
--set imagePullSecret=rh-ee-sampark-dev-bot-pull-secret
fi

oc project $USERNAME
242 changes: 242 additions & 0 deletions devcontainers/nix/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
# Nix Dev Environment

Helm chart that deploys a nix-managed GPU dev environment on OpenShift.
A minimal Fedora + Nix container image is used -- all development tools
(CUDA, PyTorch build deps, shell, editor, etc.) are installed via
home-manager at runtime. Your home directory and nix binary cache persist
across pod restarts.

## Prerequisites

- Namespace provisioned by an admin (`create_dev_admin.sh` -- skip the PVC
creation step, the Helm chart manages its own storage)
- `helm` CLI installed
- `oc` CLI installed and authenticated (`oc login`)
- SSH key registered on your GitHub account
(https://github.com/settings/keys)
- GCloud application default credentials for Vertex AI

## Setup

### 1. Create secrets

Run the existing user setup script from the repo root. It will prompt for
your username, SSH key path, and gcloud credentials path:

```bash
./create_dev_user.sh
```

This creates the `<username>-git-ssh-key` and `<username>-gcloud-config`
secrets in your namespace. You can skip the deployment step at the end --
the Helm chart handles that.

**Important:** The SSH key must be the one registered on your GitHub account.
Verify with `ssh -T git@github.com` -- it should print
`Hi <your-username>! You've successfully authenticated`. If your SSH agent
uses a different key than the key file you provide, the settings repo clone
will fail. See [Troubleshooting](#ssh-key-rejected-during-init) below.

### 2. Deploy

```bash
helm install <username>-dev ./devcontainers/nix/chart \
--set username=<username> \
-n <username>
```

This creates a deployment (paused at 0 replicas) and two persistent volumes
(100Gi home directory + 50Gi nix binary cache).

To use a personal settings repo:

```bash
helm install <username>-dev ./devcontainers/nix/chart \
--set username=<username> \
--set nix.settings.repo=git@github.com:<user>/my-settings.git \
--set nix.settings.profile=default \
-n <username>
```

### 3. Start the pod

```bash
oc scale deployment <username>-dev -n <username> --replicas=1
```

The pod goes through two init stages before it's ready:

```
Init:0/2 -- Seeding /nix from the container image (~2s)
Init:1/2 -- Cloning settings + running home-manager switch
First boot: ~5-10 min (downloads packages from cache.nixos.org)
Subsequent boots: ~30s (uses persistent nix binary cache)
Running -- Dev environment ready
```

Watch progress:

```bash
oc get pods -n <username> -w
```

### 4. Connect

**Terminal:**

```bash
oc exec -it deployment/<username>-dev -n <username> -- zsh
```

**SSH (for VS Code Remote, port forwarding, etc.):**

```bash
oc port-forward deployment/<username>-dev -n <username> 2222:22
# Then in another terminal, or in VS Code SSH config:
ssh -p 2222 root@localhost
```

## GPU profiles

The default GPU is MIG 2g.35gb. To use a different GPU, pass `--set` flags
during `helm install` or `helm upgrade`:

| GPU | Flags | Use case |
|---|---|---|
| MIG 2g.35gb | *(default, no flags needed)* | Standard development |
| MIG 1g.18gb | `--set gpu.type=nvidia.com/mig-1g.18gb --set gpu.runtimeClassName=nvidia-cdi` | Smaller workloads, testing |
| MIG 1g.18gb x2 | `--set gpu.type=nvidia.com/mig-1g.18gb --set gpu.count=2 --set gpu.runtimeClassName=nvidia-cdi` | Parallel workloads |
| Full GPU | `--set gpu.type=nvidia.com/gpu` | Performance benchmarking |
| CPU only | `--set gpu.enabled=false` | Builds, code review |

Or use the bundled profile files as a shorthand:

```bash
helm install <username>-dev ./devcontainers/nix/chart \
--set username=<username> \
-f devcontainers/nix/chart/profiles/mig-18g.yaml \
-n <username>
```

### Switching profiles

Scale down, upgrade, scale back up:

```bash
oc scale deployment <username>-dev -n <username> --replicas=0
helm upgrade <username>-dev ./devcontainers/nix/chart \
--set username=<username> \
--set gpu.type=nvidia.com/mig-1g.18gb \
--set gpu.runtimeClassName=nvidia-cdi \
-n <username>
oc scale deployment <username>-dev -n <username> --replicas=1
```

## Customization

The dev environment is configured through a `settings.nix` file. If you
provided a settings repo during deploy, it was cloned to
`~/workspace/settings`. To modify your environment after connecting:

```bash
vim ~/workspace/settings/settings.nix
torched apply
```

If you deployed without a settings repo, a default template was initialized.
See [torched-devcontainer](https://github.com/hinriksnaer/torched-devcontainer)
for the template format and available options.

## Teardown

Remove the deployment but keep your data (home directory + nix cache):

```bash
helm uninstall <username>-dev -n <username>
```

The PVCs are preserved -- a future `helm install` will reuse them and boot
quickly from the cached nix store.

To delete everything including persistent data:

```bash
helm uninstall <username>-dev -n <username>
oc delete pvc home-<username> nix-store-<username> -n <username>
```

## Troubleshooting

### SSH key rejected during init

```
git@github.com: Permission denied (publickey).
```

The SSH key in the secret doesn't match what's registered on GitHub. Check
which key your local SSH actually uses:

```bash
ssh -vT git@github.com 2>&1 | grep "Offering public key"
```

Compare the fingerprint with what's in the secret:

```bash
oc exec deployment/<username>-dev -n <username> -- \
ssh-keygen -l -f /root/.ssh-keys/id_ed25519
```

If they don't match, recreate the secret with the correct key:

```bash
oc delete secret <username>-git-ssh-key -n <username>
oc create secret generic <username>-git-ssh-key \
--namespace=<username> \
--from-file=ssh-privatekey=$HOME/.ssh/<correct-key> \
--from-file=ssh-publickey=$HOME/.ssh/<correct-key>.pub \
--from-file=known_hosts=<(ssh-keyscan github.com 2>/dev/null)
```

Then restart the pod: `oc scale` to 0 then back to 1.

### Init-config takes a long time

First boot downloads ~250MB of nix packages from cache.nixos.org. This is
normal and takes 5-10 minutes. Subsequent boots use the persistent NFS
binary cache and complete in ~30 seconds.

Check progress:

```bash
oc logs deployment/<username>-dev -c init-config -n <username> -f
```

### Quota exceeded

```
exceeded quota: gpu-quota
```

The nix track creates 150Gi of PVCs (100Gi home + 50Gi nix cache). If the
admin script already created a 250Gi workspace PVC, the total may exceed
the 300Gi storage quota. Delete the unused workspace PVC:

```bash
oc delete pvc pytorch-ibmc-storage-<username> -n <username>
```

### Pod stuck in CrashLoopBackOff

Check the main container logs:

```bash
oc logs deployment/<username>-dev -c dev -n <username>
```

### Checking init container logs

```bash
oc logs deployment/<username>-dev -c init-nix -n <username>
oc logs deployment/<username>-dev -c init-config -n <username>
```
Loading