docs: add slurm page

MilesCranmer · MilesCranmer · commit 00f0573d0009 · 2026-01-29T21:10:30.000Z
diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts
@@ -131,6 +131,7 @@ export default defineConfig({
         text: 'Advanced',
         items: [
           { text: 'Tuning', link: '/tuning' },
+          { text: 'Slurm', link: '/slurm' },
           { text: 'Backend', link: '/backend' },
         ]
       }
diff --git a/docs/src/slurm.md b/docs/src/slurm.md
@@ -0,0 +1,51 @@
+# Slurm (Multi-node)
+
+PySR supports running across multiple nodes on Slurm via `cluster_manager="slurm"`.
+This backend is **allocation-based**: you request resources with Slurm (`sbatch`/`salloc`), then PySR launches Julia workers inside that allocation (using `SlurmClusterManager.jl`).
+
+Here is a minimal `sbatch` example using 3 workers on each of 2 nodes (6 workers total).
+
+Save this as `pysr_job.sh`:
+
+```bash
+#!/bin/bash
+#SBATCH --job-name=pysr
+#SBATCH --partition=normal
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=3
+#SBATCH --time=01:00:00
+
+set -euo pipefail
+python pysr_script.py
+```
+
+Save this as `pysr_script.py` in the same directory:
+
+```python
+import numpy as np
+from pysr import PySRRegressor
+
+X = np.random.RandomState(0).randn(1000, 2)
+y = X[:, 0] + 2 * X[:, 1]
+
+model = PySRRegressor(
+    niterations=200,
+    populations=2,
+    parallelism="multiprocessing",
+    cluster_manager="slurm",
+    procs=6,  # must match the Slurm allocation's total task count
+)
+model.fit(X, y)
+print(model)
+```
+
+Submit it with:
+
+```bash
+sbatch pysr_job.sh
+```
+
+## Notes
+
+- `procs` is the number of Julia worker processes. It must match the Slurm allocation's total tasks (e.g., `--ntasks` or `--nodes * --ntasks-per-node`).
+- Run the Python script once (as the master) inside the allocation; do not wrap it in `srun`.
diff --git a/docs/src/tuning.md b/docs/src/tuning.md
@@ -6,7 +6,7 @@ First, my general tips would be to avoid using redundant operators, like how `po
 
 When running PySR, I usually do the following:
 
-I run from IPython (Jupyter Notebooks don't work as well[^1]) on the head node of a slurm cluster. Passing `cluster_manager="slurm"` will make PySR set up a run over the entire allocation. I set `procs` equal to the total number of cores over my entire allocation.
+I run from IPython (Jupyter Notebooks don't work as well[^1]) on the head node of a slurm cluster. Passing `cluster_manager="slurm"` will make PySR set up a run over the entire allocation. I set `procs` equal to the total number of tasks across my entire allocation (see the [Slurm page](slurm.md) for a complete multi-node example).
 
 I use the [tensorboard feature](https://ai.damtp.cam.ac.uk/pysr/examples/#12-using-tensorboard-for-logging) for experiment tracking.
 
diff --git a/pysr/test/slurm_docker_cluster/config/slurm.conf b/pysr/test/slurm_docker_cluster/config/slurm.conf
@@ -37,6 +37,6 @@ SlurmctldLogFile=/var/log/slurm/slurmctld.log
 SlurmdDebug=info
 SlurmdLogFile=/var/log/slurm/slurmd.log
 
-NodeName=c1 CPUs=2 RealMemory=1000 State=UNKNOWN
-NodeName=c2 CPUs=2 RealMemory=1000 State=UNKNOWN
+NodeName=c1 CPUs=4 RealMemory=1000 State=UNKNOWN
+NodeName=c2 CPUs=4 RealMemory=1000 State=UNKNOWN
 PartitionName=normal Nodes=c1,c2 Default=YES MaxTime=INFINITE State=UP
diff --git a/pysr/test/slurm_docker_cluster/docker-compose.yml b/pysr/test/slurm_docker_cluster/docker-compose.yml
@@ -17,7 +17,7 @@ services:
     command: ["slurmd"]
     hostname: c1
     working_dir: /data
-    cpus: 2
+    cpus: 4
     privileged: true
     volumes:
       - etc_munge:/etc/munge
@@ -32,7 +32,7 @@ services:
     command: ["slurmd"]
     hostname: c2
     working_dir: /data
-    cpus: 2
+    cpus: 4
     privileged: true
     volumes:
       - etc_munge:/etc/munge
diff --git a/pysr/test/test_slurm.py b/pysr/test/test_slurm.py
@@ -330,82 +330,124 @@ def _wait_for_cluster_ready(self, *, timeout_s: int):
             time.sleep(2)
 
     def test_pysr_slurm_cluster_manager(self):
-        slurm_job = self.data_dir / "pysr_slurm_job.sh"
-        slurm_job.write_text(
-            "\n".join(
-                [
-                    "#!/bin/bash",
-                    "#SBATCH --job-name=pysr-slurm-test",
-                    "#SBATCH --partition=normal",
-                    "#SBATCH --nodes=2",
-                    "#SBATCH --ntasks-per-node=1",
-                    "#SBATCH --time=40:00",
-                    "set -euo pipefail",
-                    'jobid="${SLURM_JOB_ID:-${SLURM_JOBID:-}}"',
-                    'if [ -z "$jobid" ]; then echo "Missing SLURM_JOB_ID/SLURM_JOBID" >&2; exit 1; fi',
-                    "monitor_steps() {",
-                    "  while true; do",
-                    "    echo PYSR_SCONTROL_STEP_SAMPLE",
-                    '    scontrol show step "$jobid" -o 2>/dev/null || true',
-                    "    sleep 1",
-                    "  done",
-                    "}",
-                    "monitor_steps &",
-                    "MONITOR_PID=$!",
-                    "trap 'kill $MONITOR_PID 2>/dev/null || true' EXIT",
-                    "python3 - <<'PY'",
-                    "import os",
-                    "os.environ['JULIA_DEBUG'] = 'SlurmClusterManager'",
-                    "import numpy as np",
-                    "from pysr import PySRRegressor",
-                    "X = np.random.RandomState(0).randn(30, 2)",
-                    "y = X[:, 0] + 1.0",
-                    "model = PySRRegressor(",
-                    "    niterations=2,",
-                    "    populations=2,",
-                    "    progress=False,",
-                    "    temp_equation_file=True,",
-                    "    parallelism='multiprocessing',",
-                    "    procs=2,",
-                    "    cluster_manager='slurm',",
-                    "    verbosity=0,",
-                    ")",
-                    "model.fit(X, y)",
-                    "print('PYSR_SLURM_OK:slurm')",
-                    "PY",
-                ]
+        def _assert_worker_distribution(
+            output: str,
+            *,
+            expected_procs: int,
+            expected_nodes: int,
+            expected_per_node: int,
+        ) -> None:
+            worker_hosts = re.findall(
+                r"Worker \d+ ready on host ([^,]+), port \d+",
+                output,
+            )
+            self.assertEqual(
+                len(worker_hosts),
+                expected_procs,
+                msg=f"Expected {expected_procs} workers.\n\n{output}",
+            )
+            counts: dict[str, int] = {}
+            for host in worker_hosts:
+                counts[host] = counts.get(host, 0) + 1
+            self.assertEqual(
+                len(counts),
+                expected_nodes,
+                msg=f"Expected workers on {expected_nodes} nodes.\n\n{output}",
+            )
+            self.assertTrue(
+                all(v == expected_per_node for v in counts.values()),
+                msg=f"Expected exactly {expected_per_node} workers per node.\n\n{output}",
             )
-            + "\n"
-        )
-        slurm_job.chmod(0o755)
-
-        slurm_output = self._run_sbatch(slurm_job)
-        self.assertIn("PYSR_SLURM_OK:slurm", slurm_output)
-        self._assert_scontrol_step_usage(
-            slurm_output,
-            expected_tasks=2,
-            expected_nodes=2,
-            expected_nodelist={"c1,c2", "c[1-2]"},
-            label="slurm",
-        )
-        self.assertEqual(
-            len(re.findall(r"^PYSR_SLURM_OK:slurm$", slurm_output, flags=re.MULTILINE)),
-            1,
-            msg=f"Expected slurm marker exactly once.\n\n{slurm_output}",
-        )
 
-        self.assertEqual(
-            len(
-                re.findall(
-                    r"^\[ Info: Starting SLURM job .*", slurm_output, re.MULTILINE
+        def _run_case(*, ntasks_per_node: int, procs: int, seed: int) -> str:
+            marker = f"PYSR_SLURM_OK:slurm:{procs}"
+            job = self.data_dir / f"pysr_slurm_job_{procs}.sh"
+            job.write_text(
+                "\n".join(
+                    [
+                        "#!/bin/bash",
+                        f"#SBATCH --job-name=pysr-slurm-test-{procs}",
+                        "#SBATCH --partition=normal",
+                        "#SBATCH --nodes=2",
+                        f"#SBATCH --ntasks-per-node={ntasks_per_node}",
+                        "#SBATCH --time=40:00",
+                        "set -euo pipefail",
+                        'jobid="${SLURM_JOB_ID:-${SLURM_JOBID:-}}"',
+                        'if [ -z "$jobid" ]; then echo "Missing SLURM_JOB_ID/SLURM_JOBID" >&2; exit 1; fi',
+                        "monitor_steps() {",
+                        "  while true; do",
+                        "    echo PYSR_SCONTROL_STEP_SAMPLE",
+                        '    scontrol show step "$jobid" -o 2>/dev/null || true',
+                        "    sleep 1",
+                        "  done",
+                        "}",
+                        "monitor_steps &",
+                        "MONITOR_PID=$!",
+                        "trap 'kill $MONITOR_PID 2>/dev/null || true' EXIT",
+                        "python3 - <<'PY'",
+                        "import os",
+                        "os.environ['JULIA_DEBUG'] = 'SlurmClusterManager'",
+                        "import numpy as np",
+                        "from pysr import PySRRegressor",
+                        f"X = np.random.RandomState({seed}).randn(30, 2)",
+                        "y = X[:, 0] + 1.0",
+                        "model = PySRRegressor(",
+                        "    niterations=2,",
+                        "    populations=2,",
+                        "    progress=False,",
+                        "    temp_equation_file=True,",
+                        "    parallelism='multiprocessing',",
+                        f"    procs={procs},",
+                        "    cluster_manager='slurm',",
+                        "    verbosity=0,",
+                        ")",
+                        "model.fit(X, y)",
+                        f"print('{marker}')",
+                        "PY",
+                    ]
                 )
-            ),
-            0,
-            msg=(
-                "Expected Slurm backend to use SlurmClusterManager (allocation-based), "
-                "not ClusterManagers.\n\n" + slurm_output
-            ),
-        )
+                + "\n"
+            )
+            job.chmod(0o755)
+
+            output = self._run_sbatch(job)
+            self.assertIn(marker, output)
+            self.assertEqual(
+                len(re.findall(rf"^{re.escape(marker)}$", output, flags=re.MULTILINE)),
+                1,
+                msg=f"Expected marker exactly once.\n\n{output}",
+            )
+            self.assertEqual(
+                len(
+                    re.findall(r"^\[ Info: Starting SLURM job .*", output, re.MULTILINE)
+                ),
+                0,
+                msg=(
+                    "Expected Slurm backend to use SlurmClusterManager (allocation-based), "
+                    "not ClusterManagers.\n\n" + output
+                ),
+            )
+            return output
+
+        cases = [
+            dict(ntasks_per_node=1, procs=2, seed=0),
+            dict(ntasks_per_node=3, procs=6, seed=2),
+        ]
+        for case in cases:
+            output = _run_case(**case)
+            self._assert_scontrol_step_usage(
+                output,
+                expected_tasks=case["procs"],
+                expected_nodes=2,
+                expected_nodelist={"c1,c2", "c[1-2]"},
+                label=f"slurm({case['procs']})",
+            )
+            _assert_worker_distribution(
+                output,
+                expected_procs=case["procs"],
+                expected_nodes=2,
+                expected_per_node=case["ntasks_per_node"],
+            )
 
 
 def runtests(just_tests=False):

Original file line number	Diff line number	Diff line change
`@@ -131,6 +131,7 @@ export default defineConfig({`
`131`	`131`	`text: 'Advanced',`
`132`	`132`	`items: [`
`133`	`133`	`{ text: 'Tuning', link: '/tuning' },`
	`134`	`+ { text: 'Slurm', link: '/slurm' },`
`134`	`135`	`{ text: 'Backend', link: '/backend' },`
`135`	`136`	`]`
`136`	`137`	`}`