VectorInstitute · aravind-3105 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml
@@ -58,3 +58,4 @@ jobs:
           ignore-vulns: |
             PYSEC-2025-49
             CVE-2024-6345
+            CVE-2026-4539
diff --git a/implementations/preference_alignment/01_intoduction_and_dataset_construction.ipynb b/implementations/preference_alignment/01_intoduction_and_dataset_construction.ipynb
@@ -98,21 +98,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "FileNotFoundError",
-     "evalue": "No .parquet files found in /home/coder/interpretability-llms-agents/implementations/preference_alignment/data_sky. Please download or generate the filtered dataset and place it in this folder.",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mFileNotFoundError\u001b[39m                         Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 34\u001b[39m\n\u001b[32m     31\u001b[39m parquet_files = \u001b[38;5;28mlist\u001b[39m(DATA_DIR.glob(\u001b[33m\"\u001b[39m\u001b[33m*.parquet\u001b[39m\u001b[33m\"\u001b[39m))\n\u001b[32m     33\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m parquet_files:\n\u001b[32m---> \u001b[39m\u001b[32m34\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[32m     35\u001b[39m         \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mNo .parquet files found in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mDATA_DIR\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m     36\u001b[39m         \u001b[33m\"\u001b[39m\u001b[33mPlease download or generate the filtered dataset and place it in this folder.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m     37\u001b[39m     )\n\u001b[32m     39\u001b[39m PARQUET_PATH = parquet_files[\u001b[32m0\u001b[39m]\n\u001b[32m     41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mUsing dataset file: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mPARQUET_PATH\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
-      "\u001b[31mFileNotFoundError\u001b[39m: No .parquet files found in /home/coder/interpretability-llms-agents/implementations/preference_alignment/data_sky. Please download or generate the filtered dataset and place it in this folder."
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from pathlib import Path\n",
     "\n",

diff --git a/implementations/preference_alignment/02_inference_runner.ipynb b/implementations/preference_alignment/02_inference_runner.ipynb
@@ -36,21 +36,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "FileNotFoundError",
-     "evalue": "/home/coder/interpretability-llms-agents/implementations/preference_alignment/output_data_sky not found. Please run 01_dataset_construction.ipynb first.",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mFileNotFoundError\u001b[39m                         Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 33\u001b[39m\n\u001b[32m     30\u001b[39m PROCESSED_DATASET_DIR = REPO_ROOT / \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33moutput_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mDATA_FOLDER_NAME\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m     32\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m PROCESSED_DATASET_DIR.exists():\n\u001b[32m---> \u001b[39m\u001b[32m33\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mPROCESSED_DATASET_DIR\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not found. Please run 01_dataset_construction.ipynb first.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m     35\u001b[39m \u001b[38;5;66;03m# HuggingFace Arrow Dataset Path (auto-detect)\u001b[39;00m\n\u001b[32m     36\u001b[39m train_dir = PROCESSED_DATASET_DIR / \u001b[33m\"\u001b[39m\u001b[33mtrain\u001b[39m\u001b[33m\"\u001b[39m\n",
-      "\u001b[31mFileNotFoundError\u001b[39m: /home/coder/interpretability-llms-agents/implementations/preference_alignment/output_data_sky not found. Please run 01_dataset_construction.ipynb first."
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import contextlib\n",
     "import os\n",
@@ -132,7 +120,7 @@
     "    MODEL_ID,\n",
     "    device_map=\"auto\",\n",
     "    torch_dtype=torch.bfloat16,\n",
-    "    attn_implementation=\"flash_attention_2\",\n",
+    "    attn_implementation=\"sdpa\",  # Choose \"flash_attention\" or \"sdpa\" based on your GPU capabilities\n",
     ").eval()\n",
     "\n",
     "print(\"Model loaded:\", MODEL_ID)\n",
@@ -165,7 +153,7 @@
    "outputs": [],
    "source": [
     "if MODE == \"best_of_n\":\n",
-    "    records = load_disk_records(train_dir, limit=200)\n",
+    "    records = load_disk_records(train_dir, limit=20)  # Change to 200 for larger runs\n",
     "\n",
     "    run_best_of_n(\n",
     "        records=records,\n",
@@ -230,7 +218,7 @@
     "        ],\n",
     "    }\n",
     "\n",
-    "    raw = load_arrow_records(str(ARROW_DATASET_PATH), limit=200)\n",
+    "    raw = load_arrow_records(str(ARROW_DATASET_PATH), limit=20)  # Change to 200 for larger runs\n",
     "\n",
     "    batched_outputs = {}\n",
     "\n",

diff --git a/implementations/preference_alignment/04_dpo_training.ipynb b/implementations/preference_alignment/04_dpo_training.ipynb
@@ -59,9 +59,11 @@
     "from pathlib import Path\n",
     "\n",
     "\n",
-    "MAX_SEQ_LENGTH = 2048\n",
+    "MAX_SEQ_LENGTH = 1024  # reduced from 2048 to fit 2× L4 (22GB each); increase if you have more VRAM\n",
     "MODEL_NAME = \"Qwen/Qwen2-7B-Instruct\"\n",
     "\n",
+    "os.environ[\"PYTORCH_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
+    "\n",
     "# Must match previous notebooks\n",
     "DATA_FOLDER_NAME = \"data_sky\"  # <-- change if needed\n",
     "\n",

diff --git a/implementations/preference_alignment/05_evaluation.ipynb b/implementations/preference_alignment/05_evaluation.ipynb
@@ -38,7 +38,7 @@
     "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
     "from utils.evaluation_helpers import (\n",
     "    extract_qa,\n",
-    "    judge_with_openai,\n",
+    "    judge_with_llm,\n",
     "    run_local_inference,\n",
     ")\n",
     "\n",
@@ -55,28 +55,34 @@
     "from pathlib import Path\n",
     "\n",
     "\n",
-    "# Resolve repo root\n",
-    "try:\n",
-    "    REPO_ROOT = Path(__file__).resolve().parents[1]\n",
-    "except NameError:\n",
-    "    REPO_ROOT = Path.cwd()\n",
+    "def _find_repo_root(start: Path) -> Path:\n",
+    "    \"\"\"Walk up from start until a directory containing .env is found.\"\"\"\n",
+    "    for directory in [start, *start.parents]:\n",
+    "        if (directory / \".env\").exists():\n",
+    "            return directory\n",
+    "    return start  # fallback: assume cwd is root\n",
+    "\n",
+    "\n",
+    "REPO_ROOT = _find_repo_root(Path.cwd())\n",
     "\n",
     "\n",
     "class Config(BaseSettings):\n",
     "    \"\"\"Configuration settings for evaluation.\"\"\"\n",
     "\n",
-    "    OPENAI_API_KEY: str\n",
+    "    GEMINI_API_KEY: str = \"\"\n",
+    "    OPENAI_API_KEY: str = \"\"\n",
     "\n",
     "    model_config = SettingsConfigDict(\n",
     "        env_file=REPO_ROOT / \".env\",\n",
     "        env_file_encoding=\"utf-8\",\n",
+    "        extra=\"ignore\",\n",
     "    )\n",
     "\n",
     "\n",
     "config = Config()\n",
-    "client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)\n",
-    "\n",
-    "semaphore = Semaphore(5)"
+    "semaphore = Semaphore(5)\n",
+    "print(f\"Repo root: {REPO_ROOT}\")\n",
+    "print(f\"Reading .env from: {REPO_ROOT / '.env'}\")"
    ]
   },
   {
@@ -89,8 +95,27 @@
     "\n",
     "\n",
     "BASE_MODEL = \"Qwen/Qwen2-7B-Instruct\"\n",
-    "JUDGE_MODEL = \"gpt-4o-mini\"\n",
     "\n",
+    "# ── Judge configuration ──────────────────────────────────────────────────────\n",
+    "# Set JUDGE_PROVIDER to \"gemini\" (default) or \"openai\".\n",
+    "# Gemini is used by default; OpenAI keys are not provided on this platform.\n",
+    "JUDGE_PROVIDER = \"gemini\"  # \"gemini\" | \"openai\"\n",
+    "\n",
+    "if JUDGE_PROVIDER == \"gemini\":\n",
+    "    assert config.GEMINI_API_KEY, \"Set GEMINI_API_KEY in your .env file\"\n",
+    "    client = AsyncOpenAI(\n",
+    "        api_key=config.GEMINI_API_KEY,\n",
+    "        base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\",\n",
+    "    )\n",
+    "    JUDGE_MODEL = \"gemini-2.5-flash-lite\"\n",
+    "elif JUDGE_PROVIDER == \"openai\":\n",
+    "    assert config.OPENAI_API_KEY, \"Set OPENAI_API_KEY in your .env file\"\n",
+    "    client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)\n",
+    "    JUDGE_MODEL = \"gpt-4o-mini\"\n",
+    "else:\n",
+    "    raise ValueError(f\"Unknown JUDGE_PROVIDER: {JUDGE_PROVIDER!r}\")\n",
+    "\n",
+    "# ─────────────────────────────────────────────────────────────────────────────\n",
     "MAX_NEW_TOKENS = 2048\n",
     "TEMPERATURE = 0.0\n",
     "SAMPLE_LIMIT = 50\n",
@@ -99,27 +124,29 @@
     "# Must match previous notebooks\n",
     "DATA_FOLDER_NAME = \"data_sky\"  # <-- change if needed\n",
     "\n",
+    "# Resolve notebook directory the same way as 04_dpo_training.ipynb\n",
     "try:\n",
-    "    REPO_ROOT = Path(__file__).resolve().parents[1]\n",
+    "    NOTEBOOK_DIR = Path(__file__).resolve().parent\n",
     "except NameError:\n",
-    "    REPO_ROOT = Path.cwd()\n",
+    "    NOTEBOOK_DIR = Path.cwd()\n",
     "\n",
     "# Model directory (same logic as training notebook)\n",
     "MODEL_TAG = BASE_MODEL.rsplit(\"/\", maxsplit=1)[-1].replace(\"-\", \"\")\n",
-    "DPO_MODEL_PATH = REPO_ROOT / \"models\" / f\"{DATA_FOLDER_NAME}_DPO_{MODEL_TAG}\"\n",
+    "DPO_MODEL_PATH = NOTEBOOK_DIR / \"models\" / f\"{DATA_FOLDER_NAME}_DPO_{MODEL_TAG}\"\n",
     "\n",
     "if not DPO_MODEL_PATH.exists():\n",
     "    raise FileNotFoundError(f\"{DPO_MODEL_PATH} not found. Please run 04_dpo_training.ipynb first.\")\n",
     "\n",
     "# DPO dataset directory\n",
-    "DATASET_PATH = REPO_ROOT / f\"dpo_dataset_{DATA_FOLDER_NAME}\"\n",
+    "DATASET_PATH = NOTEBOOK_DIR / f\"dpo_dataset_{DATA_FOLDER_NAME}\"\n",
     "\n",
     "if not DATASET_PATH.exists():\n",
     "    raise FileNotFoundError(f\"{DATASET_PATH} not found. Please run 03_dpo_pair_construction.ipynb first.\")\n",
     "\n",
     "# Output results file\n",
-    "OUTPUT_JSON = REPO_ROOT / f\"llm_judge_results_{DATA_FOLDER_NAME}.json\"\n",
+    "OUTPUT_JSON = NOTEBOOK_DIR / f\"llm_judge_results_{DATA_FOLDER_NAME}.json\"\n",
     "\n",
+    "print(f\"Judge: {JUDGE_PROVIDER} / {JUDGE_MODEL}\")\n",
     "print(f\"Evaluating model at: {DPO_MODEL_PATH}\")\n",
     "print(f\"Using dataset at: {DATASET_PATH}\")\n",
     "print(f\"Results will be saved to: {OUTPUT_JSON}\")"
@@ -138,14 +165,14 @@
     "    BASE_MODEL,\n",
     "    device_map=\"auto\",\n",
     "    torch_dtype=torch.bfloat16,\n",
-    "    attn_implementation=\"flash_attention_2\",\n",
+    "    attn_implementation=\"sdpa\",  # Choose \"flash_attention\" or \"sdpa\" based on your GPU capabilities\n",
     ").eval()\n",
     "\n",
     "dpo_model = AutoModelForCausalLM.from_pretrained(\n",
     "    DPO_MODEL_PATH,\n",
     "    device_map=\"auto\",\n",
     "    torch_dtype=torch.bfloat16,\n",
-    "    attn_implementation=\"flash_attention_2\",\n",
+    "    attn_implementation=\"sdpa\",  # Choose \"flash_attention\" or \"sdpa\" based on your GPU capabilities\n",
     ").eval()"
    ]
   },
@@ -269,7 +296,7 @@
     "        base_out = run_local_inference(base_model, tokenizer, prompt, MAX_NEW_TOKENS, TEMPERATURE)\n",
     "        dpo_out = run_local_inference(dpo_model, tokenizer, prompt, MAX_NEW_TOKENS, TEMPERATURE)\n",
     "\n",
-    "        judgment = await judge_with_openai(client, semaphore, JUDGE_MODEL, q, a1, a2, base_out, dpo_out)\n",
+    "        judgment = await judge_with_llm(client, semaphore, JUDGE_MODEL, q, a1, a2, base_out, dpo_out)\n",
     "\n",
     "        b = float(judgment[\"base_score\"])\n",
     "        d = float(judgment[\"dpo_score\"])\n",

diff --git a/implementations/preference_alignment/README.md b/implementations/preference_alignment/README.md
@@ -74,7 +74,7 @@ This directly optimizes the model to prefer correct judgments while maintaining
 
 ## Dataset Preparation
 
-The filtered `.parquet` files are not included in this repository.
+The filtered `.parquet` files are hosted in a GCP bucket and downloaded separately.
 
 ### Download Pre-Filtered Dataset (Recommended)
 
@@ -105,12 +105,26 @@ gcloud storage cp gs://interp-bootcamp-data/preference-alignment/data.zip .
 unzip data.zip
 ```
 
-Files are placed correctly after extraction — no manual reorganisation needed.
+The zip extracts a `data/` folder. Move its contents up and remove the wrapper:
 
-#### 3) Cleanup temporary files:
+```bash
+mv data/data_sky . && mv data/data_hh_rlhf . && rm -rf data
+```
+
+#### 3) Cleanup temporary files
 
 ```bash
-rm -f __MACOSX data.zip data/.DS_Store
+rm -rf __MACOSX data.zip .DS_Store
+```
+
+After setup, your directory should look like:
+
+```
+implementations/preference_alignment/
+├── data_sky/
+├── data_hh_rlhf/
+├── 01_dataset_construction.ipynb
+└── ...
 ```
 
 > **Note:** Use `train_sponsor_filtered.parquet` (for `data_sky`) and `train_singleturn_sponsor_filtered.parquet` (for `data_hh_rlhf`).
@@ -196,36 +210,43 @@ Once formatted properly, the rest of the pipeline (LLM-as-a-Judge → DPO → Ev
 From the **root of the repository**, install the `preference-alignment` dependency group using `uv`:
 
 ```bash
-uv sync --group preference-alignment
+uv sync --active --group preference-alignment
+```
+
+The `--active` flag ensures packages are installed into the currently activated virtual environment. After syncing, re-activate the venv to ensure your shell picks up the correct paths:
+
+```bash
+deactivate
 source .venv/bin/activate
+which python3  # should point to .venv/bin/python inside the repo root
 ```
 
-> **CUDA note:** `torch==2.6.0` from PyPI includes CUDA support on Linux. If you specifically need the CUDA 12.4 build, run:
->
-> ```bash
-> uv sync --group preference-alignment \
->   --index-url https://download.pytorch.org/whl/cu124 # Not required for Linux users, but may be needed for Windows users
-> ```
->
 > **Conflict note:** The `preference-alignment` and `xai-refresher` groups cannot be installed together. Install only one at a time.
 
-### Installing `flash-attn` (optional, for faster attention)
+<details>
+<summary>Installing <code>flash-attn</code> (optional, for faster attention)</summary>
 
-`flash-attn` requires CUDA headers and `setuptools` at compile time and cannot be installed via `uv sync`. After activating the venv, install it manually:
+`flash-attn` cannot be built from source on login nodes (no `nvcc`/`CUDA_HOME`). Install a pre-built wheel directly:
 
 ```bash
-uv pip install flash-attn==2.7.3 --no-build-isolation
+/path/to/.venv/bin/python -m pip install \
+  "https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3%2Bcu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl"
 ```
 
-> **Note:** This step requires a GPU node with CUDA available. Skip it if you are running on a CPU-only machine.
+> Use `/path/to/.venv/bin/python -m pip` (explicit venv python) rather than bare `pip` or `python`, to avoid installing into the wrong environment.
+>
+> The `cu12torch2.8` wheel is compatible with `torch 2.10+cu12x`. Do **not** use `pip install flash-attn==2.7.3 --no-build-isolation` — pip will reject pre-built wheels with local version labels (e.g. `2.7.3+cu12...`) when an exact version like `==2.7.3` is requested.
+>
+> Skip this step entirely if running on a CPU-only machine.
+
+</details>
 
 ## Notes
 
 - Run notebooks sequentially from **01 → 05**.
 - Ensure GPU availability before running ```02_inference_runner.ipynb``` and ```04_dpo_training.ipynb```.
 - The quality of alignment depends strongly on the judge model and prompt design.
 - Our results might have less win rate since we used only 300 samples for training, for better results use larger amount of data.
-
 # Discussion & Conceptual Checkpoints
 
 These questions are intended to help participants reflect on the design choices behind Con-J and DPO.

diff --git a/implementations/preference_alignment/utils/dpo_training_helpers.py b/implementations/preference_alignment/utils/dpo_training_helpers.py
@@ -7,6 +7,14 @@
 from unsloth import FastLanguageModel, PatchDPOTrainer
 
 
+try:
+    import flash_attn  # noqa: F401
+
+    HAS_FLASH_ATTN = True
+except ImportError:
+    HAS_FLASH_ATTN = False
+
+
 def extract_prompt_from_conversations(convs) -> str:
     """
     Isolate the user's initial prompt from a conversation history.
@@ -89,15 +97,16 @@ def load_unsloth_model(model_name: str, max_seq_length: int) -> tuple[Any, Any]:
         max_seq_length=max_seq_length,
         load_in_4bit=True,
         dtype=None,
-        device_map=None,  # handled by Accelerate
+        device_map="auto",  # splits model layers across all available GPUs
     )
 
     tokenizer.pad_token = tokenizer.eos_token
     tokenizer.padding_side = "left"
     tokenizer.truncation_side = "left"
     tokenizer.model_max_length = max_seq_length
 
-    model.config.use_flash_attention_2 = True
+    if HAS_FLASH_ATTN:
+        model.config.use_flash_attention_2 = True
     model.config.max_position_embeddings = max_seq_length
 
     return model, tokenizer
@@ -171,9 +180,9 @@ def build_dpo_trainer(
         output_dir=output_dir,
         beta=0.1,
         num_train_epochs=3,
-        per_device_train_batch_size=2,
-        per_device_eval_batch_size=2,
-        gradient_accumulation_steps=16,
+        per_device_train_batch_size=1,
+        per_device_eval_batch_size=1,
+        gradient_accumulation_steps=32,  # doubled to keep effective batch size the same as before (was 2×16)
         learning_rate=1.8e-6,
         warmup_ratio=0.25,
         lr_scheduler_type="cosine",