Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/code_checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,4 @@ jobs:
ignore-vulns: |
PYSEC-2025-49
CVE-2024-6345
CVE-2026-4539
Original file line number Diff line number Diff line change
Expand Up @@ -98,21 +98,9 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "No .parquet files found in /home/coder/interpretability-llms-agents/implementations/preference_alignment/data_sky. Please download or generate the filtered dataset and place it in this folder.",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 34\u001b[39m\n\u001b[32m 31\u001b[39m parquet_files = \u001b[38;5;28mlist\u001b[39m(DATA_DIR.glob(\u001b[33m\"\u001b[39m\u001b[33m*.parquet\u001b[39m\u001b[33m\"\u001b[39m))\n\u001b[32m 33\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m parquet_files:\n\u001b[32m---> \u001b[39m\u001b[32m34\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[32m 35\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mNo .parquet files found in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mDATA_DIR\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m. \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 36\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mPlease download or generate the filtered dataset and place it in this folder.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 37\u001b[39m )\n\u001b[32m 39\u001b[39m PARQUET_PATH = parquet_files[\u001b[32m0\u001b[39m]\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mUsing dataset file: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mPARQUET_PATH\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
"\u001b[31mFileNotFoundError\u001b[39m: No .parquet files found in /home/coder/interpretability-llms-agents/implementations/preference_alignment/data_sky. Please download or generate the filtered dataset and place it in this folder."
]
}
],
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
Expand Down
22 changes: 5 additions & 17 deletions implementations/preference_alignment/02_inference_runner.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,21 +36,9 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "/home/coder/interpretability-llms-agents/implementations/preference_alignment/output_data_sky not found. Please run 01_dataset_construction.ipynb first.",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 33\u001b[39m\n\u001b[32m 30\u001b[39m PROCESSED_DATASET_DIR = REPO_ROOT / \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33moutput_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mDATA_FOLDER_NAME\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m 32\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m PROCESSED_DATASET_DIR.exists():\n\u001b[32m---> \u001b[39m\u001b[32m33\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mPROCESSED_DATASET_DIR\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not found. Please run 01_dataset_construction.ipynb first.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 35\u001b[39m \u001b[38;5;66;03m# HuggingFace Arrow Dataset Path (auto-detect)\u001b[39;00m\n\u001b[32m 36\u001b[39m train_dir = PROCESSED_DATASET_DIR / \u001b[33m\"\u001b[39m\u001b[33mtrain\u001b[39m\u001b[33m\"\u001b[39m\n",
"\u001b[31mFileNotFoundError\u001b[39m: /home/coder/interpretability-llms-agents/implementations/preference_alignment/output_data_sky not found. Please run 01_dataset_construction.ipynb first."
]
}
],
"outputs": [],
"source": [
"import contextlib\n",
"import os\n",
Expand Down Expand Up @@ -132,7 +120,7 @@
" MODEL_ID,\n",
" device_map=\"auto\",\n",
" torch_dtype=torch.bfloat16,\n",
" attn_implementation=\"flash_attention_2\",\n",
" attn_implementation=\"sdpa\", # Choose \"flash_attention\" or \"sdpa\" based on your GPU capabilities\n",
").eval()\n",
"\n",
"print(\"Model loaded:\", MODEL_ID)\n",
Expand Down Expand Up @@ -165,7 +153,7 @@
"outputs": [],
"source": [
"if MODE == \"best_of_n\":\n",
" records = load_disk_records(train_dir, limit=200)\n",
" records = load_disk_records(train_dir, limit=20) # Change to 200 for larger runs\n",
"\n",
" run_best_of_n(\n",
" records=records,\n",
Expand Down Expand Up @@ -230,7 +218,7 @@
" ],\n",
" }\n",
"\n",
" raw = load_arrow_records(str(ARROW_DATASET_PATH), limit=200)\n",
" raw = load_arrow_records(str(ARROW_DATASET_PATH), limit=20) # Change to 200 for larger runs\n",
"\n",
" batched_outputs = {}\n",
"\n",
Expand Down
4 changes: 3 additions & 1 deletion implementations/preference_alignment/04_dpo_training.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,11 @@
"from pathlib import Path\n",
"\n",
"\n",
"MAX_SEQ_LENGTH = 2048\n",
"MAX_SEQ_LENGTH = 1024 # reduced from 2048 to fit 2× L4 (22GB each); increase if you have more VRAM\n",
"MODEL_NAME = \"Qwen/Qwen2-7B-Instruct\"\n",
"\n",
"os.environ[\"PYTORCH_ALLOC_CONF\"] = \"expandable_segments:True\"\n",
"\n",
"# Must match previous notebooks\n",
"DATA_FOLDER_NAME = \"data_sky\" # <-- change if needed\n",
"\n",
Expand Down
65 changes: 46 additions & 19 deletions implementations/preference_alignment/05_evaluation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
"from utils.evaluation_helpers import (\n",
" extract_qa,\n",
" judge_with_openai,\n",
" judge_with_llm,\n",
" run_local_inference,\n",
")\n",
"\n",
Expand All @@ -55,28 +55,34 @@
"from pathlib import Path\n",
"\n",
"\n",
"# Resolve repo root\n",
"try:\n",
" REPO_ROOT = Path(__file__).resolve().parents[1]\n",
"except NameError:\n",
" REPO_ROOT = Path.cwd()\n",
"def _find_repo_root(start: Path) -> Path:\n",
" \"\"\"Walk up from start until a directory containing .env is found.\"\"\"\n",
" for directory in [start, *start.parents]:\n",
" if (directory / \".env\").exists():\n",
" return directory\n",
" return start # fallback: assume cwd is root\n",
"\n",
"\n",
"REPO_ROOT = _find_repo_root(Path.cwd())\n",
"\n",
"\n",
"class Config(BaseSettings):\n",
" \"\"\"Configuration settings for evaluation.\"\"\"\n",
"\n",
" OPENAI_API_KEY: str\n",
" GEMINI_API_KEY: str = \"\"\n",
" OPENAI_API_KEY: str = \"\"\n",
"\n",
" model_config = SettingsConfigDict(\n",
" env_file=REPO_ROOT / \".env\",\n",
" env_file_encoding=\"utf-8\",\n",
" extra=\"ignore\",\n",
" )\n",
"\n",
"\n",
"config = Config()\n",
"client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)\n",
"\n",
"semaphore = Semaphore(5)"
"semaphore = Semaphore(5)\n",
"print(f\"Repo root: {REPO_ROOT}\")\n",
"print(f\"Reading .env from: {REPO_ROOT / '.env'}\")"
]
},
{
Expand All @@ -89,8 +95,27 @@
"\n",
"\n",
"BASE_MODEL = \"Qwen/Qwen2-7B-Instruct\"\n",
"JUDGE_MODEL = \"gpt-4o-mini\"\n",
"\n",
"# ── Judge configuration ──────────────────────────────────────────────────────\n",
"# Set JUDGE_PROVIDER to \"gemini\" (default) or \"openai\".\n",
"# Gemini is used by default; OpenAI keys are not provided on this platform.\n",
"JUDGE_PROVIDER = \"gemini\" # \"gemini\" | \"openai\"\n",
"\n",
"if JUDGE_PROVIDER == \"gemini\":\n",
" assert config.GEMINI_API_KEY, \"Set GEMINI_API_KEY in your .env file\"\n",
" client = AsyncOpenAI(\n",
" api_key=config.GEMINI_API_KEY,\n",
" base_url=\"https://generativelanguage.googleapis.com/v1beta/openai/\",\n",
" )\n",
" JUDGE_MODEL = \"gemini-2.5-flash-lite\"\n",
"elif JUDGE_PROVIDER == \"openai\":\n",
" assert config.OPENAI_API_KEY, \"Set OPENAI_API_KEY in your .env file\"\n",
" client = AsyncOpenAI(api_key=config.OPENAI_API_KEY)\n",
" JUDGE_MODEL = \"gpt-4o-mini\"\n",
"else:\n",
" raise ValueError(f\"Unknown JUDGE_PROVIDER: {JUDGE_PROVIDER!r}\")\n",
"\n",
"# ─────────────────────────────────────────────────────────────────────────────\n",
"MAX_NEW_TOKENS = 2048\n",
"TEMPERATURE = 0.0\n",
"SAMPLE_LIMIT = 50\n",
Expand All @@ -99,27 +124,29 @@
"# Must match previous notebooks\n",
"DATA_FOLDER_NAME = \"data_sky\" # <-- change if needed\n",
"\n",
"# Resolve notebook directory the same way as 04_dpo_training.ipynb\n",
"try:\n",
" REPO_ROOT = Path(__file__).resolve().parents[1]\n",
" NOTEBOOK_DIR = Path(__file__).resolve().parent\n",
"except NameError:\n",
" REPO_ROOT = Path.cwd()\n",
" NOTEBOOK_DIR = Path.cwd()\n",
"\n",
"# Model directory (same logic as training notebook)\n",
"MODEL_TAG = BASE_MODEL.rsplit(\"/\", maxsplit=1)[-1].replace(\"-\", \"\")\n",
"DPO_MODEL_PATH = REPO_ROOT / \"models\" / f\"{DATA_FOLDER_NAME}_DPO_{MODEL_TAG}\"\n",
"DPO_MODEL_PATH = NOTEBOOK_DIR / \"models\" / f\"{DATA_FOLDER_NAME}_DPO_{MODEL_TAG}\"\n",
"\n",
"if not DPO_MODEL_PATH.exists():\n",
" raise FileNotFoundError(f\"{DPO_MODEL_PATH} not found. Please run 04_dpo_training.ipynb first.\")\n",
"\n",
"# DPO dataset directory\n",
"DATASET_PATH = REPO_ROOT / f\"dpo_dataset_{DATA_FOLDER_NAME}\"\n",
"DATASET_PATH = NOTEBOOK_DIR / f\"dpo_dataset_{DATA_FOLDER_NAME}\"\n",
"\n",
"if not DATASET_PATH.exists():\n",
" raise FileNotFoundError(f\"{DATASET_PATH} not found. Please run 03_dpo_pair_construction.ipynb first.\")\n",
"\n",
"# Output results file\n",
"OUTPUT_JSON = REPO_ROOT / f\"llm_judge_results_{DATA_FOLDER_NAME}.json\"\n",
"OUTPUT_JSON = NOTEBOOK_DIR / f\"llm_judge_results_{DATA_FOLDER_NAME}.json\"\n",
"\n",
"print(f\"Judge: {JUDGE_PROVIDER} / {JUDGE_MODEL}\")\n",
"print(f\"Evaluating model at: {DPO_MODEL_PATH}\")\n",
"print(f\"Using dataset at: {DATASET_PATH}\")\n",
"print(f\"Results will be saved to: {OUTPUT_JSON}\")"
Expand All @@ -138,14 +165,14 @@
" BASE_MODEL,\n",
" device_map=\"auto\",\n",
" torch_dtype=torch.bfloat16,\n",
" attn_implementation=\"flash_attention_2\",\n",
" attn_implementation=\"sdpa\", # Choose \"flash_attention\" or \"sdpa\" based on your GPU capabilities\n",
").eval()\n",
"\n",
"dpo_model = AutoModelForCausalLM.from_pretrained(\n",
" DPO_MODEL_PATH,\n",
" device_map=\"auto\",\n",
" torch_dtype=torch.bfloat16,\n",
" attn_implementation=\"flash_attention_2\",\n",
" attn_implementation=\"sdpa\", # Choose \"flash_attention\" or \"sdpa\" based on your GPU capabilities\n",
").eval()"
]
},
Expand Down Expand Up @@ -269,7 +296,7 @@
" base_out = run_local_inference(base_model, tokenizer, prompt, MAX_NEW_TOKENS, TEMPERATURE)\n",
" dpo_out = run_local_inference(dpo_model, tokenizer, prompt, MAX_NEW_TOKENS, TEMPERATURE)\n",
"\n",
" judgment = await judge_with_openai(client, semaphore, JUDGE_MODEL, q, a1, a2, base_out, dpo_out)\n",
" judgment = await judge_with_llm(client, semaphore, JUDGE_MODEL, q, a1, a2, base_out, dpo_out)\n",
"\n",
" b = float(judgment[\"base_score\"])\n",
" d = float(judgment[\"dpo_score\"])\n",
Expand Down
55 changes: 38 additions & 17 deletions implementations/preference_alignment/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ This directly optimizes the model to prefer correct judgments while maintaining

## Dataset Preparation

The filtered `.parquet` files are not included in this repository.
The filtered `.parquet` files are hosted in a GCP bucket and downloaded separately.

### Download Pre-Filtered Dataset (Recommended)

Expand Down Expand Up @@ -105,12 +105,26 @@ gcloud storage cp gs://interp-bootcamp-data/preference-alignment/data.zip .
unzip data.zip
```

Files are placed correctly after extraction — no manual reorganisation needed.
The zip extracts a `data/` folder. Move its contents up and remove the wrapper:

#### 3) Cleanup temporary files:
```bash
mv data/data_sky . && mv data/data_hh_rlhf . && rm -rf data
```

#### 3) Cleanup temporary files

```bash
rm -f __MACOSX data.zip data/.DS_Store
rm -rf __MACOSX data.zip .DS_Store
```

After setup, your directory should look like:

```
implementations/preference_alignment/
├── data_sky/
├── data_hh_rlhf/
├── 01_dataset_construction.ipynb
└── ...
```

> **Note:** Use `train_sponsor_filtered.parquet` (for `data_sky`) and `train_singleturn_sponsor_filtered.parquet` (for `data_hh_rlhf`).
Expand Down Expand Up @@ -196,36 +210,43 @@ Once formatted properly, the rest of the pipeline (LLM-as-a-Judge → DPO → Ev
From the **root of the repository**, install the `preference-alignment` dependency group using `uv`:

```bash
uv sync --group preference-alignment
uv sync --active --group preference-alignment
```

The `--active` flag ensures packages are installed into the currently activated virtual environment. After syncing, re-activate the venv to ensure your shell picks up the correct paths:

```bash
deactivate
source .venv/bin/activate
which python3 # should point to .venv/bin/python inside the repo root
```

> **CUDA note:** `torch==2.6.0` from PyPI includes CUDA support on Linux. If you specifically need the CUDA 12.4 build, run:
>
> ```bash
> uv sync --group preference-alignment \
> --index-url https://download.pytorch.org/whl/cu124 # Not required for Linux users, but may be needed for Windows users
> ```
>
> **Conflict note:** The `preference-alignment` and `xai-refresher` groups cannot be installed together. Install only one at a time.

### Installing `flash-attn` (optional, for faster attention)
<details>
<summary>Installing <code>flash-attn</code> (optional, for faster attention)</summary>

`flash-attn` requires CUDA headers and `setuptools` at compile time and cannot be installed via `uv sync`. After activating the venv, install it manually:
`flash-attn` cannot be built from source on login nodes (no `nvcc`/`CUDA_HOME`). Install a pre-built wheel directly:

```bash
uv pip install flash-attn==2.7.3 --no-build-isolation
/path/to/.venv/bin/python -m pip install \
"https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3%2Bcu12torch2.8cxx11abiTRUE-cp312-cp312-linux_x86_64.whl"
```

> **Note:** This step requires a GPU node with CUDA available. Skip it if you are running on a CPU-only machine.
> Use `/path/to/.venv/bin/python -m pip` (explicit venv python) rather than bare `pip` or `python`, to avoid installing into the wrong environment.
>
> The `cu12torch2.8` wheel is compatible with `torch 2.10+cu12x`. Do **not** use `pip install flash-attn==2.7.3 --no-build-isolation` — pip will reject pre-built wheels with local version labels (e.g. `2.7.3+cu12...`) when an exact version like `==2.7.3` is requested.
>
> Skip this step entirely if running on a CPU-only machine.

</details>

## Notes

- Run notebooks sequentially from **01 → 05**.
- Ensure GPU availability before running ```02_inference_runner.ipynb``` and ```04_dpo_training.ipynb```.
- The quality of alignment depends strongly on the judge model and prompt design.
- Our results might have less win rate since we used only 300 samples for training, for better results use larger amount of data.

# Discussion & Conceptual Checkpoints

These questions are intended to help participants reflect on the design choices behind Con-J and DPO.
Expand Down
19 changes: 14 additions & 5 deletions implementations/preference_alignment/utils/dpo_training_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@
from unsloth import FastLanguageModel, PatchDPOTrainer


try:
import flash_attn # noqa: F401

HAS_FLASH_ATTN = True
except ImportError:
HAS_FLASH_ATTN = False


def extract_prompt_from_conversations(convs) -> str:
"""
Isolate the user's initial prompt from a conversation history.
Expand Down Expand Up @@ -89,15 +97,16 @@ def load_unsloth_model(model_name: str, max_seq_length: int) -> tuple[Any, Any]:
max_seq_length=max_seq_length,
load_in_4bit=True,
dtype=None,
device_map=None, # handled by Accelerate
device_map="auto", # splits model layers across all available GPUs
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"
tokenizer.model_max_length = max_seq_length

model.config.use_flash_attention_2 = True
if HAS_FLASH_ATTN:
model.config.use_flash_attention_2 = True
model.config.max_position_embeddings = max_seq_length

return model, tokenizer
Expand Down Expand Up @@ -171,9 +180,9 @@ def build_dpo_trainer(
output_dir=output_dir,
beta=0.1,
num_train_epochs=3,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
gradient_accumulation_steps=16,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
gradient_accumulation_steps=32, # doubled to keep effective batch size the same as before (was 2×16)
learning_rate=1.8e-6,
warmup_ratio=0.25,
lr_scheduler_type="cosine",
Expand Down
Loading
Loading