Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,5 @@ src/training/cache_embeddings/aws/vllm-inventory-*.ini
grafana/
grafana-data/
prometheus-data/
src/semantic-router/.cache/
src/semantic-router/.cache/
54 changes: 54 additions & 0 deletions src/training/model_eval/onboard/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Onboarding Evaluation

This module provides a unified onboarding workflow for model evaluation,
threshold policy reporting, and optional routing model updates.

## CLI

The CLI is exposed via:

```bash
python src/training/model_eval/onboard_eval.py --help
```

## Run System Eval + Threshold Report

```bash
python src/training/model_eval/onboard_eval.py \
--config onboarding_config.json \
--test-name system_eval \
--datasets mmlu-pro-en mmlu-prox-zh fact-check-en feedback-en \
--max-samples 50 \
--report-out system_eval_report.json \
--thresholds-out onboarding_thresholds.json \
--min-accuracy 0.7 \
--max-latency-ms 2000
```

## Write Thresholds Back Into Config

```bash
python src/training/model_eval/onboard_eval.py \
--config onboarding_config.json \
--test-name system_eval \
--datasets mmlu-pro-en fact-check-en \
--thresholds-out onboarding_thresholds.json \
--update-config
```

This writes an `onboarding_thresholds` object into the JSON config file (or into
`--config-out` if provided) so the evaluation policy is stored alongside the
model onboarding config.

## Optional: Update Routing Models

```bash
python src/training/model_eval/onboard_eval.py \
--config onboarding_config.json \
--test-name system_eval \
--datasets mmlu-pro-en \
--ml-benchmark-queries queries.jsonl \
--ml-benchmark-model-config models.yaml \
--ml-benchmark-output benchmark_output.jsonl \
--ml-train-output ./ml-models
```
4 changes: 4 additions & 0 deletions src/training/model_eval/onboard/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .evaluator import OnboardEvaluate
from .types import ModelConfig, TestResult

__all__ = ["OnboardEvaluate", "ModelConfig", "TestResult"]
5 changes: 5 additions & 0 deletions src/training/model_eval/onboard/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .cli import main


if __name__ == "__main__":
raise SystemExit(main())
120 changes: 120 additions & 0 deletions src/training/model_eval/onboard/arc_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
from typing import Any, Dict, Optional

import pandas as pd
from datasets import load_dataset
from openai import OpenAI
from tqdm import tqdm

from .constants import ANSWER_PATTERN_ARC
from .types import TestResult


class ArcEvalMixin:
def _run_arc_challenge(self, samples: Optional[int] = 20) -> TestResult:
"""Run ARC Challenge test"""
print(f"Starting ARC Challenge test, samples: {samples}")

dataset = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="train")
df = pd.DataFrame(dataset)

if samples and len(df) > samples:
df = df.sample(samples, random_state=self.model_config.seed)

results_df = self._evaluate_arc(df)
self.arc_results = results_df

valid_results = results_df[results_df["success"]]
overall_accuracy = (
valid_results["is_correct"].mean() if not valid_results.empty else 0.0
)

test_result = TestResult(
model_name=self.model_config.model_name,
test_name="arc_challenge",
score=overall_accuracy,
metrics={
"overall": overall_accuracy,
"total_questions": len(results_df),
"successful_queries": len(valid_results),
"failed_queries": len(results_df) - len(valid_results),
},
details={"split": "train", "samples_evaluated": len(df)},
)

self.test_results.append(test_result)
return test_result

def _evaluate_arc(self, df: pd.DataFrame) -> pd.DataFrame:
"""Evaluate ARC"""
client = OpenAI(
base_url=self.model_config.endpoint,
api_key=self.model_config.api_key or "dummy",
)

results = []
for _, row in tqdm(
df.iterrows(), total=len(df), desc="Evaluating ARC Challenge"
):
result = self._process_arc_question(client, row.to_dict())
results.append(result)

return pd.DataFrame(results)

def _process_arc_question(
self, client: OpenAI, question_data: Dict[str, Any]
) -> Dict[str, Any]:
"""Process a single ARC question"""
question = question_data["question"]
choices = question_data["choices"]
correct_answer = question_data["answerKey"]

formatted_options = ""
for label, text in zip(choices["label"], choices["text"]):
formatted_options += f"{label}) {text}\n"

if self.model_config.use_cot:
prompt = (
"Question: "
f"{question}\n\nOptions:\n{formatted_options}\n\n"
"Please solve this step-by-step, then provide your final answer in the format "
"'Answer: [letter]'."
)
else:
prompt = (
"Question: "
f"{question}\n\nOptions:\n{formatted_options}\n\n"
"Please choose the correct answer from the options above. Provide your answer "
"in the format 'Answer: [letter]'."
)

response = client.chat.completions.create(
model=self.model_config.model_name,
messages=[{"role": "user", "content": prompt}],
max_tokens=self.model_config.max_tokens,
temperature=self.model_config.temperature,
)
response_text = response.choices[0].message.content

predicted_answer = self._extract_answer_arc(response_text)
is_correct = predicted_answer == correct_answer

return {
"id": question_data["id"],
"question": question,
"correct_answer": correct_answer,
"predicted_answer": predicted_answer,
"is_correct": is_correct,
"success": True,
}

def _extract_answer_arc(self, response: str) -> Optional[str]:
"""Extract ARC answer from response"""
match = ANSWER_PATTERN_ARC.search(response)
if match:
return match.group(1).upper()

for char in reversed(response):
if char.upper() in "ABCD":
return char.upper()

return None
202 changes: 202 additions & 0 deletions src/training/model_eval/onboard/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import argparse
import json
import subprocess
from pathlib import Path
from typing import Any, Dict, List, Optional

from .evaluator import OnboardEvaluate
from .thresholds import build_threshold_report


def _load_config(config_path: Path) -> Dict[str, Any]:
with config_path.open("r", encoding="utf-8") as handle:
return json.load(handle)


def _write_config(config_path: Path, config: Dict[str, Any]) -> None:
with config_path.open("w", encoding="utf-8") as handle:
json.dump(config, handle, indent=2)
handle.write("\n")


def _run_benchmark(args: argparse.Namespace) -> Optional[Path]:
if not args.ml_benchmark_queries:
return None

queries = Path(args.ml_benchmark_queries)
output = Path(args.ml_benchmark_output)
bench_dir = Path(args.ml_selection_dir)
cmd = [
"python",
str(bench_dir / "benchmark.py"),
"--queries",
str(queries),
"--output",
str(output),
]

if args.ml_benchmark_model_config:
cmd.extend(["--model-config", args.ml_benchmark_model_config])
elif args.ml_benchmark_models:
cmd.extend(["--models", args.ml_benchmark_models])
if args.ml_benchmark_endpoint:
cmd.extend(["--endpoint", args.ml_benchmark_endpoint])
else:
raise ValueError("Provide --ml-benchmark-models or --ml-benchmark-model-config")

if args.ml_benchmark_concurrency:
cmd.extend(["--concurrency", str(args.ml_benchmark_concurrency)])
if args.ml_benchmark_max_tokens:
cmd.extend(["--max-tokens", str(args.ml_benchmark_max_tokens)])
if args.ml_benchmark_temperature is not None:
cmd.extend(["--temperature", str(args.ml_benchmark_temperature)])
if args.ml_benchmark_concise:
cmd.append("--concise")
if args.ml_benchmark_limit:
cmd.extend(["--limit", str(args.ml_benchmark_limit)])

subprocess.run(cmd, check=True)
return output


def _run_training(args: argparse.Namespace, benchmark_output: Optional[Path]) -> None:
if not args.ml_train_output:
return

train_dir = Path(args.ml_selection_dir)
if not args.ml_train_data and benchmark_output is None:
raise ValueError("Provide --ml-train-data or enable ML benchmark first")

data_file = Path(args.ml_train_data or benchmark_output)

cmd = [
"python",
str(train_dir / "train.py"),
"--data-file",
str(data_file),
"--output-dir",
str(args.ml_train_output),
]

if args.ml_train_algorithm:
cmd.extend(["--algorithm", args.ml_train_algorithm])
if args.ml_train_device:
cmd.extend(["--device", args.ml_train_device])
if args.ml_train_embedding_model:
cmd.extend(["--embedding-model", args.ml_train_embedding_model])
if args.ml_train_quality_weight:
cmd.extend(["--quality-weight", str(args.ml_train_quality_weight)])
if args.ml_train_batch_size:
cmd.extend(["--batch-size", str(args.ml_train_batch_size)])

subprocess.run(cmd, check=True)


def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Model onboarding pipeline (eval, thresholds, ML routing models)"
)
parser.add_argument(
"--config", required=True, help="Path to onboarding config JSON"
)
parser.add_argument(
"--test-name",
default="system_eval",
choices=["system_eval", "arc_challenge", "mmlu_pro"],
help="Evaluation to run",
)
parser.add_argument("--datasets", nargs="*", help="System eval dataset IDs")
parser.add_argument("--max-samples", type=int, default=50)
parser.add_argument("--input-path", help="Text/jsonl input for system_eval")
parser.add_argument("--report-out", help="Write JSON report to this path")

parser.add_argument("--thresholds-out", help="Write threshold report JSON")
parser.add_argument("--min-accuracy", type=float, default=0.7)
parser.add_argument("--max-latency-ms", type=float, default=2000.0)
parser.add_argument(
"--update-config",
action="store_true",
help="Write onboarding thresholds back into config JSON",
)
parser.add_argument("--config-out", help="Write updated config to a new path")

parser.add_argument(
"--ml-selection-dir",
default="src/training/model_selection/ml_model_selection",
help="Path to ML model selection directory",
)
parser.add_argument("--ml-benchmark-queries", help="Queries JSONL for ML benchmark")
parser.add_argument("--ml-benchmark-models", help="Comma-separated model list")
parser.add_argument("--ml-benchmark-model-config", help="models.yaml for benchmark")
parser.add_argument("--ml-benchmark-endpoint", help="Endpoint for --models mode")
parser.add_argument("--ml-benchmark-output", default="benchmark_output.jsonl")
parser.add_argument("--ml-benchmark-concurrency", type=int)
parser.add_argument("--ml-benchmark-max-tokens", type=int)
parser.add_argument("--ml-benchmark-temperature", type=float)
parser.add_argument("--ml-benchmark-concise", action="store_true")
parser.add_argument("--ml-benchmark-limit", type=int)

parser.add_argument("--ml-train-data", help="Benchmark output JSONL for training")
parser.add_argument("--ml-train-output", help="Output dir for trained models")
parser.add_argument("--ml-train-algorithm", help="all|knn|kmeans|svm|mlp")
parser.add_argument("--ml-train-device", help="cpu|cuda|mps")
parser.add_argument("--ml-train-embedding-model", help="Embedding model name")
parser.add_argument("--ml-train-quality-weight", type=float)
parser.add_argument("--ml-train-batch-size", type=int)

return parser


def main() -> int:
parser = build_parser()
args = parser.parse_args()

config_path = Path(args.config)
config = _load_config(config_path)

evaluator = OnboardEvaluate(config_path=str(config_path))
evaluator.parse(config)

if args.test_name == "system_eval":
datasets = args.datasets
result = evaluator.run_performance_test(
"system_eval",
datasets=datasets,
max_samples=args.max_samples,
input_path=args.input_path,
)
else:
result = evaluator.run_performance_test(args.test_name)

if args.report_out:
evaluator.generate_report(args.report_out)

threshold_report = None
if args.test_name == "system_eval" and evaluator.system_eval_summary:
threshold_report = build_threshold_report(
evaluator.system_eval_summary,
args.min_accuracy,
args.max_latency_ms,
)
if args.thresholds_out:
threshold_path = Path(args.thresholds_out)
with threshold_path.open("w", encoding="utf-8") as handle:
json.dump(threshold_report, handle, indent=2)
handle.write("\n")

if args.update_config and threshold_report is not None:
config["onboarding_thresholds"] = threshold_report
output_path = Path(args.config_out) if args.config_out else config_path
_write_config(output_path, config)

benchmark_output = _run_benchmark(args)
_run_training(args, benchmark_output)

print(
f"Completed {result.test_name} with score {result.score:.4f} (samples={result.metrics.get('total_samples')})"
)
return 0


if __name__ == "__main__":
raise SystemExit(main())
Loading
Loading