Skip to content

Commit d9f5036

Browse files
authored
Merge pull request #2047 from dgageot/board/docke-agent-eval-can-take-a-long-time-ar-10a5596e
eval: reduce redundant work during evaluation runs
2 parents a19756f + 519ec07 commit d9f5036

File tree

3 files changed

+60
-13
lines changed

3 files changed

+60
-13
lines changed

pkg/evaluation/build.go

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ var (
2626

2727
// getOrBuildImage returns a cached image ID or builds a new one.
2828
// Images are cached by working directory to avoid redundant builds.
29+
// Concurrent calls for the same working directory are deduplicated
30+
// using singleflight so that only one build runs at a time per key.
2931
func (r *Runner) getOrBuildImage(ctx context.Context, workingDir string) (string, error) {
3032
r.imageCacheMu.Lock()
3133
if imageID, ok := r.imageCache[workingDir]; ok {
@@ -34,16 +36,27 @@ func (r *Runner) getOrBuildImage(ctx context.Context, workingDir string) (string
3436
}
3537
r.imageCacheMu.Unlock()
3638

37-
imageID, err := r.buildEvalImage(ctx, workingDir)
39+
// singleflight ensures only one build per working directory runs at a time.
40+
// The cache write inside the callback guarantees the result is available
41+
// before singleflight releases the key, so subsequent callers always
42+
// hit the cache above.
43+
v, err, _ := r.imageBuildGroup.Do(workingDir, func() (any, error) {
44+
imageID, err := r.buildEvalImage(ctx, workingDir)
45+
if err != nil {
46+
return "", err
47+
}
48+
49+
r.imageCacheMu.Lock()
50+
r.imageCache[workingDir] = imageID
51+
r.imageCacheMu.Unlock()
52+
53+
return imageID, nil
54+
})
3855
if err != nil {
3956
return "", err
4057
}
4158

42-
r.imageCacheMu.Lock()
43-
r.imageCache[workingDir] = imageID
44-
r.imageCacheMu.Unlock()
45-
46-
return imageID, nil
59+
return v.(string), nil
4760
}
4861

4962
func (r *Runner) buildEvalImage(ctx context.Context, workingDir string) (string, error) {

pkg/evaluation/eval.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"time"
2020

2121
"github.com/google/uuid"
22+
"golang.org/x/sync/singleflight"
2223

2324
"github.com/docker/docker-agent/pkg/config"
2425
"github.com/docker/docker-agent/pkg/config/latest"
@@ -39,6 +40,9 @@ type Runner struct {
3940
// Key is the working directory (empty string for no working dir).
4041
imageCache map[string]string
4142
imageCacheMu sync.Mutex
43+
44+
// imageBuildGroup deduplicates concurrent image builds for the same working directory.
45+
imageBuildGroup singleflight.Group
4246
}
4347

4448
// newRunner creates a new evaluation runner.
@@ -290,10 +294,12 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
290294
evals = &session.EvalCriteria{}
291295
}
292296

297+
userMessages := getUserMessages(evalSess.Session)
298+
293299
result := Result{
294300
InputPath: evalSess.SourcePath,
295301
Title: evalSess.Title,
296-
Question: strings.Join(getUserMessages(evalSess.Session), "\n"),
302+
Question: strings.Join(userMessages, "\n"),
297303
SizeExpected: evals.Size,
298304
RelevanceExpected: float64(len(evals.Relevance)),
299305
}
@@ -310,7 +316,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
310316
return result, fmt.Errorf("building eval image: %w", err)
311317
}
312318

313-
events, err := r.runDockerAgentInContainer(ctx, imageID, getUserMessages(evalSess.Session), evals.Setup)
319+
events, err := r.runDockerAgentInContainer(ctx, imageID, userMessages, evals.Setup)
314320
if err != nil {
315321
return result, fmt.Errorf("running docker agent in container: %w", err)
316322
}
@@ -323,7 +329,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
323329
result.Size = getResponseSize(result.Response)
324330

325331
// Build session from events for database storage
326-
result.Session = SessionFromEvents(events, evalSess.Title, getUserMessages(evalSess.Session))
332+
result.Session = SessionFromEvents(events, evalSess.Title, userMessages)
327333
result.Session.Evals = evals
328334

329335
if len(expectedToolCalls) > 0 || len(actualToolCalls) > 0 {

pkg/evaluation/judge.go

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,13 @@ type Judge struct {
6060
model provider.Provider
6161
runConfig *config.RuntimeConfig
6262
concurrency int
63+
64+
// judgeWithSchema is a provider pre-configured with structured output.
65+
// Created lazily on first use and reused across all relevance checks.
66+
// Protected by judgeWithSchemaMu; only cached on success so that
67+
// transient errors (e.g. context cancellation) can be retried.
68+
judgeWithSchema provider.Provider
69+
judgeWithSchemaMu sync.Mutex
6370
}
6471

6572
// NewJudge creates a new Judge that runs relevance checks with the given concurrency.
@@ -141,16 +148,37 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
141148
return passed, failed, errs
142149
}
143150

144-
// checkSingle checks a single relevance criterion against the response.
145-
// It returns whether the check passed, the reason provided by the judge, and any error.
146-
func (j *Judge) checkSingle(ctx context.Context, response, criterion string) (passed bool, reason string, err error) {
151+
// getOrCreateJudgeWithSchema returns a provider pre-configured with structured output.
152+
// The provider is created once and reused across all relevance checks.
153+
// Unlike sync.Once, transient failures (e.g. context cancellation) are not
154+
// cached, allowing subsequent calls to retry.
155+
func (j *Judge) getOrCreateJudgeWithSchema(ctx context.Context) (provider.Provider, error) {
156+
j.judgeWithSchemaMu.Lock()
157+
defer j.judgeWithSchemaMu.Unlock()
158+
159+
if j.judgeWithSchema != nil {
160+
return j.judgeWithSchema, nil
161+
}
162+
147163
modelCfg := j.model.BaseConfig().ModelConfig
148-
judgeWithSchema, err := provider.New(
164+
p, err := provider.New(
149165
ctx,
150166
&modelCfg,
151167
j.runConfig.EnvProvider(),
152168
options.WithStructuredOutput(judgeResponseSchema),
153169
)
170+
if err != nil {
171+
return nil, err
172+
}
173+
174+
j.judgeWithSchema = p
175+
return j.judgeWithSchema, nil
176+
}
177+
178+
// checkSingle checks a single relevance criterion against the response.
179+
// It returns whether the check passed, the reason provided by the judge, and any error.
180+
func (j *Judge) checkSingle(ctx context.Context, response, criterion string) (passed bool, reason string, err error) {
181+
judgeWithSchema, err := j.getOrCreateJudgeWithSchema(ctx)
154182
if err != nil {
155183
return false, "", fmt.Errorf("creating judge provider with structured output: %w", err)
156184
}

0 commit comments

Comments
 (0)