diff --git a/packages/opencode/src/altimate/observability/tracing.ts b/packages/opencode/src/altimate/observability/tracing.ts index c6a3e7f6e..e0e742bc1 100644 --- a/packages/opencode/src/altimate/observability/tracing.ts +++ b/packages/opencode/src/altimate/observability/tracing.ts @@ -148,6 +148,15 @@ export interface TraceFile { export interface TraceExporter { readonly name: string export(trace: TraceFile): Promise + // altimate_change start — M3 fix: optional crash-guard hook. + // If implemented, Trace.flushSync() calls this with the crashing session's + // id BEFORE writing the canonical crashed file. Exporters that write to a + // shared canonical location (e.g. FileExporter on the local filesystem) + // should use it to suppress in-flight or subsequent exports that would + // race the synchronous crash write. Exporters that POST to remote + // endpoints (e.g. HttpExporter) don't need to implement it. + markCrashed?(sessionId: string): void + // altimate_change end } // --------------------------------------------------------------------------- @@ -165,6 +174,30 @@ export class FileExporter implements TraceExporter { readonly name = "file" private dir: string private maxFiles: number + // altimate_change start — M3 fix: per-session crash-guard. + // We track which sessions have been crashed-out via flushSync rather than + // a single boolean, because the same FileExporter instance is shared + // across many Trace instances in the TUI worker (worker.ts caches a + // single exporter array at module init, then every session does + // `Trace.withExporters([...tracingExporters], ...)` which spreads the + // array but shares the inner objects). A single boolean would cause one + // crashed session to permanently suppress export() for all subsequent + // sessions in the same worker. Keying by sessionId scopes the guard to + // the actual crashing trace. + private crashedSessions = new Set() + /** Mark a session's exports as superseded by a synchronous crash write. + * Subsequent or in-flight export() calls for that session bail at the + * next checkpoint (entry, pre-writeFile, or pre-rename). Idempotent. + * The sessionId is normalized through the same sanitization that + * buildTraceFile / export use for the on-disk file name, so callers + * can pass the raw `this.sessionId` without worrying about whether + * it contains path-unsafe characters. */ + markCrashed(sessionId: string) { + if (!sessionId) return + const safeId = sessionId.replace(/[/\\.:]/g, "_") || "unknown" + this.crashedSessions.add(safeId) + } + // altimate_change end constructor(dir?: string, maxFiles?: number) { this.dir = dir ?? DEFAULT_TRACES_DIR @@ -172,6 +205,10 @@ export class FileExporter implements TraceExporter { } async export(trace: TraceFile): Promise { + // altimate_change start — M3 fix: bail at entry if this session has been + // marked crashed by a concurrent flushSync. + if (this.crashedSessions.has(trace.sessionId)) return undefined + // altimate_change end let tmpPath: string | undefined try { await fs.mkdir(this.dir, { recursive: true }) @@ -181,7 +218,20 @@ export class FileExporter implements TraceExporter { // Atomic write: write to temp file, then rename — prevents partial reads // when concurrent snapshots or exports target the same file tmpPath = filePath + `.tmp.${Date.now()}.${Math.random().toString(36).slice(2, 8)}` + // altimate_change start — M3 fix: re-check before the long writeFile. + // For large traces the writeFile takes 100+ms — a wide window for + // flushSync to interleave. + if (this.crashedSessions.has(trace.sessionId)) return undefined + // altimate_change end await fs.writeFile(tmpPath, JSON.stringify(trace, null, 2)) + // altimate_change start — M3 fix: re-check before the rename. If + // flushSync ran during the writeFile, drop the tmp instead of + // overwriting flushSync's crashed canonical file. + if (this.crashedSessions.has(trace.sessionId)) { + await fs.unlink(tmpPath).catch(() => {}) + return undefined + } + // altimate_change end await fs.rename(tmpPath, filePath) if (this.maxFiles > 0) { @@ -337,6 +387,17 @@ export class Trace { private metadata: TraceFile["metadata"] = {} private snapshotDir: string | undefined private snapshotPending = false + // altimate_change start — M2 fix: if a snapshot was requested while another + // was in flight (and thus dropped by the snapshotPending debounce), we + // schedule exactly one follow-up snapshot when the in-flight one finishes. + // Without this, the on-disk file lags memory until a fresh event arrives + // — and if the process exits in that gap, the burst's tail is lost. + private snapshotRequestedDuringPending = false + // Once endTrace begins its canonical write, no more snapshots may run — + // they would race endTrace's mutation of the root span (endTime, status) + // and could clobber endTrace's content with stale pre-end state. + private endTraceStarted = false + // altimate_change end private snapshotPromise: Promise | undefined // Set true when flushSync runs. Prevents in-flight async snapshot() // calls from racing with the synchronous crash write — without this, @@ -750,9 +811,25 @@ export class Trace { */ private snapshot() { if (!this.snapshotDir || !this.sessionId) return - if (this.snapshotPending) return // Debounce — only one in flight at a time + // altimate_change start — M2 fix: when debounce drops this call, record + // that another snapshot is needed once the in-flight one settles. + // Without this, the disk file lags memory whenever events arrive faster + // than snapshots can complete (typical for bursty LLM turns). + if (this.snapshotPending) { + this.snapshotRequestedDuringPending = true + return + } + // No new snapshots once endTrace has begun: the canonical write is now + // endTrace's job, and a concurrent snapshot would capture state from + // BEFORE endTrace's root-span mutation and could lose to-end-only fields. + if (this.endTraceStarted) return + // altimate_change end if (this.crashed) return // flushSync wrote the canonical crashed file; do NOT race it this.snapshotPending = true + // altimate_change start — M2 fix: reset before kicking off, so any + // snapshot() calls during this write set the dirty flag for our .finally. + this.snapshotRequestedDuringPending = false + // altimate_change end const trace = this.buildTraceFile() const safeId = (this.sessionId || "unknown").replace(/[/\\.:]/g, "_") || "unknown" @@ -779,6 +856,17 @@ export class Trace { .finally(() => { this.snapshotPending = false this.snapshotPromise = undefined + // altimate_change start — M2 fix: if any span was added (or any + // snapshot was requested) while we were writing, schedule exactly + // one follow-up to flush those changes. Bounded: at most one extra + // write per "burst → quiet" cycle, regardless of burst size. + // Skip the follow-up if endTrace has taken over the canonical write, + // or if flushSync has marked the trace crashed. + if (this.snapshotRequestedDuringPending && !this.crashed && !this.endTraceStarted) { + this.snapshotRequestedDuringPending = false + queueMicrotask(() => this.snapshot()) + } + // altimate_change end }) } @@ -793,13 +881,26 @@ export class Trace { } /** - * Wait for any pending async snapshot to complete. - * Use from tests that need to read the trace file deterministically after - * a span completion (instead of `await sleep(50)` which races on slow CI runners). - * No-op if no snapshot is in flight. + * Wait for the on-disk snapshot to catch up with in-memory state. + * + * Drains both the current in-flight snapshot and any M2 follow-up snapshot + * scheduled by its `.finally`. Bounded by `MAX_DRAIN_ITER` to guard against + * pathological cases where events arrive faster than they can be drained. */ async flush(): Promise { - if (this.snapshotPromise) await this.snapshotPromise.catch(() => {}) + // altimate_change start — M2 fix companion: drain follow-up snapshots so + // callers (and tests) see disk == memory after flush() returns. Without + // this drain loop, the M2 fix's queueMicrotask follow-up runs AFTER + // flush() resolves and the disk still trails by one snapshot. + const MAX_DRAIN_ITER = 16 + for (let i = 0; i < MAX_DRAIN_ITER; i++) { + if (this.snapshotPromise) await this.snapshotPromise.catch(() => {}) + if (!this.snapshotRequestedDuringPending && !this.snapshotPromise) return + // Yield so any queued microtask (the M2 follow-up) gets to run and + // populate snapshotPromise for the next iteration to await. + await new Promise((r) => queueMicrotask(r)) + } + // altimate_change end } /** @@ -871,6 +972,29 @@ export class Trace { * Returns the result from the first exporter that succeeds (typically the file path). */ async endTrace(error?: string): Promise { + // altimate_change start — M2 fix companion: claim the canonical write + // BEFORE awaiting the in-flight snapshot so any follow-up snapshot the + // in-flight one might schedule in its `.finally` is suppressed. + // + // Microtask-ordering invariant we rely on below: when an in-flight + // snapshot resolves, its `.finally` runs synchronously inside the + // promise chain — and only THEN does the `await this.snapshotPromise` + // (line below) hand control back to this function. By that point, if + // the .finally tried to schedule a follow-up snapshot via + // `queueMicrotask`, the follow-up is queued but hasn't run yet. We set + // `endTraceStarted=true` BEFORE the await, so whether the .finally + // already checked the flag (during the await) or the queued microtask + // runs after this point (calling `snapshot()` which re-checks at line + // ~803), both paths see the flag and bail. + this.endTraceStarted = true + // M3 fix: if flushSync already wrote the canonical crashed file, return + // the path to that file directly. Calling exporters would either bail + // (FileExporter, via the per-session crash guard) or POST stale + // "completed" data (HttpExporter). The crash signal is authoritative + // across exporters once flushSync has fired. + if (this.crashed) return this.getTracePath() + // altimate_change end + // Wait for any in-flight snapshot to complete before final write if (this.snapshotPromise) await this.snapshotPromise.catch(() => {}) @@ -980,6 +1104,21 @@ export class Trace { // Set crashed BEFORE writing so any in-flight async snapshot() will // bail out at its rename step instead of clobbering our crashed file. this.crashed = true + // altimate_change start — M3 fix: notify every exporter that the current + // session is crashed BEFORE writing the canonical file synchronously + // below. Exporters that share a canonical write target (e.g. FileExporter) + // can suppress in-flight or future writes for this session. Polymorphic + // dispatch via the optional `markCrashed?` method — no instanceof check, + // so custom exporters with similar semantics participate automatically. + // Without this, an in-flight `await endTrace()` whose export() is + // mid-writeFile (a 100+ms window on multi-MB traces) lands its rename + // after this synchronous write and clobbers the crashed content. + if (this.sessionId) { + for (const exp of this.exporters) { + exp.markCrashed?.(this.sessionId) + } + } + // altimate_change end this.currentGenerationSpanId = undefined const rootSpan = this.spans.find((s) => s.spanId === this.rootSpanId) if (rootSpan) { diff --git a/packages/opencode/test/altimate/tracing-display-crash.test.ts b/packages/opencode/test/altimate/tracing-display-crash.test.ts index 0eda918b3..5017cd3c0 100644 --- a/packages/opencode/test/altimate/tracing-display-crash.test.ts +++ b/packages/opencode/test/altimate/tracing-display-crash.test.ts @@ -497,18 +497,23 @@ describe("flushSync — multiple calls", () => { expect(traceFile.summary.status).toBe("crashed") }) - test("flushSync then endTrace — endTrace overwrites crashed status", async () => { + test("flushSync then endTrace — flushSync's crashed status is preserved (M3 fix)", async () => { const tracer = Recap.withExporters([new FileExporter(tmpDir)]) tracer.startTrace("s-flush-then-end", { prompt: "test" }) await new Promise((r) => setTimeout(r, 50)) tracer.flushSync("early crash") + const flushedPath = tracer.getTracePath()! - // But actually the process survived — endTrace completes normally - const filePath = await tracer.endTrace() - const traceFile: TraceFile = JSON.parse(await fs.readFile(filePath!, "utf-8")) - // endTrace should overwrite with "completed" - expect(traceFile.summary.status).toBe("completed") + // After flushSync, endTrace short-circuits and returns the canonical + // crashed file path. The crashed write is authoritative — endTrace + // does not call any exporter once Trace.crashed is set. + const endPath = await tracer.endTrace() + expect(endPath).toBe(flushedPath) + + const traceFile: TraceFile = JSON.parse(await fs.readFile(flushedPath, "utf-8")) + expect(traceFile.summary.status).toBe("crashed") + expect(traceFile.summary.error).toContain("early crash") }) }) diff --git a/packages/opencode/test/altimate/tracing-rename-race.test.ts b/packages/opencode/test/altimate/tracing-rename-race.test.ts new file mode 100644 index 000000000..96420c255 --- /dev/null +++ b/packages/opencode/test/altimate/tracing-rename-race.test.ts @@ -0,0 +1,507 @@ +/** + * Repro for the trace corruption bug: flushSync's synchronous write can be + * clobbered by an in-flight async snapshot's fs.rename. + * + * The race: + * 1. snapshot() is called → buildTraceFile, writeFile(tmp), then check `crashed`, then rename(tmp, final) + * 2. The `crashed` check passes synchronously (false at that point), so fs.rename is scheduled + * 3. flushSync() runs synchronously, sets `crashed=true`, writeFileSync(final, crashed content) + * 4. fs.rename completes — atomic rename overwrites the crashed file with the (stale) snapshot content + * + * The `crashed` flag was added (see Round-3 audit comment at tracing.ts:341-345) to fix this, + * but the fix is incomplete: the check happens BEFORE fs.rename's syscall completes, so once the + * check has passed the rename is already on its way and can't be cancelled. + * + * This reproducer injects a small delay into fs.rename to make the race deterministic. + * In production the same window is opened by slow disk I/O, large trace serialization, or + * a busy event loop. + */ + +import { describe, test, expect, beforeEach, afterEach } from "bun:test" +import fs from "fs/promises" +import path from "path" +import os from "os" +import { Trace, FileExporter, type TraceFile } from "../../src/altimate/observability/tracing" + +describe("trace corruption — flushSync vs in-flight rename race", () => { + let tmpDir: string + let originalRename: typeof fs.rename + + beforeEach(async () => { + tmpDir = path.join(os.tmpdir(), `trace-rename-race-${Date.now()}-${Math.random().toString(36).slice(2)}`) + await fs.mkdir(tmpDir, { recursive: true }) + originalRename = fs.rename + }) + + afterEach(async () => { + // Safety guard: restore fs.rename even if a test failed mid-patch. + // Without this, a patched fs.rename leaks into subsequent test files + // because `fs/promises` is a process-global namespace object. + if ((fs as any).rename !== originalRename) { + ;(fs as any).rename = originalRename + } + await fs.rm(tmpDir, { recursive: true, force: true }).catch(() => {}) + }) + + // M1 — flushSync's content overwritten by an in-flight snapshot's late + // fs.rename. The `crashed` flag added in commit 38463876b checks before + // the rename is initiated, but once initiated the kernel rename completes + // asynchronously and can clobber flushSync's sync write. THIS PR DOES + // NOT CLOSE THIS WINDOW — it remains as an architectural residual at the + // kernel level. On local SSD (APFS/ext4) the window is sub-millisecond + // and naturally protected by the existing `crashed` flag (see M1-natural + // tests below). Risk concentrates on non-POSIX filesystems (network FS, + // FUSE, sync-mediated cloud drives). + // + // This deterministic test (with an injected fs.rename delay) is skipped + // because the residual is documented in the umbrella issue and a known- + // failing test in CI is noise. Restore it locally with `test()` to + // experiment with kernel-rename TOCTOU mitigations. + test.skip("M1 residual — flushSync clobbered by adversarially-delayed snapshot rename (architectural)", async () => { + ;(fs as any).rename = async (from: string, to: string) => { + await new Promise((r) => setTimeout(r, 200)) + return originalRename(from, to) + } + + const sessionId = "race-victim" + const tracer = Trace.withExporters([new FileExporter(tmpDir)]) + tracer.startTrace(sessionId, { prompt: "long-running task" }) + await new Promise((r) => setTimeout(r, 300)) + + tracer.logToolCall({ + tool: "bash", + callID: "c-pre-crash", + state: { status: "completed", input: {}, output: "ok", time: { start: 1, end: 2 } }, + }) + await new Promise((r) => setTimeout(r, 50)) + tracer.flushSync("SIGTERM received during long session") + await new Promise((r) => setTimeout(r, 300)) + + const filePath = path.join(tmpDir, `${sessionId}.json`) + const traceFile: TraceFile = JSON.parse(await fs.readFile(filePath, "utf-8")) + expect(traceFile.summary.status).toBe("crashed") + }) + + test("natural race — burst of events + flushSync, no fs patch (statistical)", async () => { + // No fs.rename patch — pure stock behavior. We run many iterations with + // a burst of events followed by an immediate flushSync. The natural timing + // between writeFile completion and rename is small but nonzero; over enough + // iterations the race will fire at least once on most machines. + const ITERATIONS = 50 + const failures: Array<{ iter: number; status: string; hasError: boolean }> = [] + + for (let i = 0; i < ITERATIONS; i++) { + const sessionId = `natural-${i}` + const tracer = Trace.withExporters([new FileExporter(tmpDir)]) + tracer.startTrace(sessionId, { prompt: `iter ${i}` }) + + // Burst of spans — each triggers snapshot(). The debounce drops most; + // the first one is in flight when we crash. + for (let j = 0; j < 8; j++) { + tracer.logToolCall({ + tool: `tool-${j}`, + callID: `c${i}-${j}`, + state: { + status: "completed", + input: { i, j, payload: "x".repeat(500) }, + output: "ok", + time: { start: 1, end: 2 }, + }, + }) + } + + tracer.flushSync(`crash-${i}`) + + // Give the in-flight async rename time to land (or not). + await new Promise((r) => setTimeout(r, 20)) + + const filePath = path.join(tmpDir, `${sessionId}.json`) + const traceFile: TraceFile = JSON.parse(await fs.readFile(filePath, "utf-8")) + if (traceFile.summary.status !== "crashed" || !traceFile.summary.error?.includes(`crash-${i}`)) { + failures.push({ + iter: i, + status: traceFile.summary.status, + hasError: !!traceFile.summary.error, + }) + } + } + + // Report what we observed so the test failure message is actionable + if (failures.length > 0) { + console.error(`Natural race hit ${failures.length}/${ITERATIONS} times. Sample failures:`) + for (const f of failures.slice(0, 5)) console.error(` iter=${f.iter} status=${f.status} hasError=${f.hasError}`) + } + expect(failures).toEqual([]) + }) + + // M2 — debounce drops events from in-flight snapshots, no follow-up scheduled. + // If a crash happens after the in-flight snapshot completes but before the next + // event triggers another snapshot, the on-disk file is stale. + test("M2 — events arriving during in-flight snapshot are not snapshotted until the next event", async () => { + // Slow rename so events can pile up during the in-flight write + ;(fs as any).rename = async (from: string, to: string) => { + await new Promise((r) => setTimeout(r, 150)) + return originalRename(from, to) + } + + const sessionId = "debounce-drop" + const tracer = Trace.withExporters([new FileExporter(tmpDir)]) + tracer.startTrace(sessionId, { prompt: "test" }) + // Let startTrace's initial snapshot land + await new Promise((r) => setTimeout(r, 250)) + + // Trigger snapshot via tool A; rename will be delayed 150ms + tracer.logToolCall({ + tool: "tool-A", + callID: "a", + state: { status: "completed", input: { i: 1 }, output: "A", time: { start: 1, end: 2 } }, + }) + + // Within the rename's 150ms delay, fire B, C, D. snapshotPending is true, + // so each snapshot() returns early — but the M2 fix sets + // snapshotRequestedDuringPending so a follow-up snapshot is scheduled + // (via queueMicrotask in the in-flight snapshot's .finally). + await new Promise((r) => setTimeout(r, 30)) + tracer.logToolCall({ + tool: "tool-B", + callID: "b", + state: { status: "completed", input: { i: 2 }, output: "B", time: { start: 3, end: 4 } }, + }) + tracer.logToolCall({ + tool: "tool-C", + callID: "c", + state: { status: "completed", input: { i: 3 }, output: "C", time: { start: 5, end: 6 } }, + }) + tracer.logToolCall({ + tool: "tool-D", + callID: "d", + state: { status: "completed", input: { i: 4 }, output: "D", time: { start: 7, end: 8 } }, + }) + + // Wait for A's in-flight snapshot to complete AND the M2 fix's follow-up + // snapshot to also land. The follow-up runs an additional writeFile + + // (patched 150 ms) rename, so we need ~2× the per-snapshot budget. + await new Promise((r) => setTimeout(r, 600)) + + // Read the file. With the M2 fix the .finally() schedules a follow-up + // snapshot when events arrived during the in-flight write; all four + // tools should now land on disk. + const filePath = path.join(tmpDir, `${sessionId}.json`) + const traceFile: TraceFile = JSON.parse(await fs.readFile(filePath, "utf-8")) + const toolNames = traceFile.spans.filter((s) => s.kind === "tool").map((s) => s.name).sort() + + expect(toolNames).toEqual(["tool-A", "tool-B", "tool-C", "tool-D"]) + }) + + // Note: an "M3 deterministic" test that patches fs.rename to add an + // artificial delay would exercise the same kernel-rename TOCTOU as the + // M1 deterministic test above — that residual is architectural and not + // closed by this PR. The natural M3 test below (large trace, no patch) + // is the load-bearing regression test for the M3 fix: it exercises the + // long-writeFile window, which is the only window that fires naturally + // and which FileExporter._crashed actually catches. + + // M1 natural — verifies the EXISTING `crashed` flag (commit 38463876b) + // continues to protect against the snapshot-rename TOCTOU under natural + // timing on local SSD. No fs patches, no workers. + // Regression test for pre-existing protection, not for this PR's new fix. + test("M1-natural — crashed flag protects flushSync against the snapshot rename race", async () => { + let observedClobber = 0 + const TRIALS = 5 + for (let trial = 0; trial < TRIALS; trial++) { + const sessionId = `natural-m1-${trial}` + const tracer = Trace.withExporters([new FileExporter(tmpDir)]) + tracer.startTrace(sessionId, { prompt: "long-running task simulation" }) + + // Preload some bulky spans so the snapshot's writeFile has real work + const bulk = "x".repeat(2_000) + for (let i = 0; i < 500; i++) { + tracer.logToolCall({ + tool: `preload-${i % 16}`, + callID: `pre-${trial}-${i}`, + state: { + status: "completed", + input: { i, payload: bulk }, + output: bulk, + time: { start: i, end: i + 1 }, + }, + }) + } + await new Promise((r) => setTimeout(r, 50)) + + tracer.logToolCall({ + tool: "trigger", + callID: `trigger-${trial}`, + state: { status: "completed", input: { trial }, output: "go", time: { start: 1, end: 2 } }, + }) + tracer.flushSync(`natural-crash-${trial}`) + await new Promise((r) => setTimeout(r, 300)) + + const filePath = path.join(tmpDir, `${sessionId}.json`) + const traceFile: TraceFile = JSON.parse(await fs.readFile(filePath, "utf-8")) + if (traceFile.summary.status !== "crashed" || !traceFile.summary.error?.includes(`natural-crash-${trial}`)) { + observedClobber++ + } + } + // On local SSD the kernel rename's syscall completes too fast for + // flushSync to interleave; existing `crashed` flag closes the window. + expect(observedClobber).toBe(0) + }, 30_000) + + // M2 under natural conditions: NO fs patch. Just burst events fast. + // If the burst overlaps with an in-flight snapshot, late events get dropped + // and never re-snapshotted. A crash (or process exit without endTrace) at + // that moment leaves the file missing the most recent events. + test("M2-natural — burst of events while snapshot in flight, file ends stale (no fs patch)", async () => { + let observedDrops = 0 + const TRIALS = 20 + const detail: Array<{ trial: number; expected: number; actual: number; missing: number }> = [] + + for (let trial = 0; trial < TRIALS; trial++) { + const sessionId = `natural-m2-${trial}` + const tracer = Trace.withExporters([new FileExporter(tmpDir)]) + tracer.startTrace(sessionId, { prompt: "burst sim" }) + + // Preload bulk so the snapshot's writeFile takes real time + const bulk = "x".repeat(2_000) + for (let i = 0; i < 800; i++) { + tracer.logToolCall({ + tool: `pre-${i}`, + callID: `pre-${trial}-${i}`, + state: { + status: "completed", + input: { i, payload: bulk }, + output: bulk, + time: { start: i, end: i + 1 }, + }, + }) + } + // Let preload settle + await new Promise((r) => setTimeout(r, 50)) + + // Burst of 8 fresh events — most of which will arrive during an + // in-flight snapshot and get dropped. + const burstNames = [] + for (let i = 0; i < 8; i++) { + const name = `burst-${i}` + burstNames.push(name) + tracer.logToolCall({ + tool: name, + callID: `b-${trial}-${i}`, + state: { status: "completed", input: { i, payload: bulk }, output: "ok", time: { start: 1, end: 2 } }, + }) + } + + // Simulate "process exits without endTrace" by waiting for any + // in-flight write to settle, then reading. NO subsequent event fires + // to trigger a catch-up snapshot. + await new Promise((r) => setTimeout(r, 100)) + + const filePath = path.join(tmpDir, `${sessionId}.json`) + const traceFile: TraceFile = JSON.parse(await fs.readFile(filePath, "utf-8")) + const burstFound = traceFile.spans.filter((s) => s.name.startsWith("burst-")).map((s) => s.name) + const missing = burstNames.length - burstFound.length + if (missing > 0) observedDrops++ + detail.push({ trial, expected: burstNames.length, actual: burstFound.length, missing }) + } + + // Regression: after the M2 fix (follow-up snapshot scheduled in .finally), + // all burst events must land on disk before the file is read. + if (observedDrops > 0) { + console.error(`Natural M2 regression: ${observedDrops}/${TRIALS} had dropped events`) + for (const d of detail.slice(0, 5)) + console.error(` trial=${d.trial} expected=${d.expected} actual=${d.actual} missing=${d.missing}`) + } + expect(observedDrops).toBe(0) + }, 60_000) + + // (M1-natural-aggressive — pool saturation + huge JSON + CPU oversubscription — + // ran 10 trials with 25 MB JSON, 16 worker_threads, 200 concurrent 1 MB noise + // writes; result was 0/10 clobbered. The existing `crashed` flag closes the + // window on local SSD under all the natural pressure we could apply. The + // residual kernel-rename TOCTOU is documented in the umbrella issue. + // Test deleted because it adds no coverage over M1-natural and ran ~10 minutes.) + + // M3 natural: NO fs patch. endTrace's export() does writeFile(tmp) → rename(tmp, final). + // For a large preloaded trace, the writeFile takes real time (tens to hundreds of ms), + // giving flushSync a wide window to interleave. export() has NO crashed-guard, so + // its rename always lands afterward and overwrites flushSync's crashed content. + test("M3-natural — endTrace export() race fires without fs patch on large trace", async () => { + let observedClobber = 0 + const TRIALS = 10 + const detail: Array<{ trial: number; status: string; hasError: boolean; bytes: number }> = [] + + for (let trial = 0; trial < TRIALS; trial++) { + const sessionId = `natural-m20-${trial}` + const tracer = Trace.withExporters([new FileExporter(tmpDir)]) + tracer.startTrace(sessionId, { prompt: "natural m20" }) + + // Big preload so export's writeFile takes real wall-clock time + const bulk = "x".repeat(5_000) + for (let i = 0; i < 5_000; i++) { + tracer.logToolCall({ + tool: `pre-${i % 32}`, + callID: `pre-${trial}-${i}`, + state: { + status: "completed", + input: { i, payload: bulk }, + output: bulk, + time: { start: i, end: i + 1 }, + }, + }) + } + await new Promise((r) => setTimeout(r, 100)) + + // Kick off endTrace asynchronously — it will call FileExporter.export() + // which writeFiles the tmp, then renames. The writeFile of a multi-MB + // JSON takes real time. + const endTracePromise = tracer.endTrace() + + // Wait briefly for export's writeFile to begin (Promise scheduling). + // Then synchronously flushSync — writes filePath via writeFileSync. + // export's writeFile is still in flight; its rename hasn't run yet. + await new Promise((r) => setTimeout(r, 5)) + tracer.flushSync(`natural-m20-crash-${trial}`) + + // Wait for endTrace's rename to land. With no crashed guard in export(), + // the rename will clobber flushSync's content. + await endTracePromise.catch(() => {}) + await new Promise((r) => setTimeout(r, 200)) + + const filePath = path.join(tmpDir, `${sessionId}.json`) + const buf = await fs.readFile(filePath, "utf-8") + const traceFile: TraceFile = JSON.parse(buf) + const wasClobbered = + traceFile.summary.status !== "crashed" || + !traceFile.summary.error?.includes(`natural-m20-crash-${trial}`) + if (wasClobbered) observedClobber++ + detail.push({ + trial, + status: traceFile.summary.status, + hasError: !!traceFile.summary.error, + bytes: buf.length, + }) + } + + // Regression: after the M3 fix (FileExporter._crashed guard + flushSync + // notifies all FileExporters), endTrace's in-flight export() must not + // clobber flushSync's canonical crashed write. + if (observedClobber > 0) { + console.error(`Natural M3 regression: ${observedClobber}/${TRIALS} clobbered`) + for (const d of detail) + console.error(` trial=${d.trial} status=${d.status} hasError=${d.hasError} bytes=${d.bytes}`) + } + expect(observedClobber).toBe(0) + }, 120_000) + + // Regression for the critical issue raised in the consensus review of PR #867: + // FileExporter._crashed was originally an instance-level boolean, but the TUI + // worker caches a single FileExporter across sessions (worker.ts shares + // `tracingExporters` via `Trace.withExporters([...tracingExporters])`). + // A single boolean meant one crashed session would permanently suppress + // exports for every subsequent session in the same worker. The fix is + // per-session crash tracking via Set. + test("shared FileExporter — flushSync on trace A must NOT suppress trace B's export", async () => { + const sharedExporter = new FileExporter(tmpDir) + + // Trace A: flushSync → marks session-A crashed, writes canonical file + const traceA = Trace.withExporters([sharedExporter]) + traceA.startTrace("session-A", { prompt: "A" }) + await new Promise((r) => setTimeout(r, 50)) + traceA.logToolCall({ + tool: "tool-a", + callID: "ca", + state: { status: "completed", input: {}, output: "ok", time: { start: 1, end: 2 } }, + }) + traceA.flushSync("crash-A") + + // Trace B: normal lifecycle on the SAME exporter instance — must export cleanly + const traceB = Trace.withExporters([sharedExporter]) + traceB.startTrace("session-B", { prompt: "B" }) + traceB.logToolCall({ + tool: "tool-b", + callID: "cb", + state: { status: "completed", input: {}, output: "ok", time: { start: 1, end: 2 } }, + }) + const pathB = await traceB.endTrace() + + // Trace A's file: crashed (flushSync wins) + const traceFileA: TraceFile = JSON.parse( + await fs.readFile(path.join(tmpDir, "session-A.json"), "utf-8"), + ) + expect(traceFileA.summary.status).toBe("crashed") + + // Trace B's file: completed (per-session guard scopes A's crash to A only) + expect(pathB).toBe(path.join(tmpDir, "session-B.json")) + const traceFileB: TraceFile = JSON.parse(await fs.readFile(pathB!, "utf-8")) + expect(traceFileB.summary.status).toBe("completed") + }) + + // Regression for cubic-dev-ai's P1 review finding on PR #867: + // markCrashed() was being called with the raw sessionId, but the export() + // suppression check uses trace.sessionId (which comes from buildTraceFile + // and is sanitized through the `[/\\.:]` → `_` regex). A sessionId + // containing any of those characters would store under one key in the + // crashedSessions Set and be looked up under a different key in export(), + // silently bypassing the crash guard. Fix: sanitize at the store side too. + test("crash guard handles unsafe sessionId chars (sanitization parity)", async () => { + const exporter = new FileExporter(tmpDir) + const tracer = Trace.withExporters([exporter]) + // Pick a sessionId that needs sanitization through the file-name regex + const unsafeId = "weird:session/with.unsafe:chars" + tracer.startTrace(unsafeId, { prompt: "test" }) + await new Promise((r) => setTimeout(r, 50)) + tracer.flushSync("crash with unsafe id") + + // Manually invoke export() with the canonical (sanitized) trace data — + // this is what endTrace would do internally. If sanitization parity is + // broken, the markCrashed Set entry won't match and the export will + // overwrite flushSync's content. + const trace: TraceFile = { + version: 2, + traceId: "t", + sessionId: unsafeId.replace(/[/\\.:]/g, "_"), + startedAt: new Date().toISOString(), + metadata: {}, + spans: [], + summary: { + totalTokens: 0, + totalCost: 0, + totalToolCalls: 0, + totalGenerations: 0, + duration: 0, + status: "completed", + tokens: { input: 0, output: 0, reasoning: 0, cacheRead: 0, cacheWrite: 0 }, + }, + } + const result = await exporter.export(trace) + // export() must bail because the session was marked crashed; the + // sanitized lookup must find the (sanitized) entry from markCrashed. + expect(result).toBeUndefined() + + // And the on-disk file must still be flushSync's crashed content + const safeId = unsafeId.replace(/[/\\.:]/g, "_") + const traceFile: TraceFile = JSON.parse( + await fs.readFile(path.join(tmpDir, `${safeId}.json`), "utf-8"), + ) + expect(traceFile.summary.status).toBe("crashed") + }) + + test("baseline — flushSync alone writes crashed content correctly (no race)", async () => { + // Sanity check: without the rename delay, flushSync's content lands and survives. + // If this fails we have a different bug than the race we're investigating. + const sessionId = "baseline" + const tracer = Trace.withExporters([new FileExporter(tmpDir)]) + tracer.startTrace(sessionId, { prompt: "baseline" }) + + // Wait for the initial snapshot to settle + await new Promise((r) => setTimeout(r, 100)) + + tracer.flushSync("SIGTERM baseline") + + const filePath = path.join(tmpDir, `${sessionId}.json`) + const traceFile: TraceFile = JSON.parse(await fs.readFile(filePath, "utf-8")) + expect(traceFile.summary.status).toBe("crashed") + expect(traceFile.summary.error).toContain("SIGTERM baseline") + }) +}) diff --git a/packages/opencode/test/session/real-tool-simulation.test.ts b/packages/opencode/test/session/real-tool-simulation.test.ts index 44199acee..3b21cc2e2 100644 --- a/packages/opencode/test/session/real-tool-simulation.test.ts +++ b/packages/opencode/test/session/real-tool-simulation.test.ts @@ -298,8 +298,10 @@ describe("REAL EXEC: sql_analyze tool", () => { const result = await execSqlAnalyze("SELCT * FORM users") expect(result.output).toContain("Parse error") - // Still gets suggestion on first call since it didn't throw - expect(result.output).toContain("schema_inspect") + // A `result.error` value flips `isRealFailure` to true in sql-analyze.ts, + // which suppresses the progressive-disclosure suggestion. So no + // `schema_inspect` follow-up should appear — matching the test name. + expect(result.output).not.toContain("schema_inspect") }) test("S28: sql_analyze throws — returns error, no suggestion", async () => {