Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion pkg/sentry/watchdog/BUILD
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
load("//tools:defs.bzl", "go_library")
load("//tools:defs.bzl", "go_library", "go_test")

package(
default_applicable_licenses = ["//:license"],
Expand All @@ -17,3 +17,15 @@ go_library(
"//pkg/sync",
],
)

go_test(
name = "watchdog_test",
size = "small",
srcs = [
"watchdog_test.go",
],
library = ":watchdog",
deps = [
"@com_github_google_go_cmp//cmp:go_default_library",
],
)
88 changes: 82 additions & 6 deletions pkg/sentry/watchdog/watchdog.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ package watchdog
import (
"bytes"
"fmt"
"regexp"
"strconv"
"strings"
"time"

"gvisor.dev/gvisor/pkg/log"
Expand Down Expand Up @@ -234,7 +237,9 @@ func (w *Watchdog) waitForStart() {

var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
w.doAction(w.StartupTimeoutAction, false, &buf)
var stuckTasks map[int64]struct{}
forceStackDump := false
w.doAction(w.StartupTimeoutAction, forceStackDump, stuckTasks, &buf)
}

// loop is the main watchdog routine. It only returns when 'Stop()' is called.
Expand Down Expand Up @@ -320,26 +325,30 @@ func (w *Watchdog) runTurn() {
func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound bool, now ktime.Time) {
var buf bytes.Buffer
buf.WriteString(fmt.Sprintf("Sentry detected %d stuck task(s):\n", len(offenders)))
stuckTasks := make(map[int64]struct{})
for t, o := range offenders {
tid := w.k.TaskSet().Root.IDOfTask(t)
buf.WriteString(fmt.Sprintf("\tTask tid: %v (goroutine %d), entered RunSys state %v ago.\n", tid, t.GoroutineID(), now.Sub(o.lastUpdateTime)))
stuckTasks[t.GoroutineID()] = struct{}{}
}
buf.WriteString("Search for 'goroutine <id>' in the stack dump to find the offending goroutine(s)")

// Force stack dump only if a new task is detected.
w.doAction(w.TaskTimeoutAction, newTaskFound, &buf)
forceStackDump := newTaskFound
w.doAction(w.TaskTimeoutAction, forceStackDump, stuckTasks, &buf)
}

func (w *Watchdog) reportStuckWatchdog() {
var buf bytes.Buffer
buf.WriteString("Watchdog goroutine is stuck")
w.doAction(w.TaskTimeoutAction, false, &buf)
var stuckTasks map[int64]struct{}
forceStackDump := false
w.doAction(w.StartupTimeoutAction, forceStackDump, stuckTasks, &buf)
}

// doAction will take the given action. If the action is LogWarning, the stack
// is not always dumped to the log to prevent log flooding. "forceStack"
// guarantees that the stack will be dumped regardless.
func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {
func (w *Watchdog) doAction(action Action, forceStack bool, stuckTasks map[int64]struct{}, msg *bytes.Buffer) {
switch action {
case LogWarning:
// Dump stack only if forced or sometime has passed since the last time a
Expand All @@ -355,7 +364,7 @@ func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {
case Panic:
// Panic will skip over running tasks, which is likely the culprit here. So manually
// dump all stacks before panic'ing.
log.TracebackAll(msg.String())
tracebackAllWithStuckSuffix(stuckTasks, msg.Bytes())

// Attempt to flush metrics, timeout and move on in case metrics are stuck as well.
metricsEmitted := make(chan struct{}, 1)
Expand All @@ -375,3 +384,70 @@ func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {

}
}

func tracebackAllWithStuckSuffix(stuckTasks map[int64]struct{}, msg []byte) {
stacks := log.Stacks(true)
// The full stack dump can be too big to search for stuck goroutines.
// Moreover, Syzkaller-produced console logs are limited to 128KB on either
// side of the panic message in doAction(), causing the full dump being
// written here to be truncated. So we add a section where we collate all
// known stuck stacks.
stuckHeader := "***** BEGIN STUCK GOROUTINES (there may be more, read the preceding full dump) *****"
stuckStacks := stuckGoroutineStacks(stacks, stuckTasks)
stuckFooter := "***** END STUCK GOROUTINES *****\n"

log.Warningf("%s\n%s\n\n%s\n%s\n%s", msg, stacks, stuckHeader, stuckStacks, stuckFooter)
}

func stuckGoroutineStacks(stacks []byte, stuckTasks map[int64]struct{}) []byte {
const maxStuckGoroutineSectionBytes = 64 * 1024

// goroutineheader() from runtime/traceback.go always prints the "goroutine <id>" prefix.
// The header always includes a "[<state>" and always ends with a "]:". The goroutine ID is
// the first capture group, and the stuff within the brackets is the second.
anyHdrRegex := regexp.MustCompile(`^\s*goroutine\s(\d+).*\s\[(.*)\]\:`)
stuckStateRegex := regexp.MustCompile(strings.Join([]string{
`semacquire`,
`sync\.(RW)?Mutex\.R?Lock`,
}, "|"))

result := new(bytes.Buffer)
inStuckStack := false
remaining := stacks
result.WriteByte('\n')

for len(remaining) > 0 {
lineEnd := bytes.IndexByte(remaining, '\n')
var line []byte

if lineEnd == -1 {
line = remaining
remaining = nil
} else {
line = remaining[:lineEnd]
remaining = remaining[lineEnd+1:]
}

if match := anyHdrRegex.FindSubmatch(line); match != nil {
isStuckHdr := false
if goroutineID, err := strconv.ParseInt(string(match[1]), 10, 64); err == nil {
if _, ok := stuckTasks[goroutineID]; ok {
isStuckHdr = true
}
}
isStuckHdr = isStuckHdr || stuckStateRegex.Match(match[2])
inStuckStack = isStuckHdr
}

if inStuckStack {
if result.Len() >= maxStuckGoroutineSectionBytes {
fmt.Fprintf(result, "... stuck section truncated, hit the limit of %d bytes\n", maxStuckGoroutineSectionBytes)
break
}
result.Write(line)
result.WriteByte('\n')
}
}

return result.Bytes()
}
67 changes: 67 additions & 0 deletions pkg/sentry/watchdog/watchdog_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright 2025 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package watchdog

import (
"strings"
"testing"

"github.com/google/go-cmp/cmp"
)

func TestStuckGoroutineStacks(t *testing.T) {
innocent0 := `goroutine 124 [select, 1 minutes]:
gvisor.dev/gvisor/pkg/sentry/ktime.(*SampledTimer).runGoroutine(0xc0008466c0)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sentry/ktime/sampled_timer.go:235 +0xd4
created by gvisor.dev/gvisor/pkg/sentry/ktime.(*SampledTimer).init in goroutine 123
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sentry/ktime/sampled_timer.go:102 +0x239`

innocent1 := `goroutine 75 gp=0xc000460c40 m=nil [GC worker (idle), 15 minutes]:
runtime.gopark(0x0?, 0x0?, 0x0?, 0x0?, 0x0?)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/go/src/runtime/proc.go:460 +0xce fp=0xc000466f38 sp=0xc000466f18 pc=0x48094e
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/go/src/runtime/asm_amd64.s:1693 +0x1 fp=0xc000466fe8 sp=0xc000466fe0 pc=0x4896c1
created by runtime.gcBgMarkStartWorkers in goroutine 1
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/go/src/runtime/mgc.go:1373 +0x105`

stuckNetstackGoroutine := `goroutine 25916 [sync.RWMutex.RLock, 3 minutes]:
sync.runtime_Semacquire(0x41de33?)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/go/src/runtime/sema.go:71 +0x25
gvisor.dev/gvisor/pkg/sync.(*CrossGoroutineRWMutex).Lock(0xc0006a3b08?)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sync/rwmutex_unsafe.go:154 +0x67
gvisor.dev/gvisor/pkg/sync.(*RWMutex).Lock(...)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sync/rwmutex_unsafe.go:292
created by gvisor.dev/gvisor/pkg/tcpip/link/veth.NewPair in goroutine 25922
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/tcpip/link/veth/veth.go:107 +0x2f3`

stuckTaskGoroutine := `goroutine 26128 [deliberately-not-semacquire, 3 minutes]:
sync.runtime_Semacquire(0x47e205?)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/go/src/runtime/sema.go:71 +0x25
gvisor.dev/gvisor/pkg/sync.(*CrossGoroutineRWMutex).RLock(...)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sync/rwmutex_unsafe.go:78
gvisor.dev/gvisor/pkg/sync.(*RWMutex).RLock(...)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sync/rwmutex_unsafe.go:259
gvisor.dev/gvisor/pkg/sentry/kernel.(*Task).doSyscall(0xc000fae160?)
/syzkaller/.cache/bazel/_bazel_root/8c3527d6b90f1bcab77f0f2b1b1fec96/sandbox/linux-sandbox/6/execroot/_main/gopath/src/gvisor.dev/gvisor/pkg/sentry/kernel/task_run.go:97 +0x3e8`

stuckTasks := make(map[int64]struct{})
stuckTasks[26128] = struct{}{} // 26128 is stuckTaskGoroutine.
allStacks := []string{innocent0, stuckNetstackGoroutine, innocent1, stuckTaskGoroutine}
wantStacks := "\n" + strings.Join([]string{stuckNetstackGoroutine, stuckTaskGoroutine}, "\n\n") + "\n"

gotStuckStacks := string(stuckGoroutineStacks([]byte(strings.Join(allStacks, "\n\n")), stuckTasks))
if diff := cmp.Diff(wantStacks, gotStuckStacks); diff != "" {
t.Errorf("stuckGoroutineStacks() returned unexpected diff (-want +got):\n%s", diff)
}
}
Loading