Skip to content

URLSession.data() deadlock when task is cancelled while handling a network response #5412

@NathanOlmanst

Description

@NathanOlmanst

Description

URLSession.data() causes a deadlock when the task is cancelled while handling a network response

The issue seems to be happening because URLSession.data() is implemented using withTaskCancellationHandler, whose onCancel: closure is called while holding an internal lock (StatusRecordLock). The onCancel: calls URLSessionTask.cancel() which tries to acquire workQueue.

At the same time, a network thread may already be using workQueue to handle a network response or error. This involves resuming the withCheckedThrowingContinuation which requires acquiring the same StatusRecordLock.

I've added a stack trace below. Feel free to correct me if I've misunderstood the flow.

Reproduction

See code within collapsible section below:

main.swift
// `swift run main` to execute with session.data(for:)
// `swift run main --fix` to execute with session.dataTask`

import Foundation
import FoundationNetworking

let useFix = CommandLine.arguments.contains("--fix")
print(
    useFix
        ? "Mode: FIXED -- await (continuation + session.dataTask)"
        : "Mode: BUGGY -- await session.data(for:)")

var lastProgress = Date()
let wq = DispatchQueue(label: "wd")

func tick() {
    wq.sync { lastProgress = Date() }
}
DispatchQueue.global().async {
    while true {
        Thread.sleep(forTimeInterval: 1)
        let elapsed: TimeInterval = wq.sync { Date().timeIntervalSince(lastProgress) }
        if elapsed >= 10 {
            print("\n*** no progress for \(Int(elapsed))s ***")
            kill(getpid(), SIGINT)
        }
    }
}

// endpoint does not need to exist, a ECONNREFUSED is sufficient
let url = URL(string: "http://127.0.1.1:19999/")!
let session = URLSession.shared

func buggedFetch() async throws -> Data {
    let (data, _) = try await session.data(for: URLRequest(url: url))
    return data
}

func fixedFetch() async throws -> Data {
    return try await withCheckedThrowingContinuation { c in
        session.dataTask(with: url) { d, _, e in
            if let e { c.resume(throwing: e) } else { c.resume(returning: d ?? Data()) }
        }.resume()
    }
}

let iterations = 10_000
Task {
    for i in 0..<iterations {
        tick()
        if i % 500 == 0 { print("Iteration \(i)") }

        let task = Task { () -> Data in
            return useFix ? try await fixedFetch() : try await buggedFetch()
        }
        // Give the child task time to start/process the HTTP request.
        try? await Task.sleep(nanoseconds: 100_000)

        task.cancel()
    }
    print("Completed \(iterations) iterations")
    exit(0)
}

dispatchMain()

When analyzing the example with gdb, we can see the following deadlock scenario:

  • Thread 25 acquires StatusRecordLock at frame 9
  • Thread 25 waits for workQueue at frame 4
  • Thread 9 acquires workQueue at frame 25
  • Thread 9 waits for StatusRecordLock at frame 4
Stack-trace of the code (when executing `session.data()`)
// gdb thread bt
Thread 25 (Thread 0x7fffa1ffb640 (LWP 110879) "main"):
#0  syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:38
#1  0x00007ffff7606831 in _dispatch_futex_wait () from /usr/lib/swift/linux/libdispatch.so
#2  0x00007ffff760692d in _dispatch_thread_event_wait_slow () from /usr/lib/swift/linux/libdispatch.so
#3  0x00007ffff75fd0c5 in __DISPATCH_WAIT_FOR_QUEUE__ () from /usr/lib/swift/linux/libdispatch.so
#4  0x00007ffff75fcc1e in _dispatch_sync_f_slow () from /usr/lib/swift/linux/libdispatch.so
#5  0x00007ffff75bbc62 in $s8Dispatch0A5QueueC4sync7executeyyyXE_tF () from /usr/lib/swift/linux/libswiftDispatch.so
#6  0x00007ffff674e59d in $s20FoundationNetworking14URLSessionTaskC6cancelyyF () from /usr/lib/swift/linux/libFoundationNetworking.so
#7  0x00007ffff6732d96 in $s20FoundationNetworking10URLSessionC11CancelState33_A4EC80B15160B9532B0458D8341D1269LLC6cancelyyF () from /usr/lib/swift/linux/libFoundationNetworking.so
#8  0x00007ffff7881738 in void __swift::__runtime::llvm::function_ref<void (swift::ActiveTaskStatus)>::callback_fn<swift_task_cancelImpl(swift::AsyncTask*)::$_0>(long, swift::ActiveTaskStatus) () from /usr/lib/swift/linux/libswift_Concurrency.so
#9  0x00007ffff788062f in withStatusRecordLock(swift::AsyncTask*, swift::ActiveTaskStatus, __swift::__runtime::llvm::function_ref<void (swift::ActiveTaskStatus)>, __swift::__runtime::llvm::function_ref<void (swift::ActiveTaskStatus, swift::ActiveTaskStatus&)>) () from /usr/lib/swift/linux/libswift_Concurrency.so
#10 0x00007ffff7880efd in swift_task_cancel () from /usr/lib/swift/linux/libswift_Concurrency.so
#11 0x00005555555579c0 in $s4mainyyYacfU0_TY2_ ()
#12 0x00007ffff787b370 in swift::runJobInEstablishedExecutorContext(swift::Job*) () from /usr/lib/swift/linux/libswift_Concurrency.so
#13 0x00007ffff787bf25 in swift_job_run () from /usr/lib/swift/linux/libswift_Concurrency.so
#14 0x00007ffff75f5b8b in _dispatch_continuation_pop () from /usr/lib/swift/linux/libdispatch.so
#15 0x00007ffff75f59bf in _dispatch_async_redirect_invoke () from /usr/lib/swift/linux/libdispatch.so
#16 0x00007ffff75fe936 in _dispatch_worker_thread () from /usr/lib/swift/linux/libdispatch.so
#17 0x00007ffff63f0ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#18 0x00007ffff64828d0 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

Thread 9 (Thread 0x7fffdaffd640 (LWP 110863) "main"):
#0  futex_wait (private=0, expected=2, futex_word=0x7fff88000b78) at ../sysdeps/nptl/futex-internal.h:146
#1  __GI___lll_lock_wait (futex=futex@entry=0x7fff88000b78, private=0) at ./nptl/lowlevellock.c:49
#2  0x00007ffff63f4002 in lll_mutex_lock_optimized (mutex=0x7fff88000b78) at ./nptl/pthread_mutex_lock.c:48
#3  ___pthread_mutex_lock (mutex=0x7fff88000b78) at ./nptl/pthread_mutex_lock.c:93
#4  0x00007ffff787fe33 in waitForStatusRecordUnlockIfNotSelfLocked(swift::AsyncTask*, swift::ActiveTaskStatus&) () from /usr/lib/swift/linux/libswift_Concurrency.so
#5  0x00007ffff788050c in withStatusRecordLock(swift::AsyncTask*, swift::ActiveTaskStatus, __swift::__runtime::llvm::function_ref<void (swift::ActiveTaskStatus)>, __swift::__runtime::llvm::function_ref<void (swift::ActiveTaskStatus, swift::ActiveTaskStatus&)>) () from /usr/lib/swift/linux/libswift_Concurrency.so
#6  0x00007ffff7880a42 in swift::updateStatusRecord(swift::AsyncTask*, swift::TaskStatusRecord*, __swift::__runtime::llvm::function_ref<void ()>, swift::ActiveTaskStatus&, __swift::__runtime::llvm::function_ref<void (swift::ActiveTaskStatus, swift::ActiveTaskStatus&)>) () from /usr/lib/swift/linux/libswift_Concurrency.so
#7  0x00007ffff787cdf0 in swift::AsyncTask::flagAsAndEnqueueOnExecutor(swift::SerialExecutorRef) () from /usr/lib/swift/linux/libswift_Concurrency.so
#8  0x00007ffff6734e02 in $s20FoundationNetworking10URLSessionC4data3for8delegate0A10Essentials4DataV_AA11URLResponseCtAA10URLRequestV_AA0C12TaskDelegate_pSgtYaKFAI_AKtyYaKXEfU_yScCyAI_AKts5Error_pGXEfU_yAISg_AKSgsAP_pSgtYbcfU_Tm () from /usr/lib/swift/linux/libFoundationNetworking.so
#9  0x00007ffff6736b75 in $s20FoundationNetworking10URLSessionC8download4from8delegate0A10Essentials3URLV_AA11URLResponseCtAI_AA0C12TaskDelegate_pSgtYaKFAI_AKtyYaKXEfU_yScCyAI_AKts5Error_pGXEfU_yAISg_AKSgsAN_pSgtYbcfU_TATm () from /usr/lib/swift/linux/libFoundationNetworking.so
#10 0x00007ffff6747ba9 in $s20FoundationNetworking15_ProtocolClientC03urlC04task16didFailWithErroryAA14URLSessionTaskC_s0J0_ptFTf4nnd_n () from /usr/lib/swift/linux/libFoundationNetworking.so
#11 0x00007ffff67454c7 in $s20FoundationNetworking15_ProtocolClientCAA011URLProtocolD0A2aDP03urlC0_16didFailWithErroryAA0E0C_s0J0_ptFTW () from /usr/lib/swift/linux/libFoundationNetworking.so
#12 0x00007ffff6727d7c in $s20FoundationNetworking15_NativeProtocolC8failWith5error7requesty0A07NSErrorC_AA10URLRequestVtF () from /usr/lib/swift/linux/libFoundationNetworking.so
#13 0x00007ffff6729cca in $s20FoundationNetworking15_NativeProtocolC17transferCompleted9withErrory0A07NSErrorCSg_tF () from /usr/lib/swift/linux/libFoundationNetworking.so
#14 0x00007ffff6725cee in $s20FoundationNetworking10URLSessionC12_MultiHandleC17completedTransfer33_76FB65EF45BDE9115477B690E87F5E28LL07forEasyE08easyCodeySv_So012CFURLSessionpR0VtF () from /usr/lib/swift/linux/libFoundationNetworking.so
#15 0x00007ffff672593a in $s20FoundationNetworking10URLSessionC12_MultiHandleC25readAndWriteAvailableData33_76FB65EF45BDE9115477B690E87F5E28LL2onys5Int32V_tKF () from /usr/lib/swift/linux/libFoundationNetworking.so
#16 0x00007ffff67254f4 in $s20FoundationNetworking10URLSessionC12_MultiHandleC3addyyAA05_EasyE0CF () from /usr/lib/swift/linux/libFoundationNetworking.so
#17 0x00007ffff6735dde in $s20FoundationNetworking10URLSessionCAA0C8ProtocolA2aDP3add6handleyAA11_EasyHandleC_tFTW () from /usr/lib/swift/linux/libFoundationNetworking.so
#18 0x00007ffff672cb76 in $s20FoundationNetworking15_NativeProtocolC13internalStateAC09_InternalF0OvW () from /usr/lib/swift/linux/libFoundationNetworking.so
#19 0x00007ffff672c42e in $s20FoundationNetworking15_NativeProtocolC6resumeyyF () from /usr/lib/swift/linux/libFoundationNetworking.so
#20 0x00007ffff672d031 in $s20FoundationNetworking15_NativeProtocolC16startNewTransfer4withyAA10URLRequestV_tFyAA14URLSessionTaskC5_BodyOcfU_ () from /usr/lib/swift/linux/libFoundationNetworking.so
#21 0x00007ffff672bc6b in $s20FoundationNetworking15_NativeProtocolC16startNewTransfer4withyAA10URLRequestV_tF () from /usr/lib/swift/linux/libFoundationNetworking.so
#22 0x00007ffff672c232 in $s20FoundationNetworking15_NativeProtocolC6resumeyyF () from /usr/lib/swift/linux/libFoundationNetworking.so
#23 0x00007ffff673f22a in $s20FoundationNetworking14URLSessionTaskC6resumeyyFyyXEfU_yAA11URLProtocolCSgcfU_yyYbcfU_ () from /usr/lib/swift/linux/libFoundationNetworking.so
#24 0x00007ffff66f6c56 in $sIegh_IeyBh_TR () from /usr/lib/swift/linux/libFoundationNetworking.so
#25 0x00007ffff75ef417 in _dispatch_call_block_and_release () from /usr/lib/swift/linux/libdispatch.so
#26 0x00007ffff75f8b02 in _dispatch_lane_serial_drain () from /usr/lib/swift/linux/libdispatch.so
#27 0x00007ffff75f93a9 in _dispatch_lane_invoke () from /usr/lib/swift/linux/libdispatch.so
#28 0x00007ffff75f8a28 in _dispatch_lane_serial_drain () from /usr/lib/swift/linux/libdispatch.so
#29 0x00007ffff75f93a9 in _dispatch_lane_invoke () from /usr/lib/swift/linux/libdispatch.so
#30 0x00007ffff75f8a28 in _dispatch_lane_serial_drain () from /usr/lib/swift/linux/libdispatch.so
#31 0x00007ffff75f93a9 in _dispatch_lane_invoke () from /usr/lib/swift/linux/libdispatch.so
#32 0x00007ffff75fe936 in _dispatch_worker_thread () from /usr/lib/swift/linux/libdispatch.so
#33 0x00007ffff63f0ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#34 0x00007ffff64828d0 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81

Environment

Ubuntu 22.04.5 LTS
Swift version 6.1.2 (swift-6.1.2-RELEASE)

Additional information

As implemented in the example code, the deadlock can be avoided by using URLSession.dataTask() instead of URLSession.data().

Metadata

Metadata

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions