Description
URLSession.data() causes a deadlock when the task is cancelled while handling a network response
The issue seems to be happening because URLSession.data() is implemented using withTaskCancellationHandler, whose onCancel: closure is called while holding an internal lock (StatusRecordLock). The onCancel: calls URLSessionTask.cancel() which tries to acquire workQueue.
At the same time, a network thread may already be using workQueue to handle a network response or error. This involves resuming the withCheckedThrowingContinuation which requires acquiring the same StatusRecordLock.
I've added a stack trace below. Feel free to correct me if I've misunderstood the flow.
Reproduction
See code within collapsible section below:
main.swift
// `swift run main` to execute with session.data(for:)
// `swift run main --fix` to execute with session.dataTask`
import Foundation
import FoundationNetworking
let useFix = CommandLine.arguments.contains("--fix")
print(
useFix
? "Mode: FIXED -- await (continuation + session.dataTask)"
: "Mode: BUGGY -- await session.data(for:)")
var lastProgress = Date()
let wq = DispatchQueue(label: "wd")
func tick() {
wq.sync { lastProgress = Date() }
}
DispatchQueue.global().async {
while true {
Thread.sleep(forTimeInterval: 1)
let elapsed: TimeInterval = wq.sync { Date().timeIntervalSince(lastProgress) }
if elapsed >= 10 {
print("\n*** no progress for \(Int(elapsed))s ***")
kill(getpid(), SIGINT)
}
}
}
// endpoint does not need to exist, a ECONNREFUSED is sufficient
let url = URL(string: "http://127.0.1.1:19999/")!
let session = URLSession.shared
func buggedFetch() async throws -> Data {
let (data, _) = try await session.data(for: URLRequest(url: url))
return data
}
func fixedFetch() async throws -> Data {
return try await withCheckedThrowingContinuation { c in
session.dataTask(with: url) { d, _, e in
if let e { c.resume(throwing: e) } else { c.resume(returning: d ?? Data()) }
}.resume()
}
}
let iterations = 10_000
Task {
for i in 0..<iterations {
tick()
if i % 500 == 0 { print("Iteration \(i)") }
let task = Task { () -> Data in
return useFix ? try await fixedFetch() : try await buggedFetch()
}
// Give the child task time to start/process the HTTP request.
try? await Task.sleep(nanoseconds: 100_000)
task.cancel()
}
print("Completed \(iterations) iterations")
exit(0)
}
dispatchMain()
When analyzing the example with gdb, we can see the following deadlock scenario:
- Thread 25 acquires StatusRecordLock at frame 9
- Thread 25 waits for workQueue at frame 4
- Thread 9 acquires workQueue at frame 25
- Thread 9 waits for StatusRecordLock at frame 4
Stack-trace of the code (when executing `session.data()`)
// gdb thread bt
Thread 25 (Thread 0x7fffa1ffb640 (LWP 110879) "main"):
#0 syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:38
#1 0x00007ffff7606831 in _dispatch_futex_wait () from /usr/lib/swift/linux/libdispatch.so
#2 0x00007ffff760692d in _dispatch_thread_event_wait_slow () from /usr/lib/swift/linux/libdispatch.so
#3 0x00007ffff75fd0c5 in __DISPATCH_WAIT_FOR_QUEUE__ () from /usr/lib/swift/linux/libdispatch.so
#4 0x00007ffff75fcc1e in _dispatch_sync_f_slow () from /usr/lib/swift/linux/libdispatch.so
#5 0x00007ffff75bbc62 in $s8Dispatch0A5QueueC4sync7executeyyyXE_tF () from /usr/lib/swift/linux/libswiftDispatch.so
#6 0x00007ffff674e59d in $s20FoundationNetworking14URLSessionTaskC6cancelyyF () from /usr/lib/swift/linux/libFoundationNetworking.so
#7 0x00007ffff6732d96 in $s20FoundationNetworking10URLSessionC11CancelState33_A4EC80B15160B9532B0458D8341D1269LLC6cancelyyF () from /usr/lib/swift/linux/libFoundationNetworking.so
#8 0x00007ffff7881738 in void __swift::__runtime::llvm::function_ref<void (swift::ActiveTaskStatus)>::callback_fn<swift_task_cancelImpl(swift::AsyncTask*)::$_0>(long, swift::ActiveTaskStatus) () from /usr/lib/swift/linux/libswift_Concurrency.so
#9 0x00007ffff788062f in withStatusRecordLock(swift::AsyncTask*, swift::ActiveTaskStatus, __swift::__runtime::llvm::function_ref<void (swift::ActiveTaskStatus)>, __swift::__runtime::llvm::function_ref<void (swift::ActiveTaskStatus, swift::ActiveTaskStatus&)>) () from /usr/lib/swift/linux/libswift_Concurrency.so
#10 0x00007ffff7880efd in swift_task_cancel () from /usr/lib/swift/linux/libswift_Concurrency.so
#11 0x00005555555579c0 in $s4mainyyYacfU0_TY2_ ()
#12 0x00007ffff787b370 in swift::runJobInEstablishedExecutorContext(swift::Job*) () from /usr/lib/swift/linux/libswift_Concurrency.so
#13 0x00007ffff787bf25 in swift_job_run () from /usr/lib/swift/linux/libswift_Concurrency.so
#14 0x00007ffff75f5b8b in _dispatch_continuation_pop () from /usr/lib/swift/linux/libdispatch.so
#15 0x00007ffff75f59bf in _dispatch_async_redirect_invoke () from /usr/lib/swift/linux/libdispatch.so
#16 0x00007ffff75fe936 in _dispatch_worker_thread () from /usr/lib/swift/linux/libdispatch.so
#17 0x00007ffff63f0ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#18 0x00007ffff64828d0 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
Thread 9 (Thread 0x7fffdaffd640 (LWP 110863) "main"):
#0 futex_wait (private=0, expected=2, futex_word=0x7fff88000b78) at ../sysdeps/nptl/futex-internal.h:146
#1 __GI___lll_lock_wait (futex=futex@entry=0x7fff88000b78, private=0) at ./nptl/lowlevellock.c:49
#2 0x00007ffff63f4002 in lll_mutex_lock_optimized (mutex=0x7fff88000b78) at ./nptl/pthread_mutex_lock.c:48
#3 ___pthread_mutex_lock (mutex=0x7fff88000b78) at ./nptl/pthread_mutex_lock.c:93
#4 0x00007ffff787fe33 in waitForStatusRecordUnlockIfNotSelfLocked(swift::AsyncTask*, swift::ActiveTaskStatus&) () from /usr/lib/swift/linux/libswift_Concurrency.so
#5 0x00007ffff788050c in withStatusRecordLock(swift::AsyncTask*, swift::ActiveTaskStatus, __swift::__runtime::llvm::function_ref<void (swift::ActiveTaskStatus)>, __swift::__runtime::llvm::function_ref<void (swift::ActiveTaskStatus, swift::ActiveTaskStatus&)>) () from /usr/lib/swift/linux/libswift_Concurrency.so
#6 0x00007ffff7880a42 in swift::updateStatusRecord(swift::AsyncTask*, swift::TaskStatusRecord*, __swift::__runtime::llvm::function_ref<void ()>, swift::ActiveTaskStatus&, __swift::__runtime::llvm::function_ref<void (swift::ActiveTaskStatus, swift::ActiveTaskStatus&)>) () from /usr/lib/swift/linux/libswift_Concurrency.so
#7 0x00007ffff787cdf0 in swift::AsyncTask::flagAsAndEnqueueOnExecutor(swift::SerialExecutorRef) () from /usr/lib/swift/linux/libswift_Concurrency.so
#8 0x00007ffff6734e02 in $s20FoundationNetworking10URLSessionC4data3for8delegate0A10Essentials4DataV_AA11URLResponseCtAA10URLRequestV_AA0C12TaskDelegate_pSgtYaKFAI_AKtyYaKXEfU_yScCyAI_AKts5Error_pGXEfU_yAISg_AKSgsAP_pSgtYbcfU_Tm () from /usr/lib/swift/linux/libFoundationNetworking.so
#9 0x00007ffff6736b75 in $s20FoundationNetworking10URLSessionC8download4from8delegate0A10Essentials3URLV_AA11URLResponseCtAI_AA0C12TaskDelegate_pSgtYaKFAI_AKtyYaKXEfU_yScCyAI_AKts5Error_pGXEfU_yAISg_AKSgsAN_pSgtYbcfU_TATm () from /usr/lib/swift/linux/libFoundationNetworking.so
#10 0x00007ffff6747ba9 in $s20FoundationNetworking15_ProtocolClientC03urlC04task16didFailWithErroryAA14URLSessionTaskC_s0J0_ptFTf4nnd_n () from /usr/lib/swift/linux/libFoundationNetworking.so
#11 0x00007ffff67454c7 in $s20FoundationNetworking15_ProtocolClientCAA011URLProtocolD0A2aDP03urlC0_16didFailWithErroryAA0E0C_s0J0_ptFTW () from /usr/lib/swift/linux/libFoundationNetworking.so
#12 0x00007ffff6727d7c in $s20FoundationNetworking15_NativeProtocolC8failWith5error7requesty0A07NSErrorC_AA10URLRequestVtF () from /usr/lib/swift/linux/libFoundationNetworking.so
#13 0x00007ffff6729cca in $s20FoundationNetworking15_NativeProtocolC17transferCompleted9withErrory0A07NSErrorCSg_tF () from /usr/lib/swift/linux/libFoundationNetworking.so
#14 0x00007ffff6725cee in $s20FoundationNetworking10URLSessionC12_MultiHandleC17completedTransfer33_76FB65EF45BDE9115477B690E87F5E28LL07forEasyE08easyCodeySv_So012CFURLSessionpR0VtF () from /usr/lib/swift/linux/libFoundationNetworking.so
#15 0x00007ffff672593a in $s20FoundationNetworking10URLSessionC12_MultiHandleC25readAndWriteAvailableData33_76FB65EF45BDE9115477B690E87F5E28LL2onys5Int32V_tKF () from /usr/lib/swift/linux/libFoundationNetworking.so
#16 0x00007ffff67254f4 in $s20FoundationNetworking10URLSessionC12_MultiHandleC3addyyAA05_EasyE0CF () from /usr/lib/swift/linux/libFoundationNetworking.so
#17 0x00007ffff6735dde in $s20FoundationNetworking10URLSessionCAA0C8ProtocolA2aDP3add6handleyAA11_EasyHandleC_tFTW () from /usr/lib/swift/linux/libFoundationNetworking.so
#18 0x00007ffff672cb76 in $s20FoundationNetworking15_NativeProtocolC13internalStateAC09_InternalF0OvW () from /usr/lib/swift/linux/libFoundationNetworking.so
#19 0x00007ffff672c42e in $s20FoundationNetworking15_NativeProtocolC6resumeyyF () from /usr/lib/swift/linux/libFoundationNetworking.so
#20 0x00007ffff672d031 in $s20FoundationNetworking15_NativeProtocolC16startNewTransfer4withyAA10URLRequestV_tFyAA14URLSessionTaskC5_BodyOcfU_ () from /usr/lib/swift/linux/libFoundationNetworking.so
#21 0x00007ffff672bc6b in $s20FoundationNetworking15_NativeProtocolC16startNewTransfer4withyAA10URLRequestV_tF () from /usr/lib/swift/linux/libFoundationNetworking.so
#22 0x00007ffff672c232 in $s20FoundationNetworking15_NativeProtocolC6resumeyyF () from /usr/lib/swift/linux/libFoundationNetworking.so
#23 0x00007ffff673f22a in $s20FoundationNetworking14URLSessionTaskC6resumeyyFyyXEfU_yAA11URLProtocolCSgcfU_yyYbcfU_ () from /usr/lib/swift/linux/libFoundationNetworking.so
#24 0x00007ffff66f6c56 in $sIegh_IeyBh_TR () from /usr/lib/swift/linux/libFoundationNetworking.so
#25 0x00007ffff75ef417 in _dispatch_call_block_and_release () from /usr/lib/swift/linux/libdispatch.so
#26 0x00007ffff75f8b02 in _dispatch_lane_serial_drain () from /usr/lib/swift/linux/libdispatch.so
#27 0x00007ffff75f93a9 in _dispatch_lane_invoke () from /usr/lib/swift/linux/libdispatch.so
#28 0x00007ffff75f8a28 in _dispatch_lane_serial_drain () from /usr/lib/swift/linux/libdispatch.so
#29 0x00007ffff75f93a9 in _dispatch_lane_invoke () from /usr/lib/swift/linux/libdispatch.so
#30 0x00007ffff75f8a28 in _dispatch_lane_serial_drain () from /usr/lib/swift/linux/libdispatch.so
#31 0x00007ffff75f93a9 in _dispatch_lane_invoke () from /usr/lib/swift/linux/libdispatch.so
#32 0x00007ffff75fe936 in _dispatch_worker_thread () from /usr/lib/swift/linux/libdispatch.so
#33 0x00007ffff63f0ac3 in start_thread (arg=<optimized out>) at ./nptl/pthread_create.c:442
#34 0x00007ffff64828d0 in clone3 () at ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
Environment
Ubuntu 22.04.5 LTS
Swift version 6.1.2 (swift-6.1.2-RELEASE)
Additional information
As implemented in the example code, the deadlock can be avoided by using URLSession.dataTask() instead of URLSession.data().
Description
URLSession.data() causes a deadlock when the task is cancelled while handling a network response
The issue seems to be happening because URLSession.data() is implemented using withTaskCancellationHandler, whose
onCancel:closure is called while holding an internal lock (StatusRecordLock). TheonCancel:callsURLSessionTask.cancel()which tries to acquire workQueue.At the same time, a network thread may already be using workQueue to handle a network response or error. This involves resuming the withCheckedThrowingContinuation which requires acquiring the same StatusRecordLock.
I've added a stack trace below. Feel free to correct me if I've misunderstood the flow.
Reproduction
See code within collapsible section below:
main.swift
When analyzing the example with gdb, we can see the following deadlock scenario:
Stack-trace of the code (when executing `session.data()`)
Environment
Ubuntu 22.04.5 LTS
Swift version 6.1.2 (swift-6.1.2-RELEASE)
Additional information
As implemented in the example code, the deadlock can be avoided by using
URLSession.dataTask()instead ofURLSession.data().