Skip to content

Commit 179ce1b

Browse files
authored
[SYCL] Query device timestamp from UR for submission time (#21436)
Remove submission time calculation via base device timestamp and host time delta. Such calculation is unreliable because of the clock drift between device and host and inevitably leads to breaking the specification requirement that submit time must be <= start time. Even though querying device timestamp from UR presumably costs more, it doesn't make sense to have an optimization that breaks correctness, considering that submission time recording is done only when queue has profiling enabled.
1 parent 05593e2 commit 179ce1b

File tree

3 files changed

+9
-63
lines changed

3 files changed

+9
-63
lines changed

sycl/source/detail/device_impl.cpp

Lines changed: 4 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -309,61 +309,11 @@ ur_native_handle_t device_impl::getNative() const {
309309
return Handle;
310310
}
311311

312-
// On the first call this function queries for device timestamp
313-
// along with host synchronized timestamp and stores it in member variable
314-
// MDeviceHostBaseTime. Subsequent calls to this function would just retrieve
315-
// the host timestamp, compute difference against the host timestamp in
316-
// MDeviceHostBaseTime and calculate the device timestamp based on the
317-
// difference.
318-
//
319-
// The MDeviceHostBaseTime is refreshed with new device and host timestamp
320-
// after a certain interval (determined by TimeTillRefresh) to account for
321-
// clock drift between host and device.
322-
//
323312
uint64_t device_impl::getCurrentDeviceTime() {
324-
auto GetGlobalTimestamps = [this](ur_device_handle_t Device,
325-
uint64_t *DeviceTime, uint64_t *HostTime) {
326-
auto Result =
327-
getAdapter().call_nocheck<UrApiKind::urDeviceGetGlobalTimestamps>(
328-
Device, DeviceTime, HostTime);
329-
if (Result == UR_RESULT_ERROR_INVALID_OPERATION) {
330-
// NOTE(UR port): Removed the call to GetLastError because we shouldn't
331-
// be calling it after ERROR_INVALID_OPERATION: there is no
332-
// adapter-specific error.
333-
throw detail::set_ur_error(
334-
sycl::exception(
335-
make_error_code(errc::feature_not_supported),
336-
"Device and/or backend does not support querying timestamp."),
337-
UR_RESULT_ERROR_INVALID_OPERATION);
338-
} else {
339-
getAdapter().checkUrResult<errc::feature_not_supported>(Result);
340-
}
341-
};
342-
343-
uint64_t HostTime = 0;
344-
uint64_t Diff = 0;
345-
// To account for potential clock drift between host clock and device clock.
346-
// The value set is arbitrary: 200 seconds
347-
constexpr uint64_t TimeTillRefresh = 200e9;
348-
// If getCurrentDeviceTime is called for the first time or we have to refresh.
349-
std::shared_lock<std::shared_mutex> ReadLock(MDeviceHostBaseTimeMutex);
350-
if (!MDeviceHostBaseTime.second || Diff > TimeTillRefresh) {
351-
ReadLock.unlock();
352-
std::unique_lock<std::shared_mutex> WriteLock(MDeviceHostBaseTimeMutex);
353-
// Recheck the condition after acquiring the write lock.
354-
if (MDeviceHostBaseTime.second && Diff <= TimeTillRefresh) {
355-
// If we are here, it means that another thread has already updated
356-
// MDeviceHostBaseTime, so we can just return the current device time.
357-
return MDeviceHostBaseTime.first + Diff;
358-
}
359-
GetGlobalTimestamps(MDevice, &MDeviceHostBaseTime.first,
360-
&MDeviceHostBaseTime.second);
361-
} else {
362-
GetGlobalTimestamps(MDevice, nullptr, &HostTime);
363-
assert(HostTime >= MDeviceHostBaseTime.second);
364-
Diff = HostTime - MDeviceHostBaseTime.second;
365-
}
366-
return MDeviceHostBaseTime.first + Diff;
313+
uint64_t DeviceTime = 0;
314+
getAdapter().call<UrApiKind::urDeviceGetGlobalTimestamps>(
315+
MDevice, &DeviceTime, nullptr);
316+
return DeviceTime;
367317
}
368318

369319
bool device_impl::extOneapiCanBuild(

sycl/source/detail/device_impl.hpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2249,9 +2249,6 @@ class device_impl {
22492249
// This is used for getAdapter so should be above other properties.
22502250
platform_impl &MPlatform;
22512251

2252-
std::shared_mutex MDeviceHostBaseTimeMutex;
2253-
std::pair<uint64_t, uint64_t> MDeviceHostBaseTime{0, 0};
2254-
22552252
const ur_device_handle_t MRootDevice;
22562253

22572254
// Devices track a list of active queues on it, to allow for synchronization

sycl/test-e2e/Basic/submit_time.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
// RUN: %{build} -o %t.out
2-
// There is an issue with reported device time for the L0 backend, works only on
3-
// pvc for now. No such problems for other backends.
4-
// RUN: %if (!level_zero || arch-intel_gpu_pvc) %{ %{run} %t.out %}
2+
// RUN: %{run} %t.out
53

6-
// Check that submission time is calculated properly.
4+
// Check that submission time is valid.
75

86
// Test fails on hip flakily, disable temprorarily.
97
// UNSUPPORTED: hip
@@ -19,11 +17,12 @@
1917

2018
int main(void) {
2119
constexpr size_t n = 16;
20+
constexpr size_t iter_count = 100;
2221
sycl::queue q({sycl::property::queue::enable_profiling{}});
2322
int *data = sycl::malloc_host<int>(n, q);
2423
int *dest = sycl::malloc_host<int>(n, q);
2524

26-
for (int i = 0; i < 5; i++) {
25+
for (int i = 0; i < iter_count; i++) {
2726
auto event = q.submit([&](sycl::handler &cgh) {
2827
cgh.parallel_for<class KernelTime>(
2928
sycl::range<1>(n), [=](sycl::id<1> idx) { data[idx] = idx; });
@@ -52,7 +51,7 @@ int main(void) {
5251
uint64_t memcpy_submit_time = 0;
5352
uint64_t memcpy_start_time = 0;
5453
uint64_t memcpy_end_time = 0;
55-
for (int i = 0; i < 5; i++) {
54+
for (int i = 0; i < iter_count; i++) {
5655
auto memcpy_event = q.memcpy(dest, data, sizeof(int) * n);
5756
memcpy_event.wait();
5857

0 commit comments

Comments
 (0)