@@ -100,28 +100,55 @@ using device queries, as shown in the example below.
100100
101101[source,c++]
102102----
103- auto bundle = sycl::get_kernel_bundle(q.get_context());
104- auto kernel = bundle.get_kernel<class KernelName>() ;
105- auto maxWGs = kernel.ext_oneapi_get_info<sycl::ext::oneapi::experimental::info::kernel_queue_specific::max_num_work_group_sync>(q);
106- auto range = sycl::nd_range<1>{maxWGs * 32, 32};
107- auto props = sycl::ext::oneapi::experimental::properties{sycl::ext::oneapi::experimental::use_root_sync};
108- q.parallel_for<class KernelName>(range, props, [=](sycl::nd_item<1> it) {
109-
110- // Get a handle to the root-group
103+ #include < sycl/sycl.hpp>
104+ namespace syclex = sycl::ext::oneapi::experimental ;
105+
106+ struct KernelName {
107+ KernelName(size_t *data) : data{data} {}
108+
109+ void operator()(sycl::nd_item<1> it) const {
110+ // Get a handle to the root-group.
111111 auto root = it.ext_oneapi_get_root_group();
112112
113- // Write to some global memory location
114- data[root.get_local_id ()] = root.get_local_id ();
113+ // Write to some global memory location.
114+ data[root.get_local_linear_id ()] = root.get_local_linear_id ();
115115
116- // Synchronize all work-items executing the kernel, making all writes visible
116+ // Synchronize all work-items executing the kernel, making all writes visible.
117117 sycl::group_barrier(root);
118+ }
118119
119- });
120- ----
120+ auto get(syclex::properties_tag) const {
121+ // Kernels that use root-group synchronization must be decorated with the
122+ // "use_root_sync" kernel property.
123+ return syclex::properties{syclex::use_root_sync};
124+ }
125+
126+ size_t *data;
127+ };
128+
129+ int main() {
130+ sycl::queue q;
131+
132+ // When a kernel uses root-group synchronization, the total number of
133+ // work-groups is limited. This limit can vary depending on the kernel,
134+ // so get a "kernel" object representing the kernel when plan to launch.
135+ auto bundle = sycl::get_kernel_bundle<KernelName, sycl::bundle_state::executable>(
136+ q.get_context());
137+ auto kernel = bundle.get_kernel<KernelName>();
138+
139+ // Get the maximum number of work-groups for this kernel. The limit also
140+ // depends on the size of each work-group (which is 32 in this example), and
141+ // on the amount of work-group local memory (which is 0 in this example).
142+ auto maxWGs = kernel.ext_oneapi_get_info<
143+ syclex::info::kernel_queue_specific::max_num_work_groups>(q, 32, 0);
144+
145+ // Construct an nd-range which launches the maximum number of work-groups.
146+ auto ndr = sycl::nd_range<1>{maxWGs * 32, 32};
121147
122- NOTE: SYCL 2020 requires lambdas to be named in order to locate the associated
123- `sycl::kernel` object used to query information descriptors. Reducing the
124- verbosity of the queries shown above is left to a future extension.
148+ size_t *data = sycl::malloc_device<size_t>(maxWGs * 32, q);
149+ q.parallel_for(ndr, KernelName{data}).wait();
150+ }
151+ ----
125152
126153
127154== Specification
@@ -151,20 +178,14 @@ supports.
151178
152179[source,c++]
153180----
154- namespace sycl {
155- namespace ext {
156- namespace oneapi {
157- namespace experimental {
181+ namespace sycl::ext::oneapi::experimental {
158182
159183struct use_root_sync_key {
160184 using value_t = property_value<use_root_sync_key>;
161185};
162186inline constexpr use_root_sync_key::value_t use_root_sync;
163187
164- } // namespace experimental
165- } // namespace oneapi
166- } // namespace ext
167- } // namespace sycl
188+ } // namespace sycl::ext::oneapi::experimental
168189----
169190
170191|===
@@ -188,10 +209,7 @@ additional functions.
188209
189210[source,c++]
190211----
191- namespace sycl {
192- namespace ext {
193- namespace oneapi {
194- namespace experimental {
212+ namespace sycl::ext::oneapi::experimental {
195213
196214enum class execution_scope {
197215 work_item,
@@ -257,10 +275,7 @@ public:
257275
258276};
259277
260- } // namespace experimental
261- } // namespace oneapi
262- } // namespace ext
263- } // namespace sycl
278+ } // namespace sycl::ext::oneapi::experimental
264279----
265280
266281[source,c++]
@@ -421,13 +436,13 @@ NOTE: Support for passing the `root_group` to other group functions and
421436algorithms may be added in a future version of this extension.
422437
423438These group functions and algorithms act as synchronization points, and can
424- only be used in kernels launched with the `use_root_sync` property.
425- Attempting to call these functions in kernels that were not launched with the
439+ only be used in kernels decorated with the `use_root_sync` property.
440+ Attempting to call these functions in kernels that were not decorated with the
426441`use_root_sync` property results in undefined behavior.
427442
428443NOTE: Implementations are encouraged to throw a synchronous error with the
429444`errc::invalid` error code if they are able to detect that a developer has
430- attempted to synchronize a `root_group` from an incompatible kernel launch .
445+ attempted to synchronize a `root_group` from an incompatible kernel.
431446
432447
433448=== Accessing the `root_group` instance
@@ -506,12 +521,12 @@ implementation of this extension. It is not part of the specification of the
506521extension's API.
507522
508523An implementation of this extension using Level Zero could launch kernels
509- associated with the `use_root_sync` property via
524+ decorated with the `use_root_sync` property via
510525`zeCommandListAppendLaunchCooperativeKernel`, and could query launch
511526configuration requirements using `zeKernelSuggestMaxCooperativeGroupCount`.
512527
513528Similarly, an implementation of this extension using CUDA could launch kernels
514- associated with the `use_root_sync` property via
529+ decorated with the `use_root_sync` property via
515530`cudaLaunchCooperativeKernel`, and could query launch configuration
516531requirements using a combination of
517532`cudaOccupancyMaxActiveBlocksPerMultiprocessor` and
@@ -544,4 +559,3 @@ template argument for `sycl::nd_item<>`. Adding a runtime query would require
544559callstack. It's unclear if this functionality is necessary or just nice to
545560have -- resolution of this issue depends on user and implementation experience.
546561--
547-
0 commit comments