I have this example:
#include <CL/sycl.hpp>
#include <iostream>
int main()
{
sycl::queue q(sycl::gpu_selector_v);
auto device = q.get_device();
auto max_wg_size = device.get_info<sycl::info::device::max_work_group_size>();
auto N0 = 1;
auto N1 = max_wg_size;
auto N2 = 1;
auto buf_range = sycl::range<3>(N0, N1, N2);
sycl::buffer<uint8_t, 3> buf(buf_range);
for (int wg_size = 0; wg_size < max_wg_size; wg_size++)
{
q.submit([&](sycl::handler &h)
{
auto acc = buf.template get_access<sycl::access::mode::read_write>(h);
h.parallel_for_work_group(sycl::range<1>(N0), sycl::range<1>(wg_size), [=](sycl::group<1> gr)
{
auto group_id = gr.get_id(0);
gr.parallel_for_work_item([&](sycl::h_item<1> h)
{
auto local_id = h.get_local_id(0);
for(int i = 0; i < N2; i++)
{
acc[group_id][local_id][i] = 1;
}
});
}); })
.wait();
}
return 0;
}
Which tries to perform an assignment on wg_size
work items in a single compute unit. This results in the following error on my RTX3060, when wg_size = 897
. (< max_wg_size = 1024
)
PI CUDA ERROR:
Value: 701
Name: CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
Description: too many resources requested for launch
Function: cuda_piEnqueueKernelLaunch
Source Location: /root/intel-llvm-mirror/sycl/plugins/cuda/pi_cuda.cpp:3179
The total buffer size is fairly small, so why isn’t the GPU able to handle the load?
Is it possible to use any of the device info to determine the true maximum work group size available?
GPU-info:
name: NVIDIA GeForce RTX 3060 Ti
vendor: NVIDIA Corporation
version: 0.0
max_compute_units: 38
max_work_group_size: 1024
max_clock_frequency: 1695
global_mem_size: 8360755200
local_mem_size: 49152
max_mem_alloc_size: 1073741824
global_mem_cache_size: 3145728
global_mem_cacheline_size: 0
max_work_item_sizes_1D: 1024
max_work_item_sizes_2D: 1024, 1024
max_work_item_sizes_3D: 64, 1024, 1024