Hello all,
I am running into a problem that I haven’t been able to solve for a couple of days.
I have tried launching multiple kernels but keep running into the following error -
Error: [ComputeCpp:RT0102] The requested kernel name could not be found (Unable to retrieve kernel function, is integration header included? )
My entire code is as follows -
class naive_MatMul_kernel;
class sharedMatrixMultiplication_kernel;
class testMatMul;
typedef cl::sycl::buffer<float, 1> sycl_buffer;
void naiveMatrixMultiplication(sycl_buffer MatA, sycl_buffer MatB, sycl_buffer result, size_t M, size_t N, size_t K,
queue deviceQueue, int numThreads){
* Naive Matrix Multiplication of MxN and NxK
* */
std::cout<<"Starting Matrix Multiplication"<<std::endl;
nd_range<2> launchParams = nd_range<2>(cl::sycl::range<2>(K / numThreads + 1, M / numThreads + 1),
cl::sycl::range<2>(numThreads, numThreads));
deviceQueue.submit([&MatA, &MatB, &result, M, N, K, launchParams](handler& cgh){
auto MatA_accessor = MatA.get_access<access::mode::read>(cgh);
auto MatB_accessor = MatB.get_access<access::mode::read>(cgh);
auto result_accessor = result.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<naive_MatMul_kernel>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, M, N, K]
(nd_item<2> ndItem){
auto column_index = ndItem.get_group(1) * ndItem.get_local_range(1) + ndItem.get_local_id(1);
auto row_index = ndItem.get_group(0) * ndItem.get_local_range(0) + ndItem.get_local_id(0);
if(row_index < M && column_index < K){
float sum = 0.0f;
for (int i = 0; i < N; i++) {
sum += MatA_accessor[N * row_index + i] * MatB_accessor[ i * N + column_index];
result_accessor[K * row_index + column_index] = sum;
std::cout<<"Done with Matmul"<<std::endl;
void testMatrixMultiplication(float* A, float* B, float* C, size_t m, size_t n, size_t k){
queue Queue;
buffer<float, 1> A_buffer(A, (m * n * sizeof(float )));
buffer<float, 1> B_buffer(B, (n * k * sizeof(float )));
buffer<float, 1> result_buffer(C, (m * k * sizeof(float )));
auto device = Queue.get_device();
int max_threads = int(device.get_info<cl::sycl::info::device::max_work_group_size>());
auto threads = int(std::sqrt(max_threads));
nd_range<2> launchParams = nd_range<2>(range<2>(m / threads + 1, k / threads + 1), range<2>(threads, threads));
Queue.submit([&A_buffer, &B_buffer, &result_buffer, m, n, k, launchParams](handler& cgh){
auto A_accessor = A_buffer.get_access<access::mode::read>(cgh);
auto B_accessor = B_buffer.get_access<access::mode::read>(cgh);
auto result_accessor = result_buffer.get_access<access::mode::write>(cgh);
std::cout<<"Starting Parallel For"<<std::endl;
cgh.parallel_for<testMatMul>(launchParams, [A_accessor, B_accessor, result_accessor, m , n, k](nd_item<2> item){
auto row_id = item.get_group(0) * item.get_local_range(0) + item.get_local_id(0);
auto column_id = item.get_group(1) * item.get_local_range(1) + item.get_local_id(1);
if(row_id < m && column_id < k){
float sum = 0.0f;
for(int i=0; i<n; i++){
sum += A_accessor[n * row_id + i] * B_accessor[i * n + column_id];
result_accessor[k * row_id + column_id] = sum;
Queue.submit([&result_buffer, C](handler& cgh){
auto a_accessor = result_buffer.get_access<access::mode::read>(cgh);
cgh.copy(a_accessor, C);
for(int j=0; j < 100; j++)
std::cout<<C[j]<<" ";
int main() {
size_t M = 512;
size_t N = 512;
size_t K = 512;
auto matA = (float*) malloc(M * N * sizeof(float ));
auto matB = (float*) malloc(N * K * sizeof(float ));
auto result = (float*) malloc(M * K * sizeof(float ));
for (int i=0; i< M*N; i++)
matA[i] = 2.0f;
for (int i=0; i< N*K; i++)
matB[i] = 2.0f;
for (int i = 0; i < M*K; ++i)
result[i] = 69.0f;
queue Queue;
auto device = Queue.get_device();
auto max_work_group_size = device.get_info<cl::sycl::info::device::max_work_group_size>();
auto thread_max = int(std::sqrt(max_work_group_size));
buffer<float, 1> mata_buffer(matA, range<1>(M * N * sizeof(float )));
buffer<float, 1> matb_buffer(matB, range<1>(N * K * sizeof(float )));
buffer<float, 1> result_buffer(result, range<1>(M * K * sizeof(float )));
//auto mata_shared = std::make_unique<buffer<float, 1>>(mata_buffer);
//auto matb_shared = std::make_unique<buffer<float, 1>>(matb_buffer);
//auto result_shared = std::make_unique<buffer<float, 1>>(result_buffer);
try {
//naiveMatrixMultiplication(mata_buffer, matb_buffer, result_buffer, M, N, K, Queue, thread_max);
testMatrixMultiplication(matA, matB, result, M, N, K);
catch (cl::sycl::exception& exception) {
and my CMake as follows -
cmake_minimum_required(VERSION 3.17)
set(CMAKE_CXX_COMPILER /home/atharva/ComputeCPP/computeCPP/bin/compute++)
set(CMAKE_CXX_FLAGS -sycl-driver)
set(CMAKE_MODULE_PATH /home/atharva/computecpp-sdk/cmake/Modules/)
add_executable(computecpp_test main.cpp)
target_link_libraries(computecpp_test PUBLIC ComputeCpp::ComputeCpp)
#add_sycl_to_target(computecpp_test main.cpp ${CMAKE_CURRENT_BINARY_DIR})
In either of the functions, I get the error function as mentioned above.
My environment is as follows -
Ubuntu 20.04
GCC - 9.3
Computecpp info output is as follows -
ComputeCpp Info (CE 2.3.0 2020/11/19)
SYCL 1.2.1 revision 3
Device Info:
Discovered 2 devices matching:
platform : <any>
device type : <any>
Device 0:
Device is supported : UNTESTED - Untested OS
Bitcode targets : spir64 spirv64
CL_DEVICE_NAME : Intel(R) Gen9 HD Graphics NEO
CL_DEVICE_VENDOR : Intel(R) Corporation
Device 1:
Device is supported : NO - Please refer to the website for more info about the PTX backend on the Platform Support page
Bitcode targets : ptx64
If you encounter problems when using any of these OpenCL devices, please consult
this website for known issues:
and the output of the code is as follows -
Intel(R) Gen9 HD Graphics NEO
Intel(R) Gen9 HD Graphics NEO
Starting Parallel For
Error: [ComputeCpp:RT0102] The requested kernel name could not be found (Unable to retrieve kernel function, is integration header included? )
I am unable to understand what is going wrong here