Requested Kernel Name could not be found

Hello all,
I am running into a problem that I haven’t been able to solve for a couple of days.
I have tried launching multiple kernels but keep running into the following error -
Error: [ComputeCpp:RT0102] The requested kernel name could not be found (Unable to retrieve kernel function, is integration header included? )

My entire code is as follows -

class naive_MatMul_kernel;
class sharedMatrixMultiplication_kernel;
class testMatMul;
typedef cl::sycl::buffer<float, 1> sycl_buffer;


void naiveMatrixMultiplication(sycl_buffer MatA, sycl_buffer MatB, sycl_buffer result, size_t M, size_t N, size_t K,
                               queue deviceQueue, int numThreads){

    /*
     * Naive Matrix Multiplication of MxN and NxK
     * */

    std::cout<<"Starting Matrix Multiplication"<<std::endl;
    nd_range<2> launchParams = nd_range<2>(cl::sycl::range<2>(K / numThreads + 1, M / numThreads + 1),
                                           cl::sycl::range<2>(numThreads, numThreads));

    deviceQueue.submit([&MatA, &MatB, &result, M, N, K, launchParams](handler& cgh){

        auto MatA_accessor = MatA.get_access<access::mode::read>(cgh);
        auto MatB_accessor = MatB.get_access<access::mode::read>(cgh);
        auto result_accessor = result.get_access<access::mode::read_write>(cgh);


        cgh.parallel_for<naive_MatMul_kernel>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, M, N, K]
                (nd_item<2> ndItem){

            auto column_index = ndItem.get_group(1) * ndItem.get_local_range(1) + ndItem.get_local_id(1);
            auto row_index = ndItem.get_group(0) * ndItem.get_local_range(0) + ndItem.get_local_id(0);

            if(row_index < M && column_index < K){
                float sum = 0.0f;
                for (int i = 0; i < N; i++) {
                    sum += MatA_accessor[N * row_index + i] * MatB_accessor[ i * N + column_index];
                }
                result_accessor[K * row_index + column_index] = sum;
            }
        });
    });
    deviceQueue.wait();
    std::cout<<"Done with Matmul"<<std::endl;
}
void testMatrixMultiplication(float* A, float* B, float* C, size_t m, size_t n, size_t k){
    queue Queue;

    buffer<float, 1> A_buffer(A, (m * n * sizeof(float )));
    buffer<float, 1> B_buffer(B, (n * k * sizeof(float )));
    buffer<float, 1> result_buffer(C, (m * k * sizeof(float )));

    auto device = Queue.get_device();
    std::cout<<device.get_info<cl::sycl::info::device::name>()<<std::endl;

    int max_threads = int(device.get_info<cl::sycl::info::device::max_work_group_size>());
    auto threads = int(std::sqrt(max_threads));

    nd_range<2> launchParams = nd_range<2>(range<2>(m / threads + 1, k / threads + 1), range<2>(threads, threads));

    Queue.submit([&A_buffer, &B_buffer, &result_buffer, m, n, k, launchParams](handler& cgh){

       auto A_accessor = A_buffer.get_access<access::mode::read>(cgh);
       auto B_accessor = B_buffer.get_access<access::mode::read>(cgh);
       auto result_accessor = result_buffer.get_access<access::mode::write>(cgh);

       std::cout<<"Starting Parallel For"<<std::endl;

       cgh.parallel_for<testMatMul>(launchParams, [A_accessor, B_accessor, result_accessor, m , n, k](nd_item<2> item){

           auto row_id = item.get_group(0) * item.get_local_range(0) + item.get_local_id(0);
          auto column_id = item.get_group(1) * item.get_local_range(1) + item.get_local_id(1);

          if(row_id < m && column_id < k){
              float  sum = 0.0f;

              for(int i=0; i<n; i++){
                  sum += A_accessor[n * row_id + i] * B_accessor[i * n + column_id];
              }
              result_accessor[k * row_id + column_id] = sum;
          }

       });

    });
    Queue.wait();

    Queue.submit([&result_buffer, C](handler& cgh){
       auto a_accessor = result_buffer.get_access<access::mode::read>(cgh);
       cgh.copy(a_accessor, C);
    });
    Queue.wait();

    for(int j=0; j < 100; j++)
        std::cout<<C[j]<<"   ";



}

int main() {

    size_t M  = 512;
    size_t N = 512;
    size_t K = 512;

    auto matA = (float*) malloc(M * N * sizeof(float ));
    auto matB = (float*) malloc(N * K * sizeof(float ));
    auto result =  (float*) malloc(M * K * sizeof(float ));

    for (int i=0; i< M*N; i++)
         matA[i] = 2.0f;
    for (int i=0; i< N*K; i++)
        matB[i] = 2.0f;
    for (int i = 0; i < M*K; ++i)
        result[i] = 69.0f;


    queue Queue;
    auto device = Queue.get_device();
    auto max_work_group_size = device.get_info<cl::sycl::info::device::max_work_group_size>();
    std::cout<<device.get_info<cl::sycl::info::device::name>()<<std::endl;
    auto thread_max  = int(std::sqrt(max_work_group_size));
    std::cout<<thread_max<<std::endl;


    buffer<float, 1> mata_buffer(matA, range<1>(M * N * sizeof(float )));
    buffer<float, 1> matb_buffer(matB, range<1>(N * K * sizeof(float )));
    buffer<float, 1> result_buffer(result, range<1>(M * K * sizeof(float )));

    //auto mata_shared = std::make_unique<buffer<float, 1>>(mata_buffer);
    //auto matb_shared = std::make_unique<buffer<float, 1>>(matb_buffer);
    //auto result_shared = std::make_unique<buffer<float, 1>>(result_buffer);


    try {
        //naiveMatrixMultiplication(mata_buffer, matb_buffer, result_buffer, M, N, K, Queue, thread_max);
        testMatrixMultiplication(matA, matB, result, M, N, K);
    }
    catch (cl::sycl::exception& exception) {
        std::cout<<std::endl<<exception.what()<<std::endl;
    }

and my CMake as follows -

cmake_minimum_required(VERSION 3.17)
project(computecpp_test)

set(CMAKE_CXX_COMPILER /home/atharva/ComputeCPP/computeCPP/bin/compute++)
set(CMAKE_CXX_FLAGS -sycl-driver)
set(CMAKE_CXX_FLAGS -g)

set(CMAKE_MODULE_PATH /home/atharva/computecpp-sdk/cmake/Modules/)
#include(FindComputeCpp)
find_package(ComputeCpp)

include_directories($(COMPUTECPP_INCLUDE_DIRECTORY))

add_executable(computecpp_test main.cpp)
target_link_libraries(computecpp_test PUBLIC ComputeCpp::ComputeCpp)
#add_sycl_to_target(computecpp_test main.cpp ${CMAKE_CURRENT_BINARY_DIR})

In either of the functions, I get the error function as mentioned above.
My environment is as follows -
Ubuntu 20.04
GCC - 9.3

Computecpp info output is as follows -


ComputeCpp Info (CE 2.3.0 2020/11/19)

SYCL 1.2.1 revision 3

********************************************************************************


Device Info:

Discovered 2 devices matching:
  platform    : <any>
  device type : <any>

--------------------------------------------------------------------------------
Device 0:

  Device is supported                     : UNTESTED - Untested OS
  Bitcode targets                         : spir64 spirv64 
  CL_DEVICE_NAME                          : Intel(R) Gen9 HD Graphics NEO
  CL_DEVICE_VENDOR                        : Intel(R) Corporation
  CL_DRIVER_VERSION                       : 1.0.0
  CL_DEVICE_TYPE                          : CL_DEVICE_TYPE_GPU 
--------------------------------------------------------------------------------
Device 1:

  Device is supported                     : NO - Please refer to the website for more info about the PTX backend on the Platform Support page
  Bitcode targets                         : ptx64 
  CL_DEVICE_NAME                          : GeForce RTX 2070
  CL_DEVICE_VENDOR                        : NVIDIA Corporation
  CL_DRIVER_VERSION                       : 460.84
  CL_DEVICE_TYPE                          : CL_DEVICE_TYPE_GPU 

If you encounter problems when using any of these OpenCL devices, please consult
this website for known issues:
https://developer.codeplay.com/products/computecpp/ce/guides/platform-support?version=2.3.0

********************************************************************************


and the output of the code is as follows -

Intel(R) Gen9 HD Graphics NEO
16
Intel(R) Gen9 HD Graphics NEO
Starting Parallel For

Error: [ComputeCpp:RT0102] The requested kernel name could not be found (Unable to retrieve kernel function, is integration header included? )

I am unable to understand what is going wrong here

TIA

Hi Atharva, I would recommend simply setting the CMAKE_CXX_COMPILER to compute++, then using the add_sycl_to_target function as normal (i.e. not setting include directories, not setting -sycl-driver). You can also run ninja -v to see what commands are being run to build your code, perhaps some CMake variables are stale.

Hello Duncan

My apologies for the late response. I did try with the add_sycl_to_target function but still ran into the same error. I also deleted all the CMake caches and everything associated with it, but still ran into it. (I did get different error messages with different compiler flags.)

what seemed to work was instead of -

    nd_range<2> launchParams = nd_range<2>(cl::sycl::range<2>(K / numThreads + 1, M / numThreads + 1),
                                           cl::sycl::range<2>(numThreads, numThreads));

global range should be a multiple of local range -

            auto local_range = range<2>(numThreads, numThreads);
            auto global_range = range<2>(M / numThreads + 1, N / numThreads + 1) * local_range;

            auto launchParams = nd_range<2>(global_range, local_range);

This works perfectly.