CMake Recipe for Nvidia GPU build

br-ko · 5 May 2024 23:38

Hi,
I am following this Install oneAPI for NVIDIA GPUs - Guides - oneAPI for NVIDIA® GPUs - Products - Codeplay Developer tutorial here, it builds with command line but when I to build with CMake I get

terminate called after throwing an instance of 'sycl::_V1::runtime_error'
  what():  Native API failed. Native API returns: -42 (PI_ERROR_INVALID_BINARY) -42 (PI_ERROR_INVALID_BINARY)
/var/spool/slurmd/job7492534/slurm_script: line 13: 1133566 Aborted                 (core dumped) ONEAPI_DEVICE_SELECTOR="ext_oneapi_cuda:*" SYCL_PI_TRACE=1 ./emu/bfs.gpu

Error.
So I did the following changes:

created a src folder and placed the following files inside it :
simple-sycl-app :

#include <sycl/sycl.hpp>
#include "simple-sycl-app.hpp"
int main() {

  // Creating SYCL queue
  sycl::queue Queue{};

    // Submitting command group(work) to queue
   calculate(Queue);
   return 0;
}

Then single-sycl-app.hpp :


void calculate( sycl::queue &Queue) {
  // Creating buffer of 4 ints to be used inside the kernel code
  sycl::buffer<int, 1> Buffer{4};


  // Size of index space for kernel
  sycl::range<1> NumOfWorkItems{Buffer.size()};
  
  // Submitting command group(work) to queue
  Queue.submit([&](sycl::handler &cgh) {
    // Getting write only access to the buffer on a device
    auto Accessor = Buffer.get_access<sycl::access::mode::write>(cgh);
    // Executing kernel
    cgh.parallel_for<class FillBuffer>(
        NumOfWorkItems, [=](sycl::id<1> WIid) {
          // Fill buffer with indexes
          Accessor[WIid] = static_cast<int>(WIid.get(0));
        });
  });

  // Getting read only access to the buffer on the host.
  // Implicit barrier waiting for queue to complete the work.
  auto HostAccessor = Buffer.get_host_access();

  // Check the results
  bool MismatchFound{false};
  for (size_t I{0}; I < Buffer.size(); ++I) {
    if (HostAccessor[I] != I) {
      std::cout << "The result is incorrect for element: " << I
                << " , expected: " << I << " , got: " << HostAccessor[I]
                << std::endl;
      MismatchFound = true;
    }
  }

  if (!MismatchFound) {
    std::cout << "The results are correct!" << std::endl;
  }

}

src/CMakeLists.txt :

set(SOURCE_FILE simple-sycl-app.cpp)
set(TARGET_NAME bfs)
set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
set(SIMULATOR_TARGET ${TARGET_NAME}.fpga_sim)
set(FPGA_TARGET ${TARGET_NAME}.fpga)
set(GPU_TARGET ${TARGET_NAME}.gpu)
if("${DEVICE}" STREQUAL "INTEL_MAX_GPU")
    message(STATUS "Configuring the design to run on INTEL_MAX_GPU device ${DEVICE}")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -fsycl-targets=intel_gpu_pvc -D INTEL_MAX_GPU")
elseif("${DEVICE}" STREQUAL "NVIDIA_GPU")
    message(STATUS "Configuring the design to run on NVIDIA_GPU device ${DEVICE}")
    
else()
    message(STATUS "Configuring the design to run on FPGA device ${DEVICE}")
endif()

# A SYCL ahead-of-time (AoT) compile processes the device code in two stages.
# 1. The "compile" stage compiles the device code to an intermediate representation (SPIR-V).
# 2. The "link" stage invokes the compiler's FPGA backend before linking.
#    For this reason, FPGA backend flags must be passed as link flags in CMake.
set(EMULATOR_COMPILE_FLAGS_GPU "-fsycl -fsycl-targets=nvptx64-nvidia-cuda ")
set(HARDWARE_LINK_FLAGS_GPU "-fsycl ${USER_HARDWARE_FLAGS}")
###############################################################################
### GPU (NVIDIA)
###############################################################################
add_executable(${GPU_TARGET} ${SOURCE_FILE})
target_include_directories(${GPU_TARGET} PRIVATE ${PROJECT_SOURCE_DIR}/include)
target_include_directories(${GPU_TARGET} PRIVATE ${PROJECT_SOURCE_DIR}/src)
set_target_properties(${GPU_TARGET} PROPERTIES COMPILE_FLAGS "${EMULATOR_COMPILE_FLAGS_GPU}")
set_target_properties(${GPU_TARGET} PROPERTIES LINK_FLAGS "${HARDWARE_LINK_FLAGS_GPU}")
add_custom_target(gpu DEPENDS ${GPU_TARGET})

Then the top directory CMakeLists.txt file :

if(UNIX)
    # Direct CMake to use icpx rather than the default C++ compiler/linker
    set(CMAKE_CXX_COMPILER icpx)

else() # Windows
    # Force CMake to use icx-cl rather than the default C++ compiler/linker 
    # (needed on Windows only)
    include (CMakeForceCompiler)
    CMAKE_FORCE_CXX_COMPILER (icx-cl IntelDPCPP)
    include (Platform/Windows-Clang)
endif()


cmake_minimum_required (VERSION 3.5)

project(FPGACompile CXX)

set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})

add_subdirectory (src)

I built via command line and CMake to cross test

mkdir emu; cd emu; cmake .. -DNVIDIA_GPU;cd .. 
icpx -fsycl -fsycl-targets=nvptx64-nvidia-cuda src/simple-sycl-app.cpp -o simple-sycl-app

Then submitted the job to GPU node via

sycl-ls
cd ~/simple-test
ONEAPI_DEVICE_SELECTOR="ext_oneapi_cuda:*" SYCL_PI_TRACE=1 ./simple-sycl-app
ONEAPI_DEVICE_SELECTOR="ext_oneapi_cuda:*" SYCL_PI_TRACE=1 ./emu/bfs.gpu

I got the following output :

[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
[opencl:cpu:1] Intel(R) OpenCL, AMD EPYC 7742 64-Core Processor                 OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-SXM4-40GB 8.0 [CUDA 12.2]
SYCL_PI_TRACE[all]: Requested device_type: info::device_type::automatic
SYCL_PI_TRACE[all]: Selected device: -> final score = 1500
SYCL_PI_TRACE[all]:   platform: NVIDIA CUDA BACKEND
SYCL_PI_TRACE[all]:   device: NVIDIA A100-SXM4-40GB
The results are correct!
SYCL_PI_TRACE[all]: Requested device_type: info::device_type::automatic
SYCL_PI_TRACE[all]: Selected device: -> final score = 500
SYCL_PI_TRACE[all]:   platform: NVIDIA CUDA BACKEND
SYCL_PI_TRACE[all]:   device: NVIDIA A100-SXM4-40GB

And this error message :

SYCL_PI_TRACE[basic]: Plugin found and successfully loaded: libpi_cuda.so [ PluginVersion: 14.38.1 ]
SYCL_PI_TRACE[basic]: Plugin found and successfully loaded: libpi_unified_runtime.so [ PluginVersion: 14.37.1 ]
SYCL_PI_TRACE[basic]: Plugin found and successfully loaded: libpi_cuda.so [ PluginVersion: 14.38.1 ]
SYCL_PI_TRACE[basic]: Plugin found and successfully loaded: libpi_unified_runtime.so [ PluginVersion: 14.37.1 ]
terminate called after throwing an instance of 'sycl::_V1::runtime_error'
  what():  Native API failed. Native API returns: -42 (PI_ERROR_INVALID_BINARY) -42 (PI_ERROR_INVALID_BINARY)
/var/spool/slurmd/job7492534/slurm_script: line 13: 1133566 Aborted                 (core dumped) ONEAPI_DEVICE_SELECTOR="ext_oneapi_cuda:*" SYCL_PI_TRACE=1 ./emu/bfs.gpu

What am I missing in the configuration of CMake ?

rbielski · 6 May 2024 10:18

Hi @br-ko,
it looks like your simple-sycl-app program built with direct compiler invocation runs fine, but your bfs.gpu program built with CMake fails with an “invalid binary” error.

Could you run your cmake configuration with -DCMAKE_VERBOSE_MAKEFILE=ON and post the full output of the cmake --build <dir> command? This should list the compilation commands cmake executes, and should pinpoint what’s missing there. In particular, how they differ from your command line compilation.

This error would usually mean that you’re missing -fsycl-targets=nvptx64-nvidia-cuda:

I wonder if you’re missing the target flag in your LINK_FLAGS? Although when I tested this hypothesis in 2024.1, it results in a different runtime error (PI_ERROR_INVALID_KERNEL_NAME).

br-ko · 6 May 2024 14:36

Hi @rbielski ,
EDIT: I noticed that it wasn’t passing the -fsycl-targets=nvptx64-nvidia-cuda to the cmake so added it to hardware link flags too as you pointed out and now it worked. Thank you!!
Here is the new output with CMAKE_VERBOSE_MAKEFILE=ON

$ cmake .. -DDEVICE=NVIDIA_GPU -DCMAKE_VERBOSE_MAKEFILE=ON;make gpu;
-- Configuring the design to run on NVIDIA_GPU device NVIDIA_GPU
-- NUM_COMPUTE_UNITS was not specified.                    
Configuring the design to run on the default 4                     
Please refer to the README for information on NUM_COMPUTE_UNITS selection.
-- Configuring done (0.0s)
-- Generating done (0.1s)
-- Build files have been written to: /scratch/hpc-prf-agraph/simple-test/emu
/opt/software/pc2/EB-SW/software/CMake/3.27.6-GCCcore-13.2.0/bin/cmake -S/scratch/hpc-prf-agraph/simple-test -B/scratch/hpc-prf-agraph/simple-test/emu --check-build-system CMakeFiles/Makefile.cmake 0
make  -f CMakeFiles/Makefile2 gpu
make[1]: Entering directory '/scratch/hpc-prf-agraph/simple-test/emu'
/opt/software/pc2/EB-SW/software/CMake/3.27.6-GCCcore-13.2.0/bin/cmake -S/scratch/hpc-prf-agraph/simple-test -B/scratch/hpc-prf-agraph/simple-test/emu --check-build-system CMakeFiles/Makefile.cmake 0
/opt/software/pc2/EB-SW/software/CMake/3.27.6-GCCcore-13.2.0/bin/cmake -E cmake_progress_start /scratch/hpc-prf-agraph/simple-test/emu/CMakeFiles 2
make  -f CMakeFiles/Makefile2 src/CMakeFiles/gpu.dir/all
make[2]: Entering directory '/scratch/hpc-prf-agraph/simple-test/emu'
make  -f src/CMakeFiles/bfs.gpu.dir/build.make src/CMakeFiles/bfs.gpu.dir/depend
make[3]: Entering directory '/scratch/hpc-prf-agraph/simple-test/emu'
cd /scratch/hpc-prf-agraph/simple-test/emu && /opt/software/pc2/EB-SW/software/CMake/3.27.6-GCCcore-13.2.0/bin/cmake -E cmake_depends "Unix Makefiles" /scratch/hpc-prf-agraph/simple-test /scratch/hpc-prf-agraph/simple-test/src /scratch/hpc-prf-agraph/simple-test/emu /scratch/hpc-prf-agraph/simple-test/emu/src /scratch/hpc-prf-agraph/simple-test/emu/src/CMakeFiles/bfs.gpu.dir/DependInfo.cmake "--color="
make[3]: Leaving directory '/scratch/hpc-prf-agraph/simple-test/emu'
make  -f src/CMakeFiles/bfs.gpu.dir/build.make src/CMakeFiles/bfs.gpu.dir/build
make[3]: Entering directory '/scratch/hpc-prf-agraph/simple-test/emu'
[ 50%] Linking CXX executable ../bfs.gpu
cd /scratch/hpc-prf-agraph/simple-test/emu/src && /opt/software/pc2/EB-SW/software/CMake/3.27.6-GCCcore-13.2.0/bin/cmake -E cmake_link_script CMakeFiles/bfs.gpu.dir/link.txt --verbose=1
/opt/software/FPGA/IntelFPGA/oneapi/24.0.0/compiler/2024.0/bin/icpx -fsycl  "CMakeFiles/bfs.gpu.dir/simple-sycl-app.cpp.o" -o ../bfs.gpu 
icpx: warning: linked binaries do not contain expected 'spir64-unknown-unknown' target; found targets: 'nvptx64-nvidia-cuda-sm_50' [-Wsycl-target]
make[3]: Leaving directory '/scratch/hpc-prf-agraph/simple-test/emu'
[100%] Built target bfs.gpu
make  -f src/CMakeFiles/gpu.dir/build.make src/CMakeFiles/gpu.dir/depend
make[3]: Entering directory '/scratch/hpc-prf-agraph/simple-test/emu'
cd /scratch/hpc-prf-agraph/simple-test/emu && /opt/software/pc2/EB-SW/software/CMake/3.27.6-GCCcore-13.2.0/bin/cmake -E cmake_depends "Unix Makefiles" /scratch/hpc-prf-agraph/simple-test /scratch/hpc-prf-agraph/simple-test/src /scratch/hpc-prf-agraph/simple-test/emu /scratch/hpc-prf-agraph/simple-test/emu/src /scratch/hpc-prf-agraph/simple-test/emu/src/CMakeFiles/gpu.dir/DependInfo.cmake "--color="
make[3]: Leaving directory '/scratch/hpc-prf-agraph/simple-test/emu'
make  -f src/CMakeFiles/gpu.dir/build.make src/CMakeFiles/gpu.dir/build
make[3]: Entering directory '/scratch/hpc-prf-agraph/simple-test/emu'
make[3]: Nothing to be done for 'src/CMakeFiles/gpu.dir/build'.
make[3]: Leaving directory '/scratch/hpc-prf-agraph/simple-test/emu'
[100%] Built target gpu
make[2]: Leaving directory '/scratch/hpc-prf-agraph/simple-test/emu'
/opt/software/pc2/EB-SW/software/CMake/3.27.6-GCCcore-13.2.0/bin/cmake -E cmake_progress_start /scratch/hpc-prf-agraph/simple-test/emu/CMakeFiles 0
make[1]: Leaving directory '/scratch/hpc-prf-agraph/simple-test/emu'

rbielski · 6 May 2024 15:10

Hi @br-ko,
indeed there is a compiler warning about a wrong target in the linking step:

br-ko:

icpx -fsycl  "CMakeFiles/bfs.gpu.dir/simple-sycl-app.cpp.o" -o ../bfs.gpu 
icpx: warning: linked binaries do not contain expected 'spir64-unknown-unknown' target; found targets: 'nvptx64-nvidia-cuda-sm_50' [-Wsycl-target]

I think all you need is to add -fsycl-targets=nvptx64-nvidia-cuda to HARDWARE_LINK_FLAGS_GPU.

The embedding of the device code into the executable happens in the linking stage. The compiler driver needs to know which device code should be embedded here. Since no flag was given, it was looking for the default spir64 device code in the .o file but there wasn’t any as it was compiled for nvptx64-nvidia-cuda. These flags need to be consistent for the two stages.

br-ko · 6 May 2024 15:22

Thank you, yes this solved the issue

Topic		Replies	Views
OneMKL / NVIDIA	10	928	28 November 2023
Native API Error with Shared Library oneAPI for NVIDIA GPUs	5	100	21 May 2024
Problem with host_task (old interop_task) to embed a CUDA kernel within SYCL code oneAPI for NVIDIA GPUs	4	569	7 April 2023
Compile a SYCL program for both INTEL and NVIDIA GPUs oneAPI for NVIDIA GPUs	2	269	13 March 2024
Runing Intel oneAPI (Intel C++ compiler + MKL) on Nvidia GPUs - no offload	4	294	16 March 2024

CMake Recipe for Nvidia GPU build

Related Topics