Hi,
I am following this Install oneAPI for NVIDIA GPUs - Guides - oneAPI for NVIDIA® GPUs - Products - Codeplay Developer tutorial here, it builds with command line but when I to build with CMake I get
terminate called after throwing an instance of 'sycl::_V1::runtime_error'
what(): Native API failed. Native API returns: -42 (PI_ERROR_INVALID_BINARY) -42 (PI_ERROR_INVALID_BINARY)
/var/spool/slurmd/job7492534/slurm_script: line 13: 1133566 Aborted (core dumped) ONEAPI_DEVICE_SELECTOR="ext_oneapi_cuda:*" SYCL_PI_TRACE=1 ./emu/bfs.gpu
Error.
So I did the following changes:
- created a
src
folder and placed the following files inside it :
simple-sycl-app
:
#include <sycl/sycl.hpp>
#include "simple-sycl-app.hpp"
int main() {
// Creating SYCL queue
sycl::queue Queue{};
// Submitting command group(work) to queue
calculate(Queue);
return 0;
}
Then single-sycl-app.hpp
:
void calculate( sycl::queue &Queue) {
// Creating buffer of 4 ints to be used inside the kernel code
sycl::buffer<int, 1> Buffer{4};
// Size of index space for kernel
sycl::range<1> NumOfWorkItems{Buffer.size()};
// Submitting command group(work) to queue
Queue.submit([&](sycl::handler &cgh) {
// Getting write only access to the buffer on a device
auto Accessor = Buffer.get_access<sycl::access::mode::write>(cgh);
// Executing kernel
cgh.parallel_for<class FillBuffer>(
NumOfWorkItems, [=](sycl::id<1> WIid) {
// Fill buffer with indexes
Accessor[WIid] = static_cast<int>(WIid.get(0));
});
});
// Getting read only access to the buffer on the host.
// Implicit barrier waiting for queue to complete the work.
auto HostAccessor = Buffer.get_host_access();
// Check the results
bool MismatchFound{false};
for (size_t I{0}; I < Buffer.size(); ++I) {
if (HostAccessor[I] != I) {
std::cout << "The result is incorrect for element: " << I
<< " , expected: " << I << " , got: " << HostAccessor[I]
<< std::endl;
MismatchFound = true;
}
}
if (!MismatchFound) {
std::cout << "The results are correct!" << std::endl;
}
}
src/CMakeLists.txt
:
set(SOURCE_FILE simple-sycl-app.cpp)
set(TARGET_NAME bfs)
set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
set(SIMULATOR_TARGET ${TARGET_NAME}.fpga_sim)
set(FPGA_TARGET ${TARGET_NAME}.fpga)
set(GPU_TARGET ${TARGET_NAME}.gpu)
if("${DEVICE}" STREQUAL "INTEL_MAX_GPU")
message(STATUS "Configuring the design to run on INTEL_MAX_GPU device ${DEVICE}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -fsycl-targets=intel_gpu_pvc -D INTEL_MAX_GPU")
elseif("${DEVICE}" STREQUAL "NVIDIA_GPU")
message(STATUS "Configuring the design to run on NVIDIA_GPU device ${DEVICE}")
else()
message(STATUS "Configuring the design to run on FPGA device ${DEVICE}")
endif()
# A SYCL ahead-of-time (AoT) compile processes the device code in two stages.
# 1. The "compile" stage compiles the device code to an intermediate representation (SPIR-V).
# 2. The "link" stage invokes the compiler's FPGA backend before linking.
# For this reason, FPGA backend flags must be passed as link flags in CMake.
set(EMULATOR_COMPILE_FLAGS_GPU "-fsycl -fsycl-targets=nvptx64-nvidia-cuda ")
set(HARDWARE_LINK_FLAGS_GPU "-fsycl ${USER_HARDWARE_FLAGS}")
###############################################################################
### GPU (NVIDIA)
###############################################################################
add_executable(${GPU_TARGET} ${SOURCE_FILE})
target_include_directories(${GPU_TARGET} PRIVATE ${PROJECT_SOURCE_DIR}/include)
target_include_directories(${GPU_TARGET} PRIVATE ${PROJECT_SOURCE_DIR}/src)
set_target_properties(${GPU_TARGET} PROPERTIES COMPILE_FLAGS "${EMULATOR_COMPILE_FLAGS_GPU}")
set_target_properties(${GPU_TARGET} PROPERTIES LINK_FLAGS "${HARDWARE_LINK_FLAGS_GPU}")
add_custom_target(gpu DEPENDS ${GPU_TARGET})
Then the top directory CMakeLists.txt
file :
if(UNIX)
# Direct CMake to use icpx rather than the default C++ compiler/linker
set(CMAKE_CXX_COMPILER icpx)
else() # Windows
# Force CMake to use icx-cl rather than the default C++ compiler/linker
# (needed on Windows only)
include (CMakeForceCompiler)
CMAKE_FORCE_CXX_COMPILER (icx-cl IntelDPCPP)
include (Platform/Windows-Clang)
endif()
cmake_minimum_required (VERSION 3.5)
project(FPGACompile CXX)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
add_subdirectory (src)
I built via command line
and CMake
to cross test
mkdir emu; cd emu; cmake .. -DNVIDIA_GPU;cd ..
icpx -fsycl -fsycl-targets=nvptx64-nvidia-cuda src/simple-sycl-app.cpp -o simple-sycl-app
Then submitted the job to GPU node via
sycl-ls
cd ~/simple-test
ONEAPI_DEVICE_SELECTOR="ext_oneapi_cuda:*" SYCL_PI_TRACE=1 ./simple-sycl-app
ONEAPI_DEVICE_SELECTOR="ext_oneapi_cuda:*" SYCL_PI_TRACE=1 ./emu/bfs.gpu
I got the following output :
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
[opencl:cpu:1] Intel(R) OpenCL, AMD EPYC 7742 64-Core Processor OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-SXM4-40GB 8.0 [CUDA 12.2]
SYCL_PI_TRACE[all]: Requested device_type: info::device_type::automatic
SYCL_PI_TRACE[all]: Selected device: -> final score = 1500
SYCL_PI_TRACE[all]: platform: NVIDIA CUDA BACKEND
SYCL_PI_TRACE[all]: device: NVIDIA A100-SXM4-40GB
The results are correct!
SYCL_PI_TRACE[all]: Requested device_type: info::device_type::automatic
SYCL_PI_TRACE[all]: Selected device: -> final score = 500
SYCL_PI_TRACE[all]: platform: NVIDIA CUDA BACKEND
SYCL_PI_TRACE[all]: device: NVIDIA A100-SXM4-40GB
And this error message :
SYCL_PI_TRACE[basic]: Plugin found and successfully loaded: libpi_cuda.so [ PluginVersion: 14.38.1 ]
SYCL_PI_TRACE[basic]: Plugin found and successfully loaded: libpi_unified_runtime.so [ PluginVersion: 14.37.1 ]
SYCL_PI_TRACE[basic]: Plugin found and successfully loaded: libpi_cuda.so [ PluginVersion: 14.38.1 ]
SYCL_PI_TRACE[basic]: Plugin found and successfully loaded: libpi_unified_runtime.so [ PluginVersion: 14.37.1 ]
terminate called after throwing an instance of 'sycl::_V1::runtime_error'
what(): Native API failed. Native API returns: -42 (PI_ERROR_INVALID_BINARY) -42 (PI_ERROR_INVALID_BINARY)
/var/spool/slurmd/job7492534/slurm_script: line 13: 1133566 Aborted (core dumped) ONEAPI_DEVICE_SELECTOR="ext_oneapi_cuda:*" SYCL_PI_TRACE=1 ./emu/bfs.gpu
What am I missing in the configuration of CMake ?