I was trying to run the cuda-gdb debugger on a simple program. The compile command used is icpx -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xs --offload-arch=sm_86 -g -O0 -o test test.cpp
. However I when I stop inside the kernel all the pointers are showing up as NULL. The code runs fine outside the debugger (or if there are no break points). I tried it with both malloc_shared and malloc_device and am attaching both the codes with this post. Could someone please tell me what I am missing here ?
GPU: NVIDIA RTX A2000
OneAPI Verison: 2024.1.0
CUDA Toolkit version 11.7.0
OS: Rocky 9
malloc_shared:
#include <sycl/sycl.hpp>
#include <vector>
#include <iostream>
using namespace sycl;
constexpr size_t N = 1024;
int main()
{
queue q(default_selector_v);
std::cout<<"Device: "<<q.get_device().get_info<info::device::name>()<<std::endl;
double *data = malloc_shared<double>(N, q);
for(size_t i = 0; i < N; i++)
{
data[i] = i;
}
q.parallel_for(nd_range<1>(N,64), [=] (nd_item<1> itm)
{
size_t idx = itm.get_global_linear_id();
data[idx] = data[idx] * 2;
}).wait();
for(size_t i = 0; i < N; i++)
{
if(data[i] != 2.0*i)
{
std::cout << "Error in index " << i << " : Value is " << data[i] << " instead of " << 2.0 * i << std::endl;
free(data, q);
return -1;
}
}
std::cout << " All values are correct !" << std::endl;
free(data, q);
return 0;
}
malloc_device:
#include <sycl/sycl.hpp>
#include <vector>
#include <iostream>
using namespace sycl;
constexpr size_t N = 1024;
int main()
{
queue q(default_selector_v);
std::cout<<"Device: "<<q.get_device().get_info<info::device::name>()<<std::endl;
std::vector<double> host_data(N);
double *device_data = malloc_device<double>(N, q);
for(size_t i = 0; i < N; i++)
{
host_data[i] = i;
}
q.memcpy(device_data, host_data.data(), N*sizeof(double)).wait();
q.parallel_for(nd_range<1>(N,64), [=] (nd_item<1> itm)
{
size_t idx = itm.get_global_linear_id();
device_data[idx] = device_data[idx] * 2;
}).wait();
q.memcpy(host_data.data(), device_data, N*sizeof(double)).wait();
for(size_t i = 0; i < N; i++)
{
if(host_data[i] != 2.0*i)
{
std::cout << "Error in index " << i << " : Value is " << host_data[i] << " instead of " << 2.0 * i << std::endl;
free(device_data, q);
return -1;
}
}
std::cout << " All values are correct !" << std::endl;
free(device_data, q);
return 0;
}
Debugger Output for malloc_shared:
snarayanan@rockyserver1:builds$ cuda-gdb ./nd_range_malloc_shared
NVIDIA (R) CUDA Debugger
11.7 release
Portions Copyright (C) 2007-2022 NVIDIA Corporation
GNU gdb (GDB) 10.2
Copyright (C) 2021 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-pc-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./nd_range_malloc_shared...
(cuda-gdb) b 23
Breakpoint 1 at 0x404b9c: file ../src/ndrange_vector_scalar_multiply_malloc_shared.cpp, line 23.
(cuda-gdb) run
Starting program: /mounts/work/snarayanan/save/SYCL/debugging/builds/nd_range_malloc_shared
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: File "/usr/people/shared/tools/rocky/9/intel_oneapi/2024.1.0/compiler/2024.1/lib/libsycl.so.7.1.0-gdb.py" auto-loading has been declined by your `auto-load safe-path' set to "$debugdir:$datadir/auto-load".
To enable execution of this file add
add-auto-load-safe-path /usr/people/shared/tools/rocky/9/intel_oneapi/2024.1.0/compiler/2024.1/lib/libsycl.so.7.1.0-gdb.py
line to your configuration file "/home/snarayanan/.cuda-gdbinit".
To completely disable this security protection add
set auto-load safe-path /
line to your configuration file "/home/snarayanan/.cuda-gdbinit".
For more information about this security protection see the
"Auto-loading safe path" section in the GDB manual. E.g., run from the shell:
info "(gdb)Auto-loading safe path"
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
[New Thread 0x7fffd8584640 (LWP 151252)]
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
[New Thread 0x7fffd71ff640 (LWP 151253)]
[Detaching after fork from child process 151254]
[New Thread 0x7fffb6fde640 (LWP 151263)]
[New Thread 0x7fffb57bc640 (LWP 151264)]
Device: NVIDIA RTX A2000 12GB
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
Thread 1 "nd_range_malloc" hit Breakpoint 1, main::{lambda(sycl::_V1::nd_item<1>)#1}::operator()(sycl::_V1::nd_item<1>) const (this=0x7fffd2fffd20,
itm=<error reading variable: Error: Failed to read local memory at address 0x10000200000000 on device 0 sm 0 warp 0 lane 0, error=CUDBG_ERROR_INVALID_ADDRESS(0x12).
>) at ../src/ndrange_vector_scalar_multiply_malloc_shared.cpp:23
23 data[idx] = data[idx] * 2;
(cuda-gdb) p data
$1 = (double *) 0x0 <typeinfo name for main::{lambda(sycl::_V1::nd_item<1>)#1}(double * @global)>
(cuda-gdb) p idx
$2 = 0