Debugging SYCL code on NVIDIA GPU

sidarth · 11 September 2024 22:03

I was trying to run the cuda-gdb debugger on a simple program. The compile command used is icpx -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xs --offload-arch=sm_86 -g -O0 -o test test.cpp. However I when I stop inside the kernel all the pointers are showing up as NULL. The code runs fine outside the debugger (or if there are no break points). I tried it with both malloc_shared and malloc_device and am attaching both the codes with this post. Could someone please tell me what I am missing here ?

GPU: NVIDIA RTX A2000
OneAPI Verison: 2024.1.0
CUDA Toolkit version 11.7.0
OS: Rocky 9

malloc_shared:

#include <sycl/sycl.hpp>
#include <vector>
#include <iostream>

using namespace sycl;
constexpr size_t  N = 1024;

int main()
{
  queue q(default_selector_v);
  std::cout<<"Device: "<<q.get_device().get_info<info::device::name>()<<std::endl;

  double *data = malloc_shared<double>(N, q);

  for(size_t i = 0; i < N; i++)
  {
    data[i] = i;
  }
  
  q.parallel_for(nd_range<1>(N,64), [=] (nd_item<1> itm)
  {
    size_t idx = itm.get_global_linear_id();
    data[idx] = data[idx] * 2;
  }).wait();

  for(size_t i = 0; i < N; i++)
  {
    if(data[i] != 2.0*i)
    {
      std::cout << "Error in index " << i << " : Value is " << data[i] << " instead of " << 2.0 * i << std::endl;
      free(data, q);
      return -1;
    }
  }

  std::cout << " All values are correct !" << std::endl;
  
  free(data, q);
  
  return 0;
}

malloc_device:

#include <sycl/sycl.hpp>
#include <vector>
#include <iostream>

using namespace sycl;
constexpr size_t  N = 1024;

int main()
{
  queue q(default_selector_v);
  std::cout<<"Device: "<<q.get_device().get_info<info::device::name>()<<std::endl;
   
  std::vector<double> host_data(N);
  double *device_data = malloc_device<double>(N, q);

  for(size_t i = 0; i < N; i++)
  {
    host_data[i] = i;
  }

  q.memcpy(device_data, host_data.data(), N*sizeof(double)).wait();
  
  q.parallel_for(nd_range<1>(N,64), [=] (nd_item<1> itm)
  {
    size_t idx = itm.get_global_linear_id();
    device_data[idx] = device_data[idx] * 2;
  }).wait();

  q.memcpy(host_data.data(), device_data, N*sizeof(double)).wait();

  for(size_t i = 0; i < N; i++)
  {
    if(host_data[i] != 2.0*i)
    {
      std::cout << "Error in index " << i << " : Value is " << host_data[i] << " instead of " << 2.0 * i << std::endl;
      free(device_data, q);
      return -1;
    }
  }

  std::cout << " All values are correct !" << std::endl;
  
  free(device_data, q);
  
  return 0;
}

Debugger Output for malloc_shared:

snarayanan@rockyserver1:builds$ cuda-gdb ./nd_range_malloc_shared
NVIDIA (R) CUDA Debugger
11.7 release
Portions Copyright (C) 2007-2022 NVIDIA Corporation
GNU gdb (GDB) 10.2
Copyright (C) 2021 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-pc-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./nd_range_malloc_shared...
(cuda-gdb) b 23
Breakpoint 1 at 0x404b9c: file ../src/ndrange_vector_scalar_multiply_malloc_shared.cpp, line 23.
(cuda-gdb) run
Starting program: /mounts/work/snarayanan/save/SYCL/debugging/builds/nd_range_malloc_shared
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: File "/usr/people/shared/tools/rocky/9/intel_oneapi/2024.1.0/compiler/2024.1/lib/libsycl.so.7.1.0-gdb.py" auto-loading has been declined by your `auto-load safe-path' set to "$debugdir:$datadir/auto-load".
To enable execution of this file add
add-auto-load-safe-path /usr/people/shared/tools/rocky/9/intel_oneapi/2024.1.0/compiler/2024.1/lib/libsycl.so.7.1.0-gdb.py
line to your configuration file "/home/snarayanan/.cuda-gdbinit".
To completely disable this security protection add
set auto-load safe-path /
line to your configuration file "/home/snarayanan/.cuda-gdbinit".
For more information about this security protection see the
"Auto-loading safe path" section in the GDB manual.  E.g., run from the shell:
info "(gdb)Auto-loading safe path"
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
[New Thread 0x7fffd8584640 (LWP 151252)]
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
[New Thread 0x7fffd71ff640 (LWP 151253)]
[Detaching after fork from child process 151254]
[New Thread 0x7fffb6fde640 (LWP 151263)]
[New Thread 0x7fffb57bc640 (LWP 151264)]
Device: NVIDIA RTX A2000 12GB
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]

Thread 1 "nd_range_malloc" hit Breakpoint 1, main::{lambda(sycl::_V1::nd_item<1>)#1}::operator()(sycl::_V1::nd_item<1>) const (this=0x7fffd2fffd20,
itm=<error reading variable: Error: Failed to read local memory at address 0x10000200000000 on device 0 sm 0 warp 0 lane 0, error=CUDBG_ERROR_INVALID_ADDRESS(0x12).
>) at ../src/ndrange_vector_scalar_multiply_malloc_shared.cpp:23
23          data[idx] = data[idx] * 2;
(cuda-gdb) p data
$1 = (double *) 0x0 <typeinfo name for main::{lambda(sycl::_V1::nd_item<1>)#1}(double * @global)>
(cuda-gdb) p idx
$2 = 0

rod · 12 September 2024 15:30

Thanks for the report. We are going to investigate your issue with the debugger. In the meantime, can you try with the flag CUDBG_USE_LEGACY_DEBUGGER=1 ? Sometimes we have seen this help.

sidarth · 13 September 2024 18:53

Hello @rod ,

Thank you for looking into this issue. I tried the CUDBG_USE_LEGACY_DEBUGGER=1 flag and still had the same issue.

sidarth · 30 September 2024 19:12

@rod Could you please let me know if there are any workarounds present if you were able to reproduce this issue ?

rbielski · 1 October 2024 09:29

Hello @sidarth,
we could reproduce the issue and we passed it on to our compiler engineers. We have also found that this has last worked with the icpx compiler driver in release 2024.0.2 and broke since 2024.1. However, the clang++ compiler driver provides correct behaviour up to 2024.2.1. The open-source version also provides the correct behaviour up to a few months ago. More recent open-source versions don’t produce errors but mark the itm object as “optimized out” so you can still not see its value.

While we investigate the issue, could you try using the clang++ driver instead of icpx? When using the oneAPI toolkit release, you’d need to pass the --include-intel-llvm flag to your oneAPI setup.sh script to make it available in PATH.

sidarth · 1 October 2024 16:38

Hello @rbielski,

Thank you for the reply.

We always include --include-intel-llvm flag to our oneAPI setvars.sh command. we also tried it with clang++ and the data pointers are still showing up as null pointers but the itm was parsed with time. Unfortunately I couldn’t find a way to install the older version of the oneAPI toolkit to try it out. Please let me know if I am missing something here.

Compile Command:

clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xs --offload-arch=sm_86 -g -O0 -o nd_range_malloc_shared ../src/ndrange_vector_scalar_multiply_malloc_shar
ed.cpp

cuda-gdb output:

snarayanan@rockyserver1:builds$ cuda-gdb ./nd_range_malloc_shared
NVIDIA (R) CUDA Debugger
11.7 release
Portions Copyright (C) 2007-2022 NVIDIA Corporation
GNU gdb (GDB) 10.2
Copyright (C) 2021 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-pc-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./nd_range_malloc_shared...
(cuda-gdb) b 23
Breakpoint 1 at 0x404b58: file ../src/ndrange_vector_scalar_multiply_malloc_shared.cpp, line 23.
(cuda-gdb) run
Starting program: /mounts/work/snarayanan/save/SYCL/debugging/builds/nd_range_malloc_shared
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: File "/usr/people/shared/tools/rocky/9/intel_oneapi/2024.1.0/compiler/2024.1/lib/libsycl.so.7.1.0-gdb.py" auto-loading has been declined by your `auto-load safe-path' set to "$debugdir:$datadir/auto-load".
To enable execution of this file add
add-auto-load-safe-path /usr/people/shared/tools/rocky/9/intel_oneapi/2024.1.0/compiler/2024.1/lib/libsycl.so.7.1.0-gdb.py
line to your configuration file "/home/snarayanan/.cuda-gdbinit".
To completely disable this security protection add
set auto-load safe-path /
line to your configuration file "/home/snarayanan/.cuda-gdbinit".
For more information about this security protection see the
"Auto-loading safe path" section in the GDB manual.  E.g., run from the shell:
info "(gdb)Auto-loading safe path"
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
[New Thread 0x7fffd8584640 (LWP 3857811)]
[Detaching after fork from child process 3857812]
[New Thread 0x7fffd0a67640 (LWP 3857816)]
[New Thread 0x7fffb6fde640 (LWP 3857817)]
Device: NVIDIA RTX A2000 12GB
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]

Thread 1 "nd_range_malloc" hit Breakpoint 1, main::{lambda(sycl::_V1::nd_item<1>)#1}::operator()(sycl::_V1::nd_item<1>) const (this=0x7fffd3fffd20, itm=...)
at ../src/ndrange_vector_scalar_multiply_malloc_shared.cpp:23
23          data[idx] = data[idx] * 2;
(cuda-gdb) p data
$1 = (double *) 0x0 <typeinfo name for main::{lambda(sycl::_V1::nd_item<1>)#1}(double * @global)>
(cuda-gdb)
$2 = (double *) 0x0 <typeinfo name for main::{lambda(sycl::_V1::nd_item<1>)#1}(double * @global)>
(cuda-gdb) p itm
$3 = {static dimensions = <optimized out>, globalItem = {static dimensions = <optimized out>, MImpl = {MExtent = {sycl::_V1::detail::array<1> = {common_array = {1024}},
static dimensions = <optimized out>}, MIndex = {sycl::_V1::detail::array<1> = {common_array = {0}}, static dimensions = <optimized out>}, MOffset = {
sycl::_V1::detail::array<1> = {common_array = {0}}, static dimensions = <optimized out>}}}, localItem = {static dimensions = <optimized out>, MImpl = {MExtent = {
sycl::_V1::detail::array<1> = {common_array = {64}}, static dimensions = <optimized out>}, MIndex = {sycl::_V1::detail::array<1> = {common_array = {0}},
static dimensions = <optimized out>}}}, Group = {static dimensions = <optimized out>, static fence_scope = <optimized out>, globalRange = {sycl::_V1::detail::array<1> = {
common_array = {1024}}, static dimensions = <optimized out>}, localRange = {sycl::_V1::detail::array<1> = {common_array = {64}}, static dimensions = <optimized out>},
groupRange = {sycl::_V1::detail::array<1> = {common_array = {16}}, static dimensions = <optimized out>}, index = {sycl::_V1::detail::array<1> = {common_array = {0}},
static dimensions = <optimized out>}}}

rbielski · 2 October 2024 11:05

The data issue does not occur to me using CUDA 12.6 or CUDA 12.5 with matching drivers:

(cuda-gdb) b 23
(cuda-gdb) r
Device: NVIDIA GeForce RTX 3060
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]

CUDA thread hit Breakpoint 1.2, main::{lambda(sycl::_V1::nd_item<1>)#1}::operator()(sycl::_V1::nd_item<1>) const (this=0x7fffaffffd20, itm=...) at main.cpp:23
23	    data[idx] = data[idx] * 2;

(cuda-gdb) p data
$1 = (@managed double *) 0x7fff9c000000

but I do see the same issue as you when using older CUDA toolkits (but still with a newer driver). Are you using CUDA driver version matching the CUDA toolkit? Would it be possible for you to try newer CUDA releases, 12.5 or 12.6?

sidarth · 2 October 2024 13:55

@rbielski Thank you for the response. I can confirm that I can stop inside the kernel at the breakpoint and look at the values when using a newer version of cuda toolkit (12.6).

snarayanan@crane:builds$ cuda-gdb ./nd_range_malloc_shared
NVIDIA (R) cuda-gdb 12.6
Portions Copyright (C) 2007-2024 NVIDIA Corporation
Based on GNU gdb 13.2
Copyright (C) 2023 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This CUDA-GDB was configured as "x86_64-pc-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://forums.developer.nvidia.com/c/developer-tools/cuda-developer-tools/cuda-gdb>.
Find the CUDA-GDB manual and other documentation resources online at:
<https://docs.nvidia.com/cuda/cuda-gdb/index.html>.

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./nd_range_malloc_shared...
(cuda-gdb) b 23
Breakpoint 1 at 0x404b58: file ../src/ndrange_vector_scalar_multiply_malloc_shared.cpp, line 23.
(cuda-gdb) run
Starting program: /mounts/work/snarayanan/save/SYCL/debugging/builds/nd_range_malloc_shared
warning: File "/usr/people/shared/tools/rocky/9/intel_oneapi/2024.1.0/compiler/2024.1/lib/libsycl.so.7.1.0-gdb.py" auto-loading has been declined by your `auto-load safe-path' set to "$debugdir:$datadir/auto-load".
To enable execution of this file add
add-auto-load-safe-path /usr/people/shared/tools/rocky/9/intel_oneapi/2024.1.0/compiler/2024.1/lib/libsycl.so.7.1.0-gdb.py
line to your configuration file "/home/snarayanan/.config/gdb/cuda-gdbinit".
To completely disable this security protection add
set auto-load safe-path /
line to your configuration file "/home/snarayanan/.config/gdb/cuda-gdbinit".
For more information about this security protection see the
"Auto-loading safe path" section in the GDB manual.  E.g., run from the shell:
info "(gdb)Auto-loading safe path"
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[New Thread 0x7fffd8584640 (LWP 1548630)]
[New Thread 0x7fffd71ff640 (LWP 1548631)]
[Detaching after fork from child process 1548632]
[New Thread 0x7fffb6fde640 (LWP 1548641)]
[New Thread 0x7fffb57bc640 (LWP 1548642)]
Device: NVIDIA GeForce GTX 1660 Ti
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]

CUDA thread hit Breakpoint 1.2, main::{lambda(sycl::_V1::nd_item<1>)#1}::operator()(sycl::_V1::nd_item<1>) const (this=0x7fffd2fffd20, itm=...)
at ../src/ndrange_vector_scalar_multiply_malloc_shared.cpp:23
23          data[idx] = data[idx] * 2;
(cuda-gdb) p data
$1 = (@managed double *) 0x7fff92000000
(cuda-gdb) p itm
$2 = {static dimensions = <optimized out>, globalItem = {static dimensions = <optimized out>, MImpl = {MExtent = {sycl::_V1::detail::array<1> = {common_array = {1024}},
static dimensions = <optimized out>}, MIndex = {sycl::_V1::detail::array<1> = {common_array = {0}}, static dimensions = <optimized out>}, MOffset = {
sycl::_V1::detail::array<1> = {common_array = {0}}, static dimensions = <optimized out>}}}, localItem = {static dimensions = <optimized out>, MImpl = {MExtent = {
sycl::_V1::detail::array<1> = {common_array = {64}}, static dimensions = <optimized out>}, MIndex = {sycl::_V1::detail::array<1> = {common_array = {0}},
static dimensions = <optimized out>}}}, Group = {static dimensions = <optimized out>, static fence_scope = <optimized out>, globalRange = {sycl::_V1::detail::array<1> = {
common_array = {1024}}, static dimensions = <optimized out>}, localRange = {sycl::_V1::detail::array<1> = {common_array = {64}}, static dimensions = <optimized out>},
groupRange = {sycl::_V1::detail::array<1> = {common_array = {16}}, static dimensions = <optimized out>}, index = {sycl::_V1::detail::array<1> = {common_array = {0}},
static dimensions = <optimized out>}}}
(cuda-gdb) p data[89]
$3 = 89 // Resident on GPU

Topic		Replies	Views
Cuda-gdb doesn't read sycl::nd_item inside a kernel oneAPI for NVIDIA GPUs	3	60	2 July 2024
Windows 10/INvidia device issues SYCL development	3	1082	24 April 2019
CMake Recipe for Nvidia GPU build oneAPI for NVIDIA GPUs	4	157	6 May 2024
Problem with host_task (old interop_task) to embed a CUDA kernel within SYCL code oneAPI for NVIDIA GPUs	4	579	7 April 2023
Support for CUDA 12.7 oneAPI for NVIDIA GPUs	4	74	11 December 2024

Debugging SYCL code on NVIDIA GPU

Related topics