Cuda-gdb doesn't read sycl::nd_item inside a kernel

sidarth · 26 June 2024 12:56

Hello,

I am trying to use cuda-gdb to test out debugging on a simple kernel. cuda-gdb is unable parse/dereference the sycl nd_item and so is unable to evaluate functions on it. Could someone please let me know if this an expected behavior (or) am I missing something here ?

Here is the debugger output:

snarayanan@rockyserver1:builds$ cuda-gdb ./malloc_shared_gdb 
NVIDIA (R) CUDA Debugger
11.7 release
Portions Copyright (C) 2007-2022 NVIDIA Corporation
GNU gdb (GDB) 10.2
Copyright (C) 2021 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-pc-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
    <http://www.gnu.org/software/gdb/documentation/>.

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./malloc_shared_gdb...
(cuda-gdb) b 17
Breakpoint 1 at 0x40377e: file ../src/malloc_shared_gdb.cpp, line 17.
(cuda-gdb) run
Starting program: /mounts/work/snarayanan/save/SYCL/common_tests/builds/malloc_shared_gdb
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: File "/usr/people/shared/tools/rocky/9/intel_oneapi/2024.1.0/compiler/2024.1/lib/libsycl.so.7.1.0-gdb.py" auto-loading has been declined by your `auto-load safe-path' set to "$debugdir:$datadir/auto-load".
To enable execution of this file add
        add-auto-load-safe-path /usr/people/shared/tools/rocky/9/intel_oneapi/2024.1.0/compiler/2024.1/lib/libsycl.so.7.1.0-gdb.py
line to your configuration file "/home/snarayanan/.cuda-gdbinit".
To completely disable this security protection add
        set auto-load safe-path /
line to your configuration file "/home/snarayanan/.cuda-gdbinit".
For more information about this security protection see the
"Auto-loading safe path" section in the GDB manual.  E.g., run from the shell:
        info "(gdb)Auto-loading safe path"
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
[New Thread 0x7fffd8584640 (LWP 3229748)]
warning: Cannot parse .gnu_debugdata section; LZMA support was disabled at compile time
[New Thread 0x7fffd71ff640 (LWP 3229749)]
[Detaching after fork from child process 3229750]
[New Thread 0x7fffb6fde640 (LWP 3229759)]
[New Thread 0x7fffb57bc640 (LWP 3229760)]
USING MALLOC_SHARED (dt value should be 0.125)
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]

Thread 1 "malloc_shared_g" hit Breakpoint 1, initialize_array_device (Q_cell=0x7fff92000000, Q_initial=2, number_of_cells=1000000,
    item=<error reading variable: Error: Failed to read local memory at address 0x6c2e0a3b30202c50 on device 0 sm 0 warp 0 lane 0, error=CUDBG_ERROR_INVALID_ADDRESS(0x12).
>) at ../src/malloc_shared_gdb.cpp:17
17            Q_cell[i] = Q_initial;
(cuda-gdb) p item
Error: Failed to read local memory at address 0x6c2e0a3b30202c50 on device 0 sm 0 warp 0 lane 0, error=CUDBG_ERROR_INVALID_ADDRESS(0x12).

(cuda-gdb) p item.get_global_index(0)
Couldn't find method sycl::_V1::nd_item<1>::get_global_index
(cuda-gdb) p Q_cell
$1 = (@managed double * @local) 0x7fff92000000

Here is the code:

#include <array>
#include <sycl/sycl.hpp>
#include <iostream>

//#define MALLOC_DEVICE

using namespace sycl;

void initialize_array_device (double* Q_cell, const double Q_initial,
                              const int number_of_cells, const nd_item<1> item)
{
   const int index = item.get_global_id(0);
   const int stride = item.get_local_range(0) * item.get_group_range(0);

   for(int i = index; i < number_of_cells; i += stride)
   {
      Q_cell[i] = Q_initial;
   }
}

int main()
{
   queue cur_que;

   range<1> local_range{512};
   int num_work_groups = 32 * cur_que.get_device().get_info<info::device::max_compute_units>();
   range<1> global_range{num_work_groups * local_range[0]};

   const int m_number_of_cells = 1000 * 1000;

   double Q_init = -1.0;
   
   #ifdef MALLOC_DEVICE
    double* data = malloc_device<double>(m_number_of_cells, cur_que);
    std::vector<double> host_data(m_number_of_cells, -1.0);
    Q_init = 0.5;
    std::cout<<"USING MALLOC_DEVICE (dt value should be 2.0)"<<std::endl;
   #else
    double* data = malloc_shared<double>(m_number_of_cells, cur_que);
    Q_init = 2.0;
    std::cout<<"USING MALLOC_SHARED (dt value should be 0.125)"<<std::endl;
   #endif

   cur_que.prefetch(data, m_number_of_cells * sizeof(double));

   cur_que.parallel_for(nd_range<1>(global_range, local_range), [=](nd_item<1> item)
   {
      initialize_array_device (data, Q_init, m_number_of_cells, item);
   });

   // This is the limit. (dt / dx2) + (dt / dy2) < 0.5
   cur_que.wait();

   double delta_t = 0.0;
   
   #ifdef MALLOC_DEVICE
    cur_que.memcpy(host_data.data(), data, m_number_of_cells*sizeof(double)).wait();
    delta_t = 1.0 / (host_data[0] * host_data[0] * 2.0);     // 2.0 added to reduce from limit.
   #else
    delta_t = 1.0 / (data[0] * data[0] * 2.0);     // 2.0 added to reduce from limit.
   #endif

   printf("GPU: Value of dt %e\n", delta_t);
   free (data, cur_que);
   return 0;
}

System Configuration:
OneAPI version: 2024.1.0 with corresponding NVIDIA pulgin
CUDA toolkit version: 11.7
CUDA card: NVIDIA RTX A2000 (12GB)
Driver Version : 550.90.07

Thank you

duncan · 26 June 2024 19:26

Hi @sidarth,

I think the warning about being unable to parse certain parts of the debug data is a bit of a red herring. Assuming you’re running a Fedora system, I think we can just ignore it.

Can you try setting CUDBG_USE_LEGACY_DEBUGGER=1 in the environment before running the program through cuda-gdb?

sidarth · 1 July 2024 19:26

Hello @duncan ,

I tried the flag and it still gives the exact same error when trying to read sycl::_V1::nd_item<1>. Is there something else I could try ? Also I was just curious if this error was reproducible.

Also here is one interesting thing I noticed, If I try to pass in the nd_item as a pointer, It is able to dereference the pointer and print out the following:

USING MALLOC_SHARED (dt value should be 0.125)
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (352,0,0), device 0, sm 0, warp 11, lane 0]

Thread 1 "malloc_shared_g" hit Breakpoint 1, initialize_array_device (Q_cell=0x7fff92000000, Q_initial=2, number_of_cells=1000000, item=0x7fffd2fffc50) at ../src/malloc_shared_gdb.cpp:17
17            Q_cell[i] = Q_initial;
(cuda-gdb) p *item
$1 = {static dimensions = <optimized out>, globalItem = {static dimensions = <optimized out>, MImpl = {MExtent = {sycl::_V1::detail::array<1> = {common_array = {0}}, static dimensions = <optimized out>},
      MIndex = {sycl::_V1::detail::array<1> = {common_array = {0}}, static dimensions = <optimized out>}, MOffset = {sycl::_V1::detail::array<1> = {common_array = {0}}, static dimensions = <optimized out>}}},
  localItem = {static dimensions = <optimized out>, MImpl = {MExtent = {sycl::_V1::detail::array<1> = {common_array = {0}}, static dimensions = <optimized out>}, MIndex = {sycl::_V1::detail::array<1> = {
          common_array = {0}}, static dimensions = <optimized out>}}}, Group = {static dimensions = <optimized out>, static fence_scope = <optimized out>, globalRange = {sycl::_V1::detail::array<1> = {
        common_array = {0}}, static dimensions = <optimized out>}, localRange = {sycl::_V1::detail::array<1> = {common_array = {0}}, static dimensions = <optimized out>}, groupRange = {
      sycl::_V1::detail::array<1> = {common_array = {0}}, static dimensions = <optimized out>}, index = {sycl::_V1::detail::array<1> = {common_array = {0}}, static dimensions = <optimized out>}}}
(cuda-gdb) p item->get_global_range(0)
Couldn't find method sycl::_V1::nd_item<1>::get_global_range

duncan · 2 July 2024 16:15

Hi @sidarth,

This is a known issue and using the legacy debugger sometimes fixes it. We don’t have any other fixes for now, I’m afraid, though an earlier version of oneAPI might not be affected by it.

Topic		Replies	Views
Debugging SYCL code on NVIDIA GPU oneAPI for NVIDIA GPUs	7	154	2 October 2024
How to debug using GDB on the host device if sycl::queue(host_selector) isn't supported anymore? SYCL development	8	545	25 October 2023
How to add watch sycl accessors in gdb? SYCL development	4	1966	21 May 2019
Using nd_range with parallel_for does not work	8	1878	27 May 2021
Debugging of SYCL kernel doesn't work SYCL development	2	1151	19 July 2019

Cuda-gdb doesn't read sycl::nd_item inside a kernel

Related topics