PI CUDA kernels only support images with channel types int32, uint32, float, and half

Hi everyone!

I’m working with a code that creates a 2D int4 image, but when I execute the program, I get:

pi_die: PI CUDA kernels only support images with channel types int32, uint32, float, and half.

I’m using CUDA 10.1 and OneAPI 2021.3.0

Does anyone know what is happening here? Thank you so much !

So as you have discovered int4 types in images are not supported when using the CUDA backend of DPC++.
The simplest way to address this would be to convert your int4 types to int32 and do the processing as normal.
If this does not provide the performance you are looking for you could store 8 x int4 inside a int32 and modify your kernels to do the correct calculations across those variables. So you would need to pack and unpack the int4 values with the int32. However this really depends on your algorithm as to what would deliver the best performance.

Thank you Rod, you are right !

@rod Sorry, but I don’t understand.

SYCL only supports 4-CHANNEL but CUDA backend only support 1-CHANNEL. So it’s impossible to fix. If I change to int image instead of int4, I get this error:

pi_die: cuda_piMemImageCreate only supports RGBA channel order

And if I use int4, I get the error of the topic, so it’s not possible to use images using CUDA backend ??? Sorry.

Hi there,
Is this a different problem to the one you previously posted?
Can you provide a simple reproducer code snippet?
Thanks.

@rod Hi, here I have an example code where I get the error “only support images with channel types int32, uint32, float, and half”:

#include <CL/sycl.hpp>
#include <cstdio>
#include <dpct/dpct.hpp>
#define HEIGHT 7680
#define WIDTH 1812

typedef sycl::int4 it;
typedef sycl::char4 it2;

dpct::image_matrix *Array_Device;
dpct::image_wrapper<sycl::int4, 2> Image;
dpct::image_wrapper<sycl::char4, 2> Image2;

void k(int x, int y, const sycl::stream &out,
       dpct::image_accessor_ext<sycl::int4, 2> Image,
       dpct::image_accessor_ext<sycl::char4, 2> Image2) {

  out << "AAAA" << sycl::endl;
}

void p() {
  it *h = new it[WIDTH * HEIGHT];
  it2 *h2 = new it2[WIDTH * HEIGHT];
  for (int i = 0; i < HEIGHT; i++)
    for (int j = 0; j < WIDTH; j++) {
      h[i * WIDTH + j] =
          sycl::int4(i * 10000 + j * 4 + 0, i * 10000 + j * 4 + 1,
                     i * 10000 + j * 4 + 2, i * 10000 + j * 4 + 3);
      h2[i * WIDTH + j] =
          sycl::char4(i * 10000 + j * 4 + 0, i * 10000 + j * 4 + 1,
                      i * 10000 + j * 4 + 2, i * 10000 + j * 4 + 3);
    }
  /*
  DPCT1059:1: SYCL only supports 4-channel image format. Adjust the code.
  */
  // dpct::image_channel channelDesc = dpct::image_channel::create<it>();
  // Array_Device =
  //     new dpct::image_matrix(channelDesc, sycl::range<2>(WIDTH, HEIGHT));
  // Image.attach(Array_Device);
  // dpct::dpct_memcpy(
  //     Array_Device->to_pitched_data(), sycl::id<3>(0, 0, 0),
  //     dpct::pitched_data(h, WIDTH * sizeof(it), WIDTH * sizeof(it), 1),
  //     sycl::id<3>(0, 0, 0), sycl::range<3>(WIDTH * sizeof(it), HEIGHT, 1));

  dpct::image_matrix *d;
  dpct::image_channel channel = Image.get_channel();

  d = new dpct::image_matrix(channel, sycl::range<2>(WIDTH, HEIGHT));
  dpct::dpct_memcpy(d->to_pitched_data(), sycl::id<3>(0, 0, 0),
                    dpct::pitched_data(h, WIDTH * HEIGHT * sizeof(sycl::int4),
                                       WIDTH * HEIGHT * sizeof(sycl::int4), 1),
                    sycl::id<3>(0, 0, 0),
                    sycl::range<3>(WIDTH * HEIGHT * sizeof(sycl::int4), 1, 1));

  Image.attach(d);

  dpct::image_matrix *d2;
  dpct::image_channel channel2 = Image2.get_channel();

  d2 = new dpct::image_matrix(channel2, sycl::range<2>(WIDTH, HEIGHT));
  dpct::dpct_memcpy(d2->to_pitched_data(), sycl::id<3>(0, 0, 0),
                    dpct::pitched_data(h2, WIDTH * HEIGHT * sizeof(sycl::char4),
                                       WIDTH * HEIGHT * sizeof(sycl::char4), 1),
                    sycl::id<3>(0, 0, 0),
                    sycl::range<3>(WIDTH * HEIGHT * sizeof(sycl::char4), 1, 1));

  Image2.attach(d2);

  dpct::get_default_queue().submit([&](sycl::handler &cgh) {
    sycl::stream out(64 * 1024, 80, cgh);

    auto Image_acc = Image.get_access(cgh);

    auto Image_smpl = Image.get_sampler();

    auto Image2_acc = Image2.get_access(cgh);

    auto Image2_smpl = Image2.get_sampler();

    cgh.parallel_for(
        sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
        [=](sycl::nd_item<3> item_ct1) {
          k(3670, 2000, out,
            dpct::image_accessor_ext<sycl::int4, 2>(Image_smpl, Image_acc),
            dpct::image_accessor_ext<sycl::char4, 2>(Image2_smpl, Image2_acc));
        });
  });
  dpct::get_current_device().queues_wait_and_throw();
}

int main() { p(); }

If you comment de char4 type, you can see that works:

#include <CL/sycl.hpp>
#include <cstdio>
#include <dpct/dpct.hpp>
#define HEIGHT 7680
#define WIDTH 1812

typedef sycl::int4 it;
typedef sycl::char4 it2;

dpct::image_matrix *Array_Device;
dpct::image_wrapper<sycl::int4, 2> Image;
dpct::image_wrapper<sycl::char4, 2> Image2;

void k(int x, int y, const sycl::stream &out,
       dpct::image_accessor_ext<sycl::int4, 2> Image) {
  //  dpct::image_accessor_ext<sycl::char4, 2> Image2) {

  out << "AAAA" << sycl::endl;
}

void p() {
  it *h = new it[WIDTH * HEIGHT];
  it2 *h2 = new it2[WIDTH * HEIGHT];
  for (int i = 0; i < HEIGHT; i++)
    for (int j = 0; j < WIDTH; j++) {
      h[i * WIDTH + j] =
          sycl::int4(i * 10000 + j * 4 + 0, i * 10000 + j * 4 + 1,
                     i * 10000 + j * 4 + 2, i * 10000 + j * 4 + 3);
      h2[i * WIDTH + j] =
          sycl::char4(i * 10000 + j * 4 + 0, i * 10000 + j * 4 + 1,
                      i * 10000 + j * 4 + 2, i * 10000 + j * 4 + 3);
    }
  /*
  DPCT1059:1: SYCL only supports 4-channel image format. Adjust the code.
  */
  // dpct::image_channel channelDesc = dpct::image_channel::create<it>();
  // Array_Device =
  //     new dpct::image_matrix(channelDesc, sycl::range<2>(WIDTH, HEIGHT));
  // Image.attach(Array_Device);
  // dpct::dpct_memcpy(
  //     Array_Device->to_pitched_data(), sycl::id<3>(0, 0, 0),
  //     dpct::pitched_data(h, WIDTH * sizeof(it), WIDTH * sizeof(it), 1),
  //     sycl::id<3>(0, 0, 0), sycl::range<3>(WIDTH * sizeof(it), HEIGHT, 1));

  dpct::image_matrix *d;
  dpct::image_channel channel = Image.get_channel();

  d = new dpct::image_matrix(channel, sycl::range<2>(WIDTH, HEIGHT));
  dpct::dpct_memcpy(d->to_pitched_data(), sycl::id<3>(0, 0, 0),
                    dpct::pitched_data(h, WIDTH * HEIGHT * sizeof(sycl::int4),
                                       WIDTH * HEIGHT * sizeof(sycl::int4), 1),
                    sycl::id<3>(0, 0, 0),
                    sycl::range<3>(WIDTH * HEIGHT * sizeof(sycl::int4), 1, 1));

  Image.attach(d);

  dpct::image_matrix *d2;
  dpct::image_channel channel2 = Image2.get_channel();

  d2 = new dpct::image_matrix(channel2, sycl::range<2>(WIDTH, HEIGHT));
  dpct::dpct_memcpy(d2->to_pitched_data(), sycl::id<3>(0, 0, 0),
                    dpct::pitched_data(h2, WIDTH * HEIGHT * sizeof(sycl::char4),
                                       WIDTH * HEIGHT * sizeof(sycl::char4), 1),
                    sycl::id<3>(0, 0, 0),
                    sycl::range<3>(WIDTH * HEIGHT * sizeof(sycl::char4), 1, 1));

  Image2.attach(d2);

  dpct::get_default_queue().submit([&](sycl::handler &cgh) {
    sycl::stream out(64 * 1024, 80, cgh);

    auto Image_acc = Image.get_access(cgh);

    auto Image_smpl = Image.get_sampler();

    // auto Image2_acc = Image2.get_access(cgh);

    // auto Image2_smpl = Image2.get_sampler();

    cgh.parallel_for(
        sycl::nd_range<3>(sycl::range<3>(1, 1, 1), sycl::range<3>(1, 1, 1)),
        [=](sycl::nd_item<3> item_ct1) {
          k(3670, 2000, out,
            dpct::image_accessor_ext<sycl::int4, 2>(Image_smpl, Image_acc));
          // dpct::image_accessor_ext<sycl::char4, 2>(Image2_smpl, Image2_acc));
        });
  });
  dpct::get_current_device().queues_wait_and_throw();
}

int main() { p(); }

I’m compiling with:

clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda  test.dp.cpp -o test

and executing with:

SYCL_DEVICE_FILTER=PI_CUDA ./test

So how can I use a char4 image? Or it’s not possible? Thank you so much for your help.

You can only use the types it suggests in the error message with image accessors in SYCL. Can you not use int32 types for your image data storage and access?