Creating variable length array in a SYCL kernel

Hello Everyone.

I have been migrating some code from Cuda to SYCL and I need to create a float array in the parallel for region of length which is a function parameter.

I am given to understand that I cannot create such arrays in a parallel for region.
Is there any way I can circumvent this ?

The SYCL code is as follows -

        void matmul_reducedOps_sycl(sycl_buffer MatA, sycl_buffer MatB, sycl_buffer result, size_t M, size_t N, size_t K,
                                    int TILE_SIZE, int VECTOR_SIZE, queue deviceQueue){

            auto local_range = range<2>(TILE_SIZE, VECTOR_SIZE);
            auto global_range = range<2>(K / (TILE_SIZE * VECTOR_SIZE), M / (TILE_SIZE)) * local_range;

            nd_range<2> launchParams = nd_range<2>(global_range, local_range);


            deviceQueue.submit([&MatA, &MatB, &result, M, N, K, TILE_SIZE, VECTOR_SIZE, launchParams](handler& cgh){

                auto MatA_accessor = MatA->get_access<access::mode::read>(cgh);
                auto MatB_accessor = MatB->get_access<access::mode::read>(cgh);
                auto result_accessor = result->get_access<access::mode::read_write>(cgh);

                accessor<float, 1, access::mode::read_write, access::target::local> A_shared{range<1>(TILE_SIZE * TILE_SIZE),
                        cgh};

                cgh.parallel_for<matmulreducedOps>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, A_shared, M, N, K,
                                                VECTOR_SIZE, TILE_SIZE](nd_item<2> item){

                    float result_vector[TILE_SIZE];

#pragma unroll
                    for (int i = 0; i < TILE_SIZE; ++i) {
                        result_vector[i] = 0.0f;
                    }

                    unsigned int block_x = item.get_group(0); unsigned int thread_x = item.get_local_id(0);
                    unsigned int block_y = item.get_group(1); unsigned int thread_y = item.get_local_id(1);

                    auto a_start_index = K * TILE_SIZE * block_y;
                    auto a_end = a_start_index + K - 1;

                    auto a_step = TILE_SIZE;

                    auto b_start_index = TILE_SIZE * VECTOR_SIZE * block_x;
                    auto b_step = TILE_SIZE * N;

                    for (int a = a_start_index, b = b_start_index; a <= a_end;
                                                a += a_step, b += b_step) {
#pragma unroll
                        for(int i=0; i < TILE_SIZE / VECTOR_SIZE; i++){
                            A_shared[( i * VECTOR_SIZE + thread_y) + TILE_SIZE * thread_x] =
                                    MatA_accessor[a + K * (i * VECTOR_SIZE + thread_y) + thread_x];
                        }

                        item.barrier();

                        auto b_base = b + TILE_SIZE * thread_y + thread_x;

                        for(int i=0; i < TILE_SIZE; i++){

                            float b_value = MatB_accessor[b_base + i*N];
#pragma unroll
                            for(int j=0 ; j<TILE_SIZE; j++){
                                result_vector[j] += A_shared[i * TILE_SIZE + j] * b_value;
                            }

                        }

                        item.barrier();

                    }

                    auto c_ptr = N * TILE_SIZE * block_y + TILE_SIZE * VECTOR_SIZE * block_x;
                    c_ptr += TILE_SIZE * thread_y + thread_x;

#pragma unroll
                    for(int i=0; i < TILE_SIZE; i++){
                        result_accessor[c_ptr] = result_vector[i];
                        c_ptr += N;
                    }

                });
            });
            deviceQueue.wait();
        }

    }

and the Cuda code is as follows -

        __global__ void
        MatrixMultiplication_reducedOps32(float *MatA, float *MatB, float *result, size_t M, size_t N, size_t K) {
            const int TILE_SIZE = 32;
            const int VECTOR_SIZE = 4;

            unsigned int block_x = blockIdx.x;
            unsigned int thread_x = threadIdx.x;
            unsigned int block_y = blockIdx.y;
            unsigned int thread_y = threadIdx.y;

            __shared__ float A_shared[TILE_SIZE * TILE_SIZE];
            float result_vector[TILE_SIZE] = {0};

            auto a_start_index = K * TILE_SIZE * block_y;
            auto a_end = a_start_index + K - 1;
            auto a_step = TILE_SIZE;

            auto b_start_index = TILE_SIZE * VECTOR_SIZE * block_x;
            auto b_step = TILE_SIZE * N;

            for (int a = a_start_index, b = b_start_index; a <= a_end; a += a_step, b += b_step) {

#pragma unroll
                for (int i = 0; i < TILE_SIZE / VECTOR_SIZE; ++i) {

                    A_shared[(i * VECTOR_SIZE + thread_y) + TILE_SIZE * thread_x] =
                            MatA[a + K * (i * VECTOR_SIZE + thread_y) + thread_x];
                }
                __syncthreads();

                float *a_shared_base = A_shared;
                float *b_base = MatB + b + TILE_SIZE * thread_y + thread_x;

#pragma unroll
                for (int i = 0; i < TILE_SIZE; i++) {
                    float b_value = *b_base;

                    result_vector[0] += a_shared_base[0] * b_value;
                    result_vector[1] += a_shared_base[1] * b_value;
                    result_vector[2] += a_shared_base[2] * b_value;
                    result_vector[3] += a_shared_base[3] * b_value;
                    result_vector[4] += a_shared_base[4] * b_value;
                    result_vector[5] += a_shared_base[5] * b_value;
                    result_vector[6] += a_shared_base[6] * b_value;
                    result_vector[7] += a_shared_base[7] * b_value;
                    result_vector[8] += a_shared_base[8] * b_value;
                    result_vector[9] += a_shared_base[9] * b_value;

                    result_vector[10] += a_shared_base[10] * b_value;
                    result_vector[11] += a_shared_base[11] * b_value;
                    result_vector[12] += a_shared_base[12] * b_value;
                    result_vector[13] += a_shared_base[13] * b_value;
                    result_vector[14] += a_shared_base[14] * b_value;
                    result_vector[15] += a_shared_base[15] * b_value;
                    result_vector[16] += a_shared_base[16] * b_value;
                    result_vector[17] += a_shared_base[17] * b_value;
                    result_vector[18] += a_shared_base[18] * b_value;
                    result_vector[19] += a_shared_base[19] * b_value;

                    result_vector[20] += a_shared_base[20] * b_value;
                    result_vector[21] += a_shared_base[21] * b_value;
                    result_vector[22] += a_shared_base[22] * b_value;
                    result_vector[23] += a_shared_base[23] * b_value;
                    result_vector[24] += a_shared_base[24] * b_value;
                    result_vector[25] += a_shared_base[25] * b_value;
                    result_vector[26] += a_shared_base[26] * b_value;
                    result_vector[27] += a_shared_base[27] * b_value;
                    result_vector[28] += a_shared_base[28] * b_value;
                    result_vector[29] += a_shared_base[29] * b_value;

                    result_vector[30] += a_shared_base[30] * b_value;
                    result_vector[31] += a_shared_base[31] * b_value;

                    a_shared_base += TILE_SIZE;
                    b_base += N;
                }
                __syncthreads();
            }

            auto c_ptr = N * TILE_SIZE * block_y + TILE_SIZE * VECTOR_SIZE * block_x;
            c_ptr += TILE_SIZE * thread_y + thread_x;

#pragma unroll
            for (int i = 0; i < TILE_SIZE; ++i) {
                result[c_ptr] = result_vector[i];
                c_ptr += N;
            }
        }

TIA

Hello @atharva362
I believe there was a notable change between your SYCL and CUDA code. In CUDA, the size of the “VLA” is a const int which is known at compile time (and inlined by the compiler to create the array) while in your SYCL you want to take it as a parameter. In CUDA and SYCL you cannot dynamically allocate memory in a kernel. So I would propose two options:

  1. make template kernels and pass the array size as a template parameter: this will allow the #pragma unroll to work.

  2. Try to allocate another accessor<float, 1, access::mode::read_write, access::target::local> which will have a sufficient size to hold all the work-items result vectors. If there’s not enough local memory, try 3.

  3. Have a look at Codeplay Developer - ComputeCpp CE - Guides - Memory Model and make an accessor to sycl::access::target::global_buffer (Global Memory) in the same way you got the A_shared accessor. It won’t be ideal to perform the computation but to store results it seems reasonable

Thanks you for replying @rod

Yeah, you got a point.

For the CUDA code, the TILE SIZE and VECTOR_SIZE are defined in a header file. This could be done in the case of Cuda, however since SYCL can target broader devices, one needs to query for permissible workgroup dimensions at run time.

Also, Could please elaborate on the first point

  1. make template kernels and pass the array size as a template parameter: this will allow the #pragma unroll to work.

I didn’t really understand it.

Hello @atharva362,

The idea with the templated kernels is that you can generate kernels that are tailored/compiled specifically your computation/tile_size. You could write something like that: (notice that the kernel name is also templated so you don’t get two kernels with the same name).

template<int TILE_SIZE, int VECTOR_SIZE>
struct matmulreducedOps;

template<int TILE_SIZE, int VECTOR_SIZE>
void matmul_reducedOps_sycl(sycl_buffer MatA, sycl_buffer MatB, sycl_buffer result, size_t M, size_t N, size_t K, queue deviceQueue)
{
    auto local_range = range<2>(TILE_SIZE, VECTOR_SIZE);
    auto global_range = range<2>(K / (TILE_SIZE * VECTOR_SIZE), M / (TILE_SIZE)) * local_range;

    nd_range<2> launchParams = nd_range<2>(global_range, local_range);

    deviceQueue.submit([&MatA, &MatB, &result, M, N, K, launchParams](handler& cgh) {

        auto MatA_accessor = MatA->get_access<access::mode::read>(cgh);
        auto MatB_accessor = MatB->get_access<access::mode::read>(cgh);
        auto result_accessor = result->get_access<access::mode::read_write>(cgh);

        accessor<float, 1, access::mode::read_write, access::target::local> A_shared{range<1>(TILE_SIZE * TILE_SIZE), cgh};

        cgh.parallel_for<matmulreducedOps<TILE_SIZE, VECTOR_SIZE>>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, A_shared, M, N, K](nd_item<2> item) {
            float result_vector[TILE_SIZE];
#pragma unroll // Will actually do the unrolling properly
            for (...; ... < TILE_SIZE;... ) { 
                //...
            }
        });
    });
}

Then you could invoke your kernel with matmul_reducedOps_sycl<32, 4>(...); depending on your needs. This will allow the SYCL backend to know the size of the array and to properly assign CUDA registers, etc. You will also be able to do the loop unrolling as the size will be known at compile time… But personally I would remove the #pragma unroll (and let the compiler unroll) as with bigger sizes you could be generating too much kernel code and reduce performance (you want to keep the hot path short).

(Of course that is not the right solution if you want to cover all possible kernel sizes as it would instantiate too many kernels)

Thanks a lo @anon1658935 ,

I will try it out

1 Like