Hello Everyone.
I have been migrating some code from Cuda to SYCL and I need to create a float array in the parallel for region of length which is a function parameter.
I am given to understand that I cannot create such arrays in a parallel for region.
Is there any way I can circumvent this ?
The SYCL code is as follows -
void matmul_reducedOps_sycl(sycl_buffer MatA, sycl_buffer MatB, sycl_buffer result, size_t M, size_t N, size_t K,
int TILE_SIZE, int VECTOR_SIZE, queue deviceQueue){
auto local_range = range<2>(TILE_SIZE, VECTOR_SIZE);
auto global_range = range<2>(K / (TILE_SIZE * VECTOR_SIZE), M / (TILE_SIZE)) * local_range;
nd_range<2> launchParams = nd_range<2>(global_range, local_range);
deviceQueue.submit([&MatA, &MatB, &result, M, N, K, TILE_SIZE, VECTOR_SIZE, launchParams](handler& cgh){
auto MatA_accessor = MatA->get_access<access::mode::read>(cgh);
auto MatB_accessor = MatB->get_access<access::mode::read>(cgh);
auto result_accessor = result->get_access<access::mode::read_write>(cgh);
accessor<float, 1, access::mode::read_write, access::target::local> A_shared{range<1>(TILE_SIZE * TILE_SIZE),
cgh};
cgh.parallel_for<matmulreducedOps>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, A_shared, M, N, K,
VECTOR_SIZE, TILE_SIZE](nd_item<2> item){
float result_vector[TILE_SIZE];
#pragma unroll
for (int i = 0; i < TILE_SIZE; ++i) {
result_vector[i] = 0.0f;
}
unsigned int block_x = item.get_group(0); unsigned int thread_x = item.get_local_id(0);
unsigned int block_y = item.get_group(1); unsigned int thread_y = item.get_local_id(1);
auto a_start_index = K * TILE_SIZE * block_y;
auto a_end = a_start_index + K - 1;
auto a_step = TILE_SIZE;
auto b_start_index = TILE_SIZE * VECTOR_SIZE * block_x;
auto b_step = TILE_SIZE * N;
for (int a = a_start_index, b = b_start_index; a <= a_end;
a += a_step, b += b_step) {
#pragma unroll
for(int i=0; i < TILE_SIZE / VECTOR_SIZE; i++){
A_shared[( i * VECTOR_SIZE + thread_y) + TILE_SIZE * thread_x] =
MatA_accessor[a + K * (i * VECTOR_SIZE + thread_y) + thread_x];
}
item.barrier();
auto b_base = b + TILE_SIZE * thread_y + thread_x;
for(int i=0; i < TILE_SIZE; i++){
float b_value = MatB_accessor[b_base + i*N];
#pragma unroll
for(int j=0 ; j<TILE_SIZE; j++){
result_vector[j] += A_shared[i * TILE_SIZE + j] * b_value;
}
}
item.barrier();
}
auto c_ptr = N * TILE_SIZE * block_y + TILE_SIZE * VECTOR_SIZE * block_x;
c_ptr += TILE_SIZE * thread_y + thread_x;
#pragma unroll
for(int i=0; i < TILE_SIZE; i++){
result_accessor[c_ptr] = result_vector[i];
c_ptr += N;
}
});
});
deviceQueue.wait();
}
}
and the Cuda code is as follows -
__global__ void
MatrixMultiplication_reducedOps32(float *MatA, float *MatB, float *result, size_t M, size_t N, size_t K) {
const int TILE_SIZE = 32;
const int VECTOR_SIZE = 4;
unsigned int block_x = blockIdx.x;
unsigned int thread_x = threadIdx.x;
unsigned int block_y = blockIdx.y;
unsigned int thread_y = threadIdx.y;
__shared__ float A_shared[TILE_SIZE * TILE_SIZE];
float result_vector[TILE_SIZE] = {0};
auto a_start_index = K * TILE_SIZE * block_y;
auto a_end = a_start_index + K - 1;
auto a_step = TILE_SIZE;
auto b_start_index = TILE_SIZE * VECTOR_SIZE * block_x;
auto b_step = TILE_SIZE * N;
for (int a = a_start_index, b = b_start_index; a <= a_end; a += a_step, b += b_step) {
#pragma unroll
for (int i = 0; i < TILE_SIZE / VECTOR_SIZE; ++i) {
A_shared[(i * VECTOR_SIZE + thread_y) + TILE_SIZE * thread_x] =
MatA[a + K * (i * VECTOR_SIZE + thread_y) + thread_x];
}
__syncthreads();
float *a_shared_base = A_shared;
float *b_base = MatB + b + TILE_SIZE * thread_y + thread_x;
#pragma unroll
for (int i = 0; i < TILE_SIZE; i++) {
float b_value = *b_base;
result_vector[0] += a_shared_base[0] * b_value;
result_vector[1] += a_shared_base[1] * b_value;
result_vector[2] += a_shared_base[2] * b_value;
result_vector[3] += a_shared_base[3] * b_value;
result_vector[4] += a_shared_base[4] * b_value;
result_vector[5] += a_shared_base[5] * b_value;
result_vector[6] += a_shared_base[6] * b_value;
result_vector[7] += a_shared_base[7] * b_value;
result_vector[8] += a_shared_base[8] * b_value;
result_vector[9] += a_shared_base[9] * b_value;
result_vector[10] += a_shared_base[10] * b_value;
result_vector[11] += a_shared_base[11] * b_value;
result_vector[12] += a_shared_base[12] * b_value;
result_vector[13] += a_shared_base[13] * b_value;
result_vector[14] += a_shared_base[14] * b_value;
result_vector[15] += a_shared_base[15] * b_value;
result_vector[16] += a_shared_base[16] * b_value;
result_vector[17] += a_shared_base[17] * b_value;
result_vector[18] += a_shared_base[18] * b_value;
result_vector[19] += a_shared_base[19] * b_value;
result_vector[20] += a_shared_base[20] * b_value;
result_vector[21] += a_shared_base[21] * b_value;
result_vector[22] += a_shared_base[22] * b_value;
result_vector[23] += a_shared_base[23] * b_value;
result_vector[24] += a_shared_base[24] * b_value;
result_vector[25] += a_shared_base[25] * b_value;
result_vector[26] += a_shared_base[26] * b_value;
result_vector[27] += a_shared_base[27] * b_value;
result_vector[28] += a_shared_base[28] * b_value;
result_vector[29] += a_shared_base[29] * b_value;
result_vector[30] += a_shared_base[30] * b_value;
result_vector[31] += a_shared_base[31] * b_value;
a_shared_base += TILE_SIZE;
b_base += N;
}
__syncthreads();
}
auto c_ptr = N * TILE_SIZE * block_y + TILE_SIZE * VECTOR_SIZE * block_x;
c_ptr += TILE_SIZE * thread_y + thread_x;
#pragma unroll
for (int i = 0; i < TILE_SIZE; ++i) {
result[c_ptr] = result_vector[i];
c_ptr += N;
}
}
TIA