Poor performance on matrix multiplication

Hi! How are you?

I want to ask about a “problem” that I’m having with the CUDA matrix multiplication code: cuda-samples/matrixMul.cu at master · NVIDIA/cuda-samples · GitHub

I executed the following CUDA and SYCL codes for matrix multiplication on the RTX2070, RTX3070, and RTX3090, but CUDA is 24% to 40% (on the rtx3090) faster than SYCL for a dimension of 16384. I believe the difference lies in shared memory, but I’m unsure how to improve it.
I’m using Intel LLVM with clang 17 and CUDA 11.7.

for compiling CUDA: nvcc program.cu -O3 -o program
for compiling SYCL: clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda program.cpp-O3 -o program

./program 16384 16384 16384 16384

Do you have some suggestions? Thank you very much!

CUDA CODE

/**
 * Matrix multiplication: C = A * B.
 * Host code.
 *
 * This sample implements matrix multiplication which makes use of shared memory
 * to ensure data reuse, the matrix multiplication is done using tiling
 * approach. It has been written for clarity of exposition to illustrate various
 * CUDA programming principles, not with the goal of providing the most
 * performant generic kernel for matrix multiplication. See also: V. Volkov and
 * J. Demmel, "Benchmarking GPUs to tune dense linear algebra," in Proc. 2008
 * ACM/IEEE Conf. on Supercomputing (SC '08), Piscataway, NJ: IEEE Press, 2008,
 * pp. Art. 31:1-11.
 */

// System includes
#include <fstream>
#include <iostream>

#include <assert.h>
#include <stdio.h>

// CUDA runtime
#include <cuda_runtime.h>
#include <sys/time.h>

double dwalltime() {
  double sec;
  struct timeval tv;

  gettimeofday(&tv, NULL);
  sec = tv.tv_sec + tv.tv_usec / 1000000.0;
  return sec;
}

/**
 * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 * wA is A's width and wB is B's width
 */
template <int BLOCK_SIZE>
__global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
  // Block index
  int bx = blockIdx.x;
  int by = blockIdx.y;

  // Thread index
  int tx = threadIdx.x;
  int ty = threadIdx.y;

  // Index of the first sub-matrix of A processed by the block
  int aBegin = wA * BLOCK_SIZE * by;

  // Index of the last sub-matrix of A processed by the block
  int aEnd = aBegin + wA - 1;

  // Step size used to iterate through the sub-matrices of A
  int aStep = BLOCK_SIZE;

  // Index of the first sub-matrix of B processed by the block
  int bBegin = BLOCK_SIZE * bx;

  // Step size used to iterate through the sub-matrices of B
  int bStep = BLOCK_SIZE * wB;

  // Csub is used to store the element of the block sub-matrix
  // that is computed by the thread
  float Csub = 0;

  // Loop over all the sub-matrices of A and B
  // required to compute the block sub-matrix
  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
    // Declaration of the shared memory array As used to
    // store the sub-matrix of A
    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

    // Declaration of the shared memory array Bs used to
    // store the sub-matrix of B
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

    // Load the matrices from device memory
    // to shared memory; each thread loads
    // one element of each matrix
    As[ty][tx] = A[a + wA * ty + tx];
    Bs[ty][tx] = B[b + wB * ty + tx];

    // Synchronize to make sure the matrices are loaded
    __syncthreads();

    // Multiply the two matrices together;
    // each thread computes one element
    // of the block sub-matrix
#pragma unroll
    for (int k = 0; k < BLOCK_SIZE; ++k) {
      Csub += As[ty][k] * Bs[k][tx];
    }

    // Synchronize to make sure that the preceding
    // computation is done before loading two new
    // sub-matrices of A and B in the next iteration
    __syncthreads();
  }

  // Write the block sub-matrix to device memory;
  // each thread writes one element
  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
  C[c + wB * ty + tx] = Csub;
}

void ConstantInit(float *data, int size, float val) {
  for (int i = 0; i < size; ++i) {
    data[i] = val;
  }
}

/**
 * Run a simple test of matrix multiplication using CUDA
 */
int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA,
                   const dim3 &dimsB) {
  // Allocate host memory for matrices A and B
  unsigned int size_A = dimsA.x * dimsA.y;
  unsigned int mem_size_A = sizeof(float) * size_A;
  float *h_A;
  cudaMallocHost(&h_A, mem_size_A);
  unsigned int size_B = dimsB.x * dimsB.y;
  unsigned int mem_size_B = sizeof(float) * size_B;
  float *h_B;
  cudaMallocHost(&h_B, mem_size_B);
  cudaStream_t stream;

  // Initialize host memory
  const float valB = 0.01f;
  ConstantInit(h_A, size_A, 1.0f);
  ConstantInit(h_B, size_B, valB);

  // Allocate device memory
  float *d_A, *d_B, *d_C;

  // Allocate host matrix C
  dim3 dimsC(dimsB.x, dimsA.y, 1);
  unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
  float *h_C;
  cudaMallocHost(&h_C, mem_size_C);

  if (h_C == NULL) {
    fprintf(stderr, "Failed to allocate host matrix C!\n");
    exit(EXIT_FAILURE);
  }

  cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A);
  cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B);
  cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C);

  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

  // copy host memory to device

  cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream);
  cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream);

  // Setup execution parameters
  dim3 threads(block_size, block_size);
  dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);

  double start = dwalltime();

  // Performs warmup operation using matrixMul CUDA kernel
  if (block_size == 16) {
    MatrixMulCUDA<16>
        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
  } else {
    MatrixMulCUDA<32>
        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
  }

  cudaStreamSynchronize(stream);
  double stop = dwalltime();
  // printf("done %lf\n", stop - start);

  // Record the start event
  start = dwalltime();

  // Execute the kernel
  int nIter = 10;

  for (int j = 0; j < nIter; j++) {
    if (block_size == 16) {
      MatrixMulCUDA<16>
          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
    } else {
      MatrixMulCUDA<32>
          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
    }
  }

  cudaStreamSynchronize(stream);

  // cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream);
  // cudaStreamSynchronize(stream);

  stop = dwalltime();

  float msecTotal = (stop - start) * 1000;

  // Compute and print the performance
  float msecPerMatrixMul = msecTotal / nIter;
  double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
                             static_cast<double>(dimsA.y) *
                             static_cast<double>(dimsB.x);
  double gigaFlops =
      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);

  std::string filename = "data_cuda.csv";
  std::ifstream file_check(filename);
  bool file_exists = file_check.good();
  file_check.close();

  // Create and open the file for writing, with or without headers
  std::ofstream file(filename, std::ios::app);
  if (!file_exists) {
    file << "N, time (S), GFLOPS, msecPerMatrixMu (S)l\n";
  }

  file << dimsA.x << ", " << msecTotal / 1000 << ", " << gigaFlops << ", "
       << msecPerMatrixMul << "\n";

  // bool correct = true;

  // test relative error by the formula
  //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
  /*  double eps = 1.e-6; // machine zero

    for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
      double abs_err = fabs(h_C[i] - (dimsA.x * valB));
      double dot_length = dimsA.x;
      double abs_val = fabs(h_C[i]);
      double rel_err = abs_err / abs_val / dot_length;

      if (rel_err > eps) {
        printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
               h_C[i], dimsA.x * valB, eps);
        correct = false;
      }
    }

    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");*/

  // Clean up memory
  cudaFreeHost(h_A);
  cudaFreeHost(h_B);
  cudaFreeHost(h_C);
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  return EXIT_SUCCESS;
}

/**
 * Program main
 */
int main(int argc, char **argv) {
  int device_id = 0;
  cudaDeviceProp device_prop;

  cudaGetDeviceProperties(&device_prop, device_id);
  std::cout << "Device Name: " << device_prop.name << std::endl;

  int block_size = 16;

  dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
  dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);

  // width of Matrix A from argc / argv directetly
  dimsA.x = argc > 1 ? atoi(argv[1]) : dimsA.x;
  dimsA.y = argc > 2 ? atoi(argv[2]) : dimsA.y;
  dimsB.x = argc > 3 ? atoi(argv[3]) : dimsB.x;
  dimsB.y = argc > 4 ? atoi(argv[4]) : dimsB.y;

  if (dimsA.x != dimsB.y) {
    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
           dimsA.x, dimsB.y);
    exit(EXIT_FAILURE);
  }

  printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
         dimsB.y);

  int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);

  exit(matrix_result);
}


SYCL CODE

/**
 * Matrix multiplication: C = A * B.
 * Host code.
 *
 * This sample implements matrix multiplication which makes use of shared memory
 * to ensure data reuse, the matrix multiplication is done using tiling
 * approach. It has been written for clarity of exposition to illustrate various
 * CUDA programming principles, not with the goal of providing the most
 * performant generic kernel for matrix multiplication. See also: V. Volkov and
 * J. Demmel, "Benchmarking GPUs to tune dense linear algebra," in Proc. 2008
 * ACM/IEEE Conf. on Supercomputing (SC '08), Piscataway, NJ: IEEE Press, 2008,
 * pp. Art. 31:1-11.
 */

// System includes
#include <assert.h>
#include <stdio.h>
#include <sycl/sycl.hpp>

// CUDA runtime
#include <cmath>
#include <sys/time.h>

double dwalltime() {
  double sec;
  struct timeval tv;

  gettimeofday(&tv, NULL);
  sec = tv.tv_sec + tv.tv_usec / 1000000.0;
  return sec;
}

/**
 * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 * wA is A's width and wB is B's width
 */
template <int BLOCK_SIZE>
void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB,
                   sycl::nd_item<3> item_ct1,
                   sycl::accessor<float, 2, sycl::access::mode::read_write,
                                  sycl::access::target::local>
                       As,
                   sycl::accessor<float, 2, sycl::access::mode::read_write,
                                  sycl::access::target::local>
                       Bs) {
  // Block index
  int bx = item_ct1.get_group(2);
  int by = item_ct1.get_group(1);

  // Thread index
  int tx = item_ct1.get_local_id(2);
  int ty = item_ct1.get_local_id(1);

  // Index of the first sub-matrix of A processed by the block
  int aBegin = wA * BLOCK_SIZE * by;

  // Index of the last sub-matrix of A processed by the block
  int aEnd = aBegin + wA - 1;

  // Step size used to iterate through the sub-matrices of A
  int aStep = BLOCK_SIZE;

  // Index of the first sub-matrix of B processed by the block
  int bBegin = BLOCK_SIZE * bx;

  // Step size used to iterate through the sub-matrices of B
  int bStep = BLOCK_SIZE * wB;

  // Csub is used to store the element of the block sub-matrix
  // that is computed by the thread
  float Csub = 0;

  // Loop over all the sub-matrices of A and B
  // required to compute the block sub-matrix
  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
    // Declaration of the shared memory array As used to
    // store the sub-matrix of A

    // Declaration of the shared memory array Bs used to
    // store the sub-matrix of B

    // Load the matrices from device memory
    // to shared memory; each thread loads
    // one element of each matrix
    As[ty][tx] = A[a + wA * ty + tx];
    Bs[ty][tx] = B[b + wB * ty + tx];

    // Synchronize to make sure the matrices are loaded
    item_ct1.barrier(sycl::access::fence_space::local_space);

    // Multiply the two matrices together;
    // each thread computes one element
    // of the block sub-matrix
#pragma unroll

    for (int k = 0; k < BLOCK_SIZE; ++k) {
      Csub += As[ty][k] * Bs[k][tx];
    }

    // Synchronize to make sure that the preceding
    // computation is done before loading two new
    // sub-matrices of A and B in the next iteration
    item_ct1.barrier(sycl::access::fence_space::local_space);
  }

  // Write the block sub-matrix to device memory;
  // each thread writes one element
  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
  C[c + wB * ty + tx] = Csub;
}

void ConstantInit(float *data, int size, float val) {
  for (int i = 0; i < size; ++i) {
    data[i] = val;
  }
}

/**
 * Run a simple test of matrix multiplication using CUDA
 */
int MatrixMultiply(int argc, char **argv, int block_size,
                   const sycl::range<3> &dimsA, const sycl::range<3> &dimsB) {
  sycl::device dev_ct1 = sycl::device::get_devices()[0];
  sycl::queue q_ct1(dev_ct1);
  // Allocate host memory for matrices A and B
  unsigned int size_A = dimsA[2] * dimsA[1];
  unsigned int mem_size_A = sizeof(float) * size_A;
  float *h_A;
  h_A = (float *)sycl::malloc_host(mem_size_A, q_ct1);
  unsigned int size_B = dimsB[2] * dimsB[1];
  unsigned int mem_size_B = sizeof(float) * size_B;
  float *h_B;
  h_B = (float *)sycl::malloc_host(mem_size_B, q_ct1);

  // Initialize host memory
  const float valB = 0.01f;
  ConstantInit(h_A, size_A, 1.0f);
  ConstantInit(h_B, size_B, valB);

  // Allocate device memory
  float *d_A, *d_B, *d_C;

  // Allocate host matrix C
  sycl::range<3> dimsC(1, dimsA[1], dimsB[2]);
  unsigned int mem_size_C = dimsC[2] * dimsC[1] * sizeof(float);
  float *h_C;
  h_C = (float *)sycl::malloc_host(mem_size_C, q_ct1);

  if (h_C == NULL) {
    fprintf(stderr, "Failed to allocate host matrix C!\n");
    exit(EXIT_FAILURE);
  }

  d_A = (float *)sycl::malloc_device(mem_size_A, q_ct1);
  d_B = (float *)sycl::malloc_device(mem_size_B, q_ct1);
  d_C = (float *)sycl::malloc_device(mem_size_C, q_ct1);

  // copy host memory to device
  q_ct1.memcpy(d_A, h_A, mem_size_A).wait();
  q_ct1.memcpy(d_B, h_B, mem_size_B).wait();

  // Setup execution parameters
  sycl::range<3> threads(1, block_size, block_size);
  sycl::range<3> grid(1, dimsA[1] / threads[1], dimsB[2] / threads[2]);

  // Create and start timer
  // printf("Computing result using CUDA Kernel...\n");

  double start = dwalltime();
  // Performs warmup operation using matrixMul CUDA kernel
  if (block_size == 16) {
    /*
    DPCT1049:2: The work-group size passed to the SYCL kernel may exceed the
    limit. To get the device limit, query info::device::max_work_group_size.
    Adjust the work-group size if needed.
    */
    q_ct1.submit([&](sycl::handler &cgh) {
      sycl::accessor<float, 2, sycl::access::mode::read_write,
                     sycl::access::target::local>
          As_acc_ct1(sycl::range<2>(16, 16), cgh);

      sycl::accessor<float, 2, sycl::access::mode::read_write,
                     sycl::access::target::local>
          Bs_acc_ct1(sycl::range<2>(16, 16), cgh);

      cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
                       [=](sycl::nd_item<3> item_ct1) {
                         MatrixMulCUDA<16>(d_C, d_A, d_B, dimsA[2], dimsB[2],
                                           item_ct1, As_acc_ct1, Bs_acc_ct1);
                       });
    });
  } else {
    /*
    DPCT1049:3: The work-group size passed to the SYCL kernel may exceed the
    limit. To get the device limit, query info::device::max_work_group_size.
    Adjust the work-group size if needed.
    */
    q_ct1.submit([&](sycl::handler &cgh) {
      sycl::accessor<float, 2, sycl::access::mode::read_write,
                     sycl::access::target::local>
          As_acc_ct1(sycl::range<2>(32, 32), cgh);

      sycl::accessor<float, 2, sycl::access::mode::read_write,
                     sycl::access::target::local>
          Bs_acc_ct1(sycl::range<2>(32, 32), cgh);

      cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
                       [=](sycl::nd_item<3> item_ct1) {
                         MatrixMulCUDA<32>(d_C, d_A, d_B, dimsA[2], dimsB[2],
                                           item_ct1, As_acc_ct1, Bs_acc_ct1);
                       });
    });
  }

  q_ct1.wait_and_throw();
  double stop = dwalltime();

  // printf("done %lf\n", stop - start);

  // Record the start event
  start = dwalltime();

  // Execute the kernel
  int nIter = 10;

  for (int j = 0; j < nIter; j++) {
    if (block_size == 16) {
      /*
      DPCT1049:4: The work-group size passed to the SYCL kernel may exceed the
      limit. To get the device limit, query info::device::max_work_group_size.
      Adjust the work-group size if needed.
      */
      q_ct1.submit([&](sycl::handler &cgh) {
        sycl::accessor<float, 2, sycl::access::mode::read_write,
                       sycl::access::target::local>
            As_acc_ct1(sycl::range<2>(16, 16), cgh);

        sycl::accessor<float, 2, sycl::access::mode::read_write,
                       sycl::access::target::local>
            Bs_acc_ct1(sycl::range<2>(16, 16), cgh);

        cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
                         [=](sycl::nd_item<3> item_ct1) {
                           MatrixMulCUDA<16>(d_C, d_A, d_B, dimsA[2], dimsB[2],
                                             item_ct1, As_acc_ct1, Bs_acc_ct1);
                         });
      });
    } else {
      /*
      DPCT1049:5: The work-group size passed to the SYCL kernel may exceed the
      limit. To get the device limit, query info::device::max_work_group_size.
      Adjust the work-group size if needed.
      */
      q_ct1.submit([&](sycl::handler &cgh) {
        sycl::accessor<float, 2, sycl::access::mode::read_write,
                       sycl::access::target::local>
            As_acc_ct1(sycl::range<2>(32, 32), cgh);

        sycl::accessor<float, 2, sycl::access::mode::read_write,
                       sycl::access::target::local>
            Bs_acc_ct1(sycl::range<2>(32, 32), cgh);

        cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
                         [=](sycl::nd_item<3> item_ct1) {
                           MatrixMulCUDA<32>(d_C, d_A, d_B, dimsA[2], dimsB[2],
                                             item_ct1, As_acc_ct1, Bs_acc_ct1);
                         });
      });
    }
  }

  q_ct1.wait_and_throw();

  stop = dwalltime();

  float msecTotal = (stop - start) * 1000;

  // Compute and print the performance
  float msecPerMatrixMul = msecTotal / nIter;
  double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA[1]) *
                             static_cast<double>(dimsA[1]) *
                             static_cast<double>(dimsB[1]);
  double gigaFlops =
      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);

  std::string filename = "data_sycl.csv";
  std::ifstream file_check(filename);
  bool file_exists = file_check.good();
  file_check.close();

  // Create and open the file for writing, with or without headers
  std::ofstream file(filename, std::ios::app);
  if (!file_exists) {
    file << "N, time (S), GFLOPS, msecPerMatrixMu (S)l\n";
  }

  file << dimsA[1] << ", " << msecTotal / 1000 << ", " << gigaFlops << ", "
       << msecPerMatrixMul << "\n";

  // // Copy result from device to host
  // q_ct1.memcpy(h_C, d_C, mem_size_C).wait();

  // q_ct1.wait_and_throw();

  // // printf("Checking computed result for correctness: ");
  // bool correct = true;

  // // test relative error by the formula
  // //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
  // double eps = 1.e-6; // machine zero

  // for (int i = 0; i < static_cast<int>(dimsC[2] * dimsC[1]); i++) {
  //   double abs_err = fabs(h_C[i] - (dimsA[2] * valB));
  //   double dot_length = dimsA[2];
  //   double abs_val = fabs(h_C[i]);
  //   double rel_err = abs_err / abs_val / dot_length;

  //   if (rel_err > eps) {
  //     printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
  //            h_C[i], dimsA[2] * valB, eps);
  //     correct = false;
  //   }
  // }

  // printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

  // Clean up memory
  sycl::free(h_A, q_ct1);
  sycl::free(h_B, q_ct1);
  sycl::free(h_C, q_ct1);
  sycl::free(d_A, q_ct1);
  sycl::free(d_B, q_ct1);
  sycl::free(d_C, q_ct1);
  // printf("\nNOTE: The CUDA Samples are not meant for performance "
  //        "measurements. Results may vary when GPU Boost is enabled.\n");

  // if (correct) {
  return EXIT_SUCCESS;
  // } else {
  //   return EXIT_FAILURE;
  // }
}

/**
 * Program main
 */
int main(int argc, char **argv) {
  // printf("[Matrix Multiply Using CUDA] - Starting...\n");

  sycl::queue q{sycl::default_selector{}};

  // Obtener el nombre del dispositivo
  std::string device_name = q.get_device().get_info<sycl::info::device::name>();

  std::cout << "Device Name: " << device_name << std::endl;

  int block_size = 16;

  sycl::range<3> dimsA(1, 5 * 2 * block_size, 5 * 2 * block_size);
  sycl::range<3> dimsB(1, 5 * 2 * block_size, 5 * 4 * block_size);

  // width of Matrix A from argc / argv directetly
  dimsA[2] = argc > 1 ? atoi(argv[1]) : dimsA[2];
  dimsA[1] = argc > 2 ? atoi(argv[2]) : dimsA[1];
  dimsB[2] = argc > 3 ? atoi(argv[3]) : dimsB[2];
  dimsB[1] = argc > 4 ? atoi(argv[4]) : dimsB[1];

  if (dimsA[2] != dimsB[1]) {
    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
           dimsA[2], dimsB[1]);
    exit(EXIT_FAILURE);
  }

  // printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA[2], dimsA[1], dimsB[2],
  //        dimsB[1]);

  int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);

  exit(matrix_result);
}

Hi, thank you for posting a clear description with reproducers. We found there is one optimisation opportunity which nvcc took but clang couldn’t. In the generated PTX code, which can be produced with:

# CUDA
nvcc program.cu -O3 -o cuda
cuobjdump --dump-ptx ./cuda | cu++filt >cuda.ptx

# SYCL
clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda program.cpp -O3 -o sycl
SYCL_DUMP_IMAGES=1 ./sycl
cuobjdump --dump-ptx sycl_nvptx641.bin | cu++filt >sycl.ptx

we can see that the source code:

As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];

item_ct1.barrier(sycl::access::fence_space::local_space); // __syncthreads(); in CUDA

#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
    Csub += As[ty][k] * Bs[k][tx];
}

results in nvcc inlining the address calculation for Bs[k][tx] and using fixed offsets:

ld.shared.f32 %f11, [%r10+64];
ld.shared.f32 %f14, [%r10+128];
ld.shared.f32 %f17, [%r10+192];
ld.shared.f32 %f20, [%r10+256];

which was not possible in clang, so the SYCL version computes the address for each value at runtime and stores all of them in registers:

add.s64 %rd9, %rd8, %rd42;
add.s64 %rd10, %rd9, %rd42;
add.s64 %rd11, %rd10, %rd42;
add.s64 %rd12, %rd11, %rd42;

ld.shared.f32 %f12, [%rd9];
ld.shared.f32 %f15, [%rd10];
ld.shared.f32 %f18, [%rd11];
ld.shared.f32 %f21, [%rd12];

This extra computation and use of more registers is what seems to be causing the performance difference.

We will look into this issue.

Hi @rbielski, amazing! What great news that you have found the problem. It would be amazing to find a way to solve it. I will keep investigating, and if I discover anything, I will let you know. We’ll keep in touch. Regards!

Hi codeplay team, any update about this issue?

Hi,

We have continued to update the compiler over the last year or so, so I would recommend that @mcostanzo tries an updated release to see what the performance is looking like.

@kminemura we’ll get back to you separately, I think that’s the easier choice.

Duncan.

Hi Codeplay Team and @duncan ,

I recently tested the code on an A100 GPU, and the performance difference is still present. SYCL is approximately 20% slower than CUDA.

As stated by @rbielski, since the SYCL version computes and stores the address for shared memory at runtime, it results in higher register usage compared to the CUDA version. Specifically, SYCL uses 40 registers per thread, while the CUDA version uses 32 registers per thread (profiled using nsys nvprof --print-gpu-trace ./program-xxxx 16384 16384 16384 16384).

Although this difference is not significant enough to cause register spilling in this simple kernel, I have observed that for more register-intensive kernels that use significant shared memory(especially 2D), the SYCL version results in register spills, whereas the CUDA version does not. This makes it challenging to achieve CUDA-level performance.

Addressing this issue would greatly help in improving the performance of SYCL.

Thank you.

SYCL compile and run:

$ clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_80 program.cpp -O3 -o program-sycl
$ ./program-sycl 16384 16384 16384 16384

output:
N, time (S), GFLOPS, msecPerMatrixMu (S)l
16384, 23.472, 3747.48, 2347.2
...

CUDA compile and run:

$ nvcc -arch=sm_80 program.cu -O3 -o program-cuda
$ ./program-cuda 16384 16384 16384 16384

output:
N, time (S), GFLOPS, msecPerMatrixMu (S)l
16384, 18.5601, 4739.24, 1856.01
...

SYCL compiler info:

$ clang++ --version
clang version 19.0.0git (https://github.com/intel/llvm 5c6c2e0f0fc101b2c6d32d62188e68ac5d666ff9)
Target: x86_64-unknown-linux-gnu
Thread model: posix
Build config: +assertions

CUDA compiler version:

$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Wed_Apr_17_19:19:55_PDT_2024
Cuda compilation tools, release 12.5, V12.5.40
Build cuda_12.5.r12.5/compiler.34177558_0

Note that the SYCL code is outdated and needs to be updated to compile. Here is the updated code.

/**
 * Matrix multiplication: C = A * B.
 * Host code.
 *
 * This sample implements matrix multiplication which makes use of shared memory
 * to ensure data reuse, the matrix multiplication is done using tiling
 * approach. It has been written for clarity of exposition to illustrate various
 * CUDA programming principles, not with the goal of providing the most
 * performant generic kernel for matrix multiplication. See also: V. Volkov and
 * J. Demmel, "Benchmarking GPUs to tune dense linear algebra," in Proc. 2008
 * ACM/IEEE Conf. on Supercomputing (SC '08), Piscataway, NJ: IEEE Press, 2008,
 * pp. Art. 31:1-11.
 */

#include <fstream>
// System includes
#include <assert.h>
#include <stdio.h>
#include <sycl/sycl.hpp>

// CUDA runtime
#include <cmath>
#include <sys/time.h>

double dwalltime() {
  double sec;
  struct timeval tv;

  gettimeofday(&tv, NULL);
  sec = tv.tv_sec + tv.tv_usec / 1000000.0;
  return sec;
}

/**
 * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 * wA is A's width and wB is B's width
 */
template <int BLOCK_SIZE>
void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB,
                   sycl::nd_item<3> item_ct1,
                   sycl::local_accessor<float, 2> As,
                   sycl::local_accessor<float, 2> Bs) {
  // Block index
  int bx = item_ct1.get_group(2);
  int by = item_ct1.get_group(1);

  // Thread index
  int tx = item_ct1.get_local_id(2);
  int ty = item_ct1.get_local_id(1);

  // Index of the first sub-matrix of A processed by the block
  int aBegin = wA * BLOCK_SIZE * by;

  // Index of the last sub-matrix of A processed by the block
  int aEnd = aBegin + wA - 1;

  // Step size used to iterate through the sub-matrices of A
  int aStep = BLOCK_SIZE;

  // Index of the first sub-matrix of B processed by the block
  int bBegin = BLOCK_SIZE * bx;

  // Step size used to iterate through the sub-matrices of B
  int bStep = BLOCK_SIZE * wB;

  // Csub is used to store the element of the block sub-matrix
  // that is computed by the thread
  float Csub = 0;

  // Loop over all the sub-matrices of A and B
  // required to compute the block sub-matrix
  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
    // Declaration of the shared memory array As used to
    // store the sub-matrix of A

    // Declaration of the shared memory array Bs used to
    // store the sub-matrix of B

    // Load the matrices from device memory
    // to shared memory; each thread loads
    // one element of each matrix
    As[ty][tx] = A[a + wA * ty + tx];
    Bs[ty][tx] = B[b + wB * ty + tx];

    // Synchronize to make sure the matrices are loaded
    item_ct1.barrier(sycl::access::fence_space::local_space);

    // Multiply the two matrices together;
    // each thread computes one element
    // of the block sub-matrix
#pragma unroll

    for (int k = 0; k < BLOCK_SIZE; ++k) {
      Csub += As[ty][k] * Bs[k][tx];
    }

    // Synchronize to make sure that the preceding
    // computation is done before loading two new
    // sub-matrices of A and B in the next iteration
    item_ct1.barrier(sycl::access::fence_space::local_space);
  }

  // Write the block sub-matrix to device memory;
  // each thread writes one element
  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
  C[c + wB * ty + tx] = Csub;
}

void ConstantInit(float *data, int size, float val) {
  for (int i = 0; i < size; ++i) {
    data[i] = val;
  }
}

/**
 * Run a simple test of matrix multiplication using CUDA
 */
int MatrixMultiply(int argc, char **argv, int block_size,
                   const sycl::range<3> &dimsA, const sycl::range<3> &dimsB) {
  sycl::queue q_ct1{sycl::default_selector_v};

  // Allocate host memory for matrices A and B
  unsigned int size_A = dimsA[2] * dimsA[1];
  unsigned int mem_size_A = sizeof(float) * size_A;
  float *h_A;
  h_A = (float *)sycl::malloc_host(mem_size_A, q_ct1);
  unsigned int size_B = dimsB[2] * dimsB[1];
  unsigned int mem_size_B = sizeof(float) * size_B;
  float *h_B;
  h_B = (float *)sycl::malloc_host(mem_size_B, q_ct1);

  // Initialize host memory
  const float valB = 0.01f;
  ConstantInit(h_A, size_A, 1.0f);
  ConstantInit(h_B, size_B, valB);

  // Allocate device memory
  float *d_A, *d_B, *d_C;

  // Allocate host matrix C
  sycl::range<3> dimsC(1, dimsA[1], dimsB[2]);
  unsigned int mem_size_C = dimsC[2] * dimsC[1] * sizeof(float);
  float *h_C;
  h_C = (float *)sycl::malloc_host(mem_size_C, q_ct1);

  if (h_C == NULL) {
    fprintf(stderr, "Failed to allocate host matrix C!\n");
    exit(EXIT_FAILURE);
  }

  d_A = (float *)sycl::malloc_device(mem_size_A, q_ct1);
  d_B = (float *)sycl::malloc_device(mem_size_B, q_ct1);
  d_C = (float *)sycl::malloc_device(mem_size_C, q_ct1);

  // copy host memory to device
  q_ct1.memcpy(d_A, h_A, mem_size_A).wait();
  q_ct1.memcpy(d_B, h_B, mem_size_B).wait();

  // Setup execution parameters
  sycl::range<3> threads(1, block_size, block_size);
  sycl::range<3> grid(1, dimsA[1] / threads[1], dimsB[2] / threads[2]);

  // Create and start timer
  // printf("Computing result using CUDA Kernel...\n");

  double start = dwalltime();
  // Performs warmup operation using matrixMul CUDA kernel
  if (block_size == 16) {
    /*
    DPCT1049:2: The work-group size passed to the SYCL kernel may exceed the
    limit. To get the device limit, query info::device::max_work_group_size.
    Adjust the work-group size if needed.
    */
    q_ct1.submit([&](sycl::handler &cgh) {
      sycl::local_accessor<float, 2>
          As_acc_ct1(sycl::range<2>(16, 16), cgh);

      sycl::local_accessor<float, 2>
          Bs_acc_ct1(sycl::range<2>(16, 16), cgh);

      cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
                       [=](sycl::nd_item<3> item_ct1) {
                         MatrixMulCUDA<16>(d_C, d_A, d_B, dimsA[2], dimsB[2],
                                           item_ct1, As_acc_ct1, Bs_acc_ct1);
                       });
    });
  } else {
    /*
    DPCT1049:3: The work-group size passed to the SYCL kernel may exceed the
    limit. To get the device limit, query info::device::max_work_group_size.
    Adjust the work-group size if needed.
    */
    q_ct1.submit([&](sycl::handler &cgh) {
        sycl::local_accessor<float, 2>
          As_acc_ct1(sycl::range<2>(32, 32), cgh);

        sycl::local_accessor<float, 2>
          Bs_acc_ct1(sycl::range<2>(32, 32), cgh);

      cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
                       [=](sycl::nd_item<3> item_ct1) {
                         MatrixMulCUDA<32>(d_C, d_A, d_B, dimsA[2], dimsB[2],
                                           item_ct1, As_acc_ct1, Bs_acc_ct1);
                       });
    });
  }

  q_ct1.wait_and_throw();
  double stop = dwalltime();

  // printf("done %lf\n", stop - start);

  // Record the start event
  start = dwalltime();

  // Execute the kernel
  int nIter = 10;

  for (int j = 0; j < nIter; j++) {
    if (block_size == 16) {
      /*
      DPCT1049:4: The work-group size passed to the SYCL kernel may exceed the
      limit. To get the device limit, query info::device::max_work_group_size.
      Adjust the work-group size if needed.
      */
      q_ct1.submit([&](sycl::handler &cgh) {
          sycl::local_accessor<float, 2>
            As_acc_ct1(sycl::range<2>(16, 16), cgh);

          sycl::local_accessor<float, 2>
            Bs_acc_ct1(sycl::range<2>(16, 16), cgh);

        cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
                         [=](sycl::nd_item<3> item_ct1) {
                           MatrixMulCUDA<16>(d_C, d_A, d_B, dimsA[2], dimsB[2],
                                             item_ct1, As_acc_ct1, Bs_acc_ct1);
                         });
      });
    } else {
      /*
      DPCT1049:5: The work-group size passed to the SYCL kernel may exceed the
      limit. To get the device limit, query info::device::max_work_group_size.
      Adjust the work-group size if needed.
      */
      q_ct1.submit([&](sycl::handler &cgh) {
          sycl::local_accessor<float, 2>
            As_acc_ct1(sycl::range<2>(32, 32), cgh);

          sycl::local_accessor<float, 2>
            Bs_acc_ct1(sycl::range<2>(32, 32), cgh);

        cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
                         [=](sycl::nd_item<3> item_ct1) {
                           MatrixMulCUDA<32>(d_C, d_A, d_B, dimsA[2], dimsB[2],
                                             item_ct1, As_acc_ct1, Bs_acc_ct1);
                         });
      });
    }
  }

  q_ct1.wait_and_throw();

  stop = dwalltime();

  float msecTotal = (stop - start) * 1000;

  // Compute and print the performance
  float msecPerMatrixMul = msecTotal / nIter;
  double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA[1]) *
                             static_cast<double>(dimsA[1]) *
                             static_cast<double>(dimsB[1]);
  double gigaFlops =
      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);

  std::string filename = "data_sycl.csv";
  std::ifstream file_check(filename);
  bool file_exists = file_check.good();
  file_check.close();

  // Create and open the file for writing, with or without headers
  std::ofstream file(filename, std::ios::app);
  if (!file_exists) {
    file << "N, time (S), GFLOPS, msecPerMatrixMu (S)l\n";
  }

  file << dimsA[1] << ", " << msecTotal / 1000 << ", " << gigaFlops << ", "
       << msecPerMatrixMul << "\n";

  // // Copy result from device to host
  // q_ct1.memcpy(h_C, d_C, mem_size_C).wait();

  // q_ct1.wait_and_throw();

  // // printf("Checking computed result for correctness: ");
  // bool correct = true;

  // // test relative error by the formula
  // //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
  // double eps = 1.e-6; // machine zero

  // for (int i = 0; i < static_cast<int>(dimsC[2] * dimsC[1]); i++) {
  //   double abs_err = fabs(h_C[i] - (dimsA[2] * valB));
  //   double dot_length = dimsA[2];
  //   double abs_val = fabs(h_C[i]);
  //   double rel_err = abs_err / abs_val / dot_length;

  //   if (rel_err > eps) {
  //     printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
  //            h_C[i], dimsA[2] * valB, eps);
  //     correct = false;
  //   }
  // }

  // printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

  // Clean up memory
  sycl::free(h_A, q_ct1);
  sycl::free(h_B, q_ct1);
  sycl::free(h_C, q_ct1);
  sycl::free(d_A, q_ct1);
  sycl::free(d_B, q_ct1);
  sycl::free(d_C, q_ct1);
  // printf("\nNOTE: The CUDA Samples are not meant for performance "
  //        "measurements. Results may vary when GPU Boost is enabled.\n");

  // if (correct) {
  return EXIT_SUCCESS;
  // } else {
  //   return EXIT_FAILURE;
  // }
}

/**
 * Program main
 */
int main(int argc, char **argv) {
  // printf("[Matrix Multiply Using CUDA] - Starting...\n");

  sycl::queue q{sycl::default_selector_v};

  // Obtener el nombre del dispositivo
  std::string device_name = q.get_device().get_info<sycl::info::device::name>();

  std::cout << "Device Name: " << device_name << std::endl;

  int block_size = 16;

  sycl::range<3> dimsA(1, 5 * 2 * block_size, 5 * 2 * block_size);
  sycl::range<3> dimsB(1, 5 * 2 * block_size, 5 * 4 * block_size);

  // width of Matrix A from argc / argv directetly
  dimsA[2] = argc > 1 ? atoi(argv[1]) : dimsA[2];
  dimsA[1] = argc > 2 ? atoi(argv[2]) : dimsA[1];
  dimsB[2] = argc > 3 ? atoi(argv[3]) : dimsB[2];
  dimsB[1] = argc > 4 ? atoi(argv[4]) : dimsB[1];

  if (dimsA[2] != dimsB[1]) {
    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
           dimsA[2], dimsB[1]);
    exit(EXIT_FAILURE);
  }

  // printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA[2], dimsA[1], dimsB[2],
  //        dimsB[1]);

  int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);

  exit(matrix_result);
}

Hi @yencal ,

thank you for confirming the issue. We’re continuing to work on it however it isn’t an easy fix.

Many thanks,
Duncan.

Hi @mcostanzo / @yencal ,

I have a couple of suggestions which should help shift the performance in a happier direction!

I would recommend using the Intel extension for local memory over a local accessor. In this particular case it seems like knowing the boundaries of the local allocation is very important for the compiler to be able to generate more optimal code. I made a change somewhat like the following:

  using namespace sycl::ext::oneapi;
  auto& As = *group_local_memory_for_overwrite<float[BLOCK_SIZE][BLOCK_SIZE]>(item_ct1.get_group());
  auto& Bs = *group_local_memory_for_overwrite<float[BLOCK_SIZE][BLOCK_SIZE]>(item_ct1.get_group());

removing the accessors from the function interface. This brings the performance to be more or less the same as the CUDA version of the code.

I hope this helps,
Duncan.

Further to all this, dpct can emit the extension variant directly if you use the flag --use-experimental-features=local-memory-kernel-scope-allocation (or indeed turn them all on with --use-experimental-features=all. However, it is not the default.