Poor performance on matrix multiplication

Hi! How are you?

I want to ask about a “problem” that I’m having with the CUDA matrix multiplication code: cuda-samples/matrixMul.cu at master · NVIDIA/cuda-samples · GitHub

I executed the following CUDA and SYCL codes for matrix multiplication on the RTX2070, RTX3070, and RTX3090, but CUDA is 24% to 40% (on the rtx3090) faster than SYCL for a dimension of 16384. I believe the difference lies in shared memory, but I’m unsure how to improve it.
I’m using Intel LLVM with clang 17 and CUDA 11.7.

for compiling CUDA: nvcc program.cu -O3 -o program
for compiling SYCL: clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda program.cpp-O3 -o program

./program 16384 16384 16384 16384

Do you have some suggestions? Thank you very much!

CUDA CODE

/**
 * Matrix multiplication: C = A * B.
 * Host code.
 *
 * This sample implements matrix multiplication which makes use of shared memory
 * to ensure data reuse, the matrix multiplication is done using tiling
 * approach. It has been written for clarity of exposition to illustrate various
 * CUDA programming principles, not with the goal of providing the most
 * performant generic kernel for matrix multiplication. See also: V. Volkov and
 * J. Demmel, "Benchmarking GPUs to tune dense linear algebra," in Proc. 2008
 * ACM/IEEE Conf. on Supercomputing (SC '08), Piscataway, NJ: IEEE Press, 2008,
 * pp. Art. 31:1-11.
 */

// System includes
#include <fstream>
#include <iostream>

#include <assert.h>
#include <stdio.h>

// CUDA runtime
#include <cuda_runtime.h>
#include <sys/time.h>

double dwalltime() {
  double sec;
  struct timeval tv;

  gettimeofday(&tv, NULL);
  sec = tv.tv_sec + tv.tv_usec / 1000000.0;
  return sec;
}

/**
 * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 * wA is A's width and wB is B's width
 */
template <int BLOCK_SIZE>
__global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
  // Block index
  int bx = blockIdx.x;
  int by = blockIdx.y;

  // Thread index
  int tx = threadIdx.x;
  int ty = threadIdx.y;

  // Index of the first sub-matrix of A processed by the block
  int aBegin = wA * BLOCK_SIZE * by;

  // Index of the last sub-matrix of A processed by the block
  int aEnd = aBegin + wA - 1;

  // Step size used to iterate through the sub-matrices of A
  int aStep = BLOCK_SIZE;

  // Index of the first sub-matrix of B processed by the block
  int bBegin = BLOCK_SIZE * bx;

  // Step size used to iterate through the sub-matrices of B
  int bStep = BLOCK_SIZE * wB;

  // Csub is used to store the element of the block sub-matrix
  // that is computed by the thread
  float Csub = 0;

  // Loop over all the sub-matrices of A and B
  // required to compute the block sub-matrix
  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
    // Declaration of the shared memory array As used to
    // store the sub-matrix of A
    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

    // Declaration of the shared memory array Bs used to
    // store the sub-matrix of B
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

    // Load the matrices from device memory
    // to shared memory; each thread loads
    // one element of each matrix
    As[ty][tx] = A[a + wA * ty + tx];
    Bs[ty][tx] = B[b + wB * ty + tx];

    // Synchronize to make sure the matrices are loaded
    __syncthreads();

    // Multiply the two matrices together;
    // each thread computes one element
    // of the block sub-matrix
#pragma unroll
    for (int k = 0; k < BLOCK_SIZE; ++k) {
      Csub += As[ty][k] * Bs[k][tx];
    }

    // Synchronize to make sure that the preceding
    // computation is done before loading two new
    // sub-matrices of A and B in the next iteration
    __syncthreads();
  }

  // Write the block sub-matrix to device memory;
  // each thread writes one element
  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
  C[c + wB * ty + tx] = Csub;
}

void ConstantInit(float *data, int size, float val) {
  for (int i = 0; i < size; ++i) {
    data[i] = val;
  }
}

/**
 * Run a simple test of matrix multiplication using CUDA
 */
int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA,
                   const dim3 &dimsB) {
  // Allocate host memory for matrices A and B
  unsigned int size_A = dimsA.x * dimsA.y;
  unsigned int mem_size_A = sizeof(float) * size_A;
  float *h_A;
  cudaMallocHost(&h_A, mem_size_A);
  unsigned int size_B = dimsB.x * dimsB.y;
  unsigned int mem_size_B = sizeof(float) * size_B;
  float *h_B;
  cudaMallocHost(&h_B, mem_size_B);
  cudaStream_t stream;

  // Initialize host memory
  const float valB = 0.01f;
  ConstantInit(h_A, size_A, 1.0f);
  ConstantInit(h_B, size_B, valB);

  // Allocate device memory
  float *d_A, *d_B, *d_C;

  // Allocate host matrix C
  dim3 dimsC(dimsB.x, dimsA.y, 1);
  unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
  float *h_C;
  cudaMallocHost(&h_C, mem_size_C);

  if (h_C == NULL) {
    fprintf(stderr, "Failed to allocate host matrix C!\n");
    exit(EXIT_FAILURE);
  }

  cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A);
  cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B);
  cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C);

  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

  // copy host memory to device

  cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream);
  cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream);

  // Setup execution parameters
  dim3 threads(block_size, block_size);
  dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);

  double start = dwalltime();

  // Performs warmup operation using matrixMul CUDA kernel
  if (block_size == 16) {
    MatrixMulCUDA<16>
        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
  } else {
    MatrixMulCUDA<32>
        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
  }

  cudaStreamSynchronize(stream);
  double stop = dwalltime();
  // printf("done %lf\n", stop - start);

  // Record the start event
  start = dwalltime();

  // Execute the kernel
  int nIter = 10;

  for (int j = 0; j < nIter; j++) {
    if (block_size == 16) {
      MatrixMulCUDA<16>
          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
    } else {
      MatrixMulCUDA<32>
          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
    }
  }

  cudaStreamSynchronize(stream);

  // cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream);
  // cudaStreamSynchronize(stream);

  stop = dwalltime();

  float msecTotal = (stop - start) * 1000;

  // Compute and print the performance
  float msecPerMatrixMul = msecTotal / nIter;
  double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
                             static_cast<double>(dimsA.y) *
                             static_cast<double>(dimsB.x);
  double gigaFlops =
      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);

  std::string filename = "data_cuda.csv";
  std::ifstream file_check(filename);
  bool file_exists = file_check.good();
  file_check.close();

  // Create and open the file for writing, with or without headers
  std::ofstream file(filename, std::ios::app);
  if (!file_exists) {
    file << "N, time (S), GFLOPS, msecPerMatrixMu (S)l\n";
  }

  file << dimsA.x << ", " << msecTotal / 1000 << ", " << gigaFlops << ", "
       << msecPerMatrixMul << "\n";

  // bool correct = true;

  // test relative error by the formula
  //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
  /*  double eps = 1.e-6; // machine zero

    for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
      double abs_err = fabs(h_C[i] - (dimsA.x * valB));
      double dot_length = dimsA.x;
      double abs_val = fabs(h_C[i]);
      double rel_err = abs_err / abs_val / dot_length;

      if (rel_err > eps) {
        printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
               h_C[i], dimsA.x * valB, eps);
        correct = false;
      }
    }

    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");*/

  // Clean up memory
  cudaFreeHost(h_A);
  cudaFreeHost(h_B);
  cudaFreeHost(h_C);
  cudaFree(d_A);
  cudaFree(d_B);
  cudaFree(d_C);

  return EXIT_SUCCESS;
}

/**
 * Program main
 */
int main(int argc, char **argv) {
  int device_id = 0;
  cudaDeviceProp device_prop;

  cudaGetDeviceProperties(&device_prop, device_id);
  std::cout << "Device Name: " << device_prop.name << std::endl;

  int block_size = 16;

  dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
  dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);

  // width of Matrix A from argc / argv directetly
  dimsA.x = argc > 1 ? atoi(argv[1]) : dimsA.x;
  dimsA.y = argc > 2 ? atoi(argv[2]) : dimsA.y;
  dimsB.x = argc > 3 ? atoi(argv[3]) : dimsB.x;
  dimsB.y = argc > 4 ? atoi(argv[4]) : dimsB.y;

  if (dimsA.x != dimsB.y) {
    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
           dimsA.x, dimsB.y);
    exit(EXIT_FAILURE);
  }

  printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
         dimsB.y);

  int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);

  exit(matrix_result);
}


SYCL CODE

/**
 * Matrix multiplication: C = A * B.
 * Host code.
 *
 * This sample implements matrix multiplication which makes use of shared memory
 * to ensure data reuse, the matrix multiplication is done using tiling
 * approach. It has been written for clarity of exposition to illustrate various
 * CUDA programming principles, not with the goal of providing the most
 * performant generic kernel for matrix multiplication. See also: V. Volkov and
 * J. Demmel, "Benchmarking GPUs to tune dense linear algebra," in Proc. 2008
 * ACM/IEEE Conf. on Supercomputing (SC '08), Piscataway, NJ: IEEE Press, 2008,
 * pp. Art. 31:1-11.
 */

// System includes
#include <assert.h>
#include <stdio.h>
#include <sycl/sycl.hpp>

// CUDA runtime
#include <cmath>
#include <sys/time.h>

double dwalltime() {
  double sec;
  struct timeval tv;

  gettimeofday(&tv, NULL);
  sec = tv.tv_sec + tv.tv_usec / 1000000.0;
  return sec;
}

/**
 * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 * wA is A's width and wB is B's width
 */
template <int BLOCK_SIZE>
void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB,
                   sycl::nd_item<3> item_ct1,
                   sycl::accessor<float, 2, sycl::access::mode::read_write,
                                  sycl::access::target::local>
                       As,
                   sycl::accessor<float, 2, sycl::access::mode::read_write,
                                  sycl::access::target::local>
                       Bs) {
  // Block index
  int bx = item_ct1.get_group(2);
  int by = item_ct1.get_group(1);

  // Thread index
  int tx = item_ct1.get_local_id(2);
  int ty = item_ct1.get_local_id(1);

  // Index of the first sub-matrix of A processed by the block
  int aBegin = wA * BLOCK_SIZE * by;

  // Index of the last sub-matrix of A processed by the block
  int aEnd = aBegin + wA - 1;

  // Step size used to iterate through the sub-matrices of A
  int aStep = BLOCK_SIZE;

  // Index of the first sub-matrix of B processed by the block
  int bBegin = BLOCK_SIZE * bx;

  // Step size used to iterate through the sub-matrices of B
  int bStep = BLOCK_SIZE * wB;

  // Csub is used to store the element of the block sub-matrix
  // that is computed by the thread
  float Csub = 0;

  // Loop over all the sub-matrices of A and B
  // required to compute the block sub-matrix
  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
    // Declaration of the shared memory array As used to
    // store the sub-matrix of A

    // Declaration of the shared memory array Bs used to
    // store the sub-matrix of B

    // Load the matrices from device memory
    // to shared memory; each thread loads
    // one element of each matrix
    As[ty][tx] = A[a + wA * ty + tx];
    Bs[ty][tx] = B[b + wB * ty + tx];

    // Synchronize to make sure the matrices are loaded
    item_ct1.barrier(sycl::access::fence_space::local_space);

    // Multiply the two matrices together;
    // each thread computes one element
    // of the block sub-matrix
#pragma unroll

    for (int k = 0; k < BLOCK_SIZE; ++k) {
      Csub += As[ty][k] * Bs[k][tx];
    }

    // Synchronize to make sure that the preceding
    // computation is done before loading two new
    // sub-matrices of A and B in the next iteration
    item_ct1.barrier(sycl::access::fence_space::local_space);
  }

  // Write the block sub-matrix to device memory;
  // each thread writes one element
  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
  C[c + wB * ty + tx] = Csub;
}

void ConstantInit(float *data, int size, float val) {
  for (int i = 0; i < size; ++i) {
    data[i] = val;
  }
}

/**
 * Run a simple test of matrix multiplication using CUDA
 */
int MatrixMultiply(int argc, char **argv, int block_size,
                   const sycl::range<3> &dimsA, const sycl::range<3> &dimsB) {
  sycl::device dev_ct1 = sycl::device::get_devices()[0];
  sycl::queue q_ct1(dev_ct1);
  // Allocate host memory for matrices A and B
  unsigned int size_A = dimsA[2] * dimsA[1];
  unsigned int mem_size_A = sizeof(float) * size_A;
  float *h_A;
  h_A = (float *)sycl::malloc_host(mem_size_A, q_ct1);
  unsigned int size_B = dimsB[2] * dimsB[1];
  unsigned int mem_size_B = sizeof(float) * size_B;
  float *h_B;
  h_B = (float *)sycl::malloc_host(mem_size_B, q_ct1);

  // Initialize host memory
  const float valB = 0.01f;
  ConstantInit(h_A, size_A, 1.0f);
  ConstantInit(h_B, size_B, valB);

  // Allocate device memory
  float *d_A, *d_B, *d_C;

  // Allocate host matrix C
  sycl::range<3> dimsC(1, dimsA[1], dimsB[2]);
  unsigned int mem_size_C = dimsC[2] * dimsC[1] * sizeof(float);
  float *h_C;
  h_C = (float *)sycl::malloc_host(mem_size_C, q_ct1);

  if (h_C == NULL) {
    fprintf(stderr, "Failed to allocate host matrix C!\n");
    exit(EXIT_FAILURE);
  }

  d_A = (float *)sycl::malloc_device(mem_size_A, q_ct1);
  d_B = (float *)sycl::malloc_device(mem_size_B, q_ct1);
  d_C = (float *)sycl::malloc_device(mem_size_C, q_ct1);

  // copy host memory to device
  q_ct1.memcpy(d_A, h_A, mem_size_A).wait();
  q_ct1.memcpy(d_B, h_B, mem_size_B).wait();

  // Setup execution parameters
  sycl::range<3> threads(1, block_size, block_size);
  sycl::range<3> grid(1, dimsA[1] / threads[1], dimsB[2] / threads[2]);

  // Create and start timer
  // printf("Computing result using CUDA Kernel...\n");

  double start = dwalltime();
  // Performs warmup operation using matrixMul CUDA kernel
  if (block_size == 16) {
    /*
    DPCT1049:2: The work-group size passed to the SYCL kernel may exceed the
    limit. To get the device limit, query info::device::max_work_group_size.
    Adjust the work-group size if needed.
    */
    q_ct1.submit([&](sycl::handler &cgh) {
      sycl::accessor<float, 2, sycl::access::mode::read_write,
                     sycl::access::target::local>
          As_acc_ct1(sycl::range<2>(16, 16), cgh);

      sycl::accessor<float, 2, sycl::access::mode::read_write,
                     sycl::access::target::local>
          Bs_acc_ct1(sycl::range<2>(16, 16), cgh);

      cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
                       [=](sycl::nd_item<3> item_ct1) {
                         MatrixMulCUDA<16>(d_C, d_A, d_B, dimsA[2], dimsB[2],
                                           item_ct1, As_acc_ct1, Bs_acc_ct1);
                       });
    });
  } else {
    /*
    DPCT1049:3: The work-group size passed to the SYCL kernel may exceed the
    limit. To get the device limit, query info::device::max_work_group_size.
    Adjust the work-group size if needed.
    */
    q_ct1.submit([&](sycl::handler &cgh) {
      sycl::accessor<float, 2, sycl::access::mode::read_write,
                     sycl::access::target::local>
          As_acc_ct1(sycl::range<2>(32, 32), cgh);

      sycl::accessor<float, 2, sycl::access::mode::read_write,
                     sycl::access::target::local>
          Bs_acc_ct1(sycl::range<2>(32, 32), cgh);

      cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
                       [=](sycl::nd_item<3> item_ct1) {
                         MatrixMulCUDA<32>(d_C, d_A, d_B, dimsA[2], dimsB[2],
                                           item_ct1, As_acc_ct1, Bs_acc_ct1);
                       });
    });
  }

  q_ct1.wait_and_throw();
  double stop = dwalltime();

  // printf("done %lf\n", stop - start);

  // Record the start event
  start = dwalltime();

  // Execute the kernel
  int nIter = 10;

  for (int j = 0; j < nIter; j++) {
    if (block_size == 16) {
      /*
      DPCT1049:4: The work-group size passed to the SYCL kernel may exceed the
      limit. To get the device limit, query info::device::max_work_group_size.
      Adjust the work-group size if needed.
      */
      q_ct1.submit([&](sycl::handler &cgh) {
        sycl::accessor<float, 2, sycl::access::mode::read_write,
                       sycl::access::target::local>
            As_acc_ct1(sycl::range<2>(16, 16), cgh);

        sycl::accessor<float, 2, sycl::access::mode::read_write,
                       sycl::access::target::local>
            Bs_acc_ct1(sycl::range<2>(16, 16), cgh);

        cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
                         [=](sycl::nd_item<3> item_ct1) {
                           MatrixMulCUDA<16>(d_C, d_A, d_B, dimsA[2], dimsB[2],
                                             item_ct1, As_acc_ct1, Bs_acc_ct1);
                         });
      });
    } else {
      /*
      DPCT1049:5: The work-group size passed to the SYCL kernel may exceed the
      limit. To get the device limit, query info::device::max_work_group_size.
      Adjust the work-group size if needed.
      */
      q_ct1.submit([&](sycl::handler &cgh) {
        sycl::accessor<float, 2, sycl::access::mode::read_write,
                       sycl::access::target::local>
            As_acc_ct1(sycl::range<2>(32, 32), cgh);

        sycl::accessor<float, 2, sycl::access::mode::read_write,
                       sycl::access::target::local>
            Bs_acc_ct1(sycl::range<2>(32, 32), cgh);

        cgh.parallel_for(sycl::nd_range<3>(grid * threads, threads),
                         [=](sycl::nd_item<3> item_ct1) {
                           MatrixMulCUDA<32>(d_C, d_A, d_B, dimsA[2], dimsB[2],
                                             item_ct1, As_acc_ct1, Bs_acc_ct1);
                         });
      });
    }
  }

  q_ct1.wait_and_throw();

  stop = dwalltime();

  float msecTotal = (stop - start) * 1000;

  // Compute and print the performance
  float msecPerMatrixMul = msecTotal / nIter;
  double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA[1]) *
                             static_cast<double>(dimsA[1]) *
                             static_cast<double>(dimsB[1]);
  double gigaFlops =
      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);

  std::string filename = "data_sycl.csv";
  std::ifstream file_check(filename);
  bool file_exists = file_check.good();
  file_check.close();

  // Create and open the file for writing, with or without headers
  std::ofstream file(filename, std::ios::app);
  if (!file_exists) {
    file << "N, time (S), GFLOPS, msecPerMatrixMu (S)l\n";
  }

  file << dimsA[1] << ", " << msecTotal / 1000 << ", " << gigaFlops << ", "
       << msecPerMatrixMul << "\n";

  // // Copy result from device to host
  // q_ct1.memcpy(h_C, d_C, mem_size_C).wait();

  // q_ct1.wait_and_throw();

  // // printf("Checking computed result for correctness: ");
  // bool correct = true;

  // // test relative error by the formula
  // //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
  // double eps = 1.e-6; // machine zero

  // for (int i = 0; i < static_cast<int>(dimsC[2] * dimsC[1]); i++) {
  //   double abs_err = fabs(h_C[i] - (dimsA[2] * valB));
  //   double dot_length = dimsA[2];
  //   double abs_val = fabs(h_C[i]);
  //   double rel_err = abs_err / abs_val / dot_length;

  //   if (rel_err > eps) {
  //     printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
  //            h_C[i], dimsA[2] * valB, eps);
  //     correct = false;
  //   }
  // }

  // printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

  // Clean up memory
  sycl::free(h_A, q_ct1);
  sycl::free(h_B, q_ct1);
  sycl::free(h_C, q_ct1);
  sycl::free(d_A, q_ct1);
  sycl::free(d_B, q_ct1);
  sycl::free(d_C, q_ct1);
  // printf("\nNOTE: The CUDA Samples are not meant for performance "
  //        "measurements. Results may vary when GPU Boost is enabled.\n");

  // if (correct) {
  return EXIT_SUCCESS;
  // } else {
  //   return EXIT_FAILURE;
  // }
}

/**
 * Program main
 */
int main(int argc, char **argv) {
  // printf("[Matrix Multiply Using CUDA] - Starting...\n");

  sycl::queue q{sycl::default_selector{}};

  // Obtener el nombre del dispositivo
  std::string device_name = q.get_device().get_info<sycl::info::device::name>();

  std::cout << "Device Name: " << device_name << std::endl;

  int block_size = 16;

  sycl::range<3> dimsA(1, 5 * 2 * block_size, 5 * 2 * block_size);
  sycl::range<3> dimsB(1, 5 * 2 * block_size, 5 * 4 * block_size);

  // width of Matrix A from argc / argv directetly
  dimsA[2] = argc > 1 ? atoi(argv[1]) : dimsA[2];
  dimsA[1] = argc > 2 ? atoi(argv[2]) : dimsA[1];
  dimsB[2] = argc > 3 ? atoi(argv[3]) : dimsB[2];
  dimsB[1] = argc > 4 ? atoi(argv[4]) : dimsB[1];

  if (dimsA[2] != dimsB[1]) {
    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
           dimsA[2], dimsB[1]);
    exit(EXIT_FAILURE);
  }

  // printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA[2], dimsA[1], dimsB[2],
  //        dimsB[1]);

  int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);

  exit(matrix_result);
}

Hi, thank you for posting a clear description with reproducers. We found there is one optimisation opportunity which nvcc took but clang couldn’t. In the generated PTX code, which can be produced with:

# CUDA
nvcc program.cu -O3 -o cuda
cuobjdump --dump-ptx ./cuda | cu++filt >cuda.ptx

# SYCL
clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda program.cpp -O3 -o sycl
SYCL_DUMP_IMAGES=1 ./sycl
cuobjdump --dump-ptx sycl_nvptx641.bin | cu++filt >sycl.ptx

we can see that the source code:

As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];

item_ct1.barrier(sycl::access::fence_space::local_space); // __syncthreads(); in CUDA

#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
    Csub += As[ty][k] * Bs[k][tx];
}

results in nvcc inlining the address calculation for Bs[k][tx] and using fixed offsets:

ld.shared.f32 %f11, [%r10+64];
ld.shared.f32 %f14, [%r10+128];
ld.shared.f32 %f17, [%r10+192];
ld.shared.f32 %f20, [%r10+256];

which was not possible in clang, so the SYCL version computes the address for each value at runtime and stores all of them in registers:

add.s64 %rd9, %rd8, %rd42;
add.s64 %rd10, %rd9, %rd42;
add.s64 %rd11, %rd10, %rd42;
add.s64 %rd12, %rd11, %rd42;

ld.shared.f32 %f12, [%rd9];
ld.shared.f32 %f15, [%rd10];
ld.shared.f32 %f18, [%rd11];
ld.shared.f32 %f21, [%rd12];

This extra computation and use of more registers is what seems to be causing the performance difference.

We will look into this issue.

Hi @rbielski, amazing! What great news that you have found the problem. It would be amazing to find a way to solve it. I will keep investigating, and if I discover anything, I will let you know. We’ll keep in touch. Regards!