Hi,
I got below error when trying to run portBLAS symm sample on OCK Refsi target:
$ SYCL_CONFIG_FILE_NAME= OCL_ICD_FILENAMES=/home/ubuntu/projects/oneapi-construction-kit/build-riscv/install/lib/libCL.so ./sample_symm_refsi_g1
A:
-0.771 -3.306 -9.351 7.3741 7.9330
-1.794 9.7978 -2.765 -1.165 -4.879
2.0450 -7.735 9.9316 4.7545 2.9770
3.9586 4.8859 -9.780 -1.559 -0.966
1.7942 6.9131 -6.875 -9.630 1.5655
---
B:
6.6000 6.6908 5.5798 -0.947 7.6777 -6.158 5.1564
0.6324 4.8952 -1.468 -8.098 0.5424 -6.350 0.9360
4.3496 -5.643 9.7355 9.0681 -7.657 -3.574 3.7529
-0.888 1.8959 -7.865 -7.243 7.8407 -6.754 6.5119
-1.642 -7.007 8.7773 6.5821 2.0527 -6.051 1.6112
---
C (before):
-6.564 -0.856 -6.175 -4.128 -3.641 9.3869 -6.574
1.4169 3.1934 -2.722 -8.103 -8.011 -8.448 6.1707
-4.092 2.6603 7.6269 6.3407 7.0392 5.7516 -3.999
5.9839 1.8104 -9.540 7.0946 -3.428 0.4224 7.4235
2.9365 7.8797 6.4519 -0.509 -7.819 3.1578 -5.600
---
Copying A, B and C to device
Executing C = 1.5*A*B + 0.5*C
The program was built for 1 devices
Build program log for 'RefSi G1 RV64':
error: OpCapability FPFastMathModeINTEL (#5837) not supported by device
-11 (PI_ERROR_BUILD_PROGRAM_FAILURE)
Copying C to host
terminate called after throwing an instance of 'sycl::_V1::runtime_error'
what(): Enqueue process failed. -59 (PI_ERROR_INVALID_OPERATION)
Aborted (core dumped)
the symm sample code is (just replace the default_selector_v to refsi_g1_selector):
#include "portblas.hpp"
#include <sycl/sycl.hpp>
#include "util.hpp"
int refsi_g1_selector(const sycl::device& dev) {
auto plt = dev.get_platform();
if ((plt.get_info<sycl::info::platform::name>() == "ComputeAorta") &&
(dev.get_info<sycl::info::device::name>() == "RefSi G1 RV64")) {
return 1;
}
return -1;
}
int main(int argc, char** argv) {
/* Create a SYCL queue with the default device selector */
sycl::queue q = sycl::queue(refsi_g1_selector);
/* Create a portBLAS sb_handle and get the policy handler */
blas::SB_Handle sb_handle(q);
/* Arguments of the SYMM operation.
* Note: these matrix dimensions are too small to get a performance gain by
* using portBLAS, but they are convenient for this sample */
const size_t m = 5;
const size_t n = 7;
const size_t lda = 32;
const size_t ldb = 32;
const size_t ldc = 32;
const float alpha = 1.5;
const float beta = 0.5;
const char side = 'l';
const char uplo = 'l';
const size_t k = side == 'l' ? m : n;
/* Create the matrices */
std::vector<float> A = std::vector<float>(lda * k);
std::vector<float> B = std::vector<float>(ldb * n);
std::vector<float> C = std::vector<float>(ldc * n);
/* Fill the matrices with random values */
fill_matrix(A, k, k, lda);
fill_matrix(B, m, n, ldb);
fill_matrix(C, m, n, ldc);
/* Print the matrices before the SYMM operation */
std::cout << "A:\n";
print_matrix(A, k, k, lda);
std::cout << "---\nB:\n";
print_matrix(B, m, n, ldb);
std::cout << "---\nC (before):\n";
print_matrix(C, m, n, ldc);
/* Create the buffers */
auto a_gpu = blas::make_sycl_iterator_buffer<float>(lda * k);
auto b_gpu = blas::make_sycl_iterator_buffer<float>(ldb * n);
auto c_gpu = blas::make_sycl_iterator_buffer<float>(ldc * n);
/* Copy the matrices to the device
* Note: this sample uses explicit copy operations, see the GEMV sample for
* an alternative way
*/
std::cout << "---\nCopying A, B and C to device\n";
blas::helper::copy_to_device(sb_handle.get_queue(), A.data(), a_gpu, lda * k);
blas::helper::copy_to_device(sb_handle.get_queue(), B.data(), b_gpu, ldb * n);
blas::helper::copy_to_device(sb_handle.get_queue(), C.data(), c_gpu, ldc * n);
/* Execute the SYMM operation */
if (side == 'l') {
std::cout << "Executing C = " << alpha << "*A*B + " << beta << "*C\n";
} else {
std::cout << "Executing C = " << alpha << "*B*A + " << beta << "*C\n";
}
blas::_symm(sb_handle, side, uplo, m, n, alpha, a_gpu, lda, b_gpu, ldb, beta,
c_gpu, ldc);
/* Copy the result to the host */
std::cout << "Copying C to host\n";
auto event = blas::helper::copy_to_host(sb_handle.get_queue(), c_gpu,
C.data(), ldc * n);
sb_handle.wait(event);
/* Print the result after the SYMM operation */
std::cout << "---\nC (after):" << std::endl;
print_matrix(C, m, n, ldc);
return 0;
}
Any idea?