Hi,
occasionally when writing SYCL code I will get the warning:
C:\Program Files\Codeplay\ComputeCpp\include\SYCL\accessor\buffer_accessor.h(160,12): warning G645B5FF6: returning reference to local temporary object [-Wreturn-stack-address]
return this->get_device_ptr()[index];
with no other information.
Does anyone know what sort of thing prompts this?
I don’t see why it’s relevant, but it seems to occur when there are a few queue.submit in one function. I broke one large method with 9 submits into a couple smaller ones and it seems to have gone away.
I can provide code samples if it helps, but they’re a bit messy
Hi Finlay, apologies for the delay in responding through the summer holiday season.
We have not seen this warning before. If you can provide a simple example of code that shows this we could investigate it.
Hi Rod,
I’ve tried my best to simplify my code, but I understand it’s still a bit complex.
The code is supposed to be doing something very similar to this
// clang-format off
// THIS FILES CAUSES ERROR
// C:\Program Files\Codeplay\ComputeCpp\include\SYCL\accessor\accessor_ops.h(155,12): error GB19FFA8B: binding value of type '__global double' to reference to type 'double' drops <<ERROR>> qualifiers
// return m_accRef[m_index];
// ^~~~~~~~~~~~~~~~~
// clang-format on
#include <CL/sycl.hpp>
#include <cstdio>
#include <iomanip>
#include <iostream>
#include <vector>
#include <memory>
template <typename T> using vec = std::vector<T>;
template <typename Floating, std::enable_if_t<std::is_floating_point<Floating>::value, bool> = true>
using matrix = vec<vec<Floating>>;
template <typename Floating> void print_vec(const vec<Floating> &r) {
std::for_each(begin(r), end(r), [](auto v) { std::cout << std::setw(3) << v << " "; });
std::puts("");
}
template <typename Floating> void print_mat(const matrix<Floating> &m) {
for (auto &&r : m) {
print_vec(r);
}
}
class LoadLU;
class FillIndexes;
class MagnitudeRowReduce;
class RowSwap1;
class RowSwap2;
class CalcMultipliers;
class FillL;
class FillU;
template <typename Floating> struct SYCL_Gaussian_LU {
static void magnitude_row_swap(const size_t matrix_size, sycl::queue &q, sycl::buffer<Floating, 2> &data_buf,
sycl::buffer<Floating, 1> &rhs_buf, const size_t diagonal) {
constexpr size_t comparisons_per_work_item = 2;
auto items = matrix_size - diagonal;
auto extra_values = items % comparisons_per_work_item;
auto work_items = items / comparisons_per_work_item + (extra_values ? 1 : 0);
// create a buffer of indexs from diagonal to matrix_size-1
// compare all values at data[index][diagonal] by their magnitude // NB current just using value, not abs value
// reducing the list of indexes until we have the index of the greatest magnitude value
// swap the current row and the row with the greastest magnitude
sycl::buffer<size_t, 1> indexesA(sycl::range<1>{items});
sycl::buffer<size_t, 1> indexesB(sycl::range<1>{work_items});
q.submit([&](sycl::handler &h) {
auto acc_indexes = indexesA.template get_access<sycl::access::mode::discard_write>(h);
h.parallel_for<class FillIndexes>(sycl::range<1>{items}, [=](sycl::id<1> id) { acc_indexes[id] = id[0] + diagonal; });
});
auto partial_magnitude_reduce = [&q, &data_buf, comparisons_per_work_item,
diagonal](sycl::buffer<size_t, 1> &read, sycl::buffer<size_t, 1> &write,
const size_t items, const size_t work_items, const size_t extra_values) {
q.submit([&](sycl::handler &h) {
auto acc_data = data_buf.template get_access<sycl::access::mode::read>(h);
auto read_mem = read.template get_access<sycl::access::mode::read>(h);
auto write_mem = write.template get_access<sycl::access::mode::write>(h);
h.parallel_for<class MagnitudeRowReduce>(sycl::range<1>{work_items}, [=](sycl::id<1> id) {
const auto global_id = id[0];
auto greatest = read_mem[global_id * comparisons_per_work_item];
if (extra_values && global_id == work_items - 1) {
for (size_t i = 1; i < extra_values; ++i) {
auto idx = read_mem[global_id * comparisons_per_work_item + i];
bool other_greater = acc_data[greatest][diagonal] < acc_data[idx][diagonal];
greatest = greatest * (!other_greater) + idx * (other_greater);
}
} else {
for (size_t i = 1; i < comparisons_per_work_item; ++i) {
auto idx = read_mem[global_id * comparisons_per_work_item + i];
bool other_greater = acc_data[greatest][diagonal] < acc_data[idx][diagonal];
greatest = greatest * (!other_greater) + idx * (other_greater);
}
}
write_mem[global_id] = greatest;
});
});
};
bool write_to_A = false;
while (items != 1) {
if (write_to_A) {
partial_magnitude_reduce(indexesB, indexesA, items, work_items, extra_values);
} else {
partial_magnitude_reduce(indexesA, indexesB, items, work_items, extra_values);
}
write_to_A = !write_to_A;
items = work_items;
extra_values = items % comparisons_per_work_item;
work_items = items / comparisons_per_work_item + (extra_values ? 1 : 0);
}
auto swap_row = [&q, &data_buf, &rhs_buf, matrix_size, diagonal](sycl::buffer<size_t, 1> &answer) {
q.submit([&](sycl::handler &h) {
auto acc_data = data_buf.template get_access<sycl::access::mode::read_write>(h);
auto acc_answer = answer.template get_access<sycl::access::mode::read>(h, sycl::range<1>{1});
h.parallel_for<class RowSwap1>(sycl::range<1>{matrix_size}, [=](sycl::id<1> id) {
auto global_id = id[0];
// TODO might be faster with no storage, only bit shifting or addition and subtraction
auto tmp = acc_data[diagonal][global_id];
acc_data[diagonal][global_id] = acc_data[acc_answer[0]][global_id];
acc_data[acc_answer[0]][global_id] = tmp;
});
});
q.submit([&](sycl::handler &h) {
auto acc_rhs = rhs_buf.template get_access<sycl::access::mode::read_write>(h);
auto acc_answer = answer.template get_access<sycl::access::mode::read>(h, sycl::range<1>{1});
h.single_task<class RowSwap2>([=]() {
// TODO might be faster with no storage, only bit shifting or addition and subtraction
auto tmp = acc_rhs[diagonal];
acc_rhs[diagonal] = acc_rhs[acc_answer[0]];
acc_rhs[acc_answer[0]] = tmp;
});
});
};
// If write_to_A then the answer is in B[0]
swap_row(write_to_A ? indexesB : indexesA);
}
static void get_gaussian_LU(const size_t matrix_size, sycl::queue &q, sycl::buffer<Floating, 2> &data_buf,
sycl::buffer<Floating, 1> &rhs_buf) {
sycl::buffer<Floating, 1> multipliers{sycl::range<1>{matrix_size - 1}};
for (auto n = size_t{0}; n < matrix_size - 1; ++n) {
magnitude_row_swap(matrix_size, q, data_buf, rhs_buf, n);
// calculate the multipliers
const size_t num_of_multipliers = matrix_size - n - 1;
q.submit([&, n](sycl::handler &h) {
auto multiplier_range = sycl::range<1>{num_of_multipliers};
auto acc_multiplier = multipliers.template get_access<sycl::access::mode::discard_write>(h, multiplier_range);
auto acc_data = data_buf.template get_access<sycl::access::mode::read>(
h, sycl::range<2>{num_of_multipliers + 1, 1}, sycl::id<2>{n, n});
h.parallel_for<class CalcMultipliers>(multiplier_range, [=](sycl::id<1> id) {
auto row = n + 1 + id[0];
acc_multiplier[id] = acc_data[row][n] / acc_data[n][n];
});
});
// fill in the L values
q.submit([&, n](sycl::handler &h) {
auto multiplier_range = sycl::range<1>{num_of_multipliers};
auto acc_multiplier = multipliers.template get_access<sycl::access::mode::read>(h); //(multiplier_range);
auto acc_data = data_buf.template get_access<sycl::access::mode::write>(h); //(sycl::range<2>{num_of_multipliers,
// 1},
// sycl::id<2>{n + 1, n});
h.parallel_for<class FillL>(multiplier_range, [=](sycl::id<1> id) {
const auto row = n + 1 + id[0];
acc_data[row][n] = acc_multiplier[id];
});
});
// prepare U values
q.submit([&, n](sycl::handler &h) {
auto acc_multiplier =
multipliers.template get_access<sycl::access::mode::read>(h); //(sycl::range<1>{1}, sycl::id<1>{row - n - 1});
auto acc_data = data_buf.template get_access<sycl::access::mode::read_write>(h); // TODO range
h.parallel_for<class FillU>(sycl::range<2>{num_of_multipliers, num_of_multipliers}, [=](sycl::id<2> id) {
auto row = n + 1 + id[0];
auto column = n + 1 + id[1];
acc_data[row][column] = acc_data[row][column] - acc_multiplier[id[0]] * acc_data[n][column];
});
});
}
}
};
// data type we are testing
using datatype = double;
int main() {
using Solver = SYCL_Gaussian_LU<datatype>;
matrix<datatype> system({{6, 2, 8, 26}, {3, 5, 2, 8}, {0, 8, 2, -7}});
matrix<datatype> expected_lu({{6, 2, 8}, {0.5, 4, -2}, {0, 2, 6}});
vec<datatype> expected_solution{4, -1, 0.5};
// set up SYCL
#ifdef PREFER_INTEL
sycl::intel_selector selector; // TODO dependency injection
#else
sycl::default_selector selector;
#endif // PREFER_INTEL
const size_t matrix_size = system.size();
sycl::queue q{selector, [](sycl::exception_list el) {
for (auto &&ex : el) {
try {
std::rethrow_exception(ex);
} catch (sycl::exception const &e) {
std::cout << "Caught asynchronous SYCL exception: " << e.what() << '\n';
}
}
}};
sycl::buffer<datatype, 2> LU_buf(sycl::range<2>(matrix_size, matrix_size));
for (size_t i = 0; i < matrix_size; ++i) { // init lu buf
q.submit([&system, &LU_buf, i, matrix_size](sycl::handler &h) {
auto acc_row =
LU_buf.template get_access<sycl::access::mode::write>(h, sycl::range<2>{1, matrix_size}, sycl::id<2>{i, 0});
h.copy(system[i].data(), acc_row);
});
}
sycl::buffer<datatype, 1> b_buf{matrix_size};
auto b = std::make_unique<datatype[]>(matrix_size);
// init b buffer
for (size_t i = 0; i < matrix_size; ++i) {
b[i] = system[i][matrix_size];
}
q.submit([&](sycl::handler &h) {
auto acc_b = b_buf.template get_access<sycl::access::mode::discard_write>(h);
h.copy(b.get(), acc_b);
});
Solver::get_gaussian_LU(matrix_size, q, LU_buf, b_buf);
auto acc_lu_buff = LU_buf.template get_access<sycl::access::mode::read>();
for (size_t i = 0; i < matrix_size; ++i) {
for (size_t j = 0; j < matrix_size; ++j) {
if (acc_lu_buff[i][j] != expected_lu[i][j]) {
std::cout << acc_lu_buff[i][j] << "!=" << expected_lu[i][j] << '\n';
}
}
}
}
If there is any way I can help, please let me know.
Finlay