Unexpected warning - "returning reference to local temporary object"

Hi,
occasionally when writing SYCL code I will get the warning:

C:\Program Files\Codeplay\ComputeCpp\include\SYCL\accessor\buffer_accessor.h(160,12): warning G645B5FF6: returning reference to local temporary object [-Wreturn-stack-address]
      return this->get_device_ptr()[index];

with no other information.
Does anyone know what sort of thing prompts this?
I don’t see why it’s relevant, but it seems to occur when there are a few queue.submit in one function. I broke one large method with 9 submits into a couple smaller ones and it seems to have gone away.

I can provide code samples if it helps, but they’re a bit messy :sweat_smile:

Thanks,
Finlay

Hi Finlay, apologies for the delay in responding through the summer holiday season.
We have not seen this warning before. If you can provide a simple example of code that shows this we could investigate it.

Hi Rod,
I’ve tried my best to simplify my code, but I understand it’s still a bit complex.
The code is supposed to be doing something very similar to this

// clang-format off
// THIS FILES CAUSES ERROR 
// C:\Program Files\Codeplay\ComputeCpp\include\SYCL\accessor\accessor_ops.h(155,12): error GB19FFA8B: binding value of type '__global double' to reference to type 'double' drops <<ERROR>> qualifiers
// return m_accRef[m_index];
//        ^~~~~~~~~~~~~~~~~
// clang-format on

#include <CL/sycl.hpp>
#include <cstdio>
#include <iomanip>
#include <iostream>
#include <vector>
#include <memory>

template <typename T> using vec = std::vector<T>;

template <typename Floating, std::enable_if_t<std::is_floating_point<Floating>::value, bool> = true>
using matrix = vec<vec<Floating>>;

template <typename Floating> void print_vec(const vec<Floating> &r) {
    std::for_each(begin(r), end(r), [](auto v) { std::cout << std::setw(3) << v << " "; });
    std::puts("");
}

template <typename Floating> void print_mat(const matrix<Floating> &m) {
    for (auto &&r : m) {
        print_vec(r);
    }
}

class LoadLU;
class FillIndexes;
class MagnitudeRowReduce;
class RowSwap1;
class RowSwap2;
class CalcMultipliers;
class FillL;
class FillU;

template <typename Floating> struct SYCL_Gaussian_LU {

    static void magnitude_row_swap(const size_t matrix_size, sycl::queue &q, sycl::buffer<Floating, 2> &data_buf,
                                   sycl::buffer<Floating, 1> &rhs_buf, const size_t diagonal) {

        constexpr size_t comparisons_per_work_item = 2;
        auto items = matrix_size - diagonal;
        auto extra_values = items % comparisons_per_work_item;
        auto work_items = items / comparisons_per_work_item + (extra_values ? 1 : 0);
        // create a buffer of indexs from diagonal to matrix_size-1
        // compare all values at data[index][diagonal] by their magnitude // NB current just using value, not abs value
        // reducing the list of indexes until we have the index of the greatest magnitude value
        // swap the current row and the row with the greastest magnitude
        sycl::buffer<size_t, 1> indexesA(sycl::range<1>{items});
        sycl::buffer<size_t, 1> indexesB(sycl::range<1>{work_items});
        q.submit([&](sycl::handler &h) {
            auto acc_indexes = indexesA.template get_access<sycl::access::mode::discard_write>(h);
            h.parallel_for<class FillIndexes>(sycl::range<1>{items}, [=](sycl::id<1> id) { acc_indexes[id] = id[0] + diagonal; });
        });

        auto partial_magnitude_reduce = [&q, &data_buf, comparisons_per_work_item,
                                         diagonal](sycl::buffer<size_t, 1> &read, sycl::buffer<size_t, 1> &write,
                                                   const size_t items, const size_t work_items, const size_t extra_values) {
            q.submit([&](sycl::handler &h) {
                auto acc_data = data_buf.template get_access<sycl::access::mode::read>(h);
                auto read_mem = read.template get_access<sycl::access::mode::read>(h);
                auto write_mem = write.template get_access<sycl::access::mode::write>(h);

                h.parallel_for<class MagnitudeRowReduce>(sycl::range<1>{work_items}, [=](sycl::id<1> id) {
                    const auto global_id = id[0];
                    auto greatest = read_mem[global_id * comparisons_per_work_item];
                    if (extra_values && global_id == work_items - 1) {
                        for (size_t i = 1; i < extra_values; ++i) {
                            auto idx = read_mem[global_id * comparisons_per_work_item + i];
                            bool other_greater = acc_data[greatest][diagonal] < acc_data[idx][diagonal];
                            greatest = greatest * (!other_greater) + idx * (other_greater);
                        }
                    } else {
                        for (size_t i = 1; i < comparisons_per_work_item; ++i) {
                            auto idx = read_mem[global_id * comparisons_per_work_item + i];
                            bool other_greater = acc_data[greatest][diagonal] < acc_data[idx][diagonal];
                            greatest = greatest * (!other_greater) + idx * (other_greater);
                        }
                    }
                    write_mem[global_id] = greatest;
                });
            });
        };

        bool write_to_A = false;
        while (items != 1) {
            if (write_to_A) {
                partial_magnitude_reduce(indexesB, indexesA, items, work_items, extra_values);
            } else {
                partial_magnitude_reduce(indexesA, indexesB, items, work_items, extra_values);
            }
            write_to_A = !write_to_A;
            items = work_items;
            extra_values = items % comparisons_per_work_item;
            work_items = items / comparisons_per_work_item + (extra_values ? 1 : 0);
        }

        auto swap_row = [&q, &data_buf, &rhs_buf, matrix_size, diagonal](sycl::buffer<size_t, 1> &answer) {
            q.submit([&](sycl::handler &h) {
                auto acc_data = data_buf.template get_access<sycl::access::mode::read_write>(h);
                auto acc_answer = answer.template get_access<sycl::access::mode::read>(h, sycl::range<1>{1});
                h.parallel_for<class RowSwap1>(sycl::range<1>{matrix_size}, [=](sycl::id<1> id) {
                    auto global_id = id[0];
                    // TODO might be faster with no storage, only bit shifting or addition and subtraction
                    auto tmp = acc_data[diagonal][global_id];
                    acc_data[diagonal][global_id] = acc_data[acc_answer[0]][global_id];
                    acc_data[acc_answer[0]][global_id] = tmp;
                });
            });
            q.submit([&](sycl::handler &h) {
                auto acc_rhs = rhs_buf.template get_access<sycl::access::mode::read_write>(h);
                auto acc_answer = answer.template get_access<sycl::access::mode::read>(h, sycl::range<1>{1});
                h.single_task<class RowSwap2>([=]() {
                    // TODO might be faster with no storage, only bit shifting or addition and subtraction
                    auto tmp = acc_rhs[diagonal];
                    acc_rhs[diagonal] = acc_rhs[acc_answer[0]];
                    acc_rhs[acc_answer[0]] = tmp;
                });
            });
        };
        // If write_to_A then the answer is in B[0]
        swap_row(write_to_A ? indexesB : indexesA);
    }

    static void get_gaussian_LU(const size_t matrix_size, sycl::queue &q, sycl::buffer<Floating, 2> &data_buf,
                                sycl::buffer<Floating, 1> &rhs_buf) {
        sycl::buffer<Floating, 1> multipliers{sycl::range<1>{matrix_size - 1}};
        for (auto n = size_t{0}; n < matrix_size - 1; ++n) {

            magnitude_row_swap(matrix_size, q, data_buf, rhs_buf, n);

            // calculate the multipliers
            const size_t num_of_multipliers = matrix_size - n - 1;
            q.submit([&, n](sycl::handler &h) {
                auto multiplier_range = sycl::range<1>{num_of_multipliers};
                auto acc_multiplier = multipliers.template get_access<sycl::access::mode::discard_write>(h, multiplier_range);
                auto acc_data = data_buf.template get_access<sycl::access::mode::read>(
                    h, sycl::range<2>{num_of_multipliers + 1, 1}, sycl::id<2>{n, n});
                h.parallel_for<class CalcMultipliers>(multiplier_range, [=](sycl::id<1> id) {
                    auto row = n + 1 + id[0];
                    acc_multiplier[id] = acc_data[row][n] / acc_data[n][n];
                });
            });

            // fill in the L values
            q.submit([&, n](sycl::handler &h) {
                auto multiplier_range = sycl::range<1>{num_of_multipliers};
                auto acc_multiplier = multipliers.template get_access<sycl::access::mode::read>(h); //(multiplier_range);
                auto acc_data = data_buf.template get_access<sycl::access::mode::write>(h); //(sycl::range<2>{num_of_multipliers,
                                                                                            // 1},
                // sycl::id<2>{n + 1, n});
                h.parallel_for<class FillL>(multiplier_range, [=](sycl::id<1> id) {
                    const auto row = n + 1 + id[0];
                    acc_data[row][n] = acc_multiplier[id];
                });
            });

            // prepare U values
            q.submit([&, n](sycl::handler &h) {
                auto acc_multiplier =
                    multipliers.template get_access<sycl::access::mode::read>(h); //(sycl::range<1>{1}, sycl::id<1>{row - n - 1});
                auto acc_data = data_buf.template get_access<sycl::access::mode::read_write>(h); // TODO range
                h.parallel_for<class FillU>(sycl::range<2>{num_of_multipliers, num_of_multipliers}, [=](sycl::id<2> id) {
                    auto row = n + 1 + id[0];
                    auto column = n + 1 + id[1];
                    acc_data[row][column] = acc_data[row][column] - acc_multiplier[id[0]] * acc_data[n][column];
                });
            });
        }
    }
};

// data type we are testing
using datatype = double;

int main() {
    using Solver = SYCL_Gaussian_LU<datatype>;
    matrix<datatype> system({{6, 2, 8, 26}, {3, 5, 2, 8}, {0, 8, 2, -7}});
    matrix<datatype> expected_lu({{6, 2, 8}, {0.5, 4, -2}, {0, 2, 6}});
    vec<datatype> expected_solution{4, -1, 0.5};

    // set up SYCL
#ifdef PREFER_INTEL
    sycl::intel_selector selector; // TODO dependency injection
#else
    sycl::default_selector selector;
#endif // PREFER_INTEL

    const size_t matrix_size = system.size();

    sycl::queue q{selector, [](sycl::exception_list el) {
                      for (auto &&ex : el) {
                          try {
                              std::rethrow_exception(ex);
                          } catch (sycl::exception const &e) {
                              std::cout << "Caught asynchronous SYCL exception: " << e.what() << '\n';
                          }
                      }
                  }};

    sycl::buffer<datatype, 2> LU_buf(sycl::range<2>(matrix_size, matrix_size));
    for (size_t i = 0; i < matrix_size; ++i) { // init lu buf
        q.submit([&system, &LU_buf, i, matrix_size](sycl::handler &h) {
            auto acc_row =
                LU_buf.template get_access<sycl::access::mode::write>(h, sycl::range<2>{1, matrix_size}, sycl::id<2>{i, 0});
            h.copy(system[i].data(), acc_row);
        });
    }

    sycl::buffer<datatype, 1> b_buf{matrix_size};
    auto b = std::make_unique<datatype[]>(matrix_size);
    // init b buffer
    for (size_t i = 0; i < matrix_size; ++i) {
        b[i] = system[i][matrix_size];
    }
    q.submit([&](sycl::handler &h) {
        auto acc_b = b_buf.template get_access<sycl::access::mode::discard_write>(h);
        h.copy(b.get(), acc_b);
    });

    Solver::get_gaussian_LU(matrix_size, q, LU_buf, b_buf);

    auto acc_lu_buff = LU_buf.template get_access<sycl::access::mode::read>();
    for (size_t i = 0; i < matrix_size; ++i) {
        for (size_t j = 0; j < matrix_size; ++j) {
            if (acc_lu_buff[i][j] != expected_lu[i][j]) {
                std::cout << acc_lu_buff[i][j] << "!=" << expected_lu[i][j] << '\n';
            }
        }
    }
}

If there is any way I can help, please let me know.
Finlay

I forgot to mention:
I couldn’t reproduce the warning, but this causes an error from the same line in the header.

I’m compiling on with Windows with Microsoft Visual Studio Community 2019 (2) Version 16.10.3