/*******************************************************************************
* Copyright (C) 2021 Intel Corporation
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

/*
*
*  Content:
*       This example demonstrates use of oneAPI Math Kernel Library (oneMKL)
*       DPCPP USM API oneapi::mkl::sparse::matmat to perform general
*       sparse matrix-sparse matrix multiplication on a SYCL device (CPU, GPU).
*       This example uses matrices in CSR format.
*
*           C = op(A) * op(B)
*
*       where op() is defined by one of
*           oneapi::mkl::transpose::{nontrans,trans,conjtrans}
*
*       It uses the full control API usage model where the user handles both memory
*       allocation for the final C Matrix, and allocation of any temporary
*       workspaces along the way. All matrices use USM device memory which
*       requires a copy of the data to host at the end for printing results.
*
*       The supported floating point data types for matmat matrix data are:
*           float
*           double
*           std::complex<float>
*           std::complex<double>
*
*       The supported matrix formats for matmat are:
*           CSR
*
*******************************************************************************/

// stl includes
#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <iterator>
#include <limits>
#include <list>
#include <vector>

#include "mkl.h"
#include "oneapi/mkl.hpp"
#include <sycl/sycl.hpp>

// local includes
#include "common_for_examples.hpp"
#include "./include/common_for_sparse_examples.hpp"

//
// Main example for Sparse Matrix-Sparse Matrix Multiply consisting of
// initialization of A and B matrices through process of creating C matrix as
// the product
//
// C = op(A) * op(B)
//
// In this case, we multiply a square symmetric A by itself to have C be the
// square of A.
//
template <typename dataType, typename intType>
int run_sparse_blas_example(sycl::queue &q)
{
    bool good = true;

    oneapi::mkl::sparse::matrix_handle_t csrA = nullptr;
    oneapi::mkl::sparse::matrix_handle_t csrB = nullptr;
    oneapi::mkl::sparse::matrix_handle_t csrC = nullptr;
    oneapi::mkl::sparse::matmat_descr_t descr = nullptr;

    //
    // array memory management tools
    //
    std::vector<intType *> int_ptr_vec;
    std::vector<dataType *> data_ptr_vec;
    std::vector<std::int64_t *> i64_ptr_vec;
    std::vector<void *> void_ptr_vec;

    try {

        // Initialize data for Sparse Matrix - Sparse Matrix Multiply
        auto opA = oneapi::mkl::transpose::trans;
        auto opB = oneapi::mkl::transpose::nontrans;

        auto viewA = oneapi::mkl::sparse::matrix_view_descr::general;
        auto viewB = oneapi::mkl::sparse::matrix_view_descr::general;
        auto viewC = oneapi::mkl::sparse::matrix_view_descr::general;

        auto a_index = oneapi::mkl::index_base::zero;
        auto b_index = oneapi::mkl::index_base::zero;
        auto c_index = oneapi::mkl::index_base::one;

        //
        // set up dimensions of matrix products
        //
        intType size = 4;

        intType a_nrows = size * size * size;
        intType a_ncols = a_nrows;
        intType a_nnz   = 27 * a_nrows;
        intType b_nrows = size * size * size;
        intType b_ncols = b_nrows;
        intType b_nnz   = 27 * b_nrows;
        intType c_nrows = size * size * size;
        intType c_ncols = c_nrows;
        // c_nnz is unknown at this point


        //
        // setup A data locally in CSR format
        //
        intType *a_rowptr_host = sycl::malloc_host<intType>(a_nrows + 1, q);
        intType *a_colind_host = sycl::malloc_host<intType>(27 * a_nrows, q);
        dataType *a_values_host = sycl::malloc_host<dataType>(27 * a_nrows, q);
        if (!a_rowptr_host || !a_colind_host || !a_values_host) {
            std::string errorMessage =
                "Failed to allocate USM host memory arrays \n"
                " for CSR A matrix: a_rowptr(" + std::to_string((a_nrows+1)*sizeof(intType)) + " bytes)\n"
                "                   a_colind(" + std::to_string((a_nrows * 27)*sizeof(intType)) + " bytes)\n"
                "                   a_values(" + std::to_string((a_nrows * 27)*sizeof(dataType)) + " bytes)";

            throw std::runtime_error(errorMessage);
        }
        int_ptr_vec.push_back(a_rowptr_host);
        int_ptr_vec.push_back(a_colind_host);
        data_ptr_vec.push_back(a_values_host);

        intType a_ind = a_index == oneapi::mkl::index_base::zero ? 0 : 1;
        generate_sparse_matrix<dataType, intType>(size, a_rowptr_host, a_colind_host, a_values_host, a_ind);
        a_nnz = a_rowptr_host[a_nrows] - a_ind;

        intType *a_rowptr = sycl::malloc_device<intType>(a_nrows + 1, q);
        intType *a_colind = sycl::malloc_device<intType>(a_nnz, q);
        dataType *a_values = sycl::malloc_device<dataType>(a_nnz, q);

        if (!a_rowptr || !a_colind || !a_values) {
           std::string errorMessage =
               "Failed to allocate USM device memory arrays \n"
               " for CSR A matrix: a_rowptr(" + std::to_string((a_nrows+1)*sizeof(intType)) + " bytes)\n"
               "                   a_colind(" + std::to_string((a_nnz)*sizeof(intType)) + " bytes)\n"
               "                   a_values(" + std::to_string((a_nnz)*sizeof(dataType)) + " bytes)";

            throw std::runtime_error(errorMessage);
        }
        int_ptr_vec.push_back(a_rowptr);
        int_ptr_vec.push_back(a_colind);
        data_ptr_vec.push_back(a_values);

        // copy A matrix USM data from host to device
        auto ev_cpy_ia = q.copy<intType>(a_rowptr_host, a_rowptr, a_nrows + 1);
        auto ev_cpy_ja = q.copy<intType>(a_colind_host, a_colind, a_nnz);
        auto ev_cpy_a  = q.copy<dataType>(a_values_host, a_values, a_nnz);


        //
        // setup B data locally in CSR format
        //
        intType *b_rowptr_host = sycl::malloc_host<intType>(b_nrows + 1, q);
        intType *b_colind_host = sycl::malloc_host<intType>(27 * b_nrows, q);
        dataType *b_values_host  = sycl::malloc_host<dataType>(27 * b_nrows, q);
        if (!b_rowptr_host || !b_colind_host || !b_values_host) {
            std::string errorMessage =
                "Failed to allocate USM host memory arrays \n"
                " for CSR B matrix: b_rowptr(" + std::to_string((b_nrows+1)*sizeof(intType)) + " bytes)\n"
                "                   b_colind(" + std::to_string((b_nrows * 27)*sizeof(intType)) + " bytes)\n"
                "                   b_values(" + std::to_string((b_nrows * 27)*sizeof(dataType)) + " bytes)";

            throw std::runtime_error(errorMessage);
        }
        int_ptr_vec.push_back(b_rowptr_host);
        int_ptr_vec.push_back(b_colind_host);
        data_ptr_vec.push_back(b_values_host);

        intType b_ind = b_index == oneapi::mkl::index_base::zero ? 0 : 1;
        generate_sparse_matrix<dataType, intType>(size, b_rowptr_host, b_colind_host, b_values_host, b_ind);
        b_nnz = b_rowptr_host[b_nrows] - b_ind;

        intType *b_rowptr = sycl::malloc_device<intType>(b_nrows + 1, q);
        intType *b_colind = sycl::malloc_device<intType>(b_nnz, q);
        dataType *b_values  = sycl::malloc_device<dataType>(b_nnz, q);

        if (!b_rowptr || !b_colind || !b_values) {
            std::string errorMessage =
                "Failed to allocate USM device memory arrays \n"
                " for CSR B matrix: b_rowptr(" + std::to_string((b_nrows+1)*sizeof(intType)) + " bytes)\n"
                "                   b_colind(" + std::to_string((b_nnz)*sizeof(intType)) + " bytes)\n"
                "                   b_values(" + std::to_string((b_nnz)*sizeof(dataType)) + " bytes)";

            throw std::runtime_error(errorMessage);
        }
        int_ptr_vec.push_back(b_rowptr);
        int_ptr_vec.push_back(b_colind);
        data_ptr_vec.push_back(b_values);

        // copy B matrix USM data from host to device
        auto ev_cpy_ib = q.copy<intType>(b_rowptr_host, b_rowptr, b_nrows + 1);
        auto ev_cpy_jb = q.copy<intType>(b_colind_host, b_colind, b_nnz);
        auto ev_cpy_b  = q.copy<dataType>(b_values_host, b_values, b_nnz);



        //
        // setup C data locally in CSR format
        // note: we don't know c_nnz so can only prepare c_rowptr at this point
        //
        intType c_ind = c_index == oneapi::mkl::index_base::zero ? 0 : 1;
        intType *c_rowptr = sycl::malloc_device<intType>(c_nrows + 1, q);
        intType *c_colind = nullptr;
        dataType *c_values = nullptr;

        if (!c_rowptr) {
            std::string errorMessage =
                "Failed to allocate USM device memory arrays \n"
                " for CSR C matrix: c_rowptr(" + std::to_string((c_nrows+1)*sizeof(intType)) + " bytes)";

            throw std::runtime_error(errorMessage);
        }
        int_ptr_vec.push_back(c_rowptr);

        //
        // other workspaces and arrays for matmat process
        //
        std::int64_t *sizeTempBuffer = nullptr, *sizeTempBuffer2 = nullptr, *c_nnz = nullptr;
        void *tempBuffer = nullptr, *tempBuffer2 = nullptr;

        //
        // Execute Matrix Multiply
        //

        std::cout << "\n\t\tsparse::matmat parameters:\n";
        std::cout << "\t\t\topA = " << opA << std::endl;
        std::cout << "\t\t\topB = " << opB << std::endl;

        std::cout << "\t\t\tviewA = " << viewA << std::endl;
        std::cout << "\t\t\tviewB = " << viewB << std::endl;
        std::cout << "\t\t\tviewC = " << viewC << std::endl;

        std::cout << "\t\t\tA_nrows = A_ncols = " << a_nrows << std::endl;
        std::cout << "\t\t\tB_nrows = B_ncols = " << b_nrows << std::endl;
        std::cout << "\t\t\tC_nrows = C_ncols = " << c_nrows << std::endl;

        std::cout << "\t\t\tA_index = " << a_index << std::endl;
        std::cout << "\t\t\tB_index = " << b_index << std::endl;
        std::cout << "\t\t\tC_index = " << c_index << std::endl;

        oneapi::mkl::sparse::matmat_request req;

        oneapi::mkl::sparse::init_matrix_handle(&csrA);
        oneapi::mkl::sparse::init_matrix_handle(&csrB);
        oneapi::mkl::sparse::init_matrix_handle(&csrC);

        auto ev_setA = oneapi::mkl::sparse::set_csr_data(q, csrA, a_nrows, a_ncols, a_index,
                a_rowptr, a_colind, a_values, {ev_cpy_ia, ev_cpy_ja, ev_cpy_a});
        auto ev_setB = oneapi::mkl::sparse::set_csr_data(q, csrB, b_nrows, b_ncols, b_index,
                b_rowptr, b_colind, b_values, {ev_cpy_ib, ev_cpy_jb, ev_cpy_b});

        //
        // only c_rowptr exists at this point in process so pass in nullptrs
        //
        auto ev_setC = oneapi::mkl::sparse::set_csr_data(q, csrC, c_nrows, c_ncols, c_index, c_rowptr,
                                                        (intType *)nullptr, (dataType *)nullptr, {});

        //
        // initialize the matmat descriptor
        //
        oneapi::mkl::sparse::init_matmat_descr(&descr);
        oneapi::mkl::sparse::set_matmat_data(descr, viewA, opA, viewB, opB, viewC);

        //
        // Stage 1:  work estimation
        //

        // Step 1.1
        //   query for size of work_estimation temp buffer
        req = oneapi::mkl::sparse::matmat_request::get_work_estimation_buf_size;
        sizeTempBuffer = sycl::malloc_host<std::int64_t>(1, q);
        if (!sizeTempBuffer) {
            std::string errorMessage =
                "Failed to allocate USM host memory arrays \n"
                " for temporary arrays:  sizeTempBuffer(" + std::to_string((1)*sizeof(std::int64_t)) + " bytes)";

            throw std::runtime_error(errorMessage);
        }
        i64_ptr_vec.push_back(sizeTempBuffer);

        auto ev1_1 = oneapi::mkl::sparse::matmat(q, csrA, csrB, csrC, req, descr, sizeTempBuffer,
                                                 nullptr, {ev_setA, ev_setB, ev_setC});

        // Step 1.2
        //   allocate temp buffer for work_estimation
        ev1_1.wait();
        tempBuffer = sycl::malloc_device(sizeTempBuffer[0] * sizeof(std::uint8_t), q);
        if (!tempBuffer) {
            std::string errorMessage =
                "Failed to allocate USM device memory arrays \n"
                " for temporary arrays:  tempBuffer(" + std::to_string((sizeTempBuffer[0])*sizeof(std::uint8_t)) + " bytes)";

            throw std::runtime_error(errorMessage);
        }
        void_ptr_vec.push_back(tempBuffer);

        // Step 1.3  do work_estimation
        req = oneapi::mkl::sparse::matmat_request::work_estimation;
        auto ev1_3 = oneapi::mkl::sparse::matmat(q, csrA, csrB, csrC, req, descr, sizeTempBuffer,
                                                 tempBuffer, {ev1_1});

        //
        // Stage 2:  compute
        //

        // Step 2.1 query size of compute temp buffer
        req = oneapi::mkl::sparse::matmat_request::get_compute_buf_size;
        sizeTempBuffer2 = sycl::malloc_host<std::int64_t>(1, q);
        if (!sizeTempBuffer2) {
            std::string errorMessage =
                "Failed to allocate USM host memory arrays \n"
                " for temporary arrays:  sizeTempBuffer2(" + std::to_string((1)*sizeof(std::int64_t)) + " bytes)";

            throw std::runtime_error(errorMessage);
        }
        i64_ptr_vec.push_back(sizeTempBuffer2);
        auto ev2_1 = oneapi::mkl::sparse::matmat(q, csrA, csrB, csrC, req, descr, sizeTempBuffer2,
                                                 nullptr, {ev1_3});

        // Step 2.2 allocate temp buffer for compute
        ev2_1.wait();
        tempBuffer2 = sycl::malloc_device(sizeTempBuffer2[0] * sizeof(std::uint8_t), q);
        if (!tempBuffer2) {
            std::string errorMessage =
                "Failed to allocate USM device memory arrays \n"
                " for temporary arrays:  tempBuffer2(" + std::to_string((sizeTempBuffer2[0])*sizeof(std::uint8_t)) + " bytes)";

            throw std::runtime_error(errorMessage);
        }
        void_ptr_vec.push_back(tempBuffer2);

        // Step 2.3 do compute
        req = oneapi::mkl::sparse::matmat_request::compute;
        auto ev2_3 = oneapi::mkl::sparse::matmat(q, csrA, csrB, csrC, req, descr, sizeTempBuffer2,
                                                 tempBuffer2, {ev2_1});

        //
        // Stage 3:  finalize
        //

        // Step 3.1  get nnz
        req = oneapi::mkl::sparse::matmat_request::get_nnz;
        c_nnz = sycl::malloc_host<std::int64_t>(1, q);
        if (!c_nnz) {
            std::string errorMessage =
                "Failed to allocate USM host memory arrays \n"
                " for c_nnz(" + std::to_string((1)*sizeof(std::int64_t)) + " bytes)";

            throw std::runtime_error(errorMessage);
        }
        i64_ptr_vec.push_back(c_nnz);
        auto ev3_1 = oneapi::mkl::sparse::matmat(q, csrA, csrB, csrC, req, descr, c_nnz, nullptr,
                                                 {ev2_3});

        // Step 3.2  allocate final c matrix arrays
        ev3_1.wait();
        c_colind = sycl::malloc_device<intType>(c_nnz[0], q);
        c_values = sycl::malloc_device<dataType>(c_nnz[0], q);
        if (!c_colind || !c_values) {
            std::string errorMessage =
                "Failed to allocate USM device memory arrays \n"
                " for CSR C matrix: c_colind(" + std::to_string((c_nnz[0])*sizeof(intType)) + " bytes)\n"
                "                   c_values(" + std::to_string((c_nnz[0])*sizeof(dataType)) + " bytes)";

            throw std::runtime_error(errorMessage);
        }
        int_ptr_vec.push_back(c_colind);
        data_ptr_vec.push_back(c_values);
        ev_setC = oneapi::mkl::sparse::set_csr_data(q, csrC, c_nrows, c_ncols, c_index, c_rowptr, c_colind, c_values, {ev3_1});

        // Step 3.3  finalize into C matrix
        req = oneapi::mkl::sparse::matmat_request::finalize;
        auto ev3_3 = oneapi::mkl::sparse::matmat(q, csrA, csrB, csrC, req, descr, nullptr, nullptr,
                                                 {ev_setC});

        // Sort C matrix output if desired
        auto ev_sort = oneapi::mkl::sparse::sort_matrix(q, csrC, {ev3_3});

        //
        // Post Processing
        //

        // Copy first set of rows of C to host for printing
        const intType c_nrows_copy = std::min<intType>(2, c_nrows); // only copy over this many rows of C to host

        intType *c_rowptr_host = sycl::malloc_host<intType>(c_nrows_copy+1, q);
        if (!c_rowptr_host) {
            std::string errorMessage =
                "Failed to allocate USM host memory arrays \n"
                " for CSR C matrix: c_rowptr(" + std::to_string((c_nrows_copy+1)*sizeof(intType)) + " bytes)";

            throw std::runtime_error(errorMessage);
        }
        int_ptr_vec.push_back(c_rowptr_host);

        auto ev_cpy_ic = q.copy<intType>(c_rowptr, c_rowptr_host,
                c_nrows_copy+1, {ev_sort}); // copy part of c_rowptr device to host

        ev_cpy_ic.wait(); // make sure copy is done before reading from it
        const intType c_host_nnz = c_rowptr_host[c_nrows_copy] - c_ind;
        intType *c_colind_host = sycl::malloc_host<intType>(c_host_nnz, q);
        dataType *c_values_host  = sycl::malloc_host<dataType>(c_host_nnz, q);
        if (!c_colind_host || !c_values_host) {
            std::string errorMessage =
                "Failed to allocate USM host memory arrays \n"
                " for CSR C matrix: c_colind(" + std::to_string((c_host_nnz)*sizeof(intType)) + " bytes)\n"
                "                   c_values(" + std::to_string((c_host_nnz)*sizeof(dataType)) + " bytes)";

            throw std::runtime_error(errorMessage);
        }
        int_ptr_vec.push_back(c_colind_host);
        data_ptr_vec.push_back(c_values_host);

        auto ev_cpy_jc = q.copy<intType>(c_colind, c_colind_host,
                c_host_nnz, {ev_sort, ev_cpy_ic}); // copy part of c_colind device to host
        auto ev_cpy_c  = q.copy<dataType>(c_values, c_values_host,
                c_host_nnz, {ev_sort, ev_cpy_ic}); // copy part of c_values device to host

        // print out a portion of C solution
        sycl::event ev_print = q.submit([&](sycl::handler &cgh) {
            cgh.depends_on({ev_cpy_ic, ev_cpy_jc, ev_cpy_c});
            auto kernel = [=]() {
                std::cout << "C matrix [first " << c_nrows_copy << " rows]:" << std::endl;
                for (intType row = 0; row < c_nrows_copy; ++row) {
                    for (intType j = c_rowptr_host[row] - c_ind; j < c_rowptr_host[row + 1] - c_ind; ++j) {
                        intType col = c_colind_host[j];
                        dataType val  = c_values_host[j];
                        std::cout << "C(" << row + c_ind << ", " << col << ") = " << val
                                  << std::endl;
                    }
                }
            };
            cgh.host_task(kernel);
        });

        // clean up
        oneapi::mkl::sparse::release_matmat_descr(&descr);
        auto ev_relA = oneapi::mkl::sparse::release_matrix_handle(q, &csrA, {ev3_3});
        auto ev_relB = oneapi::mkl::sparse::release_matrix_handle(q, &csrB, {ev3_3});
        auto ev_relC = oneapi::mkl::sparse::release_matrix_handle(q, &csrC, {ev_print});

        q.wait_and_throw();

    }
    catch (sycl::exception const &e) {
        std::cout << "\t\tCaught synchronous SYCL exception:\n" << e.what() << std::endl;
        good = false;
    }
    catch (std::exception const &e) {
        std::cout << "\t\tCaught std exception:\n" << e.what() << std::endl;
        good = false;
    }

    q.wait();

    // backup cleaning of matrix handle and others for if exceptions happened
    if(descr) oneapi::mkl::sparse::release_matmat_descr(&descr);
    if(csrA) oneapi::mkl::sparse::release_matrix_handle(q, &csrA, {}).wait();
    if(csrB) oneapi::mkl::sparse::release_matrix_handle(q, &csrB, {}).wait();
    if(csrC) oneapi::mkl::sparse::release_matrix_handle(q, &csrC, {}).wait();

    cleanup_arrays<dataType, intType>(data_ptr_vec, int_ptr_vec, i64_ptr_vec, void_ptr_vec, q);

    q.wait();

    return good ? 0 : 1;
}

//
// Description of example setup, apis used and supported floating point type
// precisions
//
void print_example_banner()
{

    std::cout << "" << std::endl;
    std::cout << "###############################################################"
                 "#########"
              << std::endl;
    std::cout << "# Sparse Matrix-Sparse Matrix Multiply Example: " << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "#    C = op(A) * op(B)" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# where A and B are sparse matrices in CSR format, and C is the\n"
                 "# sparse matrix product in CSR format"
              << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# Using apis:" << std::endl;
    std::cout << "#   sparse::matmat" << std::endl;
    std::cout << "#   sparse::init_matmat_descr" << std::endl;
    std::cout << "#   sparse::set_matmat_data" << std::endl;
    std::cout << "#   sparse::release_matmat_descr" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "#   sparse::init_matrix_handle" << std::endl;
    std::cout << "#   sparse::set_csr_data" << std::endl;
    std::cout << "#   sparse::release_matrix_handle" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "# Supported floating point type precisions:" << std::endl;
    std::cout << "#   float" << std::endl;
    std::cout << "#   double" << std::endl;
    std::cout << "#   std::complex<float>" << std::endl;
    std::cout << "#   std::complex<double>" << std::endl;
    std::cout << "# " << std::endl;
    std::cout << "###############################################################"
                 "#########"
              << std::endl;
    std::cout << std::endl;
}

//
// Main entry point for example.
//
// Dispatches to appropriate device types as set at build time with flag:
// -DSYCL_DEVICES_cpu -- only runs SYCL CPU implementation
// -DSYCL_DEVICES_gpu -- only runs SYCL GPU implementation
// -DSYCL_DEVICES_all (default) -- runs on all: cpu and gpu devices
//
//  For each device selected and each supported data type,
//  run_sparse_blas_example() is run with all supported data types,
//  if any fail, we move on to the next device.
//

int main(int argc, char **argv)
{
    print_example_banner();

    std::list<my_sycl_device_types> list_of_devices;
    set_list_of_devices(list_of_devices);

    int status = 0;
    for (auto it = list_of_devices.begin(); it != list_of_devices.end(); ++it) {
        try {
            sycl::device my_dev;
            bool my_dev_is_found = false;
            get_sycl_device(my_dev, my_dev_is_found, *it);

            if (my_dev_is_found) {
                std::cout << "Running tests on " << sycl_device_names[*it] << ".\n";

                // Catch asynchronous exceptions
                auto exception_handler = [](sycl::exception_list exceptions) {
                    for (std::exception_ptr const &e : exceptions) {
                        try {
                            std::rethrow_exception(e);
                        }
                        catch (sycl::exception const &e) {
                            std::cout << "Caught asynchronous SYCL exception: \n"
                                << e.what() << std::endl;
                        }
                    }
                };

                sycl::queue q(my_dev, exception_handler);

                std::cout << "\tRunning with single precision real data type:" << std::endl;
                status |= run_sparse_blas_example<float, std::int32_t>(q);

                if (my_dev.get_info<sycl::info::device::double_fp_config>().size() != 0) {
                    std::cout << "\tRunning with double precision real data type:" << std::endl;
                    status |= run_sparse_blas_example<double, std::int32_t>(q);
                }

                std::cout << "\tRunning with single precision complex data type:" << std::endl;
                status |= run_sparse_blas_example<std::complex<float>, std::int32_t>(q);

                if (my_dev.get_info<sycl::info::device::double_fp_config>().size() != 0) {
                    std::cout << "\tRunning with double precision complex data type:" << std::endl;
                    status |= run_sparse_blas_example<std::complex<double>, std::int32_t>(q);
                }

            }
            else {
#ifdef FAIL_ON_MISSING_DEVICES
                std::cout << "No " << sycl_device_names[*it]
                    << " devices found; Fail on missing devices "
                    "is enabled.\n";
                return 1;
#else
                std::cout << "No " << sycl_device_names[*it] << " devices found; skipping "
                    << sycl_device_names[*it] << " tests.\n";
#endif
            }
        }
        catch (sycl::exception const &e) {
            std::cout << "\t\tCaught SYCL exception at driver level: \n" << e.what() << std::endl;
            continue; // stop with device, but move on to other devices
        }
        catch (std::exception const &e) {
            std::cout << "\t\tCaught std exception at driver level: \n" << e.what() << std::endl;
            continue; // stop with device, but move on to other devices
        }

    } // for device

    mkl_free_buffers();
    return status;
}
