33#ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
34#define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
49#include <ginkgo/core/base/device.hpp>
50#include <ginkgo/core/base/fwd_decls.hpp>
51#include <ginkgo/core/base/machine_topology.hpp>
52#include <ginkgo/core/base/memory.hpp>
53#include <ginkgo/core/base/scoped_device_id_guard.hpp>
54#include <ginkgo/core/base/types.hpp>
55#include <ginkgo/core/log/logger.hpp>
56#include <ginkgo/core/synthesizer/containers.hpp>
97constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
99constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
105 allocation_mode::unified_global;
107#if (GINKGO_HIP_PLATFORM_HCC == 1)
110constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
116 allocation_mode::unified_global;
130enum class dpcpp_queue_property {
142GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
143 dpcpp_queue_property b)
145 return static_cast<dpcpp_queue_property
>(
static_cast<int>(a) |
146 static_cast<int>(b));
153#define GKO_FORWARD_DECLARE(_type, ...) class _type
155GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
157#undef GKO_FORWARD_DECLARE
160class ReferenceExecutor;
289#define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
290 virtual void run(std::shared_ptr<const _type>) const
292 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
294#undef GKO_DECLARE_RUN_OVERLOAD
297 virtual void run(std::shared_ptr<const ReferenceExecutor>
executor)
const;
320template <
typename Closure>
321class RegisteredOperation :
public Operation {
329 RegisteredOperation(
const char* name, Closure op)
330 : name_(name), op_(std::move(op))
333 const char* get_name()
const noexcept override {
return name_; }
335 void run(std::shared_ptr<const ReferenceExecutor> exec)
const override
340 void run(std::shared_ptr<const OmpExecutor> exec)
const override
345 void run(std::shared_ptr<const CudaExecutor> exec)
const override
350 void run(std::shared_ptr<const HipExecutor> exec)
const override
355 void run(std::shared_ptr<const DpcppExecutor> exec)
const override
366template <
typename Closure>
448#define GKO_REGISTER_OPERATION(_name, _kernel) \
449 template <typename... Args> \
450 auto make_##_name(Args&&... args) \
452 return ::gko::detail::make_register_operation( \
453 #_kernel, [&args...](auto exec) { \
454 using exec_type = decltype(exec); \
457 std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
459 ::gko::kernels::reference::_kernel( \
460 std::dynamic_pointer_cast< \
461 const ::gko::ReferenceExecutor>(exec), \
462 std::forward<Args>(args)...); \
463 } else if (std::is_same< \
465 std::shared_ptr<const ::gko::OmpExecutor>>:: \
467 ::gko::kernels::omp::_kernel( \
468 std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
470 std::forward<Args>(args)...); \
471 } else if (std::is_same< \
473 std::shared_ptr<const ::gko::CudaExecutor>>:: \
475 ::gko::kernels::cuda::_kernel( \
476 std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
478 std::forward<Args>(args)...); \
479 } else if (std::is_same< \
481 std::shared_ptr<const ::gko::HipExecutor>>:: \
483 ::gko::kernels::hip::_kernel( \
484 std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
486 std::forward<Args>(args)...); \
487 } else if (std::is_same< \
489 std::shared_ptr<const ::gko::DpcppExecutor>>:: \
491 ::gko::kernels::dpcpp::_kernel( \
492 std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
494 std::forward<Args>(args)...); \
496 GKO_NOT_IMPLEMENTED; \
500 static_assert(true, \
501 "This assert is used to counter the false positive extra " \
502 "semi-colon warnings")
542#define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
543 template <typename... Args> \
544 auto make_##_name(Args&&... args) \
546 return ::gko::detail::make_register_operation( \
548 [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
550 static_assert(true, \
551 "This assert is used to counter the false positive extra " \
552 "semi-colon warnings")
555#define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
645 template <
typename T>
646 friend class detail::ExecutorBase;
648 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
681 template <
typename ClosureOmp,
typename ClosureCuda,
typename ClosureHip,
682 typename ClosureDpcpp>
683 void run(
const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
684 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
const
686 LambdaOperation<ClosureOmp, ClosureCuda, ClosureHip, ClosureDpcpp> op(
687 op_omp, op_cuda, op_hip, op_dpcpp);
702 template <
typename T>
705 this->
template log<log::Logger::allocation_started>(
706 this, num_elems *
sizeof(T));
707 T* allocated =
static_cast<T*
>(this->raw_alloc(num_elems *
sizeof(T)));
708 this->
template log<log::Logger::allocation_completed>(
709 this, num_elems *
sizeof(T),
reinterpret_cast<uintptr>(allocated));
720 void free(
void* ptr)
const noexcept
722 this->
template log<log::Logger::free_started>(
723 this,
reinterpret_cast<uintptr>(ptr));
725 this->
template log<log::Logger::free_completed>(
726 this,
reinterpret_cast<uintptr>(ptr));
741 template <
typename T>
743 const T* src_ptr, T* dest_ptr)
const
745 const auto src_loc =
reinterpret_cast<uintptr>(src_ptr);
746 const auto dest_loc =
reinterpret_cast<uintptr>(dest_ptr);
747 this->
template log<log::Logger::copy_started>(
748 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
749 if (
this != src_exec.
get()) {
750 src_exec->template log<log::Logger::copy_started>(
751 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
754 this->raw_copy_from(src_exec.
get(), num_elems *
sizeof(T), src_ptr,
757#if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
760 std::clog <<
"Not direct copy. Try to copy data from the masters."
763 auto src_master = src_exec->get_master().
get();
764 if (num_elems > 0 && src_master != src_exec.
get()) {
765 auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
766 src_master->copy_from<T>(src_exec, num_elems, src_ptr,
768 this->copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
769 src_master->free(master_ptr);
772 this->
template log<log::Logger::copy_completed>(
773 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
774 if (
this != src_exec.
get()) {
775 src_exec->template log<log::Logger::copy_completed>(
776 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
791 template <
typename T>
794 this->copy_from(
this, num_elems, src_ptr, dest_ptr);
806 template <
typename T>
810 this->get_master()->copy_from(
this, 1, ptr, &out);
823 virtual std::shared_ptr<const
Executor> get_master() const noexcept = 0;
828 virtual
void synchronize() const = 0;
836 void add_logger(std::shared_ptr<const log::Logger> logger)
override
838 this->propagating_logger_refcount_.fetch_add(
839 logger->needs_propagation() ? 1 : 0);
840 this->EnableLogging<Executor>::add_logger(logger);
851 this->propagating_logger_refcount_.fetch_sub(
853 this->EnableLogging<Executor>::remove_logger(logger);
856 using EnableLogging<
Executor>::remove_logger;
867 log_propagation_mode_ = mode;
879 return this->propagating_logger_refcount_.load() > 0 &&
880 log_propagation_mode_ == log_propagation_mode::automatic;
892 return this->verify_memory_from(other.get());
911 std::string device_type;
926 int num_computing_units = -1;
939 int num_pu_per_cu = -1;
949 std::vector<int> subgroup_sizes{};
959 int max_subgroup_size = -1;
971 std::vector<int> max_workitem_sizes{};
982 int max_workgroup_size;
999 std::string pci_bus_id = std::string(13,
'x');
1011 std::vector<int> closest_pu_ids{};
1019 const exec_info& get_exec_info()
const {
return this->exec_info_; }
1030 virtual void* raw_alloc(size_type size)
const = 0;
1039 virtual void raw_free(
void* ptr)
const noexcept = 0;
1051 virtual void raw_copy_from(
const Executor* src_exec, size_type n_bytes,
1052 const void* src_ptr,
void* dest_ptr)
const = 0;
1063#define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1064 virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1065 const void* src_ptr, void* dest_ptr) const = 0
1067 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1069#undef GKO_ENABLE_RAW_COPY_TO
1078 virtual bool verify_memory_from(
const Executor* src_exec)
const = 0;
1089#define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1090 virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1092 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1094 GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1096#undef GKO_ENABLE_VERIFY_MEMORY_TO
1104 virtual void populate_exec_info(
const machine_topology* mach_topo) = 0;
1111 exec_info& get_exec_info() {
return this->exec_info_; }
1113 exec_info exec_info_;
1117 std::atomic<int> propagating_logger_refcount_{};
1134 template <
typename ClosureOmp,
typename ClosureCuda,
typename ClosureHip,
1135 typename ClosureDpcpp>
1136 class LambdaOperation :
public Operation {
1148 LambdaOperation(
const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
1149 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
1156 void run(std::shared_ptr<const OmpExecutor>)
const override
1161 void run(std::shared_ptr<const ReferenceExecutor>)
const override
1166 void run(std::shared_ptr<const CudaExecutor>)
const override
1171 void run(std::shared_ptr<const HipExecutor>)
const override
1176 void run(std::shared_ptr<const DpcppExecutor>)
const override
1183 ClosureCuda op_cuda_;
1185 ClosureDpcpp op_dpcpp_;
1198template <
typename T>
1225 std::shared_ptr<const Executor> exec_;
1229template <
typename T>
1232 using pointer = T[];
1238 void operator()(pointer ptr)
const
1246 std::shared_ptr<const Executor> exec_;
1253template <
typename ConcreteExecutor>
1254class ExecutorBase :
public Executor {
1255 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
1259 using Executor::run;
1261 void run(
const Operation& op)
const override
1263 this->
template log<log::Logger::operation_launched>(
this, &op);
1264 auto scope_guard = get_scoped_device_id_guard();
1265 op.run(self()->shared_from_this());
1266 this->
template log<log::Logger::operation_completed>(
this, &op);
1270 void raw_copy_from(
const Executor* src_exec, size_type n_bytes,
1271 const void* src_ptr,
void* dest_ptr)
const override
1273 src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
1276 virtual bool verify_memory_from(
const Executor* src_exec)
const override
1278 return src_exec->verify_memory_to(self());
1282 ConcreteExecutor* self() noexcept
1284 return static_cast<ConcreteExecutor*
>(
this);
1287 const ConcreteExecutor* self() const noexcept
1289 return static_cast<const ConcreteExecutor*
>(
this);
1293#undef GKO_DECLARE_EXECUTOR_FRIEND
1303class EnableDeviceReset {
1311 "device_reset is no longer supported, call "
1312 "cudaDeviceReset/hipDeviceReset manually")
1313 void set_device_reset(
bool device_reset) {}
1321 "device_reset is no longer supported, call "
1322 "cudaDeviceReset/hipDeviceReset manually")
1323 bool get_device_reset() {
return false; }
1331 EnableDeviceReset() {}
1334 "device_reset is no longer supported, call "
1335 "cudaDeviceReset/hipDeviceReset manually")
1336 EnableDeviceReset(
bool device_reset) {}
1343#define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1344 void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1345 const void* src_ptr, void* dest_ptr) const override
1348#define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1349 virtual bool verify_memory_to(const dest_* other) const override \
1353 static_assert(true, \
1354 "This assert is used to counter the false positive extra " \
1355 "semi-colon warnings")
1366 public std::enable_shared_from_this<OmpExecutor> {
1367 friend class detail::ExecutorBase<OmpExecutor>;
1374 std::shared_ptr<CpuAllocatorBase> alloc =
1375 std::make_shared<CpuAllocator>())
1377 return std::shared_ptr<OmpExecutor>(
new OmpExecutor(std::move(alloc)));
1386 int get_num_cores()
const
1388 return this->get_exec_info().num_computing_units;
1391 int get_num_threads_per_core()
const
1393 return this->get_exec_info().num_pu_per_cu;
1396 static int get_num_omp_threads();
1398 scoped_device_id_guard get_scoped_device_id_guard()
const override;
1401 OmpExecutor(std::shared_ptr<CpuAllocatorBase> alloc)
1402 : alloc_{std::
move(alloc)}
1407 void populate_exec_info(
const machine_topology*
mach_topo)
override;
1409 void* raw_alloc(
size_type size)
const override;
1411 void raw_free(
void* ptr)
const noexcept override;
1413 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1415 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
true);
1417 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1419 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
1421 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
1423 bool verify_memory_to(
const DpcppExecutor*
dest_exec)
const override;
1425 std::shared_ptr<CpuAllocatorBase> alloc_;
1431using DefaultExecutor = OmpExecutor;
1445 static std::shared_ptr<ReferenceExecutor> create(
1446 std::shared_ptr<CpuAllocatorBase> alloc =
1447 std::make_shared<CpuAllocator>())
1449 return std::shared_ptr<ReferenceExecutor>(
1461 op.run(std::static_pointer_cast<const ReferenceExecutor>(
1470 this->ReferenceExecutor::populate_exec_info(
1474 void populate_exec_info(
const machine_topology*)
override
1476 this->get_exec_info().device_id = -1;
1477 this->get_exec_info().num_computing_units = 1;
1478 this->get_exec_info().num_pu_per_cu = 1;
1481 bool verify_memory_from(
const Executor*
src_exec)
const override
1483 return src_exec->verify_memory_to(
this);
1486 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
true);
1488 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1490 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1492 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
1494 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
1499namespace reference {
1500using DefaultExecutor = ReferenceExecutor;
1512 public std::enable_shared_from_this<CudaExecutor>,
1513 public detail::EnableDeviceReset {
1514 friend class detail::ExecutorBase<CudaExecutor>;
1529 "device_reset is deprecated entirely, call cudaDeviceReset directly. "
1530 "alloc_mode was replaced by the Allocator type "
1565 return this->get_exec_info().device_id;
1578 return this->get_exec_info().num_pu_per_cu;
1586 return this->get_exec_info().num_computing_units;
1594 return this->get_exec_info().num_computing_units *
1595 this->get_exec_info().num_pu_per_cu;
1603 return this->get_exec_info().max_subgroup_size;
1611 return this->get_exec_info().major;
1619 return this->get_exec_info().minor;
1636 return cusparse_handle_.get();
1646 return this->get_exec_info().closest_pu_ids;
1665 void set_gpu_property();
1667 void init_handles();
1673 this->get_exec_info().device_id = device_id;
1674 this->get_exec_info().num_computing_units = 0;
1675 this->get_exec_info().num_pu_per_cu = 0;
1676 this->CudaExecutor::populate_exec_info(
1678 this->set_gpu_property();
1679 this->init_handles();
1682 void* raw_alloc(
size_type size)
const override;
1684 void raw_free(
void* ptr)
const noexcept override;
1686 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1688 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1690 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1692 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1694 bool verify_memory_to(
const HipExecutor*
dest_exec)
const override;
1696 bool verify_memory_to(
const CudaExecutor*
dest_exec)
const override;
1698 void populate_exec_info(
const machine_topology*
mach_topo)
override;
1701 std::shared_ptr<Executor> master_;
1703 template <
typename T>
1704 using handle_manager = std::unique_ptr<
T, std::function<
void(
T*)>>;
1707 std::shared_ptr<CudaAllocatorBase> alloc_;
1714using DefaultExecutor = CudaExecutor;
1726 public std::enable_shared_from_this<HipExecutor>,
1727 public detail::EnableDeviceReset {
1743 "device_reset is deprecated entirely, call hipDeviceReset directly. "
1744 "alloc_mode was replaced by the Allocator type "
1747 int device_id, std::shared_ptr<
Executor> master,
bool device_reset,
1749 GKO_HIP_STREAM_STRUCT* stream =
nullptr);
1752 int device_id, std::shared_ptr<
Executor> master,
1755 GKO_HIP_STREAM_STRUCT* stream =
nullptr);
1757 std::shared_ptr<
Executor> get_master() noexcept override;
1759 std::shared_ptr<const
Executor> get_master() const noexcept override;
1761 void synchronize() const override;
1768 int get_device_id() const noexcept
1770 return this->get_exec_info().device_id;
1783 return this->get_exec_info().num_pu_per_cu;
1791 return this->get_exec_info().num_computing_units;
1799 return this->get_exec_info().major;
1807 return this->get_exec_info().minor;
1815 return this->get_exec_info().num_computing_units *
1816 this->get_exec_info().num_pu_per_cu;
1824 return this->get_exec_info().max_subgroup_size;
1841 return hipsparse_handle_.get();
1858 return this->get_exec_info().closest_pu_ids;
1861 GKO_HIP_STREAM_STRUCT* get_stream()
const {
return stream_; }
1864 void set_gpu_property();
1866 void init_handles();
1868 HipExecutor(
int device_id, std::shared_ptr<Executor> master,
1869 std::shared_ptr<HipAllocatorBase> alloc,
1870 GKO_HIP_STREAM_STRUCT* stream)
1871 : master_{std::move(master)}, alloc_{std::move(alloc)}, stream_{stream}
1873 this->get_exec_info().device_id = device_id;
1874 this->get_exec_info().num_computing_units = 0;
1875 this->get_exec_info().num_pu_per_cu = 0;
1876 this->HipExecutor::populate_exec_info(machine_topology::get_instance());
1877 this->set_gpu_property();
1878 this->init_handles();
1881 void* raw_alloc(size_type size)
const override;
1883 void raw_free(
void* ptr)
const noexcept override;
1885 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1887 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1889 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1891 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1893 bool verify_memory_to(
const CudaExecutor* dest_exec)
const override;
1895 bool verify_memory_to(
const HipExecutor* dest_exec)
const override;
1897 void populate_exec_info(
const machine_topology* mach_topo)
override;
1900 std::shared_ptr<Executor> master_;
1902 template <
typename T>
1903 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1904 handle_manager<hipblasContext> hipblas_handle_;
1905 handle_manager<hipsparseContext> hipsparse_handle_;
1906 std::shared_ptr<HipAllocatorBase> alloc_;
1907 GKO_HIP_STREAM_STRUCT* stream_;
1913using DefaultExecutor = HipExecutor;
1925 public std::enable_shared_from_this<DpcppExecutor> {
1938 static std::shared_ptr<DpcppExecutor>
create(
1939 int device_id, std::shared_ptr<Executor> master,
1940 std::string device_type =
"all",
1941 dpcpp_queue_property property = dpcpp_queue_property::in_order);
1945 std::shared_ptr<const
Executor> get_master() const noexcept override;
1947 void synchronize() const override;
1956 int get_device_id() const noexcept
1958 return this->get_exec_info().device_id;
1961 sycl::queue* get_queue()
const {
return queue_.get(); }
1979 return this->get_exec_info().subgroup_sizes;
1989 return this->get_exec_info().num_computing_units;
1997 return this->get_exec_info().num_computing_units *
1998 this->get_exec_info().num_pu_per_cu;
2008 return this->get_exec_info().max_workitem_sizes;
2018 return this->get_exec_info().max_workgroup_size;
2028 return this->get_exec_info().max_subgroup_size;
2038 return this->get_exec_info().device_type;
2042 void set_device_property(
2043 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2046 int device_id, std::shared_ptr<Executor> master,
2047 std::string device_type =
"all",
2048 dpcpp_queue_property property = dpcpp_queue_property::in_order)
2051 std::for_each(device_type.begin(), device_type.end(),
2052 [](
char& c) { c = std::tolower(c); });
2053 this->get_exec_info().device_type = std::string(device_type);
2054 this->get_exec_info().device_id = device_id;
2055 this->set_device_property(property);
2058 void populate_exec_info(
const machine_topology* mach_topo)
override;
2060 void* raw_alloc(size_type size)
const override;
2062 void raw_free(
void* ptr)
const noexcept override;
2064 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2066 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
2068 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
2070 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
2072 bool verify_memory_to(
const OmpExecutor* dest_exec)
const override;
2074 bool verify_memory_to(
const DpcppExecutor* dest_exec)
const override;
2077 std::shared_ptr<Executor> master_;
2079 template <
typename T>
2080 using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2081 queue_manager<sycl::queue> queue_;
2087using DefaultExecutor = DpcppExecutor;
2092#undef GKO_OVERRIDE_RAW_COPY_TO
Implement this interface to provide an allocator for CudaExecutor.
Definition memory.hpp:68
Allocator using cudaMalloc.
Definition memory.hpp:130
This is the Executor subclass which represents the CUDA device.
Definition executor.hpp:1513
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1644
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1654
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1576
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1609
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1584
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1601
cusparseContext * get_cusparse_handle() const
Get the cusparse handle for this executor.
Definition executor.hpp:1634
CUstream_st * get_stream() const
Returns the CUDA stream used by this executor.
Definition executor.hpp:1662
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1617
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1592
cublasContext * get_cublas_handle() const
Get the cublas handle for this executor.
Definition executor.hpp:1627
static int get_num_devices()
Get the number of devices present on the system.
This is the Executor subclass which represents a DPC++ enhanced device.
Definition executor.hpp:1925
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition executor.hpp:1977
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition executor.hpp:1987
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition executor.hpp:2016
int get_num_subgroups() const noexcept
Get the number of subgroups of this executor.
Definition executor.hpp:1995
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition executor.hpp:2006
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition executor.hpp:2026
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition executor.hpp:2036
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
The first step in using the Ginkgo library consists of creating an executor.
Definition executor.hpp:644
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition executor.hpp:720
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
void run(const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition executor.hpp:683
bool should_propagate_log() const
Returns true iff events occurring at an object created on this executor should be logged at propagati...
Definition executor.hpp:877
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition executor.hpp:890
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition executor.hpp:792
void copy_from(ptr_param< const Executor > src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition executor.hpp:742
void set_log_propagation_mode(log_propagation_mode mode)
Sets the logger event propagation mode for the executor.
Definition executor.hpp:865
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition executor.hpp:703
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition executor.hpp:807
void remove_logger(const log::Logger *logger) override
Definition executor.hpp:849
Implement this interface to provide an allocator for HipExecutor.
Definition memory.hpp:93
Definition memory.hpp:200
This is the Executor subclass which represents the HIP enhanced device.
Definition executor.hpp:1727
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1781
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1797
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1856
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1805
static int get_num_devices()
Get the number of devices present on the system.
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1789
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1813
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition executor.hpp:1839
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1849
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1822
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition executor.hpp:1832
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition exception.hpp:156
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition executor.hpp:1366
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition executor.hpp:1373
Operations can be used to define functionalities whose implementations differ among devices.
Definition executor.hpp:287
virtual const char * get_name() const noexcept
Returns the operation's name.
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition executor.hpp:1443
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition executor.hpp:1458
This is a deleter that uses an executor's free method to deallocate the data.
Definition executor.hpp:1199
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition executor.hpp:1208
void operator()(pointer ptr) const
Deletes the object.
Definition executor.hpp:1217
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition logger.hpp:777
Definition logger.hpp:104
virtual bool needs_propagation() const
Returns true if this logger, when attached to an Executor, needs to be forwarded all events from obje...
Definition logger.hpp:671
static machine_topology * get_instance()
Returns an instance of the machine_topology object.
Definition machine_topology.hpp:211
This class is used for function parameters in the place of raw pointers.
Definition utils_helper.hpp:71
T * get() const
Definition utils_helper.hpp:105
This move-only class uses RAII to set the device id within a scoped block, if necessary.
Definition scoped_device_id_guard.hpp:104
The Ginkgo namespace.
Definition abstract_factory.hpp:48
constexpr T one()
Returns the multiplicative identity for T.
Definition math.hpp:803
std::uintptr_t uintptr
Unsigned integer type capable of holding a pointer to void.
Definition types.hpp:172
std::size_t size_type
Integral type used for allocation quantities.
Definition types.hpp:120
log_propagation_mode
How Logger events are propagated to their Executor.
Definition executor.hpp:63
@ automatic
Events get reported to loggers attached to the triggering object and propagating loggers (Logger::nee...
@ never
Events only get reported at loggers attached to the triggering object.
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition executor.hpp:91