diff --git a/benchmark/main.cpp b/benchmark/main.cpp index 7a630e461..9c9cb0633 100644 --- a/benchmark/main.cpp +++ b/benchmark/main.cpp @@ -9,8 +9,11 @@ * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ -#include "xsimd_benchmark.hpp" +#include #include +#include + +#include "xsimd_benchmark.hpp" void benchmark_operation() { diff --git a/benchmark/xsimd_benchmark.hpp b/benchmark/xsimd_benchmark.hpp index 6f6b91bf2..022b6672d 100644 --- a/benchmark/xsimd_benchmark.hpp +++ b/benchmark/xsimd_benchmark.hpp @@ -15,7 +15,6 @@ #include "xsimd/arch/xsimd_scalar.hpp" #include "xsimd/xsimd.hpp" #include -#include #include #include diff --git a/include/xsimd/arch/common/xsimd_common_swizzle.hpp b/include/xsimd/arch/common/xsimd_common_swizzle.hpp index 4af2225cd..b5f82d53f 100644 --- a/include/xsimd/arch/common/xsimd_common_swizzle.hpp +++ b/include/xsimd/arch/common/xsimd_common_swizzle.hpp @@ -16,7 +16,7 @@ #include #include -#include "../../config/xsimd_inline.hpp" +#include "../../config/xsimd_macros.hpp" namespace xsimd { diff --git a/include/xsimd/arch/utils/shifts.hpp b/include/xsimd/arch/utils/shifts.hpp index dec16feff..719ecfb7a 100644 --- a/include/xsimd/arch/utils/shifts.hpp +++ b/include/xsimd/arch/utils/shifts.hpp @@ -13,7 +13,7 @@ #ifndef XSIMD_UTILS_SHIFTS_HPP #define XSIMD_UTILS_SHIFTS_HPP -#include "../../config/xsimd_inline.hpp" +#include "../../config/xsimd_macros.hpp" #include "../../types/xsimd_batch.hpp" #include "../../types/xsimd_batch_constant.hpp" #include "../../types/xsimd_traits.hpp" diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp index 7439c8037..79bdc6d9d 100644 --- a/include/xsimd/arch/xsimd_rvv.hpp +++ b/include/xsimd/arch/xsimd_rvv.hpp @@ -369,12 +369,12 @@ namespace xsimd using as_float_relaxed_t = typename as_float_relaxed::type; template - rvv_reg_t rvvreinterpret(U const& arg) noexcept + XSIMD_INLINE rvv_reg_t rvvreinterpret(U const& arg) noexcept { return rvv_reg_t(arg, types::detail::XSIMD_RVV_BITCAST); } template - rvv_reg_t rvvreinterpret(batch const& arg) noexcept + XSIMD_INLINE rvv_reg_t rvvreinterpret(batch const& arg) noexcept { typename batch::register_type r = arg; return rvvreinterpret(r); @@ -519,23 +519,23 @@ namespace xsimd XSIMD_RVV_OVERLOAD(rvvget_hi_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 1) template = types::detail::rvv_width_m1, int> = 0> - rvv_reg_t rvvget_lo(rvv_reg_t const& vv) noexcept + XSIMD_INLINE rvv_reg_t rvvget_lo(rvv_reg_t const& vv) noexcept { typename rvv_reg_t::register_type tmp = rvvget_lo_(T {}, vv); return tmp; } template = types::detail::rvv_width_m1, int> = 0> - rvv_reg_t rvvget_hi(rvv_reg_t const& vv) noexcept + XSIMD_INLINE rvv_reg_t rvvget_hi(rvv_reg_t const& vv) noexcept { typename rvv_reg_t::register_type tmp = rvvget_hi_(T {}, vv); return tmp; } - template = 0> rvv_reg_t rvvget_lo(rvv_reg_t const& vv) noexcept + template = 0> XSIMD_INLINE rvv_reg_t rvvget_lo(rvv_reg_t const& vv) noexcept { typename rvv_reg_t::register_type tmp = vv; return tmp; } - template = 0> rvv_reg_t rvvget_hi(rvv_reg_t const& vv) noexcept + template = 0> XSIMD_INLINE rvv_reg_t rvvget_hi(rvv_reg_t const& vv) noexcept { return __riscv_vslidedown(vv, vv.vl / 2, vv.vl); } diff --git a/include/xsimd/arch/xsimd_scalar.hpp b/include/xsimd/arch/xsimd_scalar.hpp index 0b13f06e1..5cf38b71c 100644 --- a/include/xsimd/arch/xsimd_scalar.hpp +++ b/include/xsimd/arch/xsimd_scalar.hpp @@ -20,7 +20,7 @@ #include #include -#include "xsimd/config/xsimd_inline.hpp" +#include "xsimd/config/xsimd_macros.hpp" #ifdef XSIMD_ENABLE_XTL_COMPLEX #include "xtl/xcomplex.hpp" diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp index 8c9073f19..5be471e93 100644 --- a/include/xsimd/arch/xsimd_sve.hpp +++ b/include/xsimd/arch/xsimd_sve.hpp @@ -16,8 +16,15 @@ #include #include +#include "../config/xsimd_macros.hpp" #include "../types/xsimd_sve_register.hpp" +// Define a inline namespace with the explicit SVE vector size to avoid ODR violation +// When dynamically dispatching between different SVE sizes. +// While most code is safe from ODR violation as the size is already encoded in the +// register (and hence batch) types, utilities can quickly fall prone to this issue. +#define XSIMD_SVE_NAMESPACE XSIMD_CONCAT(sve, XSIMD_SVE_BITS) + namespace xsimd { template @@ -25,1240 +32,1243 @@ namespace xsimd namespace kernel { - namespace detail - { - using xsimd::index; - using xsimd::types::detail::sve_vector_type; - - // predicate creation - XSIMD_INLINE svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); } - XSIMD_INLINE svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); } - XSIMD_INLINE svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); } - XSIMD_INLINE svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); } - - template - svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index {}); } - - // predicate loading - template - svbool_t sve_pmask() noexcept { return svdupq_b64(M0, M1); } - template - svbool_t sve_pmask() noexcept { return svdupq_b32(M0, M1, M2, M3); } - template - svbool_t sve_pmask() noexcept { return svdupq_b16(M0, M1, M2, M3, M4, M5, M6, M7); } - template - svbool_t sve_pmask() noexcept { return svdupq_b8(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15); } - - // count active lanes in a predicate - XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); } - XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); } - XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); } - XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); } - - template - XSIMD_INLINE uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index {}); } - - // enable for signed integers - template - using sve_enable_signed_int_t = std::enable_if_t::value && std::is_signed::value, int>; - - // enable for unsigned integers - template - using sve_enable_unsigned_int_t = std::enable_if_t::value && !std::is_signed::value, int>; - - // enable for floating points - template - using sve_enable_floating_point_t = std::enable_if_t::value, int>; - - // enable for signed integers or floating points - template - using sve_enable_signed_int_or_floating_point_t = std::enable_if_t::value, int>; - - // enable for all SVE supported types - template - using sve_enable_all_t = std::enable_if_t::value, int>; - - // Trait describing the SVE types that correspond to a scalar, - // parameterised by (byte size, signedness, floating-point-ness). - // - // `scalar` is the matching fixed-width scalar (int8_t, ..., float, - // double). SVE load/store intrinsics are overloaded on these - // pointer types, so remapping integers through `scalar` avoids - // platform quirks such as darwin arm64's `long` vs `long long` - // distinction and rejects `char` as an element type. - // - // `sizeless` is the matching sizeless SVE type. xsimd stores SVE - // vectors as fixed-size attributed types (arm_sve_vector_bits), - // which clang treats as implicitly convertible to every sizeless - // SVE type — including multi-vector tuples — making the overloaded - // svreinterpret_*/svsel/etc. intrinsics ambiguous. Static-casting - // to `sizeless` first collapses the overload set to the single - // 1-vector candidate. - template - struct sve_type; - template <> - struct sve_type<1, true, false> - { - using scalar = int8_t; - using sizeless = svint8_t; - }; - template <> - struct sve_type<1, false, false> - { - using scalar = uint8_t; - using sizeless = svuint8_t; - }; - template <> - struct sve_type<2, true, false> - { - using scalar = int16_t; - using sizeless = svint16_t; - }; - template <> - struct sve_type<2, false, false> - { - using scalar = uint16_t; - using sizeless = svuint16_t; - }; - template <> - struct sve_type<4, true, false> - { - using scalar = int32_t; - using sizeless = svint32_t; - }; - template <> - struct sve_type<4, false, false> - { - using scalar = uint32_t; - using sizeless = svuint32_t; - }; - template <> - struct sve_type<8, true, false> - { - using scalar = int64_t; - using sizeless = svint64_t; - }; - template <> - struct sve_type<8, false, false> - { - using scalar = uint64_t; - using sizeless = svuint64_t; - }; - template <> - struct sve_type<4, true, true> - { - using scalar = float; - using sizeless = svfloat32_t; - }; - template <> - struct sve_type<8, true, true> - { - using scalar = double; - using sizeless = svfloat64_t; - }; - - template - using sve_type_for = sve_type::value, std::is_floating_point::value>; - - template - using sve_sizeless_t = typename sve_type_for::sizeless; - - // Remap integer Ts to their matching fixed-width counterpart (via - // sve_type::scalar) so svld1/svst1 see the pointer type their - // overload set expects; pass non-integer Ts through unchanged. - template >::value> - struct sve_fix_integer_impl - { - using type = T; - }; - template - struct sve_fix_integer_impl - { - using type = typename sve_type_for>::scalar; - }; - - template - using sve_fix_char_t = typename sve_fix_integer_impl::type; - } // namespace detail - - /********* - * Load * - *********/ - - template = 0> - XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept - { - return svld1(detail::sve_ptrue(), reinterpret_cast const*>(src)); - } - - template = 0> - XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept + inline namespace XSIMD_SVE_NAMESPACE { - return load_aligned(src, convert(), sve {}); - } + namespace detail_sve + { + using xsimd::index; + using xsimd::types::detail::sve_vector_type; + + // predicate creation + XSIMD_INLINE svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); } + XSIMD_INLINE svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); } + XSIMD_INLINE svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); } + XSIMD_INLINE svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); } + + template + XSIMD_INLINE svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index {}); } + + // predicate loading + template + XSIMD_INLINE svbool_t sve_pmask() noexcept { return svdupq_b64(M0, M1); } + template + XSIMD_INLINE svbool_t sve_pmask() noexcept { return svdupq_b32(M0, M1, M2, M3); } + template + XSIMD_INLINE svbool_t sve_pmask() noexcept { return svdupq_b16(M0, M1, M2, M3, M4, M5, M6, M7); } + template + XSIMD_INLINE svbool_t sve_pmask() noexcept { return svdupq_b8(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15); } + + // count active lanes in a predicate + XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); } + XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); } + XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); } + XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); } + + template + XSIMD_INLINE uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index {}); } + + // enable for signed integers + template + using sve_enable_signed_int_t = std::enable_if_t::value && std::is_signed::value, int>; + + // enable for unsigned integers + template + using sve_enable_unsigned_int_t = std::enable_if_t::value && !std::is_signed::value, int>; + + // enable for floating points + template + using sve_enable_floating_point_t = std::enable_if_t::value, int>; + + // enable for signed integers or floating points + template + using sve_enable_signed_int_or_floating_point_t = std::enable_if_t::value, int>; + + // enable for all SVE supported types + template + using sve_enable_all_t = std::enable_if_t::value, int>; + + // Trait describing the SVE types that correspond to a scalar, + // parameterised by (byte size, signedness, floating-point-ness). + // + // `scalar` is the matching fixed-width scalar (int8_t, ..., float, + // double). SVE load/store intrinsics are overloaded on these + // pointer types, so remapping integers through `scalar` avoids + // platform quirks such as darwin arm64's `long` vs `long long` + // distinction and rejects `char` as an element type. + // + // `sizeless` is the matching sizeless SVE type. xsimd stores SVE + // vectors as fixed-size attributed types (arm_sve_vector_bits), + // which clang treats as implicitly convertible to every sizeless + // SVE type — including multi-vector tuples — making the overloaded + // svreinterpret_*/svsel/etc. intrinsics ambiguous. Static-casting + // to `sizeless` first collapses the overload set to the single + // 1-vector candidate. + template + struct sve_type; + template <> + struct sve_type<1, true, false> + { + using scalar = int8_t; + using sizeless = svint8_t; + }; + template <> + struct sve_type<1, false, false> + { + using scalar = uint8_t; + using sizeless = svuint8_t; + }; + template <> + struct sve_type<2, true, false> + { + using scalar = int16_t; + using sizeless = svint16_t; + }; + template <> + struct sve_type<2, false, false> + { + using scalar = uint16_t; + using sizeless = svuint16_t; + }; + template <> + struct sve_type<4, true, false> + { + using scalar = int32_t; + using sizeless = svint32_t; + }; + template <> + struct sve_type<4, false, false> + { + using scalar = uint32_t; + using sizeless = svuint32_t; + }; + template <> + struct sve_type<8, true, false> + { + using scalar = int64_t; + using sizeless = svint64_t; + }; + template <> + struct sve_type<8, false, false> + { + using scalar = uint64_t; + using sizeless = svuint64_t; + }; + template <> + struct sve_type<4, true, true> + { + using scalar = float; + using sizeless = svfloat32_t; + }; + template <> + struct sve_type<8, true, true> + { + using scalar = double; + using sizeless = svfloat64_t; + }; - // load_masked - template = 0> - XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant, Mode, requires_arch) noexcept - { - return svld1(detail::sve_pmask(), reinterpret_cast const*>(mem)); - } + template + using sve_type_for = sve_type::value, std::is_floating_point::value>; - // load_complex - template = 0> - XSIMD_INLINE batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept - { - const T* buf = reinterpret_cast(mem); - const auto tmp = svld2(detail::sve_ptrue(), buf); - const auto real = svget2(tmp, 0); - const auto imag = svget2(tmp, 1); - return batch, A> { real, imag }; - } - - template = 0> - XSIMD_INLINE batch, A> load_complex_unaligned(std::complex const* mem, convert>, requires_arch) noexcept - { - return load_complex_aligned(mem, convert> {}, sve {}); - } + template + using sve_sizeless_t = typename sve_type_for::sizeless; - /********* - * Store * - *********/ + // Remap integer Ts to their matching fixed-width counterpart (via + // sve_type::scalar) so svld1/svst1 see the pointer type their + // overload set expects; pass non-integer Ts through unchanged. + template >::value> + struct sve_fix_integer_impl + { + using type = T; + }; + template + struct sve_fix_integer_impl + { + using type = typename sve_type_for>::scalar; + }; - template = 0> - XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept - { - svst1(detail::sve_ptrue(), reinterpret_cast*>(dst), src); - } + template + using sve_fix_char_t = typename sve_fix_integer_impl::type; + } // namespace detail_sve - template = 0> - XSIMD_INLINE void store_unaligned(T* dst, batch const& src, requires_arch) noexcept - { - store_aligned(dst, src, sve {}); - } + /********* + * Load * + *********/ - // store_complex - template = 0> - XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept - { - using v2type = std::conditional_t<(sizeof(T) == 4), svfloat32x2_t, svfloat64x2_t>; - v2type tmp {}; - tmp = svset2(tmp, 0, src.real()); - tmp = svset2(tmp, 1, src.imag()); - T* buf = reinterpret_cast(dst); - svst2(detail::sve_ptrue(), buf, tmp); - } - - template = 0> - XSIMD_INLINE void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept - { - store_complex_aligned(dst, src, sve {}); - } + template = 0> + XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept + { + return svld1(detail_sve::sve_ptrue(), reinterpret_cast const*>(src)); + } - /****************** - * scatter/gather * - ******************/ + template = 0> + XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept + { + return load_aligned(src, convert(), sve {}); + } - namespace detail - { - template - using sve_enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>; - } + // load_masked + template = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant, Mode, requires_arch) noexcept + { + return svld1(detail_sve::sve_pmask(), reinterpret_cast const*>(mem)); + } - // scatter - template = 0> - XSIMD_INLINE void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept - { - svst1_scatter_index(detail::sve_ptrue(), dst, index.data, src.data); - } + // load_complex + template = 0> + XSIMD_INLINE batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept + { + const T* buf = reinterpret_cast(mem); + const auto tmp = svld2(detail_sve::sve_ptrue(), buf); + const auto real = svget2(tmp, 0); + const auto imag = svget2(tmp, 1); + return batch, A> { real, imag }; + } - // gather - template = 0> - XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept - { - return svld1_gather_index(detail::sve_ptrue(), src, index.data); - } + template = 0> + XSIMD_INLINE batch, A> load_complex_unaligned(std::complex const* mem, convert>, requires_arch) noexcept + { + return load_complex_aligned(mem, convert> {}, sve {}); + } - /******************** - * Scalar to vector * - ********************/ + /********* + * Store * + *********/ - // broadcast - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_u8(uint8_t(arg)); - } + template = 0> + XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept + { + svst1(detail_sve::sve_ptrue(), reinterpret_cast*>(dst), src); + } - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_s8(int8_t(arg)); - } + template = 0> + XSIMD_INLINE void store_unaligned(T* dst, batch const& src, requires_arch) noexcept + { + store_aligned(dst, src, sve {}); + } - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_u16(uint16_t(arg)); - } + // store_complex + template = 0> + XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept + { + using v2type = std::conditional_t<(sizeof(T) == 4), svfloat32x2_t, svfloat64x2_t>; + v2type tmp {}; + tmp = svset2(tmp, 0, src.real()); + tmp = svset2(tmp, 1, src.imag()); + T* buf = reinterpret_cast(dst); + svst2(detail_sve::sve_ptrue(), buf, tmp); + } - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_s16(int16_t(arg)); - } + template = 0> + XSIMD_INLINE void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept + { + store_complex_aligned(dst, src, sve {}); + } - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_u32(uint32_t(arg)); - } + /****************** + * scatter/gather * + ******************/ - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_s32(int32_t(arg)); - } + namespace detail_sve + { + template + using sve_enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>; + } - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_u64(uint64_t(arg)); - } + // scatter + template = 0> + XSIMD_INLINE void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept + { + svst1_scatter_index(detail_sve::sve_ptrue(), dst, index.data, src.data); + } - template = 0> - XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept - { - return svdup_n_s64(int64_t(arg)); - } + // gather + template = 0> + XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept + { + return svld1_gather_index(detail_sve::sve_ptrue(), src, index.data); + } - template - XSIMD_INLINE batch broadcast(float arg, requires_arch) noexcept - { - return svdup_n_f32(arg); - } + /******************** + * Scalar to vector * + ********************/ - template - XSIMD_INLINE batch broadcast(double arg, requires_arch) noexcept - { - return svdup_n_f64(arg); - } + // broadcast + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_u8(uint8_t(arg)); + } - template = 0> - XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept - { - return broadcast(val, sve {}); - } + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_s8(int8_t(arg)); + } - /************** - * Arithmetic * - **************/ + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_u16(uint16_t(arg)); + } - // add - template = 0> - XSIMD_INLINE batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svadd_x(detail::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_s16(int16_t(arg)); + } - // sadd - template = 0> - XSIMD_INLINE batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svqadd(lhs, rhs); - } + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_u32(uint32_t(arg)); + } - // sub - template = 0> - XSIMD_INLINE batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svsub_x(detail::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_s32(int32_t(arg)); + } - // ssub - template = 0> - XSIMD_INLINE batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svqsub(lhs, rhs); - } + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_u64(uint64_t(arg)); + } - // mul - template = 0> - XSIMD_INLINE batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svmul_x(detail::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept + { + return svdup_n_s64(int64_t(arg)); + } - // div - template = 4, int> = 0> - XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svdiv_x(detail::sve_ptrue(), lhs, rhs); - } + template + XSIMD_INLINE batch broadcast(float arg, requires_arch) noexcept + { + return svdup_n_f32(arg); + } - // max - template = 0> - XSIMD_INLINE batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svmax_x(detail::sve_ptrue(), lhs, rhs); - } + template + XSIMD_INLINE batch broadcast(double arg, requires_arch) noexcept + { + return svdup_n_f64(arg); + } - // min - template = 0> - XSIMD_INLINE batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svmin_x(detail::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept + { + return broadcast(val, sve {}); + } - // neg - template = 0> - XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept - { - return svreinterpret_u8(svneg_x(detail::sve_ptrue(), svreinterpret_s8(static_cast>(arg)))); - } + /************** + * Arithmetic * + **************/ - template = 0> - XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept - { - return svreinterpret_u16(svneg_x(detail::sve_ptrue(), svreinterpret_s16(static_cast>(arg)))); - } + // add + template = 0> + XSIMD_INLINE batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svadd_x(detail_sve::sve_ptrue(), lhs, rhs); + } - template = 0> - XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept - { - return svreinterpret_u32(svneg_x(detail::sve_ptrue(), svreinterpret_s32(static_cast>(arg)))); - } + // sadd + template = 0> + XSIMD_INLINE batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svqadd(lhs, rhs); + } - template = 0> - XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept - { - return svreinterpret_u64(svneg_x(detail::sve_ptrue(), svreinterpret_s64(static_cast>(arg)))); - } + // sub + template = 0> + XSIMD_INLINE batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svsub_x(detail_sve::sve_ptrue(), lhs, rhs); + } - template = 0> - XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept - { - return svneg_x(detail::sve_ptrue(), arg); - } + // ssub + template = 0> + XSIMD_INLINE batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svqsub(lhs, rhs); + } - // abs - template = 0> - XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept - { - return arg; - } + // mul + template = 0> + XSIMD_INLINE batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svmul_x(detail_sve::sve_ptrue(), lhs, rhs); + } - template = 0> - XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept - { - return svabs_x(detail::sve_ptrue(), arg); - } + // div + template = 4, int> = 0> + XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svdiv_x(detail_sve::sve_ptrue(), lhs, rhs); + } - // fma: x * y + z - template = 0> - XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept - { - return svmad_x(detail::sve_ptrue(), x, y, z); - } + // max + template = 0> + XSIMD_INLINE batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svmax_x(detail_sve::sve_ptrue(), lhs, rhs); + } - // fnma: z - x * y - template = 0> - XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept - { - return svmsb_x(detail::sve_ptrue(), x, y, z); - } + // min + template = 0> + XSIMD_INLINE batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svmin_x(detail_sve::sve_ptrue(), lhs, rhs); + } - // fms: x * y - z - template = 0> - XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept - { - return -fnma(x, y, z, sve {}); - } + // neg + template = 0> + XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept + { + return svreinterpret_u8(svneg_x(detail_sve::sve_ptrue(), svreinterpret_s8(static_cast>(arg)))); + } - // fnms: - x * y - z - template = 0> - XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept - { - return -fma(x, y, z, sve {}); - } + template = 0> + XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept + { + return svreinterpret_u16(svneg_x(detail_sve::sve_ptrue(), svreinterpret_s16(static_cast>(arg)))); + } - /********************** - * Logical operations * - **********************/ + template = 0> + XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept + { + return svreinterpret_u32(svneg_x(detail_sve::sve_ptrue(), svreinterpret_s32(static_cast>(arg)))); + } - // bitwise_and - template = 0> - XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svand_x(detail::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept + { + return svreinterpret_u64(svneg_x(detail_sve::sve_ptrue(), svreinterpret_s64(static_cast>(arg)))); + } - template - XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); - const auto result_bits = svand_x(detail::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f32(result_bits); - } - - template - XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); - const auto result_bits = svand_x(detail::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f64(result_bits); - } - - template = 0> - XSIMD_INLINE batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept - { - return svand_z(detail::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept + { + return svneg_x(detail_sve::sve_ptrue(), arg); + } - // bitwise_andnot - template = 0> - XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svbic_x(detail::sve_ptrue(), lhs, rhs); - } + // abs + template = 0> + XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept + { + return arg; + } - template - XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); - const auto result_bits = svbic_x(detail::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f32(result_bits); - } - - template - XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); - const auto result_bits = svbic_x(detail::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f64(result_bits); - } - - template = 0> - XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept - { - return svbic_z(detail::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept + { + return svabs_x(detail_sve::sve_ptrue(), arg); + } - // bitwise_or - template = 0> - XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svorr_x(detail::sve_ptrue(), lhs, rhs); - } + // fma: x * y + z + template = 0> + XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return svmad_x(detail_sve::sve_ptrue(), x, y, z); + } - template - XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); - const auto result_bits = svorr_x(detail::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f32(result_bits); - } - - template - XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); - const auto result_bits = svorr_x(detail::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f64(result_bits); - } - - template = 0> - XSIMD_INLINE batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept - { - return svorr_z(detail::sve_ptrue(), lhs, rhs); - } + // fnma: z - x * y + template = 0> + XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return svmsb_x(detail_sve::sve_ptrue(), x, y, z); + } - // bitwise_xor - template = 0> - XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return sveor_x(detail::sve_ptrue(), lhs, rhs); - } + // fms: x * y - z + template = 0> + XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return -fnma(x, y, z, sve {}); + } - template - XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); - const auto result_bits = sveor_x(detail::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f32(result_bits); - } - - template - XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); - const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); - const auto result_bits = sveor_x(detail::sve_ptrue(), lhs_bits, rhs_bits); - return svreinterpret_f64(result_bits); - } - - template = 0> - XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept - { - return sveor_z(detail::sve_ptrue(), lhs, rhs); - } + // fnms: - x * y - z + template = 0> + XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return -fma(x, y, z, sve {}); + } - // bitwise_not - template = 0> - XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept - { - return svnot_x(detail::sve_ptrue(), arg); - } + /********************** + * Logical operations * + **********************/ - template - XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept - { - const auto arg_bits = svreinterpret_u32(static_cast>(arg)); - const auto result_bits = svnot_x(detail::sve_ptrue(), arg_bits); - return svreinterpret_f32(result_bits); - } + // bitwise_and + template = 0> + XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svand_x(detail_sve::sve_ptrue(), lhs, rhs); + } - template - XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept - { - const auto arg_bits = svreinterpret_u64(static_cast>(arg)); - const auto result_bits = svnot_x(detail::sve_ptrue(), arg_bits); - return svreinterpret_f64(result_bits); - } + template + XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); + const auto result_bits = svand_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f32(result_bits); + } - template = 0> - XSIMD_INLINE batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept - { - return svnot_z(detail::sve_ptrue(), arg); - } + template + XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); + const auto result_bits = svand_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f64(result_bits); + } - /********** - * Shifts * - **********/ + template = 0> + XSIMD_INLINE batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return svand_z(detail_sve::sve_ptrue(), lhs, rhs); + } - namespace detail - { - template - XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<1>) noexcept + // bitwise_andnot + template = 0> + XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svreinterpret_u8(static_cast>(arg)); + return svbic_x(detail_sve::sve_ptrue(), lhs, rhs); } - template - XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<2>) noexcept + template + XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svreinterpret_u16(static_cast>(arg)); + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); + const auto result_bits = svbic_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f32(result_bits); } - template - XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<4>) noexcept + template + XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svreinterpret_u32(static_cast>(arg)); + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); + const auto result_bits = svbic_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f64(result_bits); } - template - XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<8>) noexcept + template = 0> + XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { - return svreinterpret_u64(static_cast>(arg)); + return svbic_z(detail_sve::sve_ptrue(), lhs, rhs); } - template > - XSIMD_INLINE batch sve_to_unsigned_batch(batch const& arg) noexcept + // bitwise_or + template = 0> + XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return sve_to_unsigned_batch_impl(arg, index {}); + return svorr_x(detail_sve::sve_ptrue(), lhs, rhs); } - } // namespace detail - // bitwise_lshift - template = 0> - XSIMD_INLINE batch bitwise_lshift(batch const& arg, int n, requires_arch) noexcept - { - constexpr std::size_t size = sizeof(typename batch::value_type) * 8; - assert(0 <= n && static_cast(n) < size && "index in bounds"); - return svlsl_x(detail::sve_ptrue(), arg, n); - } + template + XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); + const auto result_bits = svorr_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f32(result_bits); + } - template = 0> - XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svlsl_x(detail::sve_ptrue(), lhs, detail::sve_to_unsigned_batch(rhs)); - } + template + XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); + const auto result_bits = svorr_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f64(result_bits); + } - // bitwise_rshift - template = 0> - XSIMD_INLINE batch bitwise_rshift(batch const& arg, int n, requires_arch) noexcept - { - constexpr std::size_t size = sizeof(typename batch::value_type) * 8; - assert(0 <= n && static_cast(n) < size && "index in bounds"); - return svlsr_x(detail::sve_ptrue(), arg, static_cast(n)); - } + template = 0> + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return svorr_z(detail_sve::sve_ptrue(), lhs, rhs); + } - template = 0> - XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svlsr_x(detail::sve_ptrue(), lhs, rhs); - } + // bitwise_xor + template = 0> + XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return sveor_x(detail_sve::sve_ptrue(), lhs, rhs); + } - template = 0> - XSIMD_INLINE batch bitwise_rshift(batch const& arg, int n, requires_arch) noexcept - { - constexpr std::size_t size = sizeof(typename batch::value_type) * 8; - assert(0 <= n && static_cast(n) < size && "index in bounds"); - return svasr_x(detail::sve_ptrue(), arg, static_cast>(n)); - } + template + XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); + const auto result_bits = sveor_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f32(result_bits); + } - template = 0> - XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svasr_x(detail::sve_ptrue(), lhs, detail::sve_to_unsigned_batch(rhs)); - } + template + XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); + const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); + const auto result_bits = sveor_x(detail_sve::sve_ptrue(), lhs_bits, rhs_bits); + return svreinterpret_f64(result_bits); + } - /************** - * Reductions * - **************/ + template = 0> + XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return sveor_z(detail_sve::sve_ptrue(), lhs, rhs); + } - // reduce_add - template ::value_type, detail::sve_enable_all_t = 0> - XSIMD_INLINE V reduce_add(batch const& arg, requires_arch) noexcept - { - // sve integer reduction results are promoted to 64 bits - return static_cast(svaddv(detail::sve_ptrue(), arg)); - } + // bitwise_not + template = 0> + XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept + { + return svnot_x(detail_sve::sve_ptrue(), arg); + } - // reduce_max - template = 0> - XSIMD_INLINE T reduce_max(batch const& arg, requires_arch) noexcept - { - return svmaxv(detail::sve_ptrue(), arg); - } + template + XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept + { + const auto arg_bits = svreinterpret_u32(static_cast>(arg)); + const auto result_bits = svnot_x(detail_sve::sve_ptrue(), arg_bits); + return svreinterpret_f32(result_bits); + } - // reduce_min - template = 0> - XSIMD_INLINE T reduce_min(batch const& arg, requires_arch) noexcept - { - return svminv(detail::sve_ptrue(), arg); - } + template + XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept + { + const auto arg_bits = svreinterpret_u64(static_cast>(arg)); + const auto result_bits = svnot_x(detail_sve::sve_ptrue(), arg_bits); + return svreinterpret_f64(result_bits); + } - // haddp - template = 0> - XSIMD_INLINE batch haddp(const batch* row, requires_arch) noexcept - { - constexpr std::size_t size = batch::size; - T sums[size]; - for (std::size_t i = 0; i < size; ++i) + template = 0> + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept { - sums[i] = reduce_add(row[i], sve {}); + return svnot_z(detail_sve::sve_ptrue(), arg); } - return svld1(detail::sve_ptrue(), sums); - } - /*************** - * Comparisons * - ***************/ + /********** + * Shifts * + **********/ - // eq - template = 0> - XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svcmpeq(detail::sve_ptrue(), lhs, rhs); - } + namespace detail_sve + { + template + XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<1>) noexcept + { + return svreinterpret_u8(static_cast>(arg)); + } - template = 0> - XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept - { - const auto neq_result = sveor_z(detail::sve_ptrue(), lhs, rhs); - return svnot_z(detail::sve_ptrue(), neq_result); - } + template + XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<2>) noexcept + { + return svreinterpret_u16(static_cast>(arg)); + } - // neq - template = 0> - XSIMD_INLINE batch_bool neq(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svcmpne(detail::sve_ptrue(), lhs, rhs); - } + template + XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<4>) noexcept + { + return svreinterpret_u32(static_cast>(arg)); + } - template = 0> - XSIMD_INLINE batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept - { - return sveor_z(detail::sve_ptrue(), lhs, rhs); - } + template + XSIMD_INLINE batch sve_to_unsigned_batch_impl(batch const& arg, index<8>) noexcept + { + return svreinterpret_u64(static_cast>(arg)); + } - // lt - template = 0> - XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svcmplt(detail::sve_ptrue(), lhs, rhs); - } + template > + XSIMD_INLINE batch sve_to_unsigned_batch(batch const& arg) noexcept + { + return sve_to_unsigned_batch_impl(arg, index {}); + } + } // namespace detail_sve - // le - template = 0> - XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svcmple(detail::sve_ptrue(), lhs, rhs); - } + // bitwise_lshift + template = 0> + XSIMD_INLINE batch bitwise_lshift(batch const& arg, int n, requires_arch) noexcept + { + constexpr std::size_t size = sizeof(typename batch::value_type) * 8; + assert(0 <= n && static_cast(n) < size && "index in bounds"); + return svlsl_x(detail_sve::sve_ptrue(), arg, n); + } - // gt - template = 0> - XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svcmpgt(detail::sve_ptrue(), lhs, rhs); - } + template = 0> + XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svlsl_x(detail_sve::sve_ptrue(), lhs, detail_sve::sve_to_unsigned_batch(rhs)); + } - // ge - template = 0> - XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svcmpge(detail::sve_ptrue(), lhs, rhs); - } + // bitwise_rshift + template = 0> + XSIMD_INLINE batch bitwise_rshift(batch const& arg, int n, requires_arch) noexcept + { + constexpr std::size_t size = sizeof(typename batch::value_type) * 8; + assert(0 <= n && static_cast(n) < size && "index in bounds"); + return svlsr_x(detail_sve::sve_ptrue(), arg, static_cast(n)); + } - /*************** - * Permutation * - ***************/ + template = 0> + XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svlsr_x(detail_sve::sve_ptrue(), lhs, rhs); + } - // rotate_left - template = 0> - XSIMD_INLINE batch rotate_left(batch const& a, requires_arch) noexcept - { - return svext(a, a, N); - } + template = 0> + XSIMD_INLINE batch bitwise_rshift(batch const& arg, int n, requires_arch) noexcept + { + constexpr std::size_t size = sizeof(typename batch::value_type) * 8; + assert(0 <= n && static_cast(n) < size && "index in bounds"); + return svasr_x(detail_sve::sve_ptrue(), arg, static_cast>(n)); + } - // swizzle (dynamic) - template - XSIMD_INLINE batch swizzle(batch const& arg, batch indices, requires_arch) noexcept - { - return svtbl(arg, indices); - } + template = 0> + XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svasr_x(detail_sve::sve_ptrue(), lhs, detail_sve::sve_to_unsigned_batch(rhs)); + } - template - XSIMD_INLINE batch, A> swizzle(batch, A> const& self, - batch indices, - requires_arch) noexcept - { - const auto real = swizzle(self.real(), indices, sve {}); - const auto imag = swizzle(self.imag(), indices, sve {}); - return batch>(real, imag); - } - - // swizzle (static) - template - XSIMD_INLINE batch swizzle(batch const& arg, batch_constant indices, requires_arch) noexcept - { - static_assert(batch::size == sizeof...(idx), "invalid swizzle indices"); - return swizzle(arg, indices.as_batch(), sve {}); - } - - template - XSIMD_INLINE batch, A> swizzle(batch, A> const& arg, - batch_constant indices, - requires_arch) noexcept - { - static_assert(batch, A>::size == sizeof...(idx), "invalid swizzle indices"); - return swizzle(arg, indices.as_batch(), sve {}); - } + /************** + * Reductions * + **************/ - /************* - * Selection * - *************/ + // reduce_add + template ::value_type, detail_sve::sve_enable_all_t = 0> + XSIMD_INLINE V reduce_add(batch const& arg, requires_arch) noexcept + { + // sve integer reduction results are promoted to 64 bits + return static_cast(svaddv(detail_sve::sve_ptrue(), arg)); + } - // extract_pair - namespace detail - { - template - XSIMD_INLINE batch sve_extract_pair(batch const&, batch const& /*rhs*/, std::size_t, std::index_sequence<>) noexcept + // reduce_max + template = 0> + XSIMD_INLINE T reduce_max(batch const& arg, requires_arch) noexcept { - assert(false && "extract_pair out of bounds"); - return batch {}; + return svmaxv(detail_sve::sve_ptrue(), arg); } - template - XSIMD_INLINE batch sve_extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept + // reduce_min + template = 0> + XSIMD_INLINE T reduce_min(batch const& arg, requires_arch) noexcept { - if (n == I) - { - return svext(rhs, lhs, I); - } - else - { - return sve_extract_pair(lhs, rhs, n, std::index_sequence()); - } + return svminv(detail_sve::sve_ptrue(), arg); } - template - XSIMD_INLINE batch sve_extract_pair_impl(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence<0, Is...>) noexcept + // haddp + template = 0> + XSIMD_INLINE batch haddp(const batch* row, requires_arch) noexcept { - if (n == 0) - { - return rhs; - } - else + constexpr std::size_t size = batch::size; + T sums[size]; + for (std::size_t i = 0; i < size; ++i) { - return sve_extract_pair(lhs, rhs, n, std::index_sequence()); + sums[i] = reduce_add(row[i], sve {}); } + return svld1(detail_sve::sve_ptrue(), sums); } - } - template = 0> - XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, requires_arch) noexcept - { - constexpr std::size_t size = batch::size; - assert(n < size && "index in bounds"); - return detail::sve_extract_pair_impl(lhs, rhs, n, std::make_index_sequence()); - } - - // select - template = 0> - XSIMD_INLINE batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept - { - return svsel(cond, static_cast>(a), static_cast>(b)); - } - - template - XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept - { - return select(batch_bool { b... }, true_br, false_br, sve {}); - } + /*************** + * Comparisons * + ***************/ - // zip_lo - template = 0> - XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svzip1(lhs, rhs); - } + // eq + template = 0> + XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svcmpeq(detail_sve::sve_ptrue(), lhs, rhs); + } - // zip_hi - template = 0> - XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept - { - return svzip2(lhs, rhs); - } + template = 0> + XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + const auto neq_result = sveor_z(detail_sve::sve_ptrue(), lhs, rhs); + return svnot_z(detail_sve::sve_ptrue(), neq_result); + } - /***************************** - * Floating-point arithmetic * - *****************************/ + // neq + template = 0> + XSIMD_INLINE batch_bool neq(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svcmpne(detail_sve::sve_ptrue(), lhs, rhs); + } - // rsqrt - template = 0> - XSIMD_INLINE batch rsqrt(batch const& arg, requires_arch) noexcept - { - return svrsqrte(arg); - } + template = 0> + XSIMD_INLINE batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return sveor_z(detail_sve::sve_ptrue(), lhs, rhs); + } - // sqrt - template = 0> - XSIMD_INLINE batch sqrt(batch const& arg, requires_arch) noexcept - { - return svsqrt_x(detail::sve_ptrue(), arg); - } + // lt + template = 0> + XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svcmplt(detail_sve::sve_ptrue(), lhs, rhs); + } - // reciprocal - template = 0> - XSIMD_INLINE batch reciprocal(const batch& arg, requires_arch) noexcept - { - return svrecpe(arg); - } + // le + template = 0> + XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svcmple(detail_sve::sve_ptrue(), lhs, rhs); + } - /****************************** - * Floating-point conversions * - ******************************/ + // gt + template = 0> + XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svcmpgt(detail_sve::sve_ptrue(), lhs, rhs); + } - // fast_cast - namespace detail - { - template = 0> - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + // ge + template = 0> + XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { - return svcvt_f32_x(detail::sve_ptrue(), arg); + return svcmpge(detail_sve::sve_ptrue(), lhs, rhs); } - template = 0> - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + /*************** + * Permutation * + ***************/ + + // rotate_left + template = 0> + XSIMD_INLINE batch rotate_left(batch const& a, requires_arch) noexcept { - return svcvt_f64_x(detail::sve_ptrue(), arg); + return svext(a, a, N); } - template - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + // swizzle (dynamic) + template + XSIMD_INLINE batch swizzle(batch const& arg, batch indices, requires_arch) noexcept { - return svcvt_s32_x(detail::sve_ptrue(), arg); + return svtbl(arg, indices); } - template - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + template + XSIMD_INLINE batch, A> swizzle(batch, A> const& self, + batch indices, + requires_arch) noexcept { - return svcvt_u32_x(detail::sve_ptrue(), arg); + const auto real = swizzle(self.real(), indices, sve {}); + const auto imag = swizzle(self.imag(), indices, sve {}); + return batch>(real, imag); } - template - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + // swizzle (static) + template + XSIMD_INLINE batch swizzle(batch const& arg, batch_constant indices, requires_arch) noexcept { - return svcvt_s64_x(detail::sve_ptrue(), arg); + static_assert(batch::size == sizeof...(idx), "invalid swizzle indices"); + return swizzle(arg, indices.as_batch(), sve {}); } - template - XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + template + XSIMD_INLINE batch, A> swizzle(batch, A> const& arg, + batch_constant indices, + requires_arch) noexcept { - return svcvt_u64_x(detail::sve_ptrue(), arg); + static_assert(batch, A>::size == sizeof...(idx), "invalid swizzle indices"); + return swizzle(arg, indices.as_batch(), sve {}); } - } - /********* - * Miscs * - *********/ + /************* + * Selection * + *************/ - // set - template - XSIMD_INLINE batch set(batch const&, requires_arch, Args... args) noexcept - { - return detail::sve_vector_type { args... }; - } + // extract_pair + namespace detail_sve + { + template + XSIMD_INLINE batch sve_extract_pair(batch const&, batch const& /*rhs*/, std::size_t, std::index_sequence<>) noexcept + { + assert(false && "extract_pair out of bounds"); + return batch {}; + } - template - XSIMD_INLINE batch, A> set(batch, A> const&, requires_arch, - Args... args_complex) noexcept - { - return batch>(detail::sve_vector_type { args_complex.real()... }, - detail::sve_vector_type { args_complex.imag()... }); - } + template + XSIMD_INLINE batch sve_extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept + { + if (n == I) + { + return svext(rhs, lhs, I); + } + else + { + return sve_extract_pair(lhs, rhs, n, std::index_sequence()); + } + } - template - XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept - { - using U = as_unsigned_integer_t; - const auto values = detail::sve_vector_type { static_cast(args)... }; - const auto zero = broadcast(static_cast(0), sve {}); - return svcmpne(detail::sve_ptrue(), values, zero); - } - - // insert - namespace detail - { - // generate index sequence (iota) - XSIMD_INLINE svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); } - XSIMD_INLINE svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); } - XSIMD_INLINE svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); } - XSIMD_INLINE svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); } - - template >> - XSIMD_INLINE V sve_iota() noexcept { return sve_iota_impl(index {}); } - } // namespace detail - - template = 0> - XSIMD_INLINE batch insert(batch const& arg, T val, index, requires_arch) noexcept - { - // create a predicate with only the I-th lane activated - const auto iota = detail::sve_iota(); - const auto index_predicate = svcmpeq(detail::sve_ptrue(), iota, static_cast>(I)); - return svsel(index_predicate, static_cast>(broadcast(val, sve {})), static_cast>(arg)); - } - - // first - template = 0> - XSIMD_INLINE T first(batch const& self, requires_arch) noexcept - { - return self.data[0]; - } + template + XSIMD_INLINE batch sve_extract_pair_impl(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence<0, Is...>) noexcept + { + if (n == 0) + { + return rhs; + } + else + { + return sve_extract_pair(lhs, rhs, n, std::index_sequence()); + } + } + } - // all - template = 0> - XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept - { - return detail::sve_pcount(arg) == batch_bool::size; - } + template = 0> + XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, requires_arch) noexcept + { + constexpr std::size_t size = batch::size; + assert(n < size && "index in bounds"); + return detail_sve::sve_extract_pair_impl(lhs, rhs, n, std::make_index_sequence()); + } - // any - template = 0> - XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept - { - return svptest_any(arg, arg); - } + // select + template = 0> + XSIMD_INLINE batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept + { + return svsel(cond, static_cast>(a), static_cast>(b)); + } - // bitwise_cast - template = 0, detail::enable_sized_unsigned_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_u8(static_cast>(arg)); - } + template + XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { b... }, true_br, false_br, sve {}); + } - template = 0, detail::enable_sized_signed_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_s8(static_cast>(arg)); - } + // zip_lo + template = 0> + XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svzip1(lhs, rhs); + } - template = 0, detail::enable_sized_unsigned_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_u16(static_cast>(arg)); - } + // zip_hi + template = 0> + XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svzip2(lhs, rhs); + } - template = 0, detail::enable_sized_signed_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_s16(static_cast>(arg)); - } + /***************************** + * Floating-point arithmetic * + *****************************/ - template = 0, detail::enable_sized_unsigned_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_u32(static_cast>(arg)); - } + // rsqrt + template = 0> + XSIMD_INLINE batch rsqrt(batch const& arg, requires_arch) noexcept + { + return svrsqrte(arg); + } - template = 0, detail::enable_sized_signed_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_s32(static_cast>(arg)); - } + // sqrt + template = 0> + XSIMD_INLINE batch sqrt(batch const& arg, requires_arch) noexcept + { + return svsqrt_x(detail_sve::sve_ptrue(), arg); + } - template = 0, detail::enable_sized_unsigned_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_u64(static_cast>(arg)); - } + // reciprocal + template = 0> + XSIMD_INLINE batch reciprocal(const batch& arg, requires_arch) noexcept + { + return svrecpe(arg); + } - template = 0, detail::enable_sized_signed_t = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_s64(static_cast>(arg)); - } + /****************************** + * Floating-point conversions * + ******************************/ - template = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_f32(static_cast>(arg)); - } + // fast_cast + namespace detail_sve + { + template = 0> + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svcvt_f32_x(detail_sve::sve_ptrue(), arg); + } - template = 0> - XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept - { - return svreinterpret_f64(static_cast>(arg)); - } + template = 0> + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svcvt_f64_x(detail_sve::sve_ptrue(), arg); + } - // batch_bool_cast - template = 0> - XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& arg, batch_bool const&, requires_arch) noexcept - { - return arg.data; - } + template + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svcvt_s32_x(detail_sve::sve_ptrue(), arg); + } - // from_bool - template = 0> - XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept - { - return select(arg, batch(1), batch(0)); - } + template + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svcvt_u32_x(detail_sve::sve_ptrue(), arg); + } - // slide_left - namespace detail - { - template - struct sve_slider_left - { - template - XSIMD_INLINE batch operator()(batch const& arg) noexcept + template + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { - using u8_vector = batch; - const auto left = svdup_n_u8(0); - const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data; - const u8_vector result(svext(left, right, u8_vector::size - N)); - return bitwise_cast(result, batch {}, sve {}); + return svcvt_s64_x(detail_sve::sve_ptrue(), arg); } - }; - template <> - struct sve_slider_left<0> - { - template - XSIMD_INLINE batch operator()(batch const& arg) noexcept + template + XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { - return arg; + return svcvt_u64_x(detail_sve::sve_ptrue(), arg); } - }; - } // namespace detail + } - template = 0> - XSIMD_INLINE batch slide_left(batch const& arg, requires_arch) noexcept - { - return detail::sve_slider_left()(arg); - } + /********* + * Miscs * + *********/ - // slide_right - namespace detail - { - template - struct sve_slider_right + // set + template + XSIMD_INLINE batch set(batch const&, requires_arch, Args... args) noexcept { - template - XSIMD_INLINE batch operator()(batch const& arg) noexcept - { - using u8_vector = batch; - const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data; - const auto right = svdup_n_u8(0); - const u8_vector result(svext(left, right, N)); - return bitwise_cast(result, batch {}, sve {}); - } - }; + return detail_sve::sve_vector_type { args... }; + } - template <> - struct sve_slider_right::size> + template + XSIMD_INLINE batch, A> set(batch, A> const&, requires_arch, + Args... args_complex) noexcept { - template - XSIMD_INLINE batch operator()(batch const&) noexcept + return batch>(detail_sve::sve_vector_type { args_complex.real()... }, + detail_sve::sve_vector_type { args_complex.imag()... }); + } + + template + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept + { + using U = as_unsigned_integer_t; + const auto values = detail_sve::sve_vector_type { static_cast(args)... }; + const auto zero = broadcast(static_cast(0), sve {}); + return svcmpne(detail_sve::sve_ptrue(), values, zero); + } + + // insert + namespace detail_sve + { + // generate index sequence (iota) + XSIMD_INLINE svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); } + XSIMD_INLINE svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); } + XSIMD_INLINE svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); } + XSIMD_INLINE svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); } + + template >> + XSIMD_INLINE V sve_iota() noexcept { return sve_iota_impl(index {}); } + } // namespace detail_sve + + template = 0> + XSIMD_INLINE batch insert(batch const& arg, T val, index, requires_arch) noexcept + { + // create a predicate with only the I-th lane activated + const auto iota = detail_sve::sve_iota(); + const auto index_predicate = svcmpeq(detail_sve::sve_ptrue(), iota, static_cast>(I)); + return svsel(index_predicate, static_cast>(broadcast(val, sve {})), static_cast>(arg)); + } + + // first + template = 0> + XSIMD_INLINE T first(batch const& self, requires_arch) noexcept + { + return self.data[0]; + } + + // all + template = 0> + XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept + { + return detail_sve::sve_pcount(arg) == batch_bool::size; + } + + // any + template = 0> + XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept + { + return svptest_any(arg, arg); + } + + // bitwise_cast + template = 0, detail::enable_sized_unsigned_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_u8(static_cast>(arg)); + } + + template = 0, detail::enable_sized_signed_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_s8(static_cast>(arg)); + } + + template = 0, detail::enable_sized_unsigned_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_u16(static_cast>(arg)); + } + + template = 0, detail::enable_sized_signed_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_s16(static_cast>(arg)); + } + + template = 0, detail::enable_sized_unsigned_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_u32(static_cast>(arg)); + } + + template = 0, detail::enable_sized_signed_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_s32(static_cast>(arg)); + } + + template = 0, detail::enable_sized_unsigned_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_u64(static_cast>(arg)); + } + + template = 0, detail::enable_sized_signed_t = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_s64(static_cast>(arg)); + } + + template = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_f32(static_cast>(arg)); + } + + template = 0> + XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return svreinterpret_f64(static_cast>(arg)); + } + + // batch_bool_cast + template = 0> + XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& arg, batch_bool const&, requires_arch) noexcept + { + return arg.data; + } + + // from_bool + template = 0> + XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept + { + return select(arg, batch(1), batch(0)); + } + + // slide_left + namespace detail_sve + { + template + struct sve_slider_left { - return batch {}; - } - }; - } // namespace detail + template + XSIMD_INLINE batch operator()(batch const& arg) noexcept + { + using u8_vector = batch; + const auto left = svdup_n_u8(0); + const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data; + const u8_vector result(svext(left, right, u8_vector::size - N)); + return bitwise_cast(result, batch {}, sve {}); + } + }; + + template <> + struct sve_slider_left<0> + { + template + XSIMD_INLINE batch operator()(batch const& arg) noexcept + { + return arg; + } + }; + } // namespace detail_sve + + template = 0> + XSIMD_INLINE batch slide_left(batch const& arg, requires_arch) noexcept + { + return detail_sve::sve_slider_left()(arg); + } - template = 0> - XSIMD_INLINE batch slide_right(batch const& arg, requires_arch) noexcept - { - return detail::sve_slider_right()(arg); - } + // slide_right + namespace detail_sve + { + template + struct sve_slider_right + { + template + XSIMD_INLINE batch operator()(batch const& arg) noexcept + { + using u8_vector = batch; + const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data; + const auto right = svdup_n_u8(0); + const u8_vector result(svext(left, right, N)); + return bitwise_cast(result, batch {}, sve {}); + } + }; + + template <> + struct sve_slider_right::size> + { + template + XSIMD_INLINE batch operator()(batch const&) noexcept + { + return batch {}; + } + }; + } // namespace detail_sve + + template = 0> + XSIMD_INLINE batch slide_right(batch const& arg, requires_arch) noexcept + { + return detail_sve::sve_slider_right()(arg); + } - // isnan - template = 0> - XSIMD_INLINE batch_bool isnan(batch const& arg, requires_arch) noexcept - { - return !(arg == arg); - } + // isnan + template = 0> + XSIMD_INLINE batch_bool isnan(batch const& arg, requires_arch) noexcept + { + return !(arg == arg); + } - // nearbyint - template = 0> - XSIMD_INLINE batch nearbyint(batch const& arg, requires_arch) noexcept - { - return svrintx_x(detail::sve_ptrue(), arg); - } + // nearbyint + template = 0> + XSIMD_INLINE batch nearbyint(batch const& arg, requires_arch) noexcept + { + return svrintx_x(detail_sve::sve_ptrue(), arg); + } - // nearbyint_as_int - template - XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept - { - const auto nearest = svrintx_x(detail::sve_ptrue(), arg); - return svcvt_s32_x(detail::sve_ptrue(), nearest); - } + // nearbyint_as_int + template + XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept + { + const auto nearest = svrintx_x(detail_sve::sve_ptrue(), arg); + return svcvt_s32_x(detail_sve::sve_ptrue(), nearest); + } - template - XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept - { - const auto nearest = svrintx_x(detail::sve_ptrue(), arg); - return svcvt_s64_x(detail::sve_ptrue(), nearest); - } + template + XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept + { + const auto nearest = svrintx_x(detail_sve::sve_ptrue(), arg); + return svcvt_s64_x(detail_sve::sve_ptrue(), nearest); + } - // ldexp - template = 0> - XSIMD_INLINE batch ldexp(const batch& x, const batch, A>& exp, requires_arch) noexcept - { - return svscale_x(detail::sve_ptrue(), x, exp); - } + // ldexp + template = 0> + XSIMD_INLINE batch ldexp(const batch& x, const batch, A>& exp, requires_arch) noexcept + { + return svscale_x(detail_sve::sve_ptrue(), x, exp); + } + } // namespace XSIMD_SVE_NAMESPACE } // namespace kernel } // namespace xsimd diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp index 5e19e74c5..e2e90f438 100644 --- a/include/xsimd/config/xsimd_cpuid.hpp +++ b/include/xsimd/config/xsimd_cpuid.hpp @@ -14,7 +14,7 @@ #include "../types/xsimd_all_registers.hpp" #include "./xsimd_cpu_features.hpp" -#include "./xsimd_inline.hpp" +#include "./xsimd_macros.hpp" namespace xsimd { diff --git a/include/xsimd/config/xsimd_inline.hpp b/include/xsimd/config/xsimd_macros.hpp similarity index 88% rename from include/xsimd/config/xsimd_inline.hpp rename to include/xsimd/config/xsimd_macros.hpp index f3becaf12..8bd702978 100644 --- a/include/xsimd/config/xsimd_inline.hpp +++ b/include/xsimd/config/xsimd_macros.hpp @@ -9,8 +9,8 @@ * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ -#ifndef XSIMD_INLINE_HPP -#define XSIMD_INLINE_HPP +#ifndef XSIMD_MACROS_HPP +#define XSIMD_MACROS_HPP #if defined(__VEC__) #define XSIMD_INLINE inline @@ -26,4 +26,7 @@ #define XSIMD_INLINE inline #endif +#define XSIMD_CONCAT_INNER(a, b) a##b +#define XSIMD_CONCAT(a, b) XSIMD_CONCAT_INNER(a, b) + #endif diff --git a/include/xsimd/types/xsimd_register.hpp b/include/xsimd/types/xsimd_register.hpp index b14962e5b..bb58b9304 100644 --- a/include/xsimd/types/xsimd_register.hpp +++ b/include/xsimd/types/xsimd_register.hpp @@ -14,7 +14,7 @@ #include -#include "../config/xsimd_inline.hpp" +#include "../config/xsimd_macros.hpp" namespace xsimd { diff --git a/include/xsimd/xsimd.hpp b/include/xsimd/xsimd.hpp index e50dc3bd1..f6d2b1e7b 100644 --- a/include/xsimd/xsimd.hpp +++ b/include/xsimd/xsimd.hpp @@ -57,7 +57,7 @@ #endif #include "config/xsimd_config.hpp" -#include "config/xsimd_inline.hpp" +#include "config/xsimd_macros.hpp" #include "arch/xsimd_scalar.hpp" #include "memory/xsimd_aligned_allocator.hpp" diff --git a/test/test_power.cpp b/test/test_power.cpp index 6fa2ef396..cdf8c146a 100644 --- a/test/test_power.cpp +++ b/test/test_power.cpp @@ -13,7 +13,6 @@ #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE #include "test_utils.hpp" -#include template struct power_test