diff --git a/AK/SIMDExtras.h b/AK/SIMDExtras.h index 1d4865e9800..d8568cf6f9c 100644 --- a/AK/SIMDExtras.h +++ b/AK/SIMDExtras.h @@ -6,6 +6,7 @@ #pragma once +#include #include #include @@ -183,30 +184,148 @@ ALWAYS_INLINE static void store4_masked(VectorType v, UnderlyingType* a, Underly } // Shuffle - -template T> -ALWAYS_INLINE static T shuffle(T a, T control) +namespace Detail { +template +ALWAYS_INLINE static T shuffle_impl(T a, Control control, IndexSequence) { - // FIXME: This is probably not the fastest way to do this. + // FIXME: Maybe make the VERIFYs optional, eg on SIMD-DEBUG, to avoid the overhead in performance oriented users, like LibWasm::SIMD + // Note: - instead of _ to make the linter happy, as SIMD-DEBUG does not (yet) exist + constexpr Conditional>, ssize_t, size_t> N = vector_length; + // If you hit this verify and want a 0 in these cases instead, use shuffle_or_0 + (([control] { VERIFY(control[Idx] < N); })(), ...); + + // __builtin_shuffle is only available with GCC, and has quite good codegen + if constexpr (__has_builtin(__builtin_shuffle)) + return __builtin_shuffle(a, control); + return T { - a[control[0] & 0xf], - a[control[1] & 0xf], - a[control[2] & 0xf], - a[control[3] & 0xf], - a[control[4] & 0xf], - a[control[5] & 0xf], - a[control[6] & 0xf], - a[control[7] & 0xf], - a[control[8] & 0xf], - a[control[9] & 0xf], - a[control[10] & 0xf], - a[control[11] & 0xf], - a[control[12] & 0xf], - a[control[13] & 0xf], - a[control[14] & 0xf], - a[control[15] & 0xf], + a[control[Idx]]... }; } + +// FIXME: AppleClang somehow unconditionally executes the `a[control[Idx]]` path, +// even if its in the false branch of the ternary +// This leads to a presumably out of bounds access, which is UB +// Reenable the sanitizer once this is fixed +// As a side note UBsan makes a total mess of the codegen anyway +template +#ifdef AK_COMPILER_CLANG +[[clang::no_sanitize("undefined")]] +#endif +ALWAYS_INLINE static T shuffle_or_0_impl(T a, Control control, IndexSequence) +{ + constexpr Conditional>, ssize_t, size_t> N = vector_length; + using E = ElementOf; + + if constexpr (__has_builtin(__builtin_shuffle)) { + // GCC does a very bad job at optimizing the masking, while not recognizing the shuffle idiom + // So we jinx its __builtin_shuffle to work with out of bounds indices + auto mask = (control >= 0) | (control < N); + return __builtin_shuffle(a, control & mask) & ~mask; + } + // 1. Set all out of bounds values to ~0 + // Note: This is done so that the optimization mentioned down below works + // Note: Vector compares result in bitmasks, aka all 1s or all 0s per element + control |= ~((control > 0) | (control < N)); + // 2. Selectively set out of bounds values to 0 + // Note: Clang successfully optimizes this to a few instructions on x86-ssse3, GCC does not + // Vector Optimizations/Instruction-Selection on ArmV8 seem to not be as powerful as of Clang18 + // FIXME: We could recreate the bit mask Clang uses for the select for u32 and u16 + // control = control * explode_byte(sizeof(E)) + 0x03020100; + // return (T)shuffle_unchecked(Bytes(a), Bytes(control)); + // Note: On x86-ssse3, `pshufb` inserts a zero if the control byte has the highest bit set + // On ArmV8, `tbl` inserts a zero if the control byte is out of bounds in general + // On RiscV `vrgather.vv` inserts a 0 if the control index is out of bounds + // and is more powerful than the other two as it is able to use bigger item widths than a byte + // Note: For u64x2 Clang seems to always unroll the compare instead of doing the fancy `phufb` + + return T { + ((E)(control[Idx] != ~0 ? a[control[Idx]] : 0))... + }; +} + +template +ALWAYS_INLINE static T item_reverse_impl(T a, IndexSequence) +{ + constexpr size_t N = vector_length; + return __builtin_shufflevector(a, a, N - 1 - Idx...); +} + +template +ALWAYS_INLINE static T byte_reverse_impl(T a, IndexSequence) +{ + static_assert(sizeof...(Idx) == sizeof(T)); + constexpr size_t N = sizeof(T); + // FIXME: GCC silently ignores the dependent vector_size attribute, this seems to be a bug + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68703 + // Hence this giant conditional + using BytesVector = Conditional>>>>; + static_assert(sizeof(BytesVector) == sizeof(T)); + // Note: Using __builtin_bit_cast instead of bit_cast to avoid a psabi warning from bit_cast + auto tmp = __builtin_shufflevector( + __builtin_bit_cast(BytesVector, a), + __builtin_bit_cast(BytesVector, a), + N - 1 - Idx...); + return __builtin_bit_cast(T, tmp); +} + +template +ALWAYS_INLINE static T elementwise_byte_reverse_impl(T a, IndexSequence) +{ + static_assert(sizeof...(Idx) == vector_length); + using Element = ElementOf; + if constexpr (sizeof(Element) == 1) { + return a; + } else if constexpr (sizeof(Element) == 2) { + return T { + static_cast(__builtin_bswap16(static_cast(a[Idx])))... + }; + } else if constexpr (sizeof(Element) == 4) { + return T { + static_cast(__builtin_bswap32(static_cast(a[Idx])))... + }; + } else if constexpr (sizeof(Element) == 8) { + return T { + static_cast(__builtin_bswap64(static_cast(a[Idx])))... + }; + } else { + static_assert(DependentFalse); + } +} + +} + +// FIXME: Shuffles only work with integral types for now +template +ALWAYS_INLINE static T shuffle(T a, IndexVectorFor control) +{ + return Detail::shuffle_impl(a, control, MakeIndexSequence>()); +} + +template +ALWAYS_INLINE static T shuffle_or_0(T a, IndexVectorFor control) +{ + return Detail::shuffle_or_0_impl(a, control, MakeIndexSequence>()); +} + +template +ALWAYS_INLINE static T item_reverse(T a) +{ + return Detail::item_reverse_impl(a, MakeIndexSequence>()); +} + +template +ALWAYS_INLINE static T byte_reverse(T a) +{ + return Detail::byte_reverse_impl(a, MakeIndexSequence()); +} + +template +ALWAYS_INLINE static T elementwise_byte_reverse(T a) +{ + return Detail::elementwise_byte_reverse_impl(a, MakeIndexSequence>()); +} + } #pragma GCC diagnostic pop