HEX

File: //usr/include/xsimd/arch/xsimd_sse2.hpp
/***************************************************************************
 * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
 * Martin Renou                                                             *
 * Copyright (c) QuantStack                                                 *
 * Copyright (c) Serge Guelton                                              *
 *                                                                          *
 * Distributed under the terms of the BSD 3-Clause License.                 *
 *                                                                          *
 * The full license is in the file LICENSE, distributed with this software. *
 ****************************************************************************/

#ifndef XSIMD_SSE2_HPP
#define XSIMD_SSE2_HPP

#include <complex>
#include <limits>
#include <type_traits>

#include "../types/xsimd_sse2_register.hpp"

namespace xsimd
{
    template <class batch_type, bool... Values>
    struct batch_bool_constant;

    template <class batch_type, typename batch_type::value_type... Values>
    struct batch_constant;

    namespace kernel
    {
        using namespace types;

        // fwd
        template <class A, class T, size_t I>
        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;

        // abs
        template <class A>
        inline batch<double, A> abs(batch<double, A> const& self, requires_arch<sse2>) noexcept
        {
            __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31
            return _mm_andnot_pd(sign_mask, self);
        }
        template <class A>
        inline batch<float, A> abs(batch<float, A> const& self, requires_arch<sse2>) noexcept
        {
            __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
            return _mm_andnot_ps(sign_mask, self);
        }

        // add
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            switch (sizeof(T))
            {
            case 1:
                return _mm_add_epi8(self, other);
            case 2:
                return _mm_add_epi16(self, other);
            case 4:
                return _mm_add_epi32(self, other);
            case 8:
                return _mm_add_epi64(self, other);
            default:
                assert(false && "unsupported arch/op combination");
                return {};
            }
        }

        template <class A>
        inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_add_ps(self, other);
        }

        template <class A>
        inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_add_pd(self, other);
        }

        // all
        template <class A>
        inline bool all(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_movemask_ps(self) == 0x0F;
        }
        template <class A>
        inline bool all(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_movemask_pd(self) == 0x03;
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline bool all(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_movemask_epi8(self) == 0xFFFF;
        }

        // any
        template <class A>
        inline bool any(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_movemask_ps(self) != 0;
        }
        template <class A>
        inline bool any(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_movemask_pd(self) != 0;
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline bool any(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_movemask_epi8(self) != 0;
        }

        // bitwise_and
        template <class A>
        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_and_ps(self, other);
        }
        template <class A>
        inline batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_and_ps(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_and_si128(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_and_si128(self, other);
        }

        template <class A>
        batch<double, A> inline bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_and_pd(self, other);
        }

        template <class A>
        inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_and_pd(self, other);
        }

        // bitwise_andnot
        template <class A>
        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_andnot_ps(self, other);
        }

        template <class A>
        inline batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_andnot_ps(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_andnot_si128(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_andnot_si128(self, other);
        }

        template <class A>
        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_andnot_pd(self, other);
        }

        template <class A>
        inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_andnot_pd(self, other);
        }

        // bitwise_lshift
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
        {
            switch (sizeof(T))
            {
            case 1:
                return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other));
            case 2:
                return _mm_slli_epi16(self, other);
            case 4:
                return _mm_slli_epi32(self, other);
            case 8:
                return _mm_slli_epi64(self, other);
            default:
                assert(false && "unsupported arch/op combination");
                return {};
            }
        }

        // bitwise_not
        template <class A>
        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
        }
        template <class A>
        inline batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_xor_si128(self, _mm_set1_epi32(-1));
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_xor_si128(self, _mm_set1_epi32(-1));
        }
        template <class A>
        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
        }
        template <class A>
        inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
        }

        // bitwise_or
        template <class A>
        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_or_ps(self, other);
        }
        template <class A>
        inline batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_or_ps(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_or_si128(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_or_si128(self, other);
        }

        template <class A>
        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_or_pd(self, other);
        }

        template <class A>
        inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_or_pd(self, other);
        }

        // bitwise_rshift
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
        {
            if (std::is_signed<T>::value)
            {
                switch (sizeof(T))
                {
                case 1:
                {
                    __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF);
                    __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self);
                    __m128i res = _mm_srai_epi16(self, other);
                    return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res));
                }
                case 2:
                    return _mm_srai_epi16(self, other);
                case 4:
                    return _mm_srai_epi32(self, other);
                case 8:
                {
                    // from https://github.com/samyvilar/vect/blob/master/vect_128.h
                    return _mm_or_si128(
                        _mm_srli_epi64(self, other),
                        _mm_slli_epi64(
                            _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32),
                            64 - other));
                }
                default:
                    assert(false && "unsupported arch/op combination");
                    return {};
                }
            }
            else
            {
                switch (sizeof(T))
                {
                case 1:
                    return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other));
                case 2:
                    return _mm_srli_epi16(self, other);
                case 4:
                    return _mm_srli_epi32(self, other);
                case 8:
                    return _mm_srli_epi64(self, other);
                default:
                    assert(false && "unsupported arch/op combination");
                    return {};
                }
            }
        }

        // bitwise_xor
        template <class A>
        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_xor_ps(self, other);
        }
        template <class A>
        inline batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_xor_ps(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_xor_si128(self, other);
        }
        template <class A>
        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_xor_pd(self, other);
        }
        template <class A>
        inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_xor_pd(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_xor_si128(self, other);
        }

        // bitwise_cast
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
        {
            return _mm_castsi128_ps(self);
        }
        template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
        inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<sse2>) noexcept
        {
            return batch<Tp, A>(self.data);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
        {
            return _mm_castps_si128(self);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
        {
            return _mm_castsi128_pd(self);
        }
        template <class A>
        inline batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
        {
            return _mm_castps_pd(self);
        }
        template <class A>
        inline batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
        {
            return _mm_castpd_ps(self);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
        {
            return _mm_castpd_si128(self);
        }

        // bool_cast
        template <class A>
        batch_bool<int32_t, A> inline bool_cast(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_castps_si128(self);
        }
        template <class A>
        batch_bool<float, A> inline bool_cast(batch_bool<int32_t, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_castsi128_ps(self);
        }
        template <class A>
        batch_bool<int64_t, A> inline bool_cast(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_castpd_si128(self);
        }
        template <class A>
        batch_bool<double, A> inline bool_cast(batch_bool<int64_t, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_castsi128_pd(self);
        }

        // broadcast
        template <class A>
        batch<float, A> inline broadcast(float val, requires_arch<sse2>) noexcept
        {
            return _mm_set1_ps(val);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> broadcast(T val, requires_arch<sse2>) noexcept
        {
            switch (sizeof(T))
            {
            case 1:
                return _mm_set1_epi8(val);
            case 2:
                return _mm_set1_epi16(val);
            case 4:
                return _mm_set1_epi32(val);
            case 8:
                return _mm_set1_epi64x(val);
            default:
                assert(false && "unsupported arch/op combination");
                return {};
            }
        }
        template <class A>
        inline batch<double, A> broadcast(double val, requires_arch<sse2>) noexcept
        {
            return _mm_set1_pd(val);
        }

        // store_complex
        namespace detail
        {
            // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned
            // complex_low
            template <class A>
            inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
            {
                return _mm_unpacklo_ps(self.real(), self.imag());
            }
            // complex_high
            template <class A>
            inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
            {
                return _mm_unpackhi_ps(self.real(), self.imag());
            }
            template <class A>
            inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
            {
                return _mm_unpacklo_pd(self.real(), self.imag());
            }
            template <class A>
            inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
            {
                return _mm_unpackhi_pd(self.real(), self.imag());
            }
        }

        // div
        template <class A>
        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_div_ps(self, other);
        }
        template <class A>
        inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_div_pd(self, other);
        }

        // fast_cast
        namespace detail
        {
            template <class A>
            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
            {
                return _mm_cvtepi32_ps(self);
            }

            template <class A>
            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<sse2>) noexcept
            {
                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
                __m128i msk_lo = _mm_set1_epi32(0xFFFF);
                __m128 cnst65536f = _mm_set1_ps(65536.0f);

                __m128i v_lo = _mm_and_si128(v, msk_lo); /* extract the 16 lowest significant bits of self                             */
                __m128i v_hi = _mm_srli_epi32(v, 16); /* 16 most significant bits of v                                                 */
                __m128 v_lo_flt = _mm_cvtepi32_ps(v_lo); /* No rounding                                                                   */
                __m128 v_hi_flt = _mm_cvtepi32_ps(v_hi); /* No rounding                                                                   */
                v_hi_flt = _mm_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                                   */
                return _mm_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer    */
            }

            template <class A>
            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
            {
                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
                // adapted to sse2
                __m128i xH = _mm_srli_epi64(x, 32);
                xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
                __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
                __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
                return _mm_add_pd(f, _mm_castsi128_pd(xL));
            }

            template <class A>
            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
            {
                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
                // adapted to sse2
                __m128i xH = _mm_srai_epi32(x, 16);
                xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
                xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
                __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
                __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
                return _mm_add_pd(f, _mm_castsi128_pd(xL));
            }

            template <class A>
            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<sse2>) noexcept
            {
                return _mm_cvttps_epi32(self);
            }

            template <class A>
            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse2>) noexcept
            {
                __m128 mask = _mm_cmpge_ps(self, _mm_set1_ps(1u << 31));
                __m128 lhs = _mm_castsi128_ps(_mm_cvttps_epi32(self));
                __m128 rhs = _mm_castsi128_ps(_mm_xor_si128(
                    _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
                    _mm_set1_epi32(1u << 31)));
                return _mm_castps_si128(_mm_or_ps(_mm_and_ps(mask, rhs), _mm_andnot_ps(mask, lhs)));
            }

        }

        // eq
        template <class A>
        inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmpeq_ps(self, other);
        }
        template <class A>
        inline batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other)));
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            switch (sizeof(T))
            {
            case 1:
                return _mm_cmpeq_epi8(self, other);
            case 2:
                return _mm_cmpeq_epi16(self, other);
            case 4:
                return _mm_cmpeq_epi32(self, other);
            case 8:
            {
                __m128i tmp1 = _mm_cmpeq_epi32(self, other);
                __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1);
                __m128i tmp3 = _mm_and_si128(tmp1, tmp2);
                __m128i tmp4 = _mm_srai_epi32(tmp3, 31);
                return _mm_shuffle_epi32(tmp4, 0xF5);
            }
            default:
                assert(false && "unsupported arch/op combination");
                return {};
            }
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
        {
            return eq(batch<T, A>(self.data), batch<T, A>(other.data));
        }
        template <class A>
        inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmpeq_pd(self, other);
        }
        template <class A>
        inline batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
        }

        // ge
        template <class A>
        inline batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmpge_ps(self, other);
        }
        template <class A>
        inline batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmpge_pd(self, other);
        }

        // gt
        template <class A>
        inline batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmpgt_ps(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            if (std::is_signed<T>::value)
            {
                switch (sizeof(T))
                {
                case 1:
                    return _mm_cmpgt_epi8(self, other);
                case 2:
                    return _mm_cmpgt_epi16(self, other);
                case 4:
                    return _mm_cmpgt_epi32(self, other);
                default:
                    return gt(self, other, generic {});
                }
            }
            else
            {
                return gt(self, other, generic {});
            }
        }

        template <class A>
        inline batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmpgt_pd(self, other);
        }

        // hadd
        template <class A>
        inline float hadd(batch<float, A> const& self, requires_arch<sse2>) noexcept
        {
            __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self));
            __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
            return _mm_cvtss_f32(tmp1);
        }
        // TODO: move this in xsimd_generic
        namespace detail
        {
            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
            inline T hadd_default(batch<T, A> const& self, requires_arch<sse2>) noexcept
            {
                alignas(A::alignment()) T buffer[batch<T, A>::size];
                self.store_aligned(buffer);
                T res = 0;
                for (T val : buffer)
                {
                    res += val;
                }
                return res;
            }
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline T hadd(batch<T, A> const& self, requires_arch<sse2>) noexcept
        {
            switch (sizeof(T))
            {
            case 4:
            {
                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
                __m128i tmp2 = _mm_add_epi32(self, tmp1);
                __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
                __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
                return _mm_cvtsi128_si32(tmp4);
            }
            case 8:
            {
                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
                __m128i tmp2 = _mm_add_epi64(self, tmp1);
#if defined(__x86_64__)
                return _mm_cvtsi128_si64(tmp2);
#else
                __m128i m;
                _mm_storel_epi64(&m, tmp2);
                int64_t i;
                std::memcpy(&i, &m, sizeof(i));
                return i;
#endif
            }
            default:
                return detail::hadd_default(self, A {});
            }
        }
        template <class A>
        inline double hadd(batch<double, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
        }

        // haddp
        template <class A>
        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse2>) noexcept
        {
            __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
            __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
            __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
            tmp0 = _mm_add_ps(tmp0, tmp1);
            tmp1 = _mm_unpacklo_ps(row[2], row[3]);
            tmp1 = _mm_add_ps(tmp1, tmp2);
            tmp2 = _mm_movehl_ps(tmp1, tmp0);
            tmp0 = _mm_movelh_ps(tmp0, tmp1);
            return _mm_add_ps(tmp0, tmp2);
        }
        template <class A>
        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse2>) noexcept
        {
            return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]),
                              _mm_unpackhi_pd(row[0], row[1]));
        }

        // insert
        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse2>) noexcept
        {
            switch (sizeof(T))
            {
            case 2:
                return _mm_insert_epi16(self, val, I);
            default:
                return insert(self, val, pos, generic {});
            }
        }

        // isnan
        template <class A>
        inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_cmpunord_ps(self, self);
        }
        template <class A>
        inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_cmpunord_pd(self, self);
        }

        // load_aligned
        template <class A>
        inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
        {
            return _mm_load_ps(mem);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
        {
            return _mm_load_si128((__m128i const*)mem);
        }
        template <class A>
        inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
        {
            return _mm_load_pd(mem);
        }

        // load_unaligned
        template <class A>
        inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
        {
            return _mm_loadu_ps(mem);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
        {
            return _mm_loadu_si128((__m128i const*)mem);
        }
        template <class A>
        inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
        {
            return _mm_loadu_pd(mem);
        }

        // load_complex
        namespace detail
        {
            // Redefine these methods in the SSE-based archs if required
            template <class A>
            inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<sse2>) noexcept
            {
                return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) };
            }
            template <class A>
            inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<sse2>) noexcept
            {
                return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) };
            }
        }

        // le
        template <class A>
        inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmple_ps(self, other);
        }
        template <class A>
        inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmple_pd(self, other);
        }

        // lt
        template <class A>
        inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmplt_ps(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            if (std::is_signed<T>::value)
            {
                switch (sizeof(T))
                {
                case 1:
                    return _mm_cmplt_epi8(self, other);
                case 2:
                    return _mm_cmplt_epi16(self, other);
                case 4:
                    return _mm_cmplt_epi32(self, other);
                case 8:
                {
                    __m128i tmp1 = _mm_sub_epi64(self, other);
                    __m128i tmp2 = _mm_xor_si128(self, other);
                    __m128i tmp3 = _mm_andnot_si128(other, self);
                    __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
                    __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
                    __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
                    return _mm_shuffle_epi32(tmp6, 0xF5);
                }
                default:
                    assert(false && "unsupported arch/op combination");
                    return {};
                }
            }
            else
            {
                switch (sizeof(T))
                {
                case 1:
                    return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())));
                case 2:
                    return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())));
                case 4:
                    return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())));
                case 8:
                {
                    auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
                    auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
                    __m128i tmp1 = _mm_sub_epi64(xself, xother);
                    __m128i tmp2 = _mm_xor_si128(xself, xother);
                    __m128i tmp3 = _mm_andnot_si128(xother, xself);
                    __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
                    __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
                    __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
                    return _mm_shuffle_epi32(tmp6, 0xF5);
                }
                default:
                    assert(false && "unsupported arch/op combination");
                    return {};
                }
            }
        }

        template <class A>
        inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmplt_pd(self, other);
        }

        // max
        template <class A>
        inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_max_ps(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            return select(self > other, self, other);
        }
        template <class A>
        inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_max_pd(self, other);
        }

        // min
        template <class A>
        inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_min_ps(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            return select(self <= other, self, other);
        }
        template <class A>
        inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_min_pd(self, other);
        }

        // mul
        template <class A>
        inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_mul_ps(self, other);
        }
        template <class A>
        inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_mul_pd(self, other);
        }

        // nearbyint_as_int
        template <class A>
        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
                                                  requires_arch<sse2>) noexcept
        {
            return _mm_cvtps_epi32(self);
        }

        // neg
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<sse2>) noexcept
        {
            return 0 - self;
        }
        template <class A>
        inline batch<float, A> neg(batch<float, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
        }

        template <class A>
        inline batch<double, A> neg(batch<double, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_xor_pd(
                self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000)));
        }

        // neq
        template <class A>
        inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmpneq_ps(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            return ~(self == other);
        }
        template <class A>
        inline batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmpneq_ps(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
        {
            return ~(self == other);
        }

        template <class A>
        inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmpneq_pd(self, other);
        }
        template <class A>
        inline batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_cmpneq_pd(self, other);
        }

        // reciprocal
        template <class A>
        inline batch<float, A> reciprocal(batch<float, A> const& self,
                                          kernel::requires_arch<sse2>)
        {
            return _mm_rcp_ps(self);
        }

        // rsqrt
        template <class A>
        inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
        {
            return _mm_rsqrt_ps(val);
        }
        template <class A>
        inline batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
        {
            return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val)));
        }

        // select
        template <class A>
        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse2>) noexcept
        {
            return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br));
        }

        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
        {
            return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
        }
        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
        {
            return select(batch_bool<T, A> { Values... }, true_br, false_br, sse2 {});
        }
        template <class A>
        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse2>) noexcept
        {
            return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br));
        }

        // sqrt
        template <class A>
        inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
        {
            return _mm_sqrt_ps(val);
        }
        template <class A>
        inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
        {
            return _mm_sqrt_pd(val);
        }

        // sadd
        template <class A>
        inline batch<float, A> sadd(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_add_ps(self, other); // no saturated arithmetic on floating point numbers
        }
        // TODO: move this in xsimd_generic
        namespace detail
        {
            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
            inline batch<T, A> sadd_default(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
            {
                if (std::is_signed<T>::value)
                {
                    auto mask = (other >> (8 * sizeof(T) - 1));
                    auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
                    auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
                    return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
                }
                else
                {
                    const auto diffmax = std::numeric_limits<T>::max() - self;
                    const auto mindiff = min(diffmax, other);
                    return self + mindiff;
                }
            }
        }

        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            if (std::is_signed<T>::value)
            {
                switch (sizeof(T))
                {
                case 1:
                    return _mm_adds_epi8(self, other);
                case 2:
                    return _mm_adds_epi16(self, other);
                default:
                    return detail::sadd_default(self, other, A {});
                }
            }
            else
            {
                switch (sizeof(T))
                {
                case 1:
                    return _mm_adds_epu8(self, other);
                case 2:
                    return _mm_adds_epu16(self, other);
                default:
                    return detail::sadd_default(self, other, A {});
                }
            }
        }
        template <class A>
        inline batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_add_pd(self, other); // no saturated arithmetic on floating point numbers
        }

        // set
        template <class A, class... Values>
        inline batch<float, A> set(batch<float, A> const&, requires_arch<sse2>, Values... values) noexcept
        {
            static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
            return _mm_setr_ps(values...);
        }

        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1) noexcept
        {
            return _mm_set_epi64x(v1, v0);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3) noexcept
        {
            return _mm_setr_epi32(v0, v1, v2, v3);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
        {
            return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
        {
            return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
        }

        template <class A, class... Values>
        inline batch<double, A> set(batch<double, A> const&, requires_arch<sse2>, Values... values) noexcept
        {
            static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
            return _mm_setr_pd(values...);
        }

        template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sse2>, Values... values) noexcept
        {
            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
        }

        template <class A, class... Values>
        inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<sse2>, Values... values) noexcept
        {
            static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
            return _mm_castsi128_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
        }

        template <class A, class... Values>
        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<sse2>, Values... values) noexcept
        {
            static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
            return _mm_castsi128_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
        }

        // ssub
        template <class A>
        inline batch<float, A> ssub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_sub_ps(self, other); // no saturated arithmetic on floating point numbers
        }
        // TODO: move this in xsimd_generic
        namespace detail
        {
            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
            inline batch<T, A> ssub_default(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
            {
                if (std::is_signed<T>::value)
                {
                    return sadd(self, -other);
                }
                else
                {
                    const auto diff = min(self, other);
                    return self - diff;
                }
            }
        }

        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            if (std::is_signed<T>::value)
            {
                switch (sizeof(T))
                {
                case 1:
                    return _mm_subs_epi8(self, other);
                case 2:
                    return _mm_subs_epi16(self, other);
                default:
                    return detail::ssub_default(self, other, A {});
                }
            }
            else
            {
                switch (sizeof(T))
                {
                case 1:
                    return _mm_subs_epu8(self, other);
                case 2:
                    return _mm_subs_epu16(self, other);
                default:
                    return detail::ssub_default(self, other, A {});
                }
            }
        }

        template <class A>
        inline batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_sub_pd(self, other); // no saturated arithmetic on floating point numbers
        }

        // store_aligned
        template <class A>
        inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_store_ps(mem, self);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_store_si128((__m128i*)mem, self);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_store_si128((__m128i*)mem, self);
        }
        template <class A>
        inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_store_pd(mem, self);
        }

        // store_unaligned
        template <class A>
        inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_storeu_ps(mem, self);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_storeu_si128((__m128i*)mem, self);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_storeu_si128((__m128i*)mem, self);
        }
        template <class A>
        inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
        {
            return _mm_storeu_pd(mem, self);
        }

        // sub
        template <class A>
        inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_sub_ps(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            switch (sizeof(T))
            {
            case 1:
                return _mm_sub_epi8(self, other);
            case 2:
                return _mm_sub_epi16(self, other);
            case 4:
                return _mm_sub_epi32(self, other);
            case 8:
                return _mm_sub_epi64(self, other);
            default:
                assert(false && "unsupported arch/op combination");
                return {};
            }
        }
        template <class A>
        inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_sub_pd(self, other);
        }

        // swizzle

        namespace detail
        {
            constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
            {
                return (z << 6) | (y << 4) | (x << 2) | w;
            }
            constexpr uint32_t shuffle(uint32_t x, uint32_t y)
            {
                return (y << 1) | x;
            }
        }

        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
        {
            constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
            return _mm_shuffle_ps(self, self, index);
        }

        template <class A, uint64_t V0, uint64_t V1>
        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
        {
            constexpr uint32_t index = detail::shuffle(V0, V1);
            return _mm_shuffle_pd(self, self, index);
        }

        template <class A, uint64_t V0, uint64_t V1>
        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
        {
            constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
            return _mm_shuffle_epi32(self, index);
        }

        template <class A, uint64_t V0, uint64_t V1>
        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<sse2>) noexcept
        {
            return bitwise_cast<batch<int64_t, A>>(swizzle(bitwise_cast<batch<uint64_t, A>>(self), mask, sse2 {}));
        }

        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
        {
            constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
            return _mm_shuffle_epi32(self, index);
        }

        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
        {
            return bitwise_cast<batch<int32_t, A>>(swizzle(bitwise_cast<batch<uint32_t, A>>(self), mask, sse2 {}));
        }

        // zip_hi
        template <class A>
        inline batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_unpackhi_ps(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            switch (sizeof(T))
            {
            case 1:
                return _mm_unpackhi_epi8(self, other);
            case 2:
                return _mm_unpackhi_epi16(self, other);
            case 4:
                return _mm_unpackhi_epi32(self, other);
            case 8:
                return _mm_unpackhi_epi64(self, other);
            default:
                assert(false && "unsupported arch/op combination");
                return {};
            }
        }
        template <class A>
        inline batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_unpackhi_pd(self, other);
        }

        // zip_lo
        template <class A>
        inline batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_unpacklo_ps(self, other);
        }
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
        {
            switch (sizeof(T))
            {
            case 1:
                return _mm_unpacklo_epi8(self, other);
            case 2:
                return _mm_unpacklo_epi16(self, other);
            case 4:
                return _mm_unpacklo_epi32(self, other);
            case 8:
                return _mm_unpacklo_epi64(self, other);
            default:
                assert(false && "unsupported arch/op combination");
                return {};
            }
        }
        template <class A>
        inline batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
        {
            return _mm_unpacklo_pd(self, other);
        }
    }

}

#endif