/***
*   arm64_neon.h - declarations/definitions for ARM64 NEON specific intrinsics
*
*       Copyright (c) Microsoft Corporation. All rights reserved.
*
*Purpose:
*       This include file contains the declarations for ARM64 NEON intrinsic functions
*
****/

#pragma once

#include <stdint.h>
#include <sal.h>

#if !defined (_M_ARM64) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC)
#error This header is specific to ARM64 targets
#endif  /* !defined (_M_ARM64) && !defined(_M_HYBRID_X86_ARM64) && !defined(_M_ARM64EC) */

#pragma warning(push)
#pragma warning(disable: _VCRUNTIME_DISABLED_WARNINGS)

#if defined (__cplusplus)
extern "C" {
#endif  /* defined (__cplusplus) */

#if !defined(_ARM64_NO_EXTENDED_INTRINSICS) && !defined(_ARM64_DISTINCT_NEON_TYPES)
#  define _ARM64_EXTENDED_INTRINSICS
#endif  /* !_ARM64_NO_EXTENDED_INTRINSICS && !_ARM64_DISTINCT_NEON_TYPES */

///////////////////////////////////////////////////////////////////////////////
//
#if !defined (_ADVSIMD_ALIGN)
#if defined (__midl)
#define _ADVSIMD_ALIGN(x)
#else  /* defined (__midl) */
#define _ADVSIMD_ALIGN(x) __declspec(align(x))
#endif  /* defined (__midl) */
#endif  /* !defined (_ADVSIMD_ALIGN) */

#ifndef DUMMYNEONSTRUCT
#define DUMMYNEONSTRUCT s
#endif  /* DUMMYNEONSTRUCT */


///////////////////////////////////////////////////////////////////////////////
//
typedef unsigned __int8  poly8_t;
typedef unsigned __int16 poly16_t;
typedef unsigned __int32 poly32_t;
typedef unsigned __int64 poly64_t;
typedef float float32_t;
typedef double float64_t;

///////////////////////////////////////////////////////////////////////////////
//
// ARM64 Advanced SIMD 32bit type
//
typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(4) __n32
{
    unsigned __int32    n32_u32[1];
    unsigned __int16    n32_u16[2];
    unsigned __int8     n32_u8[4];
    __int32             n32_i32[1];
    __int16             n32_i16[2];
    __int8              n32_i8[4];
    poly32_t            n32_p32[1];
    poly16_t            n32_p16[2];
    poly8_t             n32_p8[4];
    float               n32_f32[1];
} __n32;


///////////////////////////////////////////////////////////////////////////////
//
// ARM64 Advanced SIMD 16bit type
//
typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(2) __n16
{
    unsigned __int16    n16_u16[1];
    unsigned __int8     n16_u8[2];
    __int16             n16_i16[1];
    __int8              n16_i8[2];
    poly16_t            n16_p16[1];
    poly8_t             n16_p8[2];
} __n16;


///////////////////////////////////////////////////////////////////////////////
//
// ARM64 Advanced SIMD 8bit type
//
typedef union __declspec(intrin_type) __n8
{
    unsigned __int8     n8_u8[1];
    __int8              n8_i8[1];
    poly8_t             n8_p8[1];
} __n8;


///////////////////////////////////////////////////////////////////////////////
//
// ARM64 Advanced SIMD 64bit type
//
typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __n64
{
    unsigned __int64    n64_u64[1];
    unsigned __int32    n64_u32[2];
    unsigned __int16    n64_u16[4];
    unsigned __int8     n64_u8[8];
    __int64             n64_i64[1];
    __int32             n64_i32[2];
    __int16             n64_i16[4];
    __int8              n64_i8[8];
    poly64_t            n64_p64[1];
    poly32_t            n64_p32[2];
    poly16_t            n64_p16[4];
    poly8_t             n64_p8[8];
    float               n64_f32[2];
    double              n64_f64[1];
} __n64;


///////////////////////////////////////////////////////////////////////////////
//
// ARM64 Advanced SIMD 128bit type
//
typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __n128
{
     unsigned __int64   n128_u64[2];
     unsigned __int32   n128_u32[4];
     unsigned __int16   n128_u16[8];
     unsigned __int8    n128_u8[16];
     __int64            n128_i64[2];
     __int32            n128_i32[4];
     __int16            n128_i16[8];
     __int8             n128_i8[16];
     poly64_t           n128_p64[2];
     poly32_t           n128_p32[4];
     poly16_t           n128_p16[8];
     poly8_t            n128_p8[16];
     float              n128_f32[4];
    double              n128_f64[2];

    struct
    {
        __n64  low64;
        __n64  high64;
    } DUMMYNEONSTRUCT;

} __n128;

typedef struct __n32x2
{
    __n32 val[2];
} __n32x2;

typedef struct __n64x2
{
    __n64 val[2];
} __n64x2;

typedef struct __n64x3
{
    __n64 val[3];
} __n64x3;

typedef struct __n64x4
{
    __n64 val[4];
} __n64x4;

typedef struct __n128x2
{
    __n128 val[2];
} __n128x2;

typedef struct __n128x3
{
    __n128 val[3];
} __n128x3;

typedef struct __n128x4
{
    __n128 val[4];
} __n128x4;

///////////////////////////////////////////////////////////////////////////////
//
__inline _Post_equal_to_(p) __n64 *__int8ToN64(_In_ int8_t *p)       { return (__n64 *)p; }
__inline _Post_equal_to_(p) __n64 *__int16ToN64(_In_ int16_t *p)     { return (__n64 *)p; }
__inline _Post_equal_to_(p) __n64 *__int32ToN64(_In_ int32_t *p)     { return (__n64 *)p; }
__inline _Post_equal_to_(p) __n64 *__int64ToN64(_In_ int64_t *p)     { return (__n64 *)p; }
__inline _Post_equal_to_(p) __n64 *__uint8ToN64(_In_ uint8_t *p)     { return (__n64 *)p; }
__inline _Post_equal_to_(p) __n64 *__uint16ToN64(_In_ uint16_t *p)   { return (__n64 *)p; }
__inline _Post_equal_to_(p) __n64 *__uint32ToN64(_In_ uint32_t *p)   { return (__n64 *)p; }
__inline _Post_equal_to_(p) __n64 *__uint64ToN64(_In_ uint64_t *p)   { return (__n64 *)p; }
__inline _Post_equal_to_(p) __n64 *__poly8ToN64(_In_ poly8_t *p)     { return (__n64 *)p; }
__inline _Post_equal_to_(p) __n64 *__poly16ToN64(_In_ poly16_t *p)   { return (__n64 *)p; }
__inline _Post_equal_to_(p) __n64 *__float32ToN64(_In_ float32_t *p) { return (__n64 *)p; }
__inline _Post_equal_to_(p) __n32 *__float32ToN32(_In_ float32_t *p) { return (__n32 *)p; }

__inline _Post_equal_to_(p) const __n64 *__int8ToN64_c(_In_ const int8_t *p)       { return (const __n64 *)p; }
__inline _Post_equal_to_(p) const __n64 *__int16ToN64_c(_In_ const int16_t *p)     { return (const __n64 *)p; }
__inline _Post_equal_to_(p) const __n64 *__int32ToN64_c(_In_ const int32_t *p)     { return (const __n64 *)p; }
__inline _Post_equal_to_(p) const __n64 *__int64ToN64_c(_In_ const int64_t *p)     { return (const __n64 *)p; }
__inline _Post_equal_to_(p) const __n64 *__uint8ToN64_c(_In_ const uint8_t *p)     { return (const __n64 *)p; }
__inline _Post_equal_to_(p) const __n64 *__uint16ToN64_c(_In_ const uint16_t *p)   { return (const __n64 *)p; }
__inline _Post_equal_to_(p) const __n64 *__uint32ToN64_c(_In_ const uint32_t *p)   { return (const __n64 *)p; }
__inline _Post_equal_to_(p) const __n64 *__uint64ToN64_c(_In_ const uint64_t *p)   { return (const __n64 *)p; }
__inline _Post_equal_to_(p) const __n64 *__poly8ToN64_c(_In_ const poly8_t *p)     { return (const __n64 *)p; }
__inline _Post_equal_to_(p) const __n64 *__poly16ToN64_c(_In_ const poly16_t *p)   { return (const __n64 *)p; }
__inline _Post_equal_to_(p) const __n64 *__float32ToN64_c(_In_ const float32_t *p) { return (const __n64 *)p; }
__inline _Post_equal_to_(p) const __n32 *__float32ToN32_c(_In_ const float32_t *p) { return (const __n32 *)p; }

__inline __n8 __int8ToN8_v(__int8 i)
{
    __n8 x;
    x.n8_i8[0] = i;
    return x;
}

__inline __n8 __uint8ToN8_v(unsigned __int8 i)
{
    __n8 x;
    x.n8_u8[0] = i;
    return x;
}

__inline __n16 __int16ToN16_v(__int16 i)
{
    __n16 x;
    x.n16_i16[0] = i;
    return x;
}

__inline __n16 __uint16ToN16_v(unsigned __int16 i)
{
    __n16 x;
    x.n16_u16[0] = i;
    return x;
}

__inline __n32 __int32ToN32_v(__int32 i)
{
    __n32 x;
    x.n32_i32[0] = i;
    return x;
}

__inline __n32 __uint32ToN32_v(unsigned __int32 i)
{
    __n32 x;
    x.n32_u32[0] = i;
    return x;
}

__inline __n64 __int64ToN64_v(__int64 i)
{
    __n64 x;
    x.n64_i64[0] = i;
    return x;
}

__inline __n64 __uint64ToN64_v(unsigned __int64 i)
{
    __n64 x;
    x.n64_u64[0] = i;
    return x;
}

__inline int32_t __int8ToInt32(int8_t i)      { return (int32_t)i; }
__inline int32_t __int16ToInt32(int16_t i)    { return (int32_t)i; }
__inline int32_t __int32ToInt32(int32_t i)    { return (int32_t)i; }
__inline int64_t __int64ToInt64(int64_t i)    { return (int64_t)i; }

__inline int32_t __uint8ToInt32(uint8_t i)    { return (int32_t)i; }
__inline int32_t __uint16ToInt32(uint16_t i)  { return (int32_t)i; }
__inline int32_t __uint32ToInt32(uint32_t i)  { return (int32_t)i; }
__inline int64_t __uint64ToInt64(uint64_t i)  { return (int64_t)i; }

__inline int32_t __poly8ToInt32(poly8_t i)    { return (int32_t)i; }
__inline int32_t __poly16ToInt32(poly16_t i)  { return (int32_t)i; }

double _CopyDoubleFromInt64(__int64);
float _CopyFloatFromInt32(__int32);
__int32 _CopyInt32FromFloat(float);
__int64 _CopyInt64FromDouble(double);
__inline float _CopyFloatFromUInt32(unsigned __int32 i)     { return _CopyFloatFromInt32((__int32)i); }
__inline unsigned __int32 _CopyUInt32FromFloat(float f)     { return (unsigned __int32)_CopyInt32FromFloat(f); }
__inline double _CopyDoubleFromUInt64(unsigned __int64 i)   { return _CopyDoubleFromInt64((__int64)i); }
__inline unsigned __int64 _CopyUInt64FromDouble(double f)   { return (unsigned __int64)_CopyInt64FromDouble(f); }

///////////////////////////////////////////////////////////////////////////////
// explicit types

#if !defined(_ARM64_DISTINCT_NEON_TYPES)

typedef __n32    float32x1_t;
typedef __n32x2  float32x1x2_t;
typedef __n64    float32x2_t;
typedef __n64x2  float32x2x2_t;
typedef __n64x3  float32x2x3_t;
typedef __n64x4  float32x2x4_t;
typedef __n64    float64x1_t;
typedef __n64x2  float64x1x2_t;
typedef __n64x3  float64x1x3_t;
typedef __n64x4  float64x1x4_t;
typedef __n64    int8x8_t;
typedef __n64x2  int8x8x2_t;
typedef __n64x3  int8x8x3_t;
typedef __n64x4  int8x8x4_t;
typedef __n64    int16x4_t;
typedef __n64x2  int16x4x2_t;
typedef __n64x3  int16x4x3_t;
typedef __n64x4  int16x4x4_t;
typedef __n64    int32x2_t;
typedef __n64x2  int32x2x2_t;
typedef __n64x3  int32x2x3_t;
typedef __n64x4  int32x2x4_t;
typedef __n64    int64x1_t;
typedef __n64x2  int64x1x2_t;
typedef __n64x3  int64x1x3_t;
typedef __n64x4  int64x1x4_t;
typedef __n64    poly8x8_t;
typedef __n64x2  poly8x8x2_t;
typedef __n64x3  poly8x8x3_t;
typedef __n64x4  poly8x8x4_t;
typedef __n64    poly16x4_t;
typedef __n64x2  poly16x4x2_t;
typedef __n64x3  poly16x4x3_t;
typedef __n64x4  poly16x4x4_t;
typedef __n64    poly64x1_t;
typedef __n64x2  poly64x1x2_t;
typedef __n64x3  poly64x1x3_t;
typedef __n64x4  poly64x1x4_t;
typedef __n64    uint8x8_t;
typedef __n64x2  uint8x8x2_t;
typedef __n64x3  uint8x8x3_t;
typedef __n64x4  uint8x8x4_t;
typedef __n64    uint16x4_t;
typedef __n64x2  uint16x4x2_t;
typedef __n64x3  uint16x4x3_t;
typedef __n64x4  uint16x4x4_t;
typedef __n64    uint32x2_t;
typedef __n64x2  uint32x2x2_t;
typedef __n64x3  uint32x2x3_t;
typedef __n64x4  uint32x2x4_t;
typedef __n64    uint64x1_t;
typedef __n64x2  uint64x1x2_t;
typedef __n64x3  uint64x1x3_t;
typedef __n64x4  uint64x1x4_t;
typedef __n128   float32x4_t;
typedef __n128x2 float32x4x2_t;
typedef __n128x3 float32x4x3_t;
typedef __n128x4 float32x4x4_t;
typedef __n128   float64x2_t;
typedef __n128x2 float64x2x2_t;
typedef __n128x3 float64x2x3_t;
typedef __n128x4 float64x2x4_t;
typedef __n128   int8x16_t;
typedef __n128x2 int8x16x2_t;
typedef __n128x3 int8x16x3_t;
typedef __n128x4 int8x16x4_t;
typedef __n128   int16x8_t;
typedef __n128x2 int16x8x2_t;
typedef __n128x3 int16x8x3_t;
typedef __n128x4 int16x8x4_t;
typedef __n128   int32x4_t;
typedef __n128x2 int32x4x2_t;
typedef __n128x3 int32x4x3_t;
typedef __n128x4 int32x4x4_t;
typedef __n128   int64x2_t;
typedef __n128x2 int64x2x2_t;
typedef __n128x3 int64x2x3_t;
typedef __n128x4 int64x2x4_t;
typedef __n128   poly8x16_t;
typedef __n128x2 poly8x16x2_t;
typedef __n128x3 poly8x16x3_t;
typedef __n128x4 poly8x16x4_t;
typedef __n128   poly16x8_t;
typedef __n128x2 poly16x8x2_t;
typedef __n128x3 poly16x8x3_t;
typedef __n128x4 poly16x8x4_t;
typedef __n128   poly64x2_t;
typedef __n128x2 poly64x2x2_t;
typedef __n128x3 poly64x2x3_t;
typedef __n128x4 poly64x2x4_t;
typedef __n128   uint8x16_t;
typedef __n128x2 uint8x16x2_t;
typedef __n128x3 uint8x16x3_t;
typedef __n128x4 uint8x16x4_t;
typedef __n128   uint16x8_t;
typedef __n128x2 uint16x8x2_t;
typedef __n128x3 uint16x8x3_t;
typedef __n128x4 uint16x8x4_t;
typedef __n128   uint32x4_t;
typedef __n128x2 uint32x4x2_t;
typedef __n128x3 uint32x4x3_t;
typedef __n128x4 uint32x4x4_t;
typedef __n128   uint64x2_t;
typedef __n128x2 uint64x2x2_t;
typedef __n128x3 uint64x2x3_t;
typedef __n128x4 uint64x2x4_t;
typedef __n64 float16x4_t;
typedef __n64x2 float16x4x2_t;
typedef __n64x3 float16x4x3_t;
typedef __n64x4 float16x4x4_t;
typedef __n128 float16x8_t;
typedef __n128x2 float16x8x2_t;
typedef __n128x3 float16x8x3_t;
typedef __n128x4 float16x8x4_t;

////////////////////////////////////////////////////////////////////////////////
// neon intrin_type cast macros.
#define __float32x2_t_to_n64(x)             (x)
#define __float32x2x2_t_to_n64x2(x)         (x)
#define __float32x2x3_t_to_n64x3(x)         (x)
#define __float32x2x4_t_to_n64x4(x)         (x)
#define __float32x4_t_to_n128(x)            (x)
#define __float32x4x2_t_to_n128x2(x)        (x)
#define __float32x4x3_t_to_n128x3(x)        (x)
#define __float32x4x4_t_to_n128x4(x)        (x)
#define __float64x1_t_to_n64(x)             (x)
#define __float64x1x2_t_to_n64x2(x)         (x)
#define __float64x1x3_t_to_n64x3(x)         (x)
#define __float64x1x4_t_to_n64x4(x)         (x)
#define __float64x2_t_to_n128(x)            (x)
#define __float64x2x2_t_to_n128x2(x)        (x)
#define __float64x2x3_t_to_n128x3(x)        (x)
#define __float64x2x4_t_to_n128x4(x)        (x)
#define __int16x4_t_to_n64(x)               (x)
#define __int16x4x2_t_to_n64x2(x)           (x)
#define __int16x4x3_t_to_n64x3(x)           (x)
#define __int16x4x4_t_to_n64x4(x)           (x)
#define __int16x8_t_to_n128(x)              (x)
#define __int16x8x2_t_to_n128x2(x)          (x)
#define __int16x8x3_t_to_n128x3(x)          (x)
#define __int16x8x4_t_to_n128x4(x)          (x)
#define __int32x2_t_to_n64(x)               (x)
#define __int32x2x2_t_to_n64x2(x)           (x)
#define __int32x2x3_t_to_n64x3(x)           (x)
#define __int32x2x4_t_to_n64x4(x)           (x)
#define __int32x4_t_to_n128(x)              (x)
#define __int32x4x2_t_to_n128x2(x)          (x)
#define __int32x4x3_t_to_n128x3(x)          (x)
#define __int32x4x4_t_to_n128x4(x)          (x)
#define __int64x1_t_to_n64(x)               (x)
#define __int64x1x2_t_to_n64x2(x)           (x)
#define __int64x1x3_t_to_n64x3(x)           (x)
#define __int64x1x4_t_to_n64x4(x)           (x)
#define __int64x2_t_to_n128(x)              (x)
#define __int64x2x2_t_to_n128x2(x)          (x)
#define __int64x2x3_t_to_n128x3(x)          (x)
#define __int64x2x4_t_to_n128x4(x)          (x)
#define __int8x16_t_to_n128(x)              (x)
#define __int8x16x2_t_to_n128x2(x)          (x)
#define __int8x16x3_t_to_n128x3(x)          (x)
#define __int8x16x4_t_to_n128x4(x)          (x)
#define __int8x8_t_to_n64(x)                (x)
#define __int8x8x2_t_to_n64x2(x)            (x)
#define __int8x8x3_t_to_n64x3(x)            (x)
#define __int8x8x4_t_to_n64x4(x)            (x)
#define __n128_to_float32x4_t(x)            (x)
#define __n128_to_float64x2_t(x)            (x)
#define __n128_to_int16x8_t(x)              (x)
#define __n128_to_int32x4_t(x)              (x)
#define __n128_to_int64x2_t(x)              (x)
#define __n128_to_int8x16_t(x)              (x)
#define __n128_to_poly16x8_t(x)             (x)
#define __n128_to_poly64x2_t(x)             (x)
#define __n128_to_poly8x16_t(x)             (x)
#define __n128_to_uint16x8_t(x)             (x)
#define __n128_to_uint32x4_t(x)             (x)
#define __n128_to_uint64x2_t(x)             (x)
#define __n128_to_uint8x16_t(x)             (x)
#define __n128x2_to_float32x4x2_t(x)        (x)
#define __n128x2_to_float64x2x2_t(x)        (x)
#define __n128x2_to_int16x8x2_t(x)          (x)
#define __n128x2_to_int32x4x2_t(x)          (x)
#define __n128x2_to_int64x2x2_t(x)          (x)
#define __n128x2_to_int8x16x2_t(x)          (x)
#define __n128x2_to_poly16x8x2_t(x)         (x)
#define __n128x2_to_poly64x2x2_t(x)         (x)
#define __n128x2_to_poly8x16x2_t(x)         (x)
#define __n128x2_to_uint16x8x2_t(x)         (x)
#define __n128x2_to_uint32x4x2_t(x)         (x)
#define __n128x2_to_uint64x2x2_t(x)         (x)
#define __n128x2_to_uint8x16x2_t(x)         (x)
#define __n128x3_to_float32x4x3_t(x)        (x)
#define __n128x3_to_float64x2x3_t(x)        (x)
#define __n128x3_to_int16x8x3_t(x)          (x)
#define __n128x3_to_int32x4x3_t(x)          (x)
#define __n128x3_to_int64x2x3_t(x)          (x)
#define __n128x3_to_int8x16x3_t(x)          (x)
#define __n128x3_to_poly16x8x3_t(x)         (x)
#define __n128x3_to_poly64x2x3_t(x)         (x)
#define __n128x3_to_poly8x16x3_t(x)         (x)
#define __n128x3_to_uint16x8x3_t(x)         (x)
#define __n128x3_to_uint32x4x3_t(x)         (x)
#define __n128x3_to_uint64x2x3_t(x)         (x)
#define __n128x3_to_uint8x16x3_t(x)         (x)
#define __n128x4_to_float32x4x4_t(x)        (x)
#define __n128x4_to_float64x2x4_t(x)        (x)
#define __n128x4_to_int16x8x4_t(x)          (x)
#define __n128x4_to_int32x4x4_t(x)          (x)
#define __n128x4_to_int64x2x4_t(x)          (x)
#define __n128x4_to_int8x16x4_t(x)          (x)
#define __n128x4_to_poly16x8x4_t(x)         (x)
#define __n128x4_to_poly64x2x4_t(x)         (x)
#define __n128x4_to_poly8x16x4_t(x)         (x)
#define __n128x4_to_uint16x8x4_t(x)         (x)
#define __n128x4_to_uint32x4x4_t(x)         (x)
#define __n128x4_to_uint64x2x4_t(x)         (x)
#define __n128x4_to_uint8x16x4_t(x)         (x)
#define __n64_to_float32x2_t(x)             (x)
#define __n64_to_float64x1_t(x)             (x)
#define __n64_to_int16x4_t(x)               (x)
#define __n64_to_int32x2_t(x)               (x)
#define __n64_to_int64x1_t(x)               (x)
#define __n64_to_int8x8_t(x)                (x)
#define __n64_to_poly16x4_t(x)              (x)
#define __n64_to_poly64x1_t(x)              (x)
#define __n64_to_poly8x8_t(x)               (x)
#define __n64_to_uint16x4_t(x)              (x)
#define __n64_to_uint32x2_t(x)              (x)
#define __n64_to_uint64x1_t(x)              (x)
#define __n64_to_uint8x8_t(x)               (x)
#define __n64x2_to_float32x2x2_t(x)         (x)
#define __n64x2_to_float64x1x2_t(x)         (x)
#define __n64x2_to_int16x4x2_t(x)           (x)
#define __n64x2_to_int32x2x2_t(x)           (x)
#define __n64x2_to_int64x1x2_t(x)           (x)
#define __n64x2_to_int8x8x2_t(x)            (x)
#define __n64x2_to_poly16x4x2_t(x)          (x)
#define __n64x2_to_poly64x1x2_t(x)          (x)
#define __n64x2_to_poly8x8x2_t(x)           (x)
#define __n64x2_to_uint16x4x2_t(x)          (x)
#define __n64x2_to_uint32x2x2_t(x)          (x)
#define __n64x2_to_uint64x1x2_t(x)          (x)
#define __n64x2_to_uint8x8x2_t(x)           (x)
#define __n64x3_to_float32x2x3_t(x)         (x)
#define __n64x3_to_float64x1x3_t(x)         (x)
#define __n64x3_to_int16x4x3_t(x)           (x)
#define __n64x3_to_int32x2x3_t(x)           (x)
#define __n64x3_to_int64x1x3_t(x)           (x)
#define __n64x3_to_int8x8x3_t(x)            (x)
#define __n64x3_to_poly16x4x3_t(x)          (x)
#define __n64x3_to_poly64x1x3_t(x)          (x)
#define __n64x3_to_poly8x8x3_t(x)           (x)
#define __n64x3_to_uint16x4x3_t(x)          (x)
#define __n64x3_to_uint32x2x3_t(x)          (x)
#define __n64x3_to_uint64x1x3_t(x)          (x)
#define __n64x3_to_uint8x8x3_t(x)           (x)
#define __n64x4_to_float32x2x4_t(x)         (x)
#define __n64x4_to_float64x1x4_t(x)         (x)
#define __n64x4_to_int16x4x4_t(x)           (x)
#define __n64x4_to_int32x2x4_t(x)           (x)
#define __n64x4_to_int64x1x4_t(x)           (x)
#define __n64x4_to_int8x8x4_t(x)            (x)
#define __n64x4_to_poly16x4x4_t(x)          (x)
#define __n64x4_to_poly64x1x4_t(x)          (x)
#define __n64x4_to_poly8x8x4_t(x)           (x)
#define __n64x4_to_uint16x4x4_t(x)          (x)
#define __n64x4_to_uint32x2x4_t(x)          (x)
#define __n64x4_to_uint64x1x4_t(x)          (x)
#define __n64x4_to_uint8x8x4_t(x)           (x)
#define __poly16x4_t_to_n64(x)              (x)
#define __poly16x4x2_t_to_n64x2(x)          (x)
#define __poly16x4x3_t_to_n64x3(x)          (x)
#define __poly16x4x4_t_to_n64x4(x)          (x)
#define __poly16x8_t_to_n128(x)             (x)
#define __poly16x8x2_t_to_n128x2(x)         (x)
#define __poly16x8x3_t_to_n128x3(x)         (x)
#define __poly16x8x4_t_to_n128x4(x)         (x)
#define __poly64x1_t_to_n64(x)              (x)
#define __poly64x1x2_t_to_n64x2(x)          (x)
#define __poly64x1x3_t_to_n64x3(x)          (x)
#define __poly64x1x4_t_to_n64x4(x)          (x)
#define __poly64x2_t_to_n128(x)             (x)
#define __poly64x2x2_t_to_n128x2(x)         (x)
#define __poly64x2x3_t_to_n128x3(x)         (x)
#define __poly64x2x4_t_to_n128x4(x)         (x)
#define __poly8x16_t_to_n128(x)             (x)
#define __poly8x16x2_t_to_n128x2(x)         (x)
#define __poly8x16x3_t_to_n128x3(x)         (x)
#define __poly8x16x4_t_to_n128x4(x)         (x)
#define __poly8x8_t_to_n64(x)               (x)
#define __poly8x8x2_t_to_n64x2(x)           (x)
#define __poly8x8x3_t_to_n64x3(x)           (x)
#define __poly8x8x4_t_to_n64x4(x)           (x)
#define __uint16x4_t_to_n64(x)              (x)
#define __uint16x4x2_t_to_n64x2(x)          (x)
#define __uint16x4x3_t_to_n64x3(x)          (x)
#define __uint16x4x4_t_to_n64x4(x)          (x)
#define __uint16x8_t_to_n128(x)             (x)
#define __uint16x8x2_t_to_n128x2(x)         (x)
#define __uint16x8x3_t_to_n128x3(x)         (x)
#define __uint16x8x4_t_to_n128x4(x)         (x)
#define __uint32x2_t_to_n64(x)              (x)
#define __uint32x2x2_t_to_n64x2(x)          (x)
#define __uint32x2x3_t_to_n64x3(x)          (x)
#define __uint32x2x4_t_to_n64x4(x)          (x)
#define __uint32x4_t_to_n128(x)             (x)
#define __uint32x4x2_t_to_n128x2(x)         (x)
#define __uint32x4x3_t_to_n128x3(x)         (x)
#define __uint32x4x4_t_to_n128x4(x)         (x)
#define __uint64x1_t_to_n64(x)              (x)
#define __uint64x1x2_t_to_n64x2(x)          (x)
#define __uint64x1x3_t_to_n64x3(x)          (x)
#define __uint64x1x4_t_to_n64x4(x)          (x)
#define __uint64x2_t_to_n128(x)             (x)
#define __uint64x2x2_t_to_n128x2(x)         (x)
#define __uint64x2x3_t_to_n128x3(x)         (x)
#define __uint64x2x4_t_to_n128x4(x)         (x)
#define __uint8x16_t_to_n128(x)             (x)
#define __uint8x16x2_t_to_n128x2(x)         (x)
#define __uint8x16x3_t_to_n128x3(x)         (x)
#define __uint8x16x4_t_to_n128x4(x)         (x)
#define __uint8x8_t_to_n64(x)               (x)
#define __uint8x8x2_t_to_n64x2(x)           (x)
#define __uint8x8x3_t_to_n64x3(x)           (x)
#define __uint8x8x4_t_to_n64x4(x)           (x)
#define __n128_to_float16x8_t(x)            (x)
#define __n128x2_to_float16x8x2_t(x)        (x)
#define __n128x3_to_float16x8x3_t(x)        (x)
#define __n128x4_to_float16x8x4_t(x)        (x)
#define __float16x8_t_to_n128(x)            (x)
#define __float16x8x2_t_to_n128x2(x)        (x)
#define __float16x8x3_t_to_n128x3(x)        (x)
#define __float16x8x4_t_to_n128x4(x)        (x)
#define __n64_to_float16x4_t(x)             (x)
#define __n64x2_to_float16x4x2_t(x)         (x)
#define __n64x3_to_float16x4x3_t(x)         (x)
#define __n64x4_to_float16x4x4_t(x)         (x)
#define __float16x4_t_to_n64(x)             (x)
#define __float16x4x2_t_to_n64x2(x)         (x)
#define __float16x4x3_t_to_n64x3(x)         (x)
#define __float16x4x4_t_to_n64x4(x)         (x)
#else

////////////////////////////////////////////////////////////////////////////////
// 32-bits neon short vector extended types
typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(4) __Float32x1_t
{
    float n32_f32[1];

#ifdef __cplusplus
    __forceinline       float& operator[](size_t idx) noexcept       { return n32_f32[idx]; }
    __forceinline const float& operator[](size_t idx) const noexcept { return n32_f32[idx]; }
#endif
} __Float32x1_t, float32x1_t;

#ifdef __cplusplus
static_assert(alignof(float32x1_t) == alignof(__n32), "alignof(float32x1_t) != alignof(__n32)");
static_assert(sizeof(float32x1_t) == sizeof(__n32), "sizeof(float32x1_t) != sizeof(__n32)");
#endif

typedef struct float32x1x2_t
{
    float32x1_t val[2];
} float32x1x2_t;

#ifdef __cplusplus
static_assert(sizeof(float32x1x2_t) == (sizeof(float32x1_t) * 2), "sizeof(float32x1x2_t) != (sizeof(float32x1_t) * 2)");
#endif

// 64-bits neon short vector extended types
typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __Float32x2_t
{
    float n64_f32[2];

#ifdef __cplusplus
    __forceinline       float& operator[](size_t idx) noexcept       { return n64_f32[idx]; }
    __forceinline const float& operator[](size_t idx) const noexcept { return n64_f32[idx]; }
#endif
} __Float32x2_t, float32x2_t;

#ifdef __cplusplus
static_assert(alignof(float32x2_t) == alignof(__n64), "alignof(float32x2_t) != alignof(__n64)");
static_assert(sizeof(float32x2_t) == sizeof(__n64), "sizeof(float32x2_t) != sizeof(__n64)");
#endif

typedef struct float32x2x2_t
{
    float32x2_t val[2];
} float32x2x2_t;

#ifdef __cplusplus
static_assert(sizeof(float32x2x2_t) == (sizeof(float32x2_t) * 2), "sizeof(float32x2x2_t) != (sizeof(float32x2_t) * 2)");
#endif

typedef struct float32x2x3_t
{
    float32x2_t val[3];
} float32x2x3_t;

#ifdef __cplusplus
static_assert(sizeof(float32x2x3_t) == (sizeof(float32x2_t) * 3), "sizeof(float32x2x3_t) != (sizeof(float32x2_t) * 3)");
#endif

typedef struct float32x2x4_t
{
    float32x2_t val[4];
} float32x2x4_t;

#ifdef __cplusplus
static_assert(sizeof(float32x2x4_t) == (sizeof(float32x2_t) * 4), "sizeof(float32x2x4_t) != (sizeof(float32x2_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __Float64x1_t
{
    double n64_f64[1];

#ifdef __cplusplus
    __forceinline       double& operator[](size_t idx) noexcept       { return n64_f64[idx]; }
    __forceinline const double& operator[](size_t idx) const noexcept { return n64_f64[idx]; }
#endif
} __Float64x1_t, float64x1_t;

#ifdef __cplusplus
static_assert(alignof(float64x1_t) == alignof(__n64), "alignof(float64x1_t) != alignof(__n64)");
static_assert(sizeof(float64x1_t) == sizeof(__n64), "sizeof(float64x1_t) != sizeof(__n64)");
#endif

typedef struct float64x1x2_t
{
    float64x1_t val[2];
} float64x1x2_t;

#ifdef __cplusplus
static_assert(sizeof(float64x1x2_t) == (sizeof(float64x1_t) * 2), "sizeof(float64x1x2_t) != (sizeof(float64x1_t) * 2)");
#endif

typedef struct float64x1x3_t
{
    float64x1_t val[3];
} float64x1x3_t;

#ifdef __cplusplus
static_assert(sizeof(float64x1x3_t) == (sizeof(float64x1_t) * 3), "sizeof(float64x1x3_t) != (sizeof(float64x1_t) * 3)");
#endif

typedef struct float64x1x4_t
{
    float64x1_t val[4];
} float64x1x4_t;

#ifdef __cplusplus
static_assert(sizeof(float64x1x4_t) == (sizeof(float64x1_t) * 4), "sizeof(float64x1x4_t) != (sizeof(float64x1_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __Int8x8_t
{
    int8_t n64_i8[8];

#ifdef __cplusplus
    __forceinline       int8_t& operator[](size_t idx) noexcept       { return n64_i8[idx]; }
    __forceinline const int8_t& operator[](size_t idx) const noexcept { return n64_i8[idx]; }
#endif
} __Int8x8_t, int8x8_t;

#ifdef __cplusplus
static_assert(alignof(int8x8_t) == alignof(__n64), "alignof(int8x8_t) != alignof(__n64)");
static_assert(sizeof(int8x8_t) == sizeof(__n64), "sizeof(int8x8_t) != sizeof(__n64)");
#endif

typedef struct int8x8x2_t
{
    int8x8_t val[2];
} int8x8x2_t;

#ifdef __cplusplus
static_assert(sizeof(int8x8x2_t) == (sizeof(int8x8_t) * 2), "sizeof(int8x8x2_t) != (sizeof(int8x8_t) * 2)");
#endif

typedef struct int8x8x3_t
{
    int8x8_t val[3];
} int8x8x3_t;

#ifdef __cplusplus
static_assert(sizeof(int8x8x3_t) == (sizeof(int8x8_t) * 3), "sizeof(int8x8x3_t) != (sizeof(int8x8_t) * 3)");
#endif

typedef struct int8x8x4_t
{
    int8x8_t val[4];
} int8x8x4_t;

#ifdef __cplusplus
static_assert(sizeof(int8x8x4_t) == (sizeof(int8x8_t) * 4), "sizeof(int8x8x4_t) != (sizeof(int8x8_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __Int16x4_t
{
    int16_t n64_i16[4];

#ifdef __cplusplus
    __forceinline       int16_t& operator[](size_t idx) noexcept       { return n64_i16[idx]; }
    __forceinline const int16_t& operator[](size_t idx) const noexcept { return n64_i16[idx]; }
#endif
} __Int16x4_t, int16x4_t;

#ifdef __cplusplus
static_assert(alignof(int16x4_t) == alignof(__n64), "alignof(int16x4_t) != alignof(__n64)");
static_assert(sizeof(int16x4_t) == sizeof(__n64), "sizeof(int16x4_t) != sizeof(__n64)");
#endif

typedef struct int16x4x2_t
{
    int16x4_t val[2];
} int16x4x2_t;

#ifdef __cplusplus
static_assert(sizeof(int16x4x2_t) == (sizeof(int16x4_t) * 2), "sizeof(int16x4x2_t) != (sizeof(int16x4_t) * 2)");
#endif

typedef struct int16x4x3_t
{
    int16x4_t val[3];
} int16x4x3_t;

#ifdef __cplusplus
static_assert(sizeof(int16x4x3_t) == (sizeof(int16x4_t) * 3), "sizeof(int16x4x3_t) != (sizeof(int16x4_t) * 3)");
#endif

typedef struct int16x4x4_t
{
    int16x4_t val[4];
} int16x4x4_t;

#ifdef __cplusplus
static_assert(sizeof(int16x4x4_t) == (sizeof(int16x4_t) * 4), "sizeof(int16x4x4_t) != (sizeof(int16x4_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __Int32x2_t
{
    int32_t n64_i32[2];

#ifdef __cplusplus
    __forceinline       int32_t& operator[](size_t idx) noexcept       { return n64_i32[idx]; }
    __forceinline const int32_t& operator[](size_t idx) const noexcept { return n64_i32[idx]; }
#endif
} __Int32x2_t, int32x2_t;

#ifdef __cplusplus
static_assert(alignof(int32x2_t) == alignof(__n64), "alignof(int32x2_t) != alignof(__n64)");
static_assert(sizeof(int32x2_t) == sizeof(__n64), "sizeof(int32x2_t) != sizeof(__n64)");
#endif

typedef struct int32x2x2_t
{
    int32x2_t val[2];
} int32x2x2_t;

#ifdef __cplusplus
static_assert(sizeof(int32x2x2_t) == (sizeof(int32x2_t) * 2), "sizeof(int32x2x2_t) != (sizeof(int32x2_t) * 2)");
#endif

typedef struct int32x2x3_t
{
    int32x2_t val[3];
} int32x2x3_t;

#ifdef __cplusplus
static_assert(sizeof(int32x2x3_t) == (sizeof(int32x2_t) * 3), "sizeof(int32x2x3_t) != (sizeof(int32x2_t) * 3)");
#endif

typedef struct int32x2x4_t
{
    int32x2_t val[4];
} int32x2x4_t;

#ifdef __cplusplus
static_assert(sizeof(int32x2x4_t) == (sizeof(int32x2_t) * 4), "sizeof(int32x2x4_t) != (sizeof(int32x2_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __Int64x1_t
{
    int64_t n64_i64[1];

#ifdef __cplusplus
    __forceinline       int64_t& operator[](size_t idx) noexcept       { return n64_i64[idx]; }
    __forceinline const int64_t& operator[](size_t idx) const noexcept { return n64_i64[idx]; }
#endif
} __Int64x1_t, int64x1_t;

#ifdef __cplusplus
static_assert(alignof(int64x1_t) == alignof(__n64), "alignof(int64x1_t) != alignof(__n64)");
static_assert(sizeof(int64x1_t) == sizeof(__n64), "sizeof(int64x1_t) != sizeof(__n64)");
#endif

typedef struct int64x1x2_t
{
    int64x1_t val[2];
} int64x1x2_t;

#ifdef __cplusplus
static_assert(sizeof(int64x1x2_t) == (sizeof(int64x1_t) * 2), "sizeof(int64x1x2_t) != (sizeof(int64x1_t) * 2)");
#endif

typedef struct int64x1x3_t
{
    int64x1_t val[3];
} int64x1x3_t;

#ifdef __cplusplus
static_assert(sizeof(int64x1x3_t) == (sizeof(int64x1_t) * 3), "sizeof(int64x1x3_t) != (sizeof(int64x1_t) * 3)");
#endif

typedef struct int64x1x4_t
{
    int64x1_t val[4];
} int64x1x4_t;

#ifdef __cplusplus
static_assert(sizeof(int64x1x4_t) == (sizeof(int64x1_t) * 4), "sizeof(int64x1x4_t) != (sizeof(int64x1_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __Poly8x8_t
{
    poly8_t n64_p8[8];

#ifdef __cplusplus
    __forceinline       poly8_t& operator[](size_t idx) noexcept       { return n64_p8[idx]; }
    __forceinline const poly8_t& operator[](size_t idx) const noexcept { return n64_p8[idx]; }
#endif
} __Poly8x8_t, poly8x8_t;

#ifdef __cplusplus
static_assert(alignof(poly8x8_t) == alignof(__n64), "alignof(poly8x8_t) != alignof(__n64)");
static_assert(sizeof(poly8x8_t) == sizeof(__n64), "sizeof(poly8x8_t) != sizeof(__n64)");
#endif

typedef struct poly8x8x2_t
{
    poly8x8_t val[2];
} poly8x8x2_t;

#ifdef __cplusplus
static_assert(sizeof(poly8x8x2_t) == (sizeof(poly8x8_t) * 2), "sizeof(poly8x8x2_t) != (sizeof(poly8x8_t) * 2)");
#endif

typedef struct poly8x8x3_t
{
    poly8x8_t val[3];
} poly8x8x3_t;

#ifdef __cplusplus
static_assert(sizeof(poly8x8x3_t) == (sizeof(poly8x8_t) * 3), "sizeof(poly8x8x3_t) != (sizeof(poly8x8_t) * 3)");
#endif

typedef struct poly8x8x4_t
{
    poly8x8_t val[4];
} poly8x8x4_t;

#ifdef __cplusplus
static_assert(sizeof(poly8x8x4_t) == (sizeof(poly8x8_t) * 4), "sizeof(poly8x8x4_t) != (sizeof(poly8x8_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __Poly16x4_t
{
    poly16_t n64_p16[4];

#ifdef __cplusplus
    __forceinline       poly16_t& operator[](size_t idx) noexcept       { return n64_p16[idx]; }
    __forceinline const poly16_t& operator[](size_t idx) const noexcept { return n64_p16[idx]; }
#endif
} __Poly16x4_t, poly16x4_t;

#ifdef __cplusplus
static_assert(alignof(poly16x4_t) == alignof(__n64), "alignof(poly16x4_t) != alignof(__n64)");
static_assert(sizeof(poly16x4_t) == sizeof(__n64), "sizeof(poly16x4_t) != sizeof(__n64)");
#endif

typedef struct poly16x4x2_t
{
    poly16x4_t val[2];
} poly16x4x2_t;

#ifdef __cplusplus
static_assert(sizeof(poly16x4x2_t) == (sizeof(poly16x4_t) * 2), "sizeof(poly16x4x2_t) != (sizeof(poly16x4_t) * 2)");
#endif

typedef struct poly16x4x3_t
{
    poly16x4_t val[3];
} poly16x4x3_t;

#ifdef __cplusplus
static_assert(sizeof(poly16x4x3_t) == (sizeof(poly16x4_t) * 3), "sizeof(poly16x4x3_t) != (sizeof(poly16x4_t) * 3)");
#endif

typedef struct poly16x4x4_t
{
    poly16x4_t val[4];
} poly16x4x4_t;

#ifdef __cplusplus
static_assert(sizeof(poly16x4x4_t) == (sizeof(poly16x4_t) * 4), "sizeof(poly16x4x4_t) != (sizeof(poly16x4_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __Poly64x1_t
{
    poly64_t n64_p64[1];

#ifdef __cplusplus
    __forceinline       poly64_t& operator[](size_t idx) noexcept       { return n64_p64[idx]; }
    __forceinline const poly64_t& operator[](size_t idx) const noexcept { return n64_p64[idx]; }
#endif
} __Poly64x1_t, poly64x1_t;

#ifdef __cplusplus
static_assert(alignof(poly64x1_t) == alignof(__n64), "alignof(poly64x1_t) != alignof(__n64)");
static_assert(sizeof(poly64x1_t) == sizeof(__n64), "sizeof(poly64x1_t) != sizeof(__n64)");
#endif

typedef struct poly64x1x2_t
{
    poly64x1_t val[2];
} poly64x1x2_t;

#ifdef __cplusplus
static_assert(sizeof(poly64x1x2_t) == (sizeof(poly64x1_t) * 2), "sizeof(poly64x1x2_t) != (sizeof(poly64x1_t) * 2)");
#endif

typedef struct poly64x1x3_t
{
    poly64x1_t val[3];
} poly64x1x3_t;

#ifdef __cplusplus
static_assert(sizeof(poly64x1x3_t) == (sizeof(poly64x1_t) * 3), "sizeof(poly64x1x3_t) != (sizeof(poly64x1_t) * 3)");
#endif

typedef struct poly64x1x4_t
{
    poly64x1_t val[4];
} poly64x1x4_t;

#ifdef __cplusplus
static_assert(sizeof(poly64x1x4_t) == (sizeof(poly64x1_t) * 4), "sizeof(poly64x1x4_t) != (sizeof(poly64x1_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __Uint8x8_t
{
    uint8_t n64_u8[8];

#ifdef __cplusplus
    __forceinline       uint8_t& operator[](size_t idx) noexcept       { return n64_u8[idx]; }
    __forceinline const uint8_t& operator[](size_t idx) const noexcept { return n64_u8[idx]; }
#endif
} __Uint8x8_t, uint8x8_t;

#ifdef __cplusplus
static_assert(alignof(uint8x8_t) == alignof(__n64), "alignof(uint8x8_t) != alignof(__n64)");
static_assert(sizeof(uint8x8_t) == sizeof(__n64), "sizeof(uint8x8_t) != sizeof(__n64)");
#endif

typedef struct uint8x8x2_t
{
    uint8x8_t val[2];
} uint8x8x2_t;

#ifdef __cplusplus
static_assert(sizeof(uint8x8x2_t) == (sizeof(uint8x8_t) * 2), "sizeof(uint8x8x2_t) != (sizeof(uint8x8_t) * 2)");
#endif

typedef struct uint8x8x3_t
{
    uint8x8_t val[3];
} uint8x8x3_t;

#ifdef __cplusplus
static_assert(sizeof(uint8x8x3_t) == (sizeof(uint8x8_t) * 3), "sizeof(uint8x8x3_t) != (sizeof(uint8x8_t) * 3)");
#endif

typedef struct uint8x8x4_t
{
    uint8x8_t val[4];
} uint8x8x4_t;

#ifdef __cplusplus
static_assert(sizeof(uint8x8x4_t) == (sizeof(uint8x8_t) * 4), "sizeof(uint8x8x4_t) != (sizeof(uint8x8_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __Uint16x4_t
{
    uint16_t n64_u16[4];

#ifdef __cplusplus
    __forceinline       uint16_t& operator[](size_t idx) noexcept       { return n64_u16[idx]; }
    __forceinline const uint16_t& operator[](size_t idx) const noexcept { return n64_u16[idx]; }
#endif
} __Uint16x4_t, uint16x4_t;

#ifdef __cplusplus
static_assert(alignof(uint16x4_t) == alignof(__n64), "alignof(uint16x4_t) != alignof(__n64)");
static_assert(sizeof(uint16x4_t) == sizeof(__n64), "sizeof(uint16x4_t) != sizeof(__n64)");
#endif

typedef struct uint16x4x2_t
{
    uint16x4_t val[2];
} uint16x4x2_t;

#ifdef __cplusplus
static_assert(sizeof(uint16x4x2_t) == (sizeof(uint16x4_t) * 2), "sizeof(uint16x4x2_t) != (sizeof(uint16x4_t) * 2)");
#endif

typedef struct uint16x4x3_t
{
    uint16x4_t val[3];
} uint16x4x3_t;

#ifdef __cplusplus
static_assert(sizeof(uint16x4x3_t) == (sizeof(uint16x4_t) * 3), "sizeof(uint16x4x3_t) != (sizeof(uint16x4_t) * 3)");
#endif

typedef struct uint16x4x4_t
{
    uint16x4_t val[4];
} uint16x4x4_t;

#ifdef __cplusplus
static_assert(sizeof(uint16x4x4_t) == (sizeof(uint16x4_t) * 4), "sizeof(uint16x4x4_t) != (sizeof(uint16x4_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __Uint32x2_t
{
    uint32_t n64_u32[2];

#ifdef __cplusplus
    __forceinline       uint32_t& operator[](size_t idx) noexcept       { return n64_u32[idx]; }
    __forceinline const uint32_t& operator[](size_t idx) const noexcept { return n64_u32[idx]; }
#endif
} __Uint32x2_t, uint32x2_t;

#ifdef __cplusplus
static_assert(alignof(uint32x2_t) == alignof(__n64), "alignof(uint32x2_t) != alignof(__n64)");
static_assert(sizeof(uint32x2_t) == sizeof(__n64), "sizeof(uint32x2_t) != sizeof(__n64)");
#endif

typedef struct uint32x2x2_t
{
    uint32x2_t val[2];
} uint32x2x2_t;

#ifdef __cplusplus
static_assert(sizeof(uint32x2x2_t) == (sizeof(uint32x2_t) * 2), "sizeof(uint32x2x2_t) != (sizeof(uint32x2_t) * 2)");
#endif

typedef struct uint32x2x3_t
{
    uint32x2_t val[3];
} uint32x2x3_t;

#ifdef __cplusplus
static_assert(sizeof(uint32x2x3_t) == (sizeof(uint32x2_t) * 3), "sizeof(uint32x2x3_t) != (sizeof(uint32x2_t) * 3)");
#endif

typedef struct uint32x2x4_t
{
    uint32x2_t val[4];
} uint32x2x4_t;

#ifdef __cplusplus
static_assert(sizeof(uint32x2x4_t) == (sizeof(uint32x2_t) * 4), "sizeof(uint32x2x4_t) != (sizeof(uint32x2_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(8) __Uint64x1_t
{
    uint64_t n64_u64[1];

#ifdef __cplusplus
    __forceinline       uint64_t& operator[](size_t idx) noexcept       { return n64_u64[idx]; }
    __forceinline const uint64_t& operator[](size_t idx) const noexcept { return n64_u64[idx]; }
#endif
} __Uint64x1_t, uint64x1_t;

#ifdef __cplusplus
static_assert(alignof(uint64x1_t) == alignof(__n64), "alignof(uint64x1_t) != alignof(__n64)");
static_assert(sizeof(uint64x1_t) == sizeof(__n64), "sizeof(uint64x1_t) != sizeof(__n64)");
#endif

typedef struct uint64x1x2_t
{
    uint64x1_t val[2];
} uint64x1x2_t;

#ifdef __cplusplus
static_assert(sizeof(uint64x1x2_t) == (sizeof(uint64x1_t) * 2), "sizeof(uint64x1x2_t) != (sizeof(uint64x1_t) * 2)");
#endif

typedef struct uint64x1x3_t
{
    uint64x1_t val[3];
} uint64x1x3_t;

#ifdef __cplusplus
static_assert(sizeof(uint64x1x3_t) == (sizeof(uint64x1_t) * 3), "sizeof(uint64x1x3_t) != (sizeof(uint64x1_t) * 3)");
#endif

typedef struct uint64x1x4_t
{
    uint64x1_t val[4];
} uint64x1x4_t;

#ifdef __cplusplus
static_assert(sizeof(uint64x1x4_t) == (sizeof(uint64x1_t) * 4), "sizeof(uint64x1x4_t) != (sizeof(uint64x1_t) * 4)");
#endif

////////////////////////////////////////////////////////////////////////////////
// 128-bits neon short vector extended types
typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __Float32x4_t
{
    float n128_f32[4];

#ifdef __cplusplus
    __forceinline       float& operator[](size_t idx) noexcept       { return n128_f32[idx]; }
    __forceinline const float& operator[](size_t idx) const noexcept { return n128_f32[idx]; }
#endif
} __Float32x4_t, float32x4_t;

#ifdef __cplusplus
static_assert(alignof(float32x4_t) == alignof(__n128), "alignof(float32x4_t) != alignof(__n128)");
static_assert(sizeof(float32x4_t) == sizeof(__n128), "sizeof(float32x4_t) != sizeof(__n128)");
#endif

typedef struct float32x4x2_t
{
    float32x4_t val[2];
} float32x4x2_t;

#ifdef __cplusplus
static_assert(sizeof(float32x4x2_t) == (sizeof(float32x4_t) * 2), "sizeof(float32x4x2_t) != (sizeof(float32x4_t) * 2)");
#endif

typedef struct float32x4x3_t
{
    float32x4_t val[3];
} float32x4x3_t;

#ifdef __cplusplus
static_assert(sizeof(float32x4x3_t) == (sizeof(float32x4_t) * 3), "sizeof(float32x4x3_t) != (sizeof(float32x4_t) * 3)");
#endif

typedef struct float32x4x4_t
{
    float32x4_t val[4];
} float32x4x4_t;

#ifdef __cplusplus
static_assert(sizeof(float32x4x4_t) == (sizeof(float32x4_t) * 4), "sizeof(float32x4x4_t) != (sizeof(float32x4_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __Float64x2_t
{
    double n128_f64[2];

#ifdef __cplusplus
    __forceinline       double& operator[](size_t idx) noexcept       { return n128_f64[idx]; }
    __forceinline const double& operator[](size_t idx) const noexcept { return n128_f64[idx]; }
#endif
} __Float64x2_t, float64x2_t;

#ifdef __cplusplus
static_assert(alignof(float64x2_t) == alignof(__n128), "alignof(float64x2_t) != alignof(__n128)");
static_assert(sizeof(float64x2_t) == sizeof(__n128), "sizeof(float64x2_t) != sizeof(__n128)");
#endif

typedef struct float64x2x2_t
{
    float64x2_t val[2];
} float64x2x2_t;

#ifdef __cplusplus
static_assert(sizeof(float64x2x2_t) == (sizeof(float64x2_t) * 2), "sizeof(float64x2x2_t) != (sizeof(float64x2_t) * 2)");
#endif

typedef struct float64x2x3_t
{
    float64x2_t val[3];
} float64x2x3_t;

#ifdef __cplusplus
static_assert(sizeof(float64x2x3_t) == (sizeof(float64x2_t) * 3), "sizeof(float64x2x3_t) != (sizeof(float64x2_t) * 3)");
#endif

typedef struct float64x2x4_t
{
    float64x2_t val[4];
} float64x2x4_t;

#ifdef __cplusplus
static_assert(sizeof(float64x2x4_t) == (sizeof(float64x2_t) * 4), "sizeof(float64x2x4_t) != (sizeof(float64x2_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __Int8x16_t
{
    int8_t n128_i8[16];

#ifdef __cplusplus
    __forceinline       int8_t& operator[](size_t idx) noexcept       { return n128_i8[idx]; }
    __forceinline const int8_t& operator[](size_t idx) const noexcept { return n128_i8[idx]; }
#endif
} __Int8x16_t, int8x16_t;

#ifdef __cplusplus
static_assert(alignof(int8x16_t) == alignof(__n128), "alignof(int8x16_t) != alignof(__n128)");
static_assert(sizeof(int8x16_t) == sizeof(__n128), "sizeof(int8x16_t) != sizeof(__n128)");
#endif

typedef struct int8x16x2_t
{
    int8x16_t val[2];
} int8x16x2_t;

#ifdef __cplusplus
static_assert(sizeof(int8x16x2_t) == (sizeof(int8x16_t) * 2), "sizeof(int8x16x2_t) != (sizeof(int8x16_t) * 2)");
#endif

typedef struct int8x16x3_t
{
    int8x16_t val[3];
} int8x16x3_t;

#ifdef __cplusplus
static_assert(sizeof(int8x16x3_t) == (sizeof(int8x16_t) * 3), "sizeof(int8x16x3_t) != (sizeof(int8x16_t) * 3)");
#endif

typedef struct int8x16x4_t
{
    int8x16_t val[4];
} int8x16x4_t;

#ifdef __cplusplus
static_assert(sizeof(int8x16x4_t) == (sizeof(int8x16_t) * 4), "sizeof(int8x16x4_t) != (sizeof(int8x16_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __Int16x8_t
{
    int16_t n128_i16[8];

#ifdef __cplusplus
    __forceinline       int16_t& operator[](size_t idx) noexcept       { return n128_i16[idx]; }
    __forceinline const int16_t& operator[](size_t idx) const noexcept { return n128_i16[idx]; }
#endif
} __Int16x8_t, int16x8_t;

#ifdef __cplusplus
static_assert(alignof(int16x8_t) == alignof(__n128), "alignof(int16x8_t) != alignof(__n128)");
static_assert(sizeof(int16x8_t) == sizeof(__n128), "sizeof(int16x8_t) != sizeof(__n128)");
#endif

typedef struct int16x8x2_t
{
    int16x8_t val[2];
} int16x8x2_t;

#ifdef __cplusplus
static_assert(sizeof(int16x8x2_t) == (sizeof(int16x8_t) * 2), "sizeof(int16x8x2_t) != (sizeof(int16x8_t) * 2)");
#endif

typedef struct int16x8x3_t
{
    int16x8_t val[3];
} int16x8x3_t;

#ifdef __cplusplus
static_assert(sizeof(int16x8x3_t) == (sizeof(int16x8_t) * 3), "sizeof(int16x8x3_t) != (sizeof(int16x8_t) * 3)");
#endif

typedef struct int16x8x4_t
{
    int16x8_t val[4];
} int16x8x4_t;

#ifdef __cplusplus
static_assert(sizeof(int16x8x4_t) == (sizeof(int16x8_t) * 4), "sizeof(int16x8x4_t) != (sizeof(int16x8_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __Int32x4_t
{
    int32_t n128_i32[4];

#ifdef __cplusplus
    __forceinline       int32_t& operator[](size_t idx) noexcept       { return n128_i32[idx]; }
    __forceinline const int32_t& operator[](size_t idx) const noexcept { return n128_i32[idx]; }
#endif
} __Int32x4_t, int32x4_t;

#ifdef __cplusplus
static_assert(alignof(int32x4_t) == alignof(__n128), "alignof(int32x4_t) != alignof(__n128)");
static_assert(sizeof(int32x4_t) == sizeof(__n128), "sizeof(int32x4_t) != sizeof(__n128)");
#endif

typedef struct int32x4x2_t
{
    int32x4_t val[2];
} int32x4x2_t;

#ifdef __cplusplus
static_assert(sizeof(int32x4x2_t) == (sizeof(int32x4_t) * 2), "sizeof(int32x4x2_t) != (sizeof(int32x4_t) * 2)");
#endif

typedef struct int32x4x3_t
{
    int32x4_t val[3];
} int32x4x3_t;

#ifdef __cplusplus
static_assert(sizeof(int32x4x3_t) == (sizeof(int32x4_t) * 3), "sizeof(int32x4x3_t) != (sizeof(int32x4_t) * 3)");
#endif

typedef struct int32x4x4_t
{
    int32x4_t val[4];
} int32x4x4_t;

#ifdef __cplusplus
static_assert(sizeof(int32x4x4_t) == (sizeof(int32x4_t) * 4), "sizeof(int32x4x4_t) != (sizeof(int32x4_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __Int64x2_t
{
    int64_t n128_i64[2];

#ifdef __cplusplus
    __forceinline       int64_t& operator[](size_t idx) noexcept       { return n128_i64[idx]; }
    __forceinline const int64_t& operator[](size_t idx) const noexcept { return n128_i64[idx]; }
#endif
} __Int64x2_t, int64x2_t;

#ifdef __cplusplus
static_assert(alignof(int64x2_t) == alignof(__n128), "alignof(int64x2_t) != alignof(__n128)");
static_assert(sizeof(int64x2_t) == sizeof(__n128), "sizeof(int64x2_t) != sizeof(__n128)");
#endif

typedef struct int64x2x2_t
{
    int64x2_t val[2];
} int64x2x2_t;

#ifdef __cplusplus
static_assert(sizeof(int64x2x2_t) == (sizeof(int64x2_t) * 2), "sizeof(int64x2x2_t) != (sizeof(int64x2_t) * 2)");
#endif

typedef struct int64x2x3_t
{
    int64x2_t val[3];
} int64x2x3_t;

#ifdef __cplusplus
static_assert(sizeof(int64x2x3_t) == (sizeof(int64x2_t) * 3), "sizeof(int64x2x3_t) != (sizeof(int64x2_t) * 3)");
#endif

typedef struct int64x2x4_t
{
    int64x2_t val[4];
} int64x2x4_t;

#ifdef __cplusplus
static_assert(sizeof(int64x2x4_t) == (sizeof(int64x2_t) * 4), "sizeof(int64x2x4_t) != (sizeof(int64x2_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __Poly8x16_t
{
    poly8_t n128_p8[16];

#ifdef __cplusplus
    __forceinline       poly8_t& operator[](size_t idx) noexcept       { return n128_p8[idx]; }
    __forceinline const poly8_t& operator[](size_t idx) const noexcept { return n128_p8[idx]; }
#endif
} __Poly8x16_t, poly8x16_t;

#ifdef __cplusplus
static_assert(alignof(poly8x16_t) == alignof(__n128), "alignof(poly8x16_t) != alignof(__n128)");
static_assert(sizeof(poly8x16_t) == sizeof(__n128), "sizeof(poly8x16_t) != sizeof(__n128)");
#endif

typedef struct poly8x16x2_t
{
    poly8x16_t val[2];
} poly8x16x2_t;

#ifdef __cplusplus
static_assert(sizeof(poly8x16x2_t) == (sizeof(poly8x16_t) * 2), "sizeof(poly8x16x2_t) != (sizeof(poly8x16_t) * 2)");
#endif

typedef struct poly8x16x3_t
{
    poly8x16_t val[3];
} poly8x16x3_t;

#ifdef __cplusplus
static_assert(sizeof(poly8x16x3_t) == (sizeof(poly8x16_t) * 3), "sizeof(poly8x16x3_t) != (sizeof(poly8x16_t) * 3)");
#endif

typedef struct poly8x16x4_t
{
    poly8x16_t val[4];
} poly8x16x4_t;

#ifdef __cplusplus
static_assert(sizeof(poly8x16x4_t) == (sizeof(poly8x16_t) * 4), "sizeof(poly8x16x4_t) != (sizeof(poly8x16_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __Poly16x8_t
{
    poly16_t n128_p16[8];

#ifdef __cplusplus
    __forceinline       poly16_t& operator[](size_t idx) noexcept       { return n128_p16[idx]; }
    __forceinline const poly16_t& operator[](size_t idx) const noexcept { return n128_p16[idx]; }
#endif
} __Poly16x8_t, poly16x8_t;

#ifdef __cplusplus
static_assert(alignof(poly16x8_t) == alignof(__n128), "alignof(poly16x8_t) != alignof(__n128)");
static_assert(sizeof(poly16x8_t) == sizeof(__n128), "sizeof(poly16x8_t) != sizeof(__n128)");
#endif

typedef struct poly16x8x2_t
{
    poly16x8_t val[2];
} poly16x8x2_t;

#ifdef __cplusplus
static_assert(sizeof(poly16x8x2_t) == (sizeof(poly16x8_t) * 2), "sizeof(poly16x8x2_t) != (sizeof(poly16x8_t) * 2)");
#endif

typedef struct poly16x8x3_t
{
    poly16x8_t val[3];
} poly16x8x3_t;

#ifdef __cplusplus
static_assert(sizeof(poly16x8x3_t) == (sizeof(poly16x8_t) * 3), "sizeof(poly16x8x3_t) != (sizeof(poly16x8_t) * 3)");
#endif

typedef struct poly16x8x4_t
{
    poly16x8_t val[4];
} poly16x8x4_t;

#ifdef __cplusplus
static_assert(sizeof(poly16x8x4_t) == (sizeof(poly16x8_t) * 4), "sizeof(poly16x8x4_t) != (sizeof(poly16x8_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __Poly64x2_t
{
    poly64_t n128_p64[2];

#ifdef __cplusplus
    __forceinline       poly64_t& operator[](size_t idx) noexcept       { return n128_p64[idx]; }
    __forceinline const poly64_t& operator[](size_t idx) const noexcept { return n128_p64[idx]; }
#endif
} __Poly64x2_t, poly64x2_t;

#ifdef __cplusplus
static_assert(alignof(poly64x2_t) == alignof(__n128), "alignof(poly64x2_t) != alignof(__n128)");
static_assert(sizeof(poly64x2_t) == sizeof(__n128), "sizeof(poly64x2_t) != sizeof(__n128)");
#endif

typedef struct poly64x2x2_t
{
    poly64x2_t val[2];
} poly64x2x2_t;

#ifdef __cplusplus
static_assert(sizeof(poly64x2x2_t) == (sizeof(poly64x2_t) * 2), "sizeof(poly64x2x2_t) != (sizeof(poly64x2_t) * 2)");
#endif

typedef struct poly64x2x3_t
{
    poly64x2_t val[3];
} poly64x2x3_t;

#ifdef __cplusplus
static_assert(sizeof(poly64x2x3_t) == (sizeof(poly64x2_t) * 3), "sizeof(poly64x2x3_t) != (sizeof(poly64x2_t) * 3)");
#endif

typedef struct poly64x2x4_t
{
    poly64x2_t val[4];
} poly64x2x4_t;

#ifdef __cplusplus
static_assert(sizeof(poly64x2x4_t) == (sizeof(poly64x2_t) * 4), "sizeof(poly64x2x4_t) != (sizeof(poly64x2_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __Uint8x16_t
{
    uint8_t n128_u8[16];

#ifdef __cplusplus
    __forceinline       uint8_t& operator[](size_t idx) noexcept       { return n128_u8[idx]; }
    __forceinline const uint8_t& operator[](size_t idx) const noexcept { return n128_u8[idx]; }
#endif
} __Uint8x16_t, uint8x16_t;

#ifdef __cplusplus
static_assert(alignof(uint8x16_t) == alignof(__n128), "alignof(uint8x16_t) != alignof(__n128)");
static_assert(sizeof(uint8x16_t) == sizeof(__n128), "sizeof(uint8x16_t) != sizeof(__n128)");
#endif

typedef struct uint8x16x2_t
{
    uint8x16_t val[2];
} uint8x16x2_t;

#ifdef __cplusplus
static_assert(sizeof(uint8x16x2_t) == (sizeof(uint8x16_t) * 2), "sizeof(uint8x16x2_t) != (sizeof(uint8x16_t) * 2)");
#endif

typedef struct uint8x16x3_t
{
    uint8x16_t val[3];
} uint8x16x3_t;

#ifdef __cplusplus
static_assert(sizeof(uint8x16x3_t) == (sizeof(uint8x16_t) * 3), "sizeof(uint8x16x3_t) != (sizeof(uint8x16_t) * 3)");
#endif

typedef struct uint8x16x4_t
{
    uint8x16_t val[4];
} uint8x16x4_t;

#ifdef __cplusplus
static_assert(sizeof(uint8x16x4_t) == (sizeof(uint8x16_t) * 4), "sizeof(uint8x16x4_t) != (sizeof(uint8x16_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __Uint16x8_t
{
    uint16_t n128_u16[8];

#ifdef __cplusplus
    __forceinline       uint16_t& operator[](size_t idx) noexcept       { return n128_u16[idx]; }
    __forceinline const uint16_t& operator[](size_t idx) const noexcept { return n128_u16[idx]; }
#endif
} __Uint16x8_t, uint16x8_t;

#ifdef __cplusplus
static_assert(alignof(uint16x8_t) == alignof(__n128), "alignof(uint16x8_t) != alignof(__n128)");
static_assert(sizeof(uint16x8_t) == sizeof(__n128), "sizeof(uint16x8_t) != sizeof(__n128)");
#endif

typedef struct uint16x8x2_t
{
    uint16x8_t val[2];
} uint16x8x2_t;

#ifdef __cplusplus
static_assert(sizeof(uint16x8x2_t) == (sizeof(uint16x8_t) * 2), "sizeof(uint16x8x2_t) != (sizeof(uint16x8_t) * 2)");
#endif

typedef struct uint16x8x3_t
{
    uint16x8_t val[3];
} uint16x8x3_t;

#ifdef __cplusplus
static_assert(sizeof(uint16x8x3_t) == (sizeof(uint16x8_t) * 3), "sizeof(uint16x8x3_t) != (sizeof(uint16x8_t) * 3)");
#endif

typedef struct uint16x8x4_t
{
    uint16x8_t val[4];
} uint16x8x4_t;

#ifdef __cplusplus
static_assert(sizeof(uint16x8x4_t) == (sizeof(uint16x8_t) * 4), "sizeof(uint16x8x4_t) != (sizeof(uint16x8_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __Uint32x4_t
{
    uint32_t n128_u32[4];

#ifdef __cplusplus
    __forceinline       uint32_t& operator[](size_t idx) noexcept       { return n128_u32[idx]; }
    __forceinline const uint32_t& operator[](size_t idx) const noexcept { return n128_u32[idx]; }
#endif
} __Uint32x4_t, uint32x4_t;

#ifdef __cplusplus
static_assert(alignof(uint32x4_t) == alignof(__n128), "alignof(uint32x4_t) != alignof(__n128)");
static_assert(sizeof(uint32x4_t) == sizeof(__n128), "sizeof(uint32x4_t) != sizeof(__n128)");
#endif

typedef struct uint32x4x2_t
{
    uint32x4_t val[2];
} uint32x4x2_t;

#ifdef __cplusplus
static_assert(sizeof(uint32x4x2_t) == (sizeof(uint32x4_t) * 2), "sizeof(uint32x4x2_t) != (sizeof(uint32x4_t) * 2)");
#endif

typedef struct uint32x4x3_t
{
    uint32x4_t val[3];
} uint32x4x3_t;

#ifdef __cplusplus
static_assert(sizeof(uint32x4x3_t) == (sizeof(uint32x4_t) * 3), "sizeof(uint32x4x3_t) != (sizeof(uint32x4_t) * 3)");
#endif

typedef struct uint32x4x4_t
{
    uint32x4_t val[4];
} uint32x4x4_t;

#ifdef __cplusplus
static_assert(sizeof(uint32x4x4_t) == (sizeof(uint32x4_t) * 4), "sizeof(uint32x4x4_t) != (sizeof(uint32x4_t) * 4)");
#endif

typedef union __declspec(intrin_type) _ADVSIMD_ALIGN(16) __Uint64x2_t
{
    uint64_t n128_u64[2];

#ifdef __cplusplus
    __forceinline       uint64_t& operator[](size_t idx) noexcept       { return n128_u64[idx]; }
    __forceinline const uint64_t& operator[](size_t idx) const noexcept { return n128_u64[idx]; }
#endif
} __Uint64x2_t, uint64x2_t;

#ifdef __cplusplus
static_assert(alignof(uint64x2_t) == alignof(__n128), "alignof(uint64x2_t) != alignof(__n128)");
static_assert(sizeof(uint64x2_t) == sizeof(__n128), "sizeof(uint64x2_t) != sizeof(__n128)");
#endif

typedef struct uint64x2x2_t
{
    uint64x2_t val[2];
} uint64x2x2_t;

#ifdef __cplusplus
static_assert(sizeof(uint64x2x2_t) == (sizeof(uint64x2_t) * 2), "sizeof(uint64x2x2_t) != (sizeof(uint64x2_t) * 2)");
#endif

typedef struct uint64x2x3_t
{
    uint64x2_t val[3];
} uint64x2x3_t;

#ifdef __cplusplus
static_assert(sizeof(uint64x2x3_t) == (sizeof(uint64x2_t) * 3), "sizeof(uint64x2x3_t) != (sizeof(uint64x2_t) * 3)");
#endif

typedef struct uint64x2x4_t
{
    uint64x2_t val[4];
} uint64x2x4_t;

#ifdef __cplusplus
static_assert(sizeof(uint64x2x4_t) == (sizeof(uint64x2_t) * 4), "sizeof(uint64x2x4_t) != (sizeof(uint64x2_t) * 4)");
#endif

////////////////////////////////////////////////////////////////////////////////
// neon intrin_type cast functions.
__forceinline __n64 __float32x2_t_to_n64(float32x2_t x) { return *(__n64 *)(&x); }
__forceinline __n64x2 __float32x2x2_t_to_n64x2(float32x2x2_t x) { return *(__n64x2 *)(&x); }
__forceinline __n64x3 __float32x2x3_t_to_n64x3(float32x2x3_t x) { return *(__n64x3 *)(&x); }
__forceinline __n64x4 __float32x2x4_t_to_n64x4(float32x2x4_t x) { return *(__n64x4 *)(&x); }
__forceinline __n128 __float32x4_t_to_n128(float32x4_t x) { return *(__n128 *)(&x); }
__forceinline __n128x2 __float32x4x2_t_to_n128x2(float32x4x2_t x) { return *(__n128x2 *)(&x); }
__forceinline __n128x3 __float32x4x3_t_to_n128x3(float32x4x3_t x) { return *(__n128x3 *)(&x); }
__forceinline __n128x4 __float32x4x4_t_to_n128x4(float32x4x4_t x) { return *(__n128x4 *)(&x); }
__forceinline __n64 __float64x1_t_to_n64(float64x1_t x) { return *(__n64 *)(&x); }
__forceinline __n64x2 __float64x1x2_t_to_n64x2(float64x1x2_t x) { return *(__n64x2 *)(&x); }
__forceinline __n64x3 __float64x1x3_t_to_n64x3(float64x1x3_t x) { return *(__n64x3 *)(&x); }
__forceinline __n64x4 __float64x1x4_t_to_n64x4(float64x1x4_t x) { return *(__n64x4 *)(&x); }
__forceinline __n128 __float64x2_t_to_n128(float64x2_t x) { return *(__n128 *)(&x); }
__forceinline __n128x2 __float64x2x2_t_to_n128x2(float64x2x2_t x) { return *(__n128x2 *)(&x); }
__forceinline __n128x3 __float64x2x3_t_to_n128x3(float64x2x3_t x) { return *(__n128x3 *)(&x); }
__forceinline __n128x4 __float64x2x4_t_to_n128x4(float64x2x4_t x) { return *(__n128x4 *)(&x); }
__forceinline __n64 __int16x4_t_to_n64(int16x4_t x) { return *(__n64 *)(&x); }
__forceinline __n64x2 __int16x4x2_t_to_n64x2(int16x4x2_t x) { return *(__n64x2 *)(&x); }
__forceinline __n64x3 __int16x4x3_t_to_n64x3(int16x4x3_t x) { return *(__n64x3 *)(&x); }
__forceinline __n64x4 __int16x4x4_t_to_n64x4(int16x4x4_t x) { return *(__n64x4 *)(&x); }
__forceinline __n128 __int16x8_t_to_n128(int16x8_t x) { return *(__n128 *)(&x); }
__forceinline __n128x2 __int16x8x2_t_to_n128x2(int16x8x2_t x) { return *(__n128x2 *)(&x); }
__forceinline __n128x3 __int16x8x3_t_to_n128x3(int16x8x3_t x) { return *(__n128x3 *)(&x); }
__forceinline __n128x4 __int16x8x4_t_to_n128x4(int16x8x4_t x) { return *(__n128x4 *)(&x); }
__forceinline __n64 __int32x2_t_to_n64(int32x2_t x) { return *(__n64 *)(&x); }
__forceinline __n64x2 __int32x2x2_t_to_n64x2(int32x2x2_t x) { return *(__n64x2 *)(&x); }
__forceinline __n64x3 __int32x2x3_t_to_n64x3(int32x2x3_t x) { return *(__n64x3 *)(&x); }
__forceinline __n64x4 __int32x2x4_t_to_n64x4(int32x2x4_t x) { return *(__n64x4 *)(&x); }
__forceinline __n128 __int32x4_t_to_n128(int32x4_t x) { return *(__n128 *)(&x); }
__forceinline __n128x2 __int32x4x2_t_to_n128x2(int32x4x2_t x) { return *(__n128x2 *)(&x); }
__forceinline __n128x3 __int32x4x3_t_to_n128x3(int32x4x3_t x) { return *(__n128x3 *)(&x); }
__forceinline __n128x4 __int32x4x4_t_to_n128x4(int32x4x4_t x) { return *(__n128x4 *)(&x); }
__forceinline __n64 __int64x1_t_to_n64(int64x1_t x) { return *(__n64 *)(&x); }
__forceinline __n64x2 __int64x1x2_t_to_n64x2(int64x1x2_t x) { return *(__n64x2 *)(&x); }
__forceinline __n64x3 __int64x1x3_t_to_n64x3(int64x1x3_t x) { return *(__n64x3 *)(&x); }
__forceinline __n64x4 __int64x1x4_t_to_n64x4(int64x1x4_t x) { return *(__n64x4 *)(&x); }
__forceinline __n128 __int64x2_t_to_n128(int64x2_t x) { return *(__n128 *)(&x); }
__forceinline __n128x2 __int64x2x2_t_to_n128x2(int64x2x2_t x) { return *(__n128x2 *)(&x); }
__forceinline __n128x3 __int64x2x3_t_to_n128x3(int64x2x3_t x) { return *(__n128x3 *)(&x); }
__forceinline __n128x4 __int64x2x4_t_to_n128x4(int64x2x4_t x) { return *(__n128x4 *)(&x); }
__forceinline __n128 __int8x16_t_to_n128(int8x16_t x) { return *(__n128 *)(&x); }
__forceinline __n128x2 __int8x16x2_t_to_n128x2(int8x16x2_t x) { return *(__n128x2 *)(&x); }
__forceinline __n128x3 __int8x16x3_t_to_n128x3(int8x16x3_t x) { return *(__n128x3 *)(&x); }
__forceinline __n128x4 __int8x16x4_t_to_n128x4(int8x16x4_t x) { return *(__n128x4 *)(&x); }
__forceinline __n64 __int8x8_t_to_n64(int8x8_t x) { return *(__n64 *)(&x); }
__forceinline __n64x2 __int8x8x2_t_to_n64x2(int8x8x2_t x) { return *(__n64x2 *)(&x); }
__forceinline __n64x3 __int8x8x3_t_to_n64x3(int8x8x3_t x) { return *(__n64x3 *)(&x); }
__forceinline __n64x4 __int8x8x4_t_to_n64x4(int8x8x4_t x) { return *(__n64x4 *)(&x); }
__forceinline float32x4_t __n128_to_float32x4_t(__n128 x) { return *(float32x4_t *)(&x); }
__forceinline float64x2_t __n128_to_float64x2_t(__n128 x) { return *(float64x2_t *)(&x); }
__forceinline int16x8_t __n128_to_int16x8_t(__n128 x) { return *(int16x8_t *)(&x); }
__forceinline int32x4_t __n128_to_int32x4_t(__n128 x) { return *(int32x4_t *)(&x); }
__forceinline int64x2_t __n128_to_int64x2_t(__n128 x) { return *(int64x2_t *)(&x); }
__forceinline int8x16_t __n128_to_int8x16_t(__n128 x) { return *(int8x16_t *)(&x); }
__forceinline poly16x8_t __n128_to_poly16x8_t(__n128 x) { return *(poly16x8_t *)(&x); }
__forceinline poly64x2_t __n128_to_poly64x2_t(__n128 x) { return *(poly64x2_t *)(&x); }
__forceinline poly8x16_t __n128_to_poly8x16_t(__n128 x) { return *(poly8x16_t *)(&x); }
__forceinline uint16x8_t __n128_to_uint16x8_t(__n128 x) { return *(uint16x8_t *)(&x); }
__forceinline uint32x4_t __n128_to_uint32x4_t(__n128 x) { return *(uint32x4_t *)(&x); }
__forceinline uint64x2_t __n128_to_uint64x2_t(__n128 x) { return *(uint64x2_t *)(&x); }
__forceinline uint8x16_t __n128_to_uint8x16_t(__n128 x) { return *(uint8x16_t *)(&x); }
__forceinline float32x4x2_t __n128x2_to_float32x4x2_t(__n128x2 x) { return *(float32x4x2_t *)(&x); }
__forceinline float64x2x2_t __n128x2_to_float64x2x2_t(__n128x2 x) { return *(float64x2x2_t *)(&x); }
__forceinline int16x8x2_t __n128x2_to_int16x8x2_t(__n128x2 x) { return *(int16x8x2_t *)(&x); }
__forceinline int32x4x2_t __n128x2_to_int32x4x2_t(__n128x2 x) { return *(int32x4x2_t *)(&x); }
__forceinline int64x2x2_t __n128x2_to_int64x2x2_t(__n128x2 x) { return *(int64x2x2_t *)(&x); }
__forceinline int8x16x2_t __n128x2_to_int8x16x2_t(__n128x2 x) { return *(int8x16x2_t *)(&x); }
__forceinline poly16x8x2_t __n128x2_to_poly16x8x2_t(__n128x2 x) { return *(poly16x8x2_t *)(&x); }
__forceinline poly64x2x2_t __n128x2_to_poly64x2x2_t(__n128x2 x) { return *(poly64x2x2_t *)(&x); }
__forceinline poly8x16x2_t __n128x2_to_poly8x16x2_t(__n128x2 x) { return *(poly8x16x2_t *)(&x); }
__forceinline uint16x8x2_t __n128x2_to_uint16x8x2_t(__n128x2 x) { return *(uint16x8x2_t *)(&x); }
__forceinline uint32x4x2_t __n128x2_to_uint32x4x2_t(__n128x2 x) { return *(uint32x4x2_t *)(&x); }
__forceinline uint64x2x2_t __n128x2_to_uint64x2x2_t(__n128x2 x) { return *(uint64x2x2_t *)(&x); }
__forceinline uint8x16x2_t __n128x2_to_uint8x16x2_t(__n128x2 x) { return *(uint8x16x2_t *)(&x); }
__forceinline float32x4x3_t __n128x3_to_float32x4x3_t(__n128x3 x) { return *(float32x4x3_t *)(&x); }
__forceinline float64x2x3_t __n128x3_to_float64x2x3_t(__n128x3 x) { return *(float64x2x3_t *)(&x); }
__forceinline int16x8x3_t __n128x3_to_int16x8x3_t(__n128x3 x) { return *(int16x8x3_t *)(&x); }
__forceinline int32x4x3_t __n128x3_to_int32x4x3_t(__n128x3 x) { return *(int32x4x3_t *)(&x); }
__forceinline int64x2x3_t __n128x3_to_int64x2x3_t(__n128x3 x) { return *(int64x2x3_t *)(&x); }
__forceinline int8x16x3_t __n128x3_to_int8x16x3_t(__n128x3 x) { return *(int8x16x3_t *)(&x); }
__forceinline poly16x8x3_t __n128x3_to_poly16x8x3_t(__n128x3 x) { return *(poly16x8x3_t *)(&x); }
__forceinline poly64x2x3_t __n128x3_to_poly64x2x3_t(__n128x3 x) { return *(poly64x2x3_t *)(&x); }
__forceinline poly8x16x3_t __n128x3_to_poly8x16x3_t(__n128x3 x) { return *(poly8x16x3_t *)(&x); }
__forceinline uint16x8x3_t __n128x3_to_uint16x8x3_t(__n128x3 x) { return *(uint16x8x3_t *)(&x); }
__forceinline uint32x4x3_t __n128x3_to_uint32x4x3_t(__n128x3 x) { return *(uint32x4x3_t *)(&x); }
__forceinline uint64x2x3_t __n128x3_to_uint64x2x3_t(__n128x3 x) { return *(uint64x2x3_t *)(&x); }
__forceinline uint8x16x3_t __n128x3_to_uint8x16x3_t(__n128x3 x) { return *(uint8x16x3_t *)(&x); }
__forceinline float32x4x4_t __n128x4_to_float32x4x4_t(__n128x4 x) { return *(float32x4x4_t *)(&x); }
__forceinline float64x2x4_t __n128x4_to_float64x2x4_t(__n128x4 x) { return *(float64x2x4_t *)(&x); }
__forceinline int16x8x4_t __n128x4_to_int16x8x4_t(__n128x4 x) { return *(int16x8x4_t *)(&x); }
__forceinline int32x4x4_t __n128x4_to_int32x4x4_t(__n128x4 x) { return *(int32x4x4_t *)(&x); }
__forceinline int64x2x4_t __n128x4_to_int64x2x4_t(__n128x4 x) { return *(int64x2x4_t *)(&x); }
__forceinline int8x16x4_t __n128x4_to_int8x16x4_t(__n128x4 x) { return *(int8x16x4_t *)(&x); }
__forceinline poly16x8x4_t __n128x4_to_poly16x8x4_t(__n128x4 x) { return *(poly16x8x4_t *)(&x); }
__forceinline poly64x2x4_t __n128x4_to_poly64x2x4_t(__n128x4 x) { return *(poly64x2x4_t *)(&x); }
__forceinline poly8x16x4_t __n128x4_to_poly8x16x4_t(__n128x4 x) { return *(poly8x16x4_t *)(&x); }
__forceinline uint16x8x4_t __n128x4_to_uint16x8x4_t(__n128x4 x) { return *(uint16x8x4_t *)(&x); }
__forceinline uint32x4x4_t __n128x4_to_uint32x4x4_t(__n128x4 x) { return *(uint32x4x4_t *)(&x); }
__forceinline uint64x2x4_t __n128x4_to_uint64x2x4_t(__n128x4 x) { return *(uint64x2x4_t *)(&x); }
__forceinline uint8x16x4_t __n128x4_to_uint8x16x4_t(__n128x4 x) { return *(uint8x16x4_t *)(&x); }
__forceinline float32x2_t __n64_to_float32x2_t(__n64 x) { return *(float32x2_t *)(&x); }
__forceinline float64x1_t __n64_to_float64x1_t(__n64 x) { return *(float64x1_t *)(&x); }
__forceinline int16x4_t __n64_to_int16x4_t(__n64 x) { return *(int16x4_t *)(&x); }
__forceinline int32x2_t __n64_to_int32x2_t(__n64 x) { return *(int32x2_t *)(&x); }
__forceinline int64x1_t __n64_to_int64x1_t(__n64 x) { return *(int64x1_t *)(&x); }
__forceinline int8x8_t __n64_to_int8x8_t(__n64 x) { return *(int8x8_t *)(&x); }
__forceinline poly16x4_t __n64_to_poly16x4_t(__n64 x) { return *(poly16x4_t *)(&x); }
__forceinline poly64x1_t __n64_to_poly64x1_t(__n64 x) { return *(poly64x1_t *)(&x); }
__forceinline poly8x8_t __n64_to_poly8x8_t(__n64 x) { return *(poly8x8_t *)(&x); }
__forceinline uint16x4_t __n64_to_uint16x4_t(__n64 x) { return *(uint16x4_t *)(&x); }
__forceinline uint32x2_t __n64_to_uint32x2_t(__n64 x) { return *(uint32x2_t *)(&x); }
__forceinline uint64x1_t __n64_to_uint64x1_t(__n64 x) { return *(uint64x1_t *)(&x); }
__forceinline uint8x8_t __n64_to_uint8x8_t(__n64 x) { return *(uint8x8_t *)(&x); }
__forceinline float32x2x2_t __n64x2_to_float32x2x2_t(__n64x2 x) { return *(float32x2x2_t *)(&x); }
__forceinline float64x1x2_t __n64x2_to_float64x1x2_t(__n64x2 x) { return *(float64x1x2_t *)(&x); }
__forceinline int16x4x2_t __n64x2_to_int16x4x2_t(__n64x2 x) { return *(int16x4x2_t *)(&x); }
__forceinline int32x2x2_t __n64x2_to_int32x2x2_t(__n64x2 x) { return *(int32x2x2_t *)(&x); }
__forceinline int64x1x2_t __n64x2_to_int64x1x2_t(__n64x2 x) { return *(int64x1x2_t *)(&x); }
__forceinline int8x8x2_t __n64x2_to_int8x8x2_t(__n64x2 x) { return *(int8x8x2_t *)(&x); }
__forceinline poly16x4x2_t __n64x2_to_poly16x4x2_t(__n64x2 x) { return *(poly16x4x2_t *)(&x); }
__forceinline poly64x1x2_t __n64x2_to_poly64x1x2_t(__n64x2 x) { return *(poly64x1x2_t *)(&x); }
__forceinline poly8x8x2_t __n64x2_to_poly8x8x2_t(__n64x2 x) { return *(poly8x8x2_t *)(&x); }
__forceinline uint16x4x2_t __n64x2_to_uint16x4x2_t(__n64x2 x) { return *(uint16x4x2_t *)(&x); }
__forceinline uint32x2x2_t __n64x2_to_uint32x2x2_t(__n64x2 x) { return *(uint32x2x2_t *)(&x); }
__forceinline uint64x1x2_t __n64x2_to_uint64x1x2_t(__n64x2 x) { return *(uint64x1x2_t *)(&x); }
__forceinline uint8x8x2_t __n64x2_to_uint8x8x2_t(__n64x2 x) { return *(uint8x8x2_t *)(&x); }
__forceinline float32x2x3_t __n64x3_to_float32x2x3_t(__n64x3 x) { return *(float32x2x3_t *)(&x); }
__forceinline float64x1x3_t __n64x3_to_float64x1x3_t(__n64x3 x) { return *(float64x1x3_t *)(&x); }
__forceinline int16x4x3_t __n64x3_to_int16x4x3_t(__n64x3 x) { return *(int16x4x3_t *)(&x); }
__forceinline int32x2x3_t __n64x3_to_int32x2x3_t(__n64x3 x) { return *(int32x2x3_t *)(&x); }
__forceinline int64x1x3_t __n64x3_to_int64x1x3_t(__n64x3 x) { return *(int64x1x3_t *)(&x); }
__forceinline int8x8x3_t __n64x3_to_int8x8x3_t(__n64x3 x) { return *(int8x8x3_t *)(&x); }
__forceinline poly16x4x3_t __n64x3_to_poly16x4x3_t(__n64x3 x) { return *(poly16x4x3_t *)(&x); }
__forceinline poly64x1x3_t __n64x3_to_poly64x1x3_t(__n64x3 x) { return *(poly64x1x3_t *)(&x); }
__forceinline poly8x8x3_t __n64x3_to_poly8x8x3_t(__n64x3 x) { return *(poly8x8x3_t *)(&x); }
__forceinline uint16x4x3_t __n64x3_to_uint16x4x3_t(__n64x3 x) { return *(uint16x4x3_t *)(&x); }
__forceinline uint32x2x3_t __n64x3_to_uint32x2x3_t(__n64x3 x) { return *(uint32x2x3_t *)(&x); }
__forceinline uint64x1x3_t __n64x3_to_uint64x1x3_t(__n64x3 x) { return *(uint64x1x3_t *)(&x); }
__forceinline uint8x8x3_t __n64x3_to_uint8x8x3_t(__n64x3 x) { return *(uint8x8x3_t *)(&x); }
__forceinline float32x2x4_t __n64x4_to_float32x2x4_t(__n64x4 x) { return *(float32x2x4_t *)(&x); }
__forceinline float64x1x4_t __n64x4_to_float64x1x4_t(__n64x4 x) { return *(float64x1x4_t *)(&x); }
__forceinline int16x4x4_t __n64x4_to_int16x4x4_t(__n64x4 x) { return *(int16x4x4_t *)(&x); }
__forceinline int32x2x4_t __n64x4_to_int32x2x4_t(__n64x4 x) { return *(int32x2x4_t *)(&x); }
__forceinline int64x1x4_t __n64x4_to_int64x1x4_t(__n64x4 x) { return *(int64x1x4_t *)(&x); }
__forceinline int8x8x4_t __n64x4_to_int8x8x4_t(__n64x4 x) { return *(int8x8x4_t *)(&x); }
__forceinline poly16x4x4_t __n64x4_to_poly16x4x4_t(__n64x4 x) { return *(poly16x4x4_t *)(&x); }
__forceinline poly64x1x4_t __n64x4_to_poly64x1x4_t(__n64x4 x) { return *(poly64x1x4_t *)(&x); }
__forceinline poly8x8x4_t __n64x4_to_poly8x8x4_t(__n64x4 x) { return *(poly8x8x4_t *)(&x); }
__forceinline uint16x4x4_t __n64x4_to_uint16x4x4_t(__n64x4 x) { return *(uint16x4x4_t *)(&x); }
__forceinline uint32x2x4_t __n64x4_to_uint32x2x4_t(__n64x4 x) { return *(uint32x2x4_t *)(&x); }
__forceinline uint64x1x4_t __n64x4_to_uint64x1x4_t(__n64x4 x) { return *(uint64x1x4_t *)(&x); }
__forceinline uint8x8x4_t __n64x4_to_uint8x8x4_t(__n64x4 x) { return *(uint8x8x4_t *)(&x); }
__forceinline __n64 __poly16x4_t_to_n64(poly16x4_t x) { return *(__n64 *)(&x); }
__forceinline __n64x2 __poly16x4x2_t_to_n64x2(poly16x4x2_t x) { return *(__n64x2 *)(&x); }
__forceinline __n64x3 __poly16x4x3_t_to_n64x3(poly16x4x3_t x) { return *(__n64x3 *)(&x); }
__forceinline __n64x4 __poly16x4x4_t_to_n64x4(poly16x4x4_t x) { return *(__n64x4 *)(&x); }
__forceinline __n128 __poly16x8_t_to_n128(poly16x8_t x) { return *(__n128 *)(&x); }
__forceinline __n128x2 __poly16x8x2_t_to_n128x2(poly16x8x2_t x) { return *(__n128x2 *)(&x); }
__forceinline __n128x3 __poly16x8x3_t_to_n128x3(poly16x8x3_t x) { return *(__n128x3 *)(&x); }
__forceinline __n128x4 __poly16x8x4_t_to_n128x4(poly16x8x4_t x) { return *(__n128x4 *)(&x); }
__forceinline __n64 __poly64x1_t_to_n64(poly64x1_t x) { return *(__n64 *)(&x); }
__forceinline __n64x2 __poly64x1x2_t_to_n64x2(poly64x1x2_t x) { return *(__n64x2 *)(&x); }
__forceinline __n64x3 __poly64x1x3_t_to_n64x3(poly64x1x3_t x) { return *(__n64x3 *)(&x); }
__forceinline __n64x4 __poly64x1x4_t_to_n64x4(poly64x1x4_t x) { return *(__n64x4 *)(&x); }
__forceinline __n128 __poly64x2_t_to_n128(poly64x2_t x) { return *(__n128 *)(&x); }
__forceinline __n128x2 __poly64x2x2_t_to_n128x2(poly64x2x2_t x) { return *(__n128x2 *)(&x); }
__forceinline __n128x3 __poly64x2x3_t_to_n128x3(poly64x2x3_t x) { return *(__n128x3 *)(&x); }
__forceinline __n128x4 __poly64x2x4_t_to_n128x4(poly64x2x4_t x) { return *(__n128x4 *)(&x); }
__forceinline __n128 __poly8x16_t_to_n128(poly8x16_t x) { return *(__n128 *)(&x); }
__forceinline __n128x2 __poly8x16x2_t_to_n128x2(poly8x16x2_t x) { return *(__n128x2 *)(&x); }
__forceinline __n128x3 __poly8x16x3_t_to_n128x3(poly8x16x3_t x) { return *(__n128x3 *)(&x); }
__forceinline __n128x4 __poly8x16x4_t_to_n128x4(poly8x16x4_t x) { return *(__n128x4 *)(&x); }
__forceinline __n64 __poly8x8_t_to_n64(poly8x8_t x) { return *(__n64 *)(&x); }
__forceinline __n64x2 __poly8x8x2_t_to_n64x2(poly8x8x2_t x) { return *(__n64x2 *)(&x); }
__forceinline __n64x3 __poly8x8x3_t_to_n64x3(poly8x8x3_t x) { return *(__n64x3 *)(&x); }
__forceinline __n64x4 __poly8x8x4_t_to_n64x4(poly8x8x4_t x) { return *(__n64x4 *)(&x); }
__forceinline __n64 __uint16x4_t_to_n64(uint16x4_t x) { return *(__n64 *)(&x); }
__forceinline __n64x2 __uint16x4x2_t_to_n64x2(uint16x4x2_t x) { return *(__n64x2 *)(&x); }
__forceinline __n64x3 __uint16x4x3_t_to_n64x3(uint16x4x3_t x) { return *(__n64x3 *)(&x); }
__forceinline __n64x4 __uint16x4x4_t_to_n64x4(uint16x4x4_t x) { return *(__n64x4 *)(&x); }
__forceinline __n128 __uint16x8_t_to_n128(uint16x8_t x) { return *(__n128 *)(&x); }
__forceinline __n128x2 __uint16x8x2_t_to_n128x2(uint16x8x2_t x) { return *(__n128x2 *)(&x); }
__forceinline __n128x3 __uint16x8x3_t_to_n128x3(uint16x8x3_t x) { return *(__n128x3 *)(&x); }
__forceinline __n128x4 __uint16x8x4_t_to_n128x4(uint16x8x4_t x) { return *(__n128x4 *)(&x); }
__forceinline __n64 __uint32x2_t_to_n64(uint32x2_t x) { return *(__n64 *)(&x); }
__forceinline __n64x2 __uint32x2x2_t_to_n64x2(uint32x2x2_t x) { return *(__n64x2 *)(&x); }
__forceinline __n64x3 __uint32x2x3_t_to_n64x3(uint32x2x3_t x) { return *(__n64x3 *)(&x); }
__forceinline __n64x4 __uint32x2x4_t_to_n64x4(uint32x2x4_t x) { return *(__n64x4 *)(&x); }
__forceinline __n128 __uint32x4_t_to_n128(uint32x4_t x) { return *(__n128 *)(&x); }
__forceinline __n128x2 __uint32x4x2_t_to_n128x2(uint32x4x2_t x) { return *(__n128x2 *)(&x); }
__forceinline __n128x3 __uint32x4x3_t_to_n128x3(uint32x4x3_t x) { return *(__n128x3 *)(&x); }
__forceinline __n128x4 __uint32x4x4_t_to_n128x4(uint32x4x4_t x) { return *(__n128x4 *)(&x); }
__forceinline __n64 __uint64x1_t_to_n64(uint64x1_t x) { return *(__n64 *)(&x); }
__forceinline __n64x2 __uint64x1x2_t_to_n64x2(uint64x1x2_t x) { return *(__n64x2 *)(&x); }
__forceinline __n64x3 __uint64x1x3_t_to_n64x3(uint64x1x3_t x) { return *(__n64x3 *)(&x); }
__forceinline __n64x4 __uint64x1x4_t_to_n64x4(uint64x1x4_t x) { return *(__n64x4 *)(&x); }
__forceinline __n128 __uint64x2_t_to_n128(uint64x2_t x) { return *(__n128 *)(&x); }
__forceinline __n128x2 __uint64x2x2_t_to_n128x2(uint64x2x2_t x) { return *(__n128x2 *)(&x); }
__forceinline __n128x3 __uint64x2x3_t_to_n128x3(uint64x2x3_t x) { return *(__n128x3 *)(&x); }
__forceinline __n128x4 __uint64x2x4_t_to_n128x4(uint64x2x4_t x) { return *(__n128x4 *)(&x); }
__forceinline __n128 __uint8x16_t_to_n128(uint8x16_t x) { return *(__n128 *)(&x); }
__forceinline __n128x2 __uint8x16x2_t_to_n128x2(uint8x16x2_t x) { return *(__n128x2 *)(&x); }
__forceinline __n128x3 __uint8x16x3_t_to_n128x3(uint8x16x3_t x) { return *(__n128x3 *)(&x); }
__forceinline __n128x4 __uint8x16x4_t_to_n128x4(uint8x16x4_t x) { return *(__n128x4 *)(&x); }
__forceinline __n64 __uint8x8_t_to_n64(uint8x8_t x) { return *(__n64 *)(&x); }
__forceinline __n64x2 __uint8x8x2_t_to_n64x2(uint8x8x2_t x) { return *(__n64x2 *)(&x); }
__forceinline __n64x3 __uint8x8x3_t_to_n64x3(uint8x8x3_t x) { return *(__n64x3 *)(&x); }
__forceinline __n64x4 __uint8x8x4_t_to_n64x4(uint8x8x4_t x) { return *(__n64x4 *)(&x); }

#endif  /* !_ARM64_DISTINCT_NEON_TYPES */

///////////////////////////////////////////////////////////////////////////////
// prototypes

// DUP - register (core register to Neon register)
__n64  neon_dupr8(__int32);
__n64  neon_dupr16(__int32);
__n64  neon_dupr32(__int32);
__n64  neon_duprf32(float);
__n64  neon_dupr64(__int64);
__n64  neon_duprf64(double);
__n128 neon_dupqr8(__int32);
__n128 neon_dupqr16(__int32);
__n128 neon_dupqr32(__int32);
__n128 neon_dupqrf32(float);
__n128 neon_dupqr64(__int64);
__n128 neon_dupqrf64(double);
#define vdup_n_f64(reg)       __n64_to_float64x1_t(neon_duprf64(reg))
#define vdup_n_f32(reg)       __n64_to_float32x2_t(neon_duprf32(reg))
#define vdup_n_p64(reg)       __n64_to_poly64x1_t(neon_dupr64(reg))
#define vdup_n_p16(reg)       __n64_to_poly16x4_t(neon_dupr16(reg))
#define vdup_n_p8(reg)        __n64_to_poly8x8_t(neon_dupr8(reg))
#define vdup_n_s16(reg)       __n64_to_int16x4_t(neon_dupr16(reg))
#define vdup_n_s32(reg)       __n64_to_int32x2_t(neon_dupr32(reg))
#define vdup_n_s64(reg)       __n64_to_int64x1_t(neon_dupr64(reg))
#define vdup_n_s8(reg)        __n64_to_int8x8_t(neon_dupr8(reg))
#define vdup_n_u16(reg)       __n64_to_uint16x4_t(neon_dupr16(reg))
#define vdup_n_u32(reg)       __n64_to_uint32x2_t(neon_dupr32(reg))
#define vdup_n_u64(reg)       __n64_to_uint64x1_t(neon_dupr64(reg))
#define vdup_n_u8(reg)        __n64_to_uint8x8_t(neon_dupr8(reg))
#define vdupq_n_f32(reg)      __n128_to_float32x4_t(neon_dupqrf32(reg))
#define vdupq_n_f64(reg)      __n128_to_float64x2_t(neon_dupqrf64(reg))
#define vdupq_n_p64(reg)      __n128_to_poly64x2_t(neon_dupqr64(reg))
#define vdupq_n_p16(reg)      __n128_to_poly16x8_t(neon_dupqr16(reg))
#define vdupq_n_p8(reg)       __n128_to_poly8x16_t(neon_dupqr8(reg))
#define vdupq_n_s16(reg)      __n128_to_int16x8_t(neon_dupqr16(reg))
#define vdupq_n_s32(reg)      __n128_to_int32x4_t(neon_dupqr32(reg))
#define vdupq_n_s64(reg)      __n128_to_int64x2_t(neon_dupqr64(reg))
#define vdupq_n_s8(reg)       __n128_to_int8x16_t(neon_dupqr8(reg))
#define vdupq_n_u16(reg)      __n128_to_uint16x8_t(neon_dupqr16(reg))
#define vdupq_n_u32(reg)      __n128_to_uint32x4_t(neon_dupqr32(reg))
#define vdupq_n_u64(reg)      __n128_to_uint64x2_t(neon_dupqr64(reg))
#define vdupq_n_u8(reg)       __n128_to_uint8x16_t(neon_dupqr8(reg))
#define vmov_n_f32(reg)       __n64_to_float32x2_t(neon_duprf32(reg))
#define vmov_n_f64(reg)       __n64_to_float64x1_t(neon_duprf64(reg))
#define vmov_n_p16(reg)       __n64_to_poly16x4_t(neon_dupr16(reg))
#define vmov_n_p8(reg)        __n64_to_poly8x8_t(neon_dupr8(reg))
#define vmov_n_s16(reg)       __n64_to_int16x4_t(neon_dupr16(reg))
#define vmov_n_s32(reg)       __n64_to_int32x2_t(neon_dupr32(reg))
#define vmov_n_s64(reg)       __n64_to_int64x1_t(neon_dupr64(reg))
#define vmov_n_s8(reg)        __n64_to_int8x8_t(neon_dupr8(reg))
#define vmov_n_u16(reg)       __n64_to_uint16x4_t(neon_dupr16(reg))
#define vmov_n_u32(reg)       __n64_to_uint32x2_t(neon_dupr32(reg))
#define vmov_n_u64(reg)       __n64_to_uint64x1_t(neon_dupr64(reg))
#define vmov_n_u8(reg)        __n64_to_uint8x8_t(neon_dupr8(reg))
#define vmovq_n_f32(reg)      __n128_to_float32x4_t(neon_dupqrf32(reg))
#define vmovq_n_f64(reg)      __n128_to_float64x2_t(neon_dupqrf64(reg))
#define vmovq_n_p16(reg)      __n128_to_poly16x8_t(neon_dupqr16(reg))
#define vmovq_n_p8(reg)       __n128_to_poly8x16_t(neon_dupqr8(reg))
#define vmovq_n_s16(reg)      __n128_to_int16x8_t(neon_dupqr16(reg))
#define vmovq_n_s32(reg)      __n128_to_int32x4_t(neon_dupqr32(reg))
#define vmovq_n_s64(reg)      __n128_to_int64x2_t(neon_dupqr64(reg))
#define vmovq_n_s8(reg)       __n128_to_int8x16_t(neon_dupqr8(reg))
#define vmovq_n_u16(reg)      __n128_to_uint16x8_t(neon_dupqr16(reg))
#define vmovq_n_u32(reg)      __n128_to_uint32x4_t(neon_dupqr32(reg))
#define vmovq_n_u64(reg)      __n128_to_uint64x2_t(neon_dupqr64(reg))
#define vmovq_n_u8(reg)       __n128_to_uint8x16_t(neon_dupqr8(reg))

// DUP - element  (vector element into vector)
__n64  neon_dupe8(__n64, const __int32);
__n64  neon_dupe16(__n64, const __int32);
__n64  neon_dupe32(__n64, const __int32);
__n64  neon_dupe64(__n64, const __int32);
__n64  neon_dupe8q(__n128, const __int32);
__n64  neon_dupe16q(__n128, const __int32);
__n64  neon_dupe32q(__n128, const __int32);
__n64  neon_dupe64q(__n128, const __int32);
__n128  neon_dupqe8(__n64, const __int32);
__n128  neon_dupqe16(__n64, const __int32);
__n128  neon_dupqe32(__n64, const __int32);
__n128  neon_dupqe64(__n64, const __int32);
__n128  neon_dupqe8q(__n128, const __int32);
__n128  neon_dupqe16q(__n128, const __int32);
__n128  neon_dupqe32q(__n128, const __int32);
__n128  neon_dupqe64q(__n128, const __int32);
#define vdup_lane_f32(reg, lane)       __n64_to_float32x2_t(neon_dupe32(__float32x2_t_to_n64(reg), (lane)))
#define vdup_lane_f64(reg, lane)       __n64_to_float64x1_t(neon_dupe64(__float64x1_t_to_n64(reg), (lane)))
#define vdup_lane_p64(reg, lane)       __n64_to_poly64x1_t(neon_dupe64(__poly64x1_t_to_n64(reg), (lane)))
#define vdup_lane_p16(reg, lane)       __n64_to_poly16x4_t(neon_dupe16(__poly16x4_t_to_n64(reg), (lane)))
#define vdup_lane_p8(reg, lane)        __n64_to_poly8x8_t(neon_dupe8(__poly8x8_t_to_n64(reg), (lane)))
#define vdup_lane_s16(reg, lane)       __n64_to_int16x4_t(neon_dupe16(__int16x4_t_to_n64(reg), (lane)))
#define vdup_lane_s32(reg, lane)       __n64_to_int32x2_t(neon_dupe32(__int32x2_t_to_n64(reg), (lane)))
#define vdup_lane_s64(Dn, lane)        __n64_to_int64x1_t(neon_dupe64(__int64x1_t_to_n64(Dn), (lane)))
#define vdup_lane_s8(reg, lane)        __n64_to_int8x8_t(neon_dupe8(__int8x8_t_to_n64(reg), (lane)))
#define vdup_lane_u16(reg, lane)       __n64_to_uint16x4_t(neon_dupe16(__uint16x4_t_to_n64(reg), (lane)))
#define vdup_lane_u32(reg, lane)       __n64_to_uint32x2_t(neon_dupe32(__uint32x2_t_to_n64(reg), (lane)))
#define vdup_lane_u64(Dn, lane)        __n64_to_uint64x1_t(neon_dupe64(__uint64x1_t_to_n64(Dn), (lane)))
#define vdup_lane_u8(reg, lane)        __n64_to_uint8x8_t(neon_dupe8(__uint8x8_t_to_n64(reg), (lane)))
#define vdupq_lane_f32(reg, lane)      __n128_to_float32x4_t(neon_dupqe32(__float32x2_t_to_n64(reg), (lane)))
#define vdupq_lane_f64(reg, lane)      __n128_to_float64x2_t(neon_dupqe64(__float64x1_t_to_n64(reg), (lane)))
#define vdupq_lane_p64(reg, lane)      __n128_to_poly64x2_t(neon_dupqe64(__poly64x1_t_to_n64(reg), (lane)))
#define vdupq_lane_p16(reg, lane)      __n128_to_poly16x8_t(neon_dupqe16(__poly16x4_t_to_n64(reg), (lane)))
#define vdupq_lane_p8(reg, lane)       __n128_to_poly8x16_t(neon_dupqe8(__poly8x8_t_to_n64(reg), (lane)))
#define vdupq_lane_s16(reg, lane)      __n128_to_int16x8_t(neon_dupqe16(__int16x4_t_to_n64(reg), (lane)))
#define vdupq_lane_s32(reg, lane)      __n128_to_int32x4_t(neon_dupqe32(__int32x2_t_to_n64(reg), (lane)))
#define vdupq_lane_s64(reg, lane)      __n128_to_int64x2_t(neon_dupqe64(__int64x1_t_to_n64(reg), (lane)))
#define vdupq_lane_s8(reg, lane)       __n128_to_int8x16_t(neon_dupqe8(__int8x8_t_to_n64(reg), (lane)))
#define vdupq_lane_u16(reg, lane)      __n128_to_uint16x8_t(neon_dupqe16(__uint16x4_t_to_n64(reg), (lane)))
#define vdupq_lane_u32(reg, lane)      __n128_to_uint32x4_t(neon_dupqe32(__uint32x2_t_to_n64(reg), (lane)))
#define vdupq_lane_u64(reg, lane)      __n128_to_uint64x2_t(neon_dupqe64(__uint64x1_t_to_n64(reg), (lane)))
#define vdupq_lane_u8(reg, lane)       __n128_to_uint8x16_t(neon_dupqe8(__uint8x8_t_to_n64(reg), (lane)))
#define vdup_laneq_f32(reg, lane)      __n64_to_float32x2_t(neon_dupe32q(__float32x4_t_to_n128(reg), (lane)))
#define vdup_laneq_f64(reg, lane)      __n64_to_float64x1_t(neon_dupe64q(__float64x2_t_to_n128(reg), (lane)))
#define vdup_laneq_p64(reg, lane)      __n64_to_poly64x1_t(neon_dupe64q(__poly64x2_t_to_n128(reg), (lane)))
#define vdup_laneq_p16(reg, lane)      __n64_to_poly16x4_t(neon_dupe16q(__poly16x8_t_to_n128(reg), (lane)))
#define vdup_laneq_p8(reg, lane)       __n64_to_poly8x8_t(neon_dupe8q(__poly8x16_t_to_n128(reg), (lane)))
#define vdup_laneq_s16(reg, lane)      __n64_to_int16x4_t(neon_dupe16q(__int16x8_t_to_n128(reg), (lane)))
#define vdup_laneq_s32(reg, lane)      __n64_to_int32x2_t(neon_dupe32q(__int32x4_t_to_n128(reg), (lane)))
#define vdup_laneq_s64(Dn, lane)       __n64_to_int64x1_t(neon_dupe64q(__int64x2_t_to_n128(Dn), (lane)))
#define vdup_laneq_s8(reg, lane)       __n64_to_int8x8_t(neon_dupe8q(__int8x16_t_to_n128(reg), (lane)))
#define vdup_laneq_u16(reg, lane)      __n64_to_uint16x4_t(neon_dupe16q(__uint16x8_t_to_n128(reg), (lane)))
#define vdup_laneq_u32(reg, lane)      __n64_to_uint32x2_t(neon_dupe32q(__uint32x4_t_to_n128(reg), (lane)))
#define vdup_laneq_u64(Dn, lane)       __n64_to_uint64x1_t(neon_dupe64q(__uint64x2_t_to_n128(Dn), (lane)))
#define vdup_laneq_u8(reg, lane)       __n64_to_uint8x8_t(neon_dupe8q(__uint8x16_t_to_n128(reg), (lane)))
#define vdupq_laneq_f32(reg, lane)     __n128_to_float32x4_t(neon_dupqe32q(__float32x4_t_to_n128(reg), (lane)))
#define vdupq_laneq_f64(reg, lane)     __n128_to_float64x2_t(neon_dupqe64q(__float64x2_t_to_n128(reg), (lane)))
#define vdupq_laneq_p64(reg, lane)     __n128_to_poly64x2_t(neon_dupqe64q(__poly64x2_t_to_n128(reg), (lane)))
#define vdupq_laneq_p16(reg, lane)     __n128_to_poly16x8_t(neon_dupqe16q(__poly16x8_t_to_n128(reg), (lane)))
#define vdupq_laneq_p8(reg, lane)      __n128_to_poly8x16_t(neon_dupqe8q(__poly8x16_t_to_n128(reg), (lane)))
#define vdupq_laneq_s16(reg, lane)     __n128_to_int16x8_t(neon_dupqe16q(__int16x8_t_to_n128(reg), (lane)))
#define vdupq_laneq_s32(reg, lane)     __n128_to_int32x4_t(neon_dupqe32q(__int32x4_t_to_n128(reg), (lane)))
#define vdupq_laneq_s64(reg, lane)     __n128_to_int64x2_t(neon_dupqe64q(__int64x2_t_to_n128(reg), (lane)))
#define vdupq_laneq_s8(reg, lane)      __n128_to_int8x16_t(neon_dupqe8q(__int8x16_t_to_n128(reg), (lane)))
#define vdupq_laneq_u16(reg, lane)     __n128_to_uint16x8_t(neon_dupqe16q(__uint16x8_t_to_n128(reg), (lane)))
#define vdupq_laneq_u32(reg, lane)     __n128_to_uint32x4_t(neon_dupqe32q(__uint32x4_t_to_n128(reg), (lane)))
#define vdupq_laneq_u64(reg, lane)     __n128_to_uint64x2_t(neon_dupqe64q(__uint64x2_t_to_n128(reg), (lane)))
#define vdupq_laneq_u8(reg, lane)      __n128_to_uint8x16_t(neon_dupqe8q(__uint8x16_t_to_n128(reg), (lane)))

// DUP - scalar  (vector element into scalar)
__n8   neon_dups8 (__n64, const __int32);
__n16  neon_dups16(__n64, const __int32);
float  neon_dups32(__n64, const __int32);
__n64  neon_dups64(__n64, const __int32);
__n8   neon_dups8q (__n128, const __int32);
__n16  neon_dups16q(__n128, const __int32);
float  neon_dups32q(__n128, const __int32);
__n64  neon_dups64q(__n128, const __int32);
#define vget_lane_f32(Dm, lane)     neon_dups32(__float32x2_t_to_n64(Dm), (lane))
#define vget_lane_f64(Dm, lane)     neon_dups64(__float64x1_t_to_n64(Dm), (lane)).n64_f64[0]
#define vgetq_lane_f32(Dm, lane)    neon_dups32q(__float32x4_t_to_n128(Dm), (lane))
#define vgetq_lane_f64(Dm, lane)    neon_dups64q(__float64x2_t_to_n128(Dm), (lane)).n64_f64[0]
#define vdupb_lane_s8(src, lane)    neon_dups8(__int8x8_t_to_n64(src), (lane)).n8_i8[0]
#define vduph_lane_s16(src, lane)   neon_dups16(__int16x4_t_to_n64(src), (lane)).n16_i16[0]
#define vdups_lane_s32(src, lane)   _CopyInt32FromFloat(neon_dups32(__int32x2_t_to_n64(src), (lane)))
#define vdupd_lane_s64(src, lane)   neon_dups64(__int64x1_t_to_n64(src), (lane)).n64_i64[0]
#define vdupb_lane_u8(src, lane)    neon_dups8(__uint8x8_t_to_n64(src), (lane)).n8_u8[0]
#define vduph_lane_u16(src, lane)   neon_dups16(__uint16x4_t_to_n64(src), (lane)).n16_u16[0]
#define vdups_lane_u32(src, lane)   _CopyUInt32FromFloat(neon_dups32(__uint32x2_t_to_n64(src), (lane)))
#define vdupd_lane_u64(src, lane)   neon_dups64(__uint64x1_t_to_n64(src), (lane)).n64_u64[0]
#define vdups_lane_f32(src, lane)   neon_dups32(__float32x2_t_to_n64(src), (lane))
#define vdupd_lane_f64(src, lane)   neon_dups64(__float64x1_t_to_n64(src), (lane)).n64_f64[0]
#define vdupb_lane_p8(src, lane)    neon_dups8(__poly8x8_t_to_n64(src), (lane)).n8_p8[0]
#define vduph_lane_p16(src, lane)   neon_dups16(__poly16x4_t_to_n64(src), (lane)).n16_p16[0]
#define vdupb_laneq_s8(src, lane)    neon_dups8q(__int8x16_t_to_n128(src), (lane)).n8_i8[0]
#define vduph_laneq_s16(src, lane)   neon_dups16q(__int16x8_t_to_n128(src), (lane)).n16_i16[0]
#define vdups_laneq_s32(src, lane)   _CopyInt32FromFloat(neon_dups32q(__int32x4_t_to_n128(src), (lane)))
#define vdupd_laneq_s64(src, lane)   neon_dups64q(__int64x2_t_to_n128(src), (lane)).n64_i64[0]
#define vdupb_laneq_u8(src, lane)    neon_dups8q(__uint8x16_t_to_n128(src), (lane)).n8_u8[0]
#define vduph_laneq_u16(src, lane)   neon_dups16q(__uint16x8_t_to_n128(src), (lane)).n16_u16[0]
#define vdups_laneq_u32(src, lane)   _CopyUInt32FromFloat(neon_dups32q(__uint32x4_t_to_n128(src), (lane)))
#define vdupd_laneq_u64(src, lane)   neon_dups64q(__uint64x2_t_to_n128(src), (lane)).n64_u64[0]
#define vdups_laneq_f32(src, lane)   neon_dups32q(__float32x4_t_to_n128(src), (lane))
#define vdupd_laneq_f64(src, lane)   neon_dups64q(__float64x2_t_to_n128(src), (lane)).n64_f64[0]
#define vdupb_laneq_p8(src, lane)    neon_dups8q(__poly8x16_t_to_n128(src), (lane)).n8_p8[0]
#define vduph_laneq_p16(src, lane)   neon_dups16q(__poly16x8_t_to_n128(src), (lane)).n16_p16[0]

// FMOV - to/from general, top half of 128 bits
// The only two forms are these:
//  FMOV <Vd>.D[1], <Xn>
//  FMOV <Xd>, <Vn>.D[1]
__n128 fmov_top_half_core(__n128, __int64);
__int64 fmov_core_top_half(__n128);

// FMOV - immediate
__n64  neon_fmovi2s(const float);
__n128 neon_fmovi4s(const float);
__n128 neon_fmovi2d(const float);

// MOVI, MVNI
__n64 neon_movidw(const __int64);          // bytemask one doubleword
__n128 neon_moviqdw(const __int64);        // bytemask per doubleword
__n64 neon_movib(const int);               // per byte
__n128 neon_moviqb(const int);             // per byte
__n64 neon_movi_shift1w(const int, const int);   // shift ones per word
__n128 neon_moviq_shift1w(const int, const int); // shift ones per word
__n64 neon_movih(const int);               // per halfword
__n128 neon_moviqh(const int);             // per halfword
__n64 neon_movi_shift0h(const int, const int);   // shift zeroes per halfword
__n128 neon_moviq_shift0h(const int, const int); // shift zeroes per halfword
__n64 neon_moviw(const int);               // per word
__n128 neon_moviqw(const int);             // per word
__n64 neon_movi_shift0w(const int, const int);   // shift zeroes per word
__n128 neon_moviq_shift0w(const int, const int); // shift zeroes per word
__n64 neon_mvni_shift1w(const int, const int);   // shift ones per word
__n128 neon_mvniq_shift1w(const int, const int); // shift ones per word
__n64 neon_mvnih(const int);               // per halfword
__n128 neon_mvniqh(const int);             // per halfword
__n64 neon_mvni_shift0h(const int, const int);   // shift zeroes per halfword
__n128 neon_mvniq_shift0h(const int, const int); // shift zeroes per halfword
__n64 neon_mvniw(const int);               // per word
__n128 neon_mvniqw(const int);             // per word
__n64 neon_mvni_shift0w(const int, const int);   // shift zeroes per word
__n128 neon_mvniq_shift0w(const int, const int); // shift zeroes per word

// SMOV/UMOV - (move scalar into core)
__int8  neon_smov8   (__n64, const __int32);
__int8  neon_smovq8  (__n128, const __int32);
__int64 neon_smov64_8   (__n64, const __int32);
__int64 neon_smov64_q8  (__n128, const __int32);
__int16 neon_smov16  (__n64, const __int32);
__int16 neon_smovq16 (__n128, const __int32);
__int64 neon_smov64_16  (__n64, const __int32);
__int64 neon_smov64_q16 (__n128, const __int32);
__int32 neon_smov32  (__n64, const __int32);
__int32 neon_smovq32 (__n128, const __int32);
__int64 neon_smov64_32  (__n64, const __int32);
__int64 neon_smov64_q32 (__n128, const __int32);
__int64 neon_smov64  (__n64, const __int32);
__int64 neon_smovq64 (__n128, const __int32);
unsigned __int8  neon_umov8   (__n64, const __int32);
unsigned __int8  neon_umovq8  (__n128, const __int32);
unsigned __int16 neon_umov16  (__n64, const __int32);
unsigned __int16 neon_umovq16 (__n128, const __int32);
unsigned __int32 neon_umov32  (__n64, const __int32);
unsigned __int32 neon_umovq32 (__n128, const __int32);
unsigned __int64 neon_umov64  (__n64, const __int32);
unsigned __int64 neon_umovq64 (__n128, const __int32);
#define vget_lane_p8(Dm, lane)   neon_umov8(__poly8x8_t_to_n64(Dm), (lane))
#define vget_lane_s8(Dm, lane)   neon_smov8(__int8x8_t_to_n64(Dm), (lane))
#define vget_lane_u8(Dm, lane)   neon_umov8(__uint8x8_t_to_n64(Dm), (lane))
#define vget_lane_p16(Dm, lane)  neon_umov16(__poly16x4_t_to_n64(Dm), (lane))
#define vget_lane_s16(Dm, lane)  neon_smov16(__int16x4_t_to_n64(Dm), (lane))
#define vget_lane_u16(Dm, lane)  neon_umov16(__uint16x4_t_to_n64(Dm), (lane))
#define vget_lane_s32(Dm, lane)  neon_smov32(__int32x2_t_to_n64(Dm), (lane))
#define vget_lane_u32(Dm, lane)  neon_umov32(__uint32x2_t_to_n64(Dm), (lane))
#define vget_lane_p64(Dm, lane)  neon_umov64(__poly64x1_t_to_n64(Dm), (lane))
#define vget_lane_s64(Dm, lane)  neon_smov64(__int64x1_t_to_n64(Dm), (lane))
#define vget_lane_u64(Dm, lane)  neon_umov64(__uint64x1_t_to_n64(Dm), (lane))
#define vgetq_lane_p8(Dm, lane)  neon_umovq8(__poly8x16_t_to_n128(Dm), (lane))
#define vgetq_lane_s8(Dm, lane)  neon_smovq8(__int8x16_t_to_n128(Dm), (lane))
#define vgetq_lane_u8(Dm, lane)  neon_umovq8(__uint8x16_t_to_n128(Dm), (lane))
#define vgetq_lane_p16(Dm, lane) neon_umovq16(__poly16x8_t_to_n128(Dm), (lane))
#define vgetq_lane_s16(Dm, lane) neon_smovq16(__int16x8_t_to_n128(Dm), (lane))
#define vgetq_lane_u16(Dm, lane) neon_umovq16(__uint16x8_t_to_n128(Dm), (lane))
#define vgetq_lane_s32(Dm, lane) neon_smovq32(__int32x4_t_to_n128(Dm), (lane))
#define vgetq_lane_u32(Dm, lane) neon_umovq32(__uint32x4_t_to_n128(Dm), (lane))
#define vgetq_lane_p64(Dm, lane) neon_umovq64(__poly64x2_t_to_n128(Dm), (lane))
#define vgetq_lane_s64(Dm, lane) neon_smovq64(__int64x2_t_to_n128(Dm), (lane))
#define vgetq_lane_u64(Dm, lane) neon_umovq64(__uint64x2_t_to_n128(Dm), (lane))

// INS register
__n64  neon_insr8   (__n64, const __int32, __int32);
__n64  neon_insr16  (__n64, const __int32, __int32);
__n64  neon_insr32  (__n64, const __int32, __int32);
__n64  neon_insr64  (__n64, const __int32, __int64);
__n64  neon_insrf32 (__n64, const __int32, float);
__n64  neon_insrf64 (__n64, const __int32, double);
__n128 neon_insqr8  (__n128, const __int32, __int32);
__n128 neon_insqr16 (__n128, const __int32, __int32);
__n128 neon_insqr32 (__n128, const __int32, __int32);
__n128 neon_insqr64 (__n128, const __int32, __int64);
__n128 neon_insqrf32(__n128, const __int32, float);
__n128 neon_insqrf64(__n128, const __int32, double);
#define vset_lane_f32(corereg, opeqneonreg, lane)  __n64_to_float32x2_t(neon_insrf32(__float32x2_t_to_n64(opeqneonreg), (lane), (corereg)))
#define vset_lane_f64(corereg, opeqneonreg, lane)  __n64_to_float64x1_t(neon_insrf64(__float64x1_t_to_n64(opeqneonreg), (lane), (corereg)))
#define vset_lane_p16(corereg, opeqneonreg, lane)  __n64_to_poly16x4_t(neon_insr16(__poly16x4_t_to_n64(opeqneonreg), (lane), (corereg)))
#define vset_lane_p64(corereg, opeqneonreg, lane)  __n64_to_poly64x1_t(neon_insr64(__poly64x1_t_to_n64(opeqneonreg), (lane), (corereg)))
#define vset_lane_p8(corereg, opeqneonreg, lane)   __n64_to_poly8x8_t(neon_insr8(__poly8x8_t_to_n64(opeqneonreg), (lane), (corereg)))
#define vset_lane_s16(corereg, opeqneonreg, lane)  __n64_to_int16x4_t(neon_insr16(__int16x4_t_to_n64(opeqneonreg), (lane), (corereg)))
#define vset_lane_s32(corereg, opeqneonreg, lane)  __n64_to_int32x2_t(neon_insr32(__int32x2_t_to_n64(opeqneonreg), (lane), (corereg)))
#define vset_lane_s64(corereg, opeqneonreg, lane)  __n64_to_int64x1_t(neon_insr64(__int64x1_t_to_n64(opeqneonreg), (lane), (corereg)))
#define vset_lane_s8(corereg, opeqneonreg, lane)   __n64_to_int8x8_t(neon_insr8(__int8x8_t_to_n64(opeqneonreg), (lane), (corereg)))
#define vset_lane_u16(corereg, opeqneonreg, lane)  __n64_to_uint16x4_t(neon_insr16(__uint16x4_t_to_n64(opeqneonreg), (lane), (corereg)))
#define vset_lane_u32(corereg, opeqneonreg, lane)  __n64_to_uint32x2_t(neon_insr32(__uint32x2_t_to_n64(opeqneonreg), (lane), (corereg)))
#define vset_lane_u64(corereg, opeqneonreg, lane)  __n64_to_uint64x1_t(neon_insr64(__uint64x1_t_to_n64(opeqneonreg), (lane), (corereg)))
#define vset_lane_u8(corereg, opeqneonreg, lane)   __n64_to_uint8x8_t(neon_insr8(__uint8x8_t_to_n64(opeqneonreg), (lane), (corereg)))
#define vsetq_lane_f32(corereg, opeqneonreg, lane) __n128_to_float32x4_t(neon_insqrf32(__float32x4_t_to_n128(opeqneonreg), (lane), (corereg)))
#define vsetq_lane_f64(corereg, opeqneonreg, lane) __n128_to_float64x2_t(neon_insqrf64(__float64x2_t_to_n128(opeqneonreg), (lane), (corereg)))
#define vsetq_lane_p16(corereg, opeqneonreg, lane) __n128_to_poly16x8_t(neon_insqr16(__poly16x8_t_to_n128(opeqneonreg), (lane), (corereg)))
#define vsetq_lane_p64(corereg, opeqneonreg, lane) __n128_to_poly64x2_t(neon_insqr64(__poly64x2_t_to_n128(opeqneonreg), (lane), (corereg)))
#define vsetq_lane_p8(corereg, opeqneonreg, lane)  __n128_to_poly8x16_t(neon_insqr8(__poly8x16_t_to_n128(opeqneonreg), (lane), (corereg)))
#define vsetq_lane_s16(corereg, opeqneonreg, lane) __n128_to_int16x8_t(neon_insqr16(__int16x8_t_to_n128(opeqneonreg), (lane), (corereg)))
#define vsetq_lane_s32(corereg, opeqneonreg, lane) __n128_to_int32x4_t(neon_insqr32(__int32x4_t_to_n128(opeqneonreg), (lane), (corereg)))
#define vsetq_lane_s64(corereg, opeqneonreg, lane) __n128_to_int64x2_t(neon_insqr64(__int64x2_t_to_n128(opeqneonreg), (lane), (corereg)))
#define vsetq_lane_s8(corereg, opeqneonreg, lane)  __n128_to_int8x16_t(neon_insqr8(__int8x16_t_to_n128(opeqneonreg), (lane), (corereg)))
#define vsetq_lane_u16(corereg, opeqneonreg, lane) __n128_to_uint16x8_t(neon_insqr16(__uint16x8_t_to_n128(opeqneonreg), (lane), (corereg)))
#define vsetq_lane_u32(corereg, opeqneonreg, lane) __n128_to_uint32x4_t(neon_insqr32(__uint32x4_t_to_n128(opeqneonreg), (lane), (corereg)))
#define vsetq_lane_u64(corereg, opeqneonreg, lane) __n128_to_uint64x2_t(neon_insqr64(__uint64x2_t_to_n128(opeqneonreg), (lane), (corereg)))
#define vsetq_lane_u8(corereg, opeqneonreg, lane)  __n128_to_uint8x16_t(neon_insqr8(__uint8x16_t_to_n128(opeqneonreg), (lane), (corereg)))

// INS element
__n64  neon_inse8    (__n64, const __int32, __n64, const __int32);
__n128 neon_insqe8   (__n128, const __int32, __n64, const __int32);
__n64  neon_inse8q   (__n64, const __int32, __n128, const __int32);
__n128 neon_insqe8q  (__n128, const __int32, __n128, const __int32);
__n64  neon_inse16   (__n64, const __int32, __n64, const __int32);
__n128 neon_insqe16  (__n128, const __int32, __n64, const __int32);
__n64  neon_inse16q  (__n64, const __int32, __n128, const __int32);
__n128 neon_insqe16q (__n128, const __int32, __n128, const __int32);
__n64  neon_inse32   (__n64, const __int32, __n64, const __int32);
__n128 neon_insqe32  (__n128, const __int32, __n64, const __int32);
__n64  neon_inse32q  (__n64, const __int32, __n128, const __int32);
__n128 neon_insqe32q (__n128, const __int32, __n128, const __int32);
__n64  neon_inse64   (__n64, const __int32, __n64, const __int32);
__n128 neon_insqe64  (__n128, const __int32, __n64, const __int32);
__n64  neon_inse64q  (__n64, const __int32, __n128, const __int32);
__n128 neon_insqe64q (__n128, const __int32, __n128, const __int32);
#define vcopy_lane_s8(src1, lane1, src2, lane2) __n64_to_int8x8_t(neon_inse8(__int8x8_t_to_n64(src1), (lane1), __int8x8_t_to_n64(src2), (lane2)))
#define vcopy_lane_s16(src1, lane1, src2, lane2) __n64_to_int16x4_t(neon_inse16(__int16x4_t_to_n64(src1), (lane1), __int16x4_t_to_n64(src2), (lane2)))
#define vcopy_lane_s32(src1, lane1, src2, lane2) __n64_to_int32x2_t(neon_inse32(__int32x2_t_to_n64(src1), (lane1), __int32x2_t_to_n64(src2), (lane2)))
#define vcopy_lane_s64(src1, lane1, src2, lane2) __n64_to_int64x1_t(neon_inse64(__int64x1_t_to_n64(src1), (lane1), __int64x1_t_to_n64(src2), (lane2)))
#define vcopy_lane_u8(src1, lane1, src2, lane2) __n64_to_uint8x8_t(neon_inse8(__uint8x8_t_to_n64(src1), (lane1), __uint8x8_t_to_n64(src2), (lane2)))
#define vcopy_lane_u16(src1, lane1, src2, lane2) __n64_to_uint16x4_t(neon_inse16(__uint16x4_t_to_n64(src1), (lane1), __uint16x4_t_to_n64(src2), (lane2)))
#define vcopy_lane_u32(src1, lane1, src2, lane2) __n64_to_uint32x2_t(neon_inse32(__uint32x2_t_to_n64(src1), (lane1), __uint32x2_t_to_n64(src2), (lane2)))
#define vcopy_lane_u64(src1, lane1, src2, lane2) __n64_to_uint64x1_t(neon_inse64(__uint64x1_t_to_n64(src1), (lane1), __uint64x1_t_to_n64(src2), (lane2)))
#define vcopy_lane_p64(src1, lane1, src2, lane2) __n64_to_poly64x1_t(neon_inse64(__poly64x1_t_to_n64(src1), (lane1), __poly64x1_t_to_n64(src2), (lane2)))
#define vcopy_lane_f32(src1, lane1, src2, lane2) __n64_to_float32x2_t(neon_inse32(__float32x2_t_to_n64(src1), (lane1), __float32x2_t_to_n64(src2), (lane2)))
#define vcopy_lane_f64(src1, lane1, src2, lane2) __n64_to_float64x1_t(neon_inse64(__float64x1_t_to_n64(src1), (lane1), __float64x1_t_to_n64(src2), (lane2)))
#define vcopy_lane_p8(src1, lane1, src2, lane2) __n64_to_poly8x8_t(neon_inse8(__poly8x8_t_to_n64(src1), (lane1), __poly8x8_t_to_n64(src2), (lane2)))
#define vcopy_lane_p16(src1, lane1, src2, lane2) __n64_to_poly16x4_t(neon_inse16(__poly16x4_t_to_n64(src1), (lane1), __poly16x4_t_to_n64(src2), (lane2)))
#define vcopy_laneq_s8(src1, lane1, src2, lane2) __n64_to_int8x8_t(neon_inse8q(__int8x8_t_to_n64(src1), (lane1), __int8x16_t_to_n128(src2), (lane2)))
#define vcopy_laneq_s16(src1, lane1, src2, lane2) __n64_to_int16x4_t(neon_inse16q(__int16x4_t_to_n64(src1), (lane1), __int16x8_t_to_n128(src2), (lane2)))
#define vcopy_laneq_s32(src1, lane1, src2, lane2) __n64_to_int32x2_t(neon_inse32q(__int32x2_t_to_n64(src1), (lane1), __int32x4_t_to_n128(src2), (lane2)))
#define vcopy_laneq_s64(src1, lane1, src2, lane2) __n64_to_int64x1_t(neon_inse64q(__int64x1_t_to_n64(src1), (lane1), __int64x2_t_to_n128(src2), (lane2)))
#define vcopy_laneq_u8(src1, lane1, src2, lane2) __n64_to_uint8x8_t(neon_inse8q(__uint8x8_t_to_n64(src1), (lane1), __uint8x16_t_to_n128(src2), (lane2)))
#define vcopy_laneq_u16(src1, lane1, src2, lane2) __n64_to_uint16x4_t(neon_inse16q(__uint16x4_t_to_n64(src1), (lane1), __uint16x8_t_to_n128(src2), (lane2)))
#define vcopy_laneq_u32(src1, lane1, src2, lane2) __n64_to_uint32x2_t(neon_inse32q(__uint32x2_t_to_n64(src1), (lane1), __uint32x4_t_to_n128(src2), (lane2)))
#define vcopy_laneq_u64(src1, lane1, src2, lane2) __n64_to_uint64x1_t(neon_inse64q(__uint64x1_t_to_n64(src1), (lane1), __uint64x2_t_to_n128(src2), (lane2)))
#define vcopy_laneq_p64(src1, lane1, src2, lane2) __n64_to_poly64x1_t(neon_inse64q(__poly64x1_t_to_n64(src1), (lane1), __poly64x2_t_to_n128(src2), (lane2)))
#define vcopy_laneq_f32(src1, lane1, src2, lane2) __n64_to_float32x2_t(neon_inse32q(__float32x2_t_to_n64(src1), (lane1), __float32x4_t_to_n128(src2), (lane2)))
#define vcopy_laneq_f64(src1, lane1, src2, lane2) __n64_to_float64x1_t(neon_inse64q(__float64x1_t_to_n64(src1), (lane1), __float64x2_t_to_n128(src2), (lane2)))
#define vcopy_laneq_p8(src1, lane1, src2, lane2) __n64_to_poly8x8_t(neon_inse8q(__poly8x8_t_to_n64(src1), (lane1), __poly8x16_t_to_n128(src2), (lane2)))
#define vcopy_laneq_p16(src1, lane1, src2, lane2) __n64_to_poly16x4_t(neon_inse16q(__poly16x4_t_to_n64(src1), (lane1), __poly16x8_t_to_n128(src2), (lane2)))
#define vcopyq_lane_s8(src1, lane1, src2, lane2) __n128_to_int8x16_t(neon_insqe8(__int8x16_t_to_n128(src1), (lane1), __int8x8_t_to_n64(src2), (lane2)))
#define vcopyq_lane_s16(src1, lane1, src2, lane2) __n128_to_int16x8_t(neon_insqe16(__int16x8_t_to_n128(src1), (lane1), __int16x4_t_to_n64(src2), (lane2)))
#define vcopyq_lane_s32(src1, lane1, src2, lane2) __n128_to_int32x4_t(neon_insqe32(__int32x4_t_to_n128(src1), (lane1), __int32x2_t_to_n64(src2), (lane2)))
#define vcopyq_lane_s64(src1, lane1, src2, lane2) __n128_to_int64x2_t(neon_insqe64(__int64x2_t_to_n128(src1), (lane1), __int64x1_t_to_n64(src2), (lane2)))
#define vcopyq_lane_u8(src1, lane1, src2, lane2) __n128_to_uint8x16_t(neon_insqe8(__uint8x16_t_to_n128(src1), (lane1), __uint8x8_t_to_n64(src2), (lane2)))
#define vcopyq_lane_u16(src1, lane1, src2, lane2) __n128_to_uint16x8_t(neon_insqe16(__uint16x8_t_to_n128(src1), (lane1), __uint16x4_t_to_n64(src2), (lane2)))
#define vcopyq_lane_u32(src1, lane1, src2, lane2) __n128_to_uint32x4_t(neon_insqe32(__uint32x4_t_to_n128(src1), (lane1), __uint32x2_t_to_n64(src2), (lane2)))
#define vcopyq_lane_u64(src1, lane1, src2, lane2) __n128_to_uint64x2_t(neon_insqe64(__uint64x2_t_to_n128(src1), (lane1), __uint64x1_t_to_n64(src2), (lane2)))
#define vcopyq_lane_p64(src1, lane1, src2, lane2) __n128_to_poly64x2_t(neon_insqe64(__poly64x2_t_to_n128(src1), (lane1), __poly64x1_t_to_n64(src2), (lane2)))
#define vcopyq_lane_f32(src1, lane1, src2, lane2) __n128_to_float32x4_t(neon_insqe32(__float32x4_t_to_n128(src1), (lane1), __float32x2_t_to_n64(src2), (lane2)))
#define vcopyq_lane_f64(src1, lane1, src2, lane2) __n128_to_float64x2_t(neon_insqe64(__float64x2_t_to_n128(src1), (lane1), __float64x1_t_to_n64(src2), (lane2)))
#define vcopyq_lane_p8(src1, lane1, src2, lane2) __n128_to_poly8x16_t(neon_insqe8(__poly8x16_t_to_n128(src1), (lane1), __poly8x8_t_to_n64(src2), (lane2)))
#define vcopyq_lane_p16(src1, lane1, src2, lane2) __n128_to_poly16x8_t(neon_insqe16(__poly16x8_t_to_n128(src1), (lane1), __poly16x4_t_to_n64(src2), (lane2)))
#define vcopyq_laneq_s8(src1, lane1, src2, lane2) __n128_to_int8x16_t(neon_insqe8q(__int8x16_t_to_n128(src1), (lane1), __int8x16_t_to_n128(src2), (lane2)))
#define vcopyq_laneq_s16(src1, lane1, src2, lane2) __n128_to_int16x8_t(neon_insqe16q(__int16x8_t_to_n128(src1), (lane1), __int16x8_t_to_n128(src2), (lane2)))
#define vcopyq_laneq_s32(src1, lane1, src2, lane2) __n128_to_int32x4_t(neon_insqe32q(__int32x4_t_to_n128(src1), (lane1), __int32x4_t_to_n128(src2), (lane2)))
#define vcopyq_laneq_s64(src1, lane1, src2, lane2) __n128_to_int64x2_t(neon_insqe64q(__int64x2_t_to_n128(src1), (lane1), __int64x2_t_to_n128(src2), (lane2)))
#define vcopyq_laneq_u8(src1, lane1, src2, lane2) __n128_to_uint8x16_t(neon_insqe8q(__uint8x16_t_to_n128(src1), (lane1), __uint8x16_t_to_n128(src2), (lane2)))
#define vcopyq_laneq_u16(src1, lane1, src2, lane2) __n128_to_uint16x8_t(neon_insqe16q(__uint16x8_t_to_n128(src1), (lane1), __uint16x8_t_to_n128(src2), (lane2)))
#define vcopyq_laneq_u32(src1, lane1, src2, lane2) __n128_to_uint32x4_t(neon_insqe32q(__uint32x4_t_to_n128(src1), (lane1), __uint32x4_t_to_n128(src2), (lane2)))
#define vcopyq_laneq_u64(src1, lane1, src2, lane2) __n128_to_uint64x2_t(neon_insqe64q(__uint64x2_t_to_n128(src1), (lane1), __uint64x2_t_to_n128(src2), (lane2)))
#define vcopyq_laneq_p64(src1, lane1, src2, lane2) __n128_to_poly64x2_t(neon_insqe64q(__poly64x2_t_to_n128(src1), (lane1), __poly64x2_t_to_n128(src2), (lane2)))
#define vcopyq_laneq_f32(src1, lane1, src2, lane2) __n128_to_float32x4_t(neon_insqe32q(__float32x4_t_to_n128(src1), (lane1), __float32x4_t_to_n128(src2), (lane2)))
#define vcopyq_laneq_f64(src1, lane1, src2, lane2) __n128_to_float64x2_t(neon_insqe64q(__float64x2_t_to_n128(src1), (lane1), __float64x2_t_to_n128(src2), (lane2)))
#define vcopyq_laneq_p8(src1, lane1, src2, lane2) __n128_to_poly8x16_t(neon_insqe8q(__poly8x16_t_to_n128(src1), (lane1), __poly8x16_t_to_n128(src2), (lane2)))
#define vcopyq_laneq_p16(src1, lane1, src2, lane2) __n128_to_poly16x8_t(neon_insqe16q(__poly16x8_t_to_n128(src1), (lane1), __poly16x8_t_to_n128(src2), (lane2)))

// NOT, MVN
__n64  neon_not  (__n64);
__n128 neon_notq (__n128);
#define vmvn_p8(reg)   __n64_to_poly8x8_t(neon_not(__poly8x8_t_to_n64(reg)))
#define vmvn_s16(reg)  __n64_to_int16x4_t(neon_not(__int16x4_t_to_n64(reg)))
#define vmvn_s32(reg)  __n64_to_int32x2_t(neon_not(__int32x2_t_to_n64(reg)))
#define vmvn_s8(reg)   __n64_to_int8x8_t(neon_not(__int8x8_t_to_n64(reg)))
#define vmvn_u16(reg)  __n64_to_uint16x4_t(neon_not(__uint16x4_t_to_n64(reg)))
#define vmvn_u32(reg)  __n64_to_uint32x2_t(neon_not(__uint32x2_t_to_n64(reg)))
#define vmvn_u8(reg)   __n64_to_uint8x8_t(neon_not(__uint8x8_t_to_n64(reg)))
#define vmvnq_p8(reg)  __n128_to_poly8x16_t(neon_notq(__poly8x16_t_to_n128(reg)))
#define vmvnq_s16(reg) __n128_to_int16x8_t(neon_notq(__int16x8_t_to_n128(reg)))
#define vmvnq_s32(reg) __n128_to_int32x4_t(neon_notq(__int32x4_t_to_n128(reg)))
#define vmvnq_s8(reg)  __n128_to_int8x16_t(neon_notq(__int8x16_t_to_n128(reg)))
#define vmvnq_u16(reg) __n128_to_uint16x8_t(neon_notq(__uint16x8_t_to_n128(reg)))
#define vmvnq_u32(reg) __n128_to_uint32x4_t(neon_notq(__uint32x4_t_to_n128(reg)))
#define vmvnq_u8(reg)  __n128_to_uint8x16_t(neon_notq(__uint8x16_t_to_n128(reg)))

// FNEG/NEG/SQNEG
__n64 neon_fneg16(__n64);
__n64 neon_fneg32(__n64);
__n64 neon_fneg64(__n64);
__n128 neon_fnegq16(__n128);
__n128 neon_fnegq32(__n128);
__n128 neon_fnegq64(__n128);
__n64 neon_neg8(__n64);
__n128 neon_negq8(__n128);
__n64 neon_neg16(__n64);
__n128 neon_negq16(__n128);
__n64 neon_neg32(__n64);
__n128 neon_negq32(__n128);
__n64 neon_neg64(__n64);
__n128 neon_negq64(__n128);
__n64 neon_sqneg8(__n64);
__n128 neon_sqnegq8(__n128);
__n64 neon_sqneg16(__n64);
__n128 neon_sqnegq16(__n128);
__n64 neon_sqneg32(__n64);
__n128 neon_sqnegq32(__n128);
__n64 neon_sqneg64(__n64);
__n128 neon_sqnegq64(__n128);
__n8  neon_sqnegs8(__n8);
__n16 neon_sqnegs16(__n16);
float neon_sqnegs32(float);
__n64 neon_sqnegs64(__n64);
__n64 neon_negs64(__n64);
#define vneg_f32(reg) __n64_to_float32x2_t(neon_fneg32(__float32x2_t_to_n64(reg)))
#define vnegq_f32(reg) __n128_to_float32x4_t(neon_fnegq32(__float32x4_t_to_n128(reg)))
#define vneg_f64(reg) __n64_to_float64x1_t(neon_fneg64(__float64x1_t_to_n64(reg)))
#define vnegq_f64(reg) __n128_to_float64x2_t(neon_fnegq64(__float64x2_t_to_n128(reg)))
#define vneg_s8(reg) __n64_to_int8x8_t(neon_neg8(__int8x8_t_to_n64(reg)))
#define vnegq_s8(reg) __n128_to_int8x16_t(neon_negq8(__int8x16_t_to_n128(reg)))
#define vqneg_s8(reg) __n64_to_int8x8_t(neon_sqneg8(__int8x8_t_to_n64(reg)))
#define vqnegq_s8(reg) __n128_to_int8x16_t(neon_sqnegq8(__int8x16_t_to_n128(reg)))
#define vneg_s16(reg) __n64_to_int16x4_t(neon_neg16(__int16x4_t_to_n64(reg)))
#define vnegq_s16(reg) __n128_to_int16x8_t(neon_negq16(__int16x8_t_to_n128(reg)))
#define vqneg_s16(reg) __n64_to_int16x4_t(neon_sqneg16(__int16x4_t_to_n64(reg)))
#define vqnegq_s16(reg) __n128_to_int16x8_t(neon_sqnegq16(__int16x8_t_to_n128(reg)))
#define vneg_s32(reg) __n64_to_int32x2_t(neon_neg32(__int32x2_t_to_n64(reg)))
#define vnegq_s32(reg) __n128_to_int32x4_t(neon_negq32(__int32x4_t_to_n128(reg)))
#define vqneg_s32(reg) __n64_to_int32x2_t(neon_sqneg32(__int32x2_t_to_n64(reg)))
#define vqnegq_s32(reg) __n128_to_int32x4_t(neon_sqnegq32(__int32x4_t_to_n128(reg)))
#define vneg_s64(reg) __n64_to_int64x1_t(neon_neg64(__int64x1_t_to_n64(reg)))
#define vnegq_s64(reg) __n128_to_int64x2_t(neon_negq64(__int64x2_t_to_n128(reg)))
#define vqneg_s64(reg) __n64_to_int64x1_t(neon_sqneg64(__int64x1_t_to_n64(reg)))
#define vqnegq_s64(reg) __n128_to_int64x2_t(neon_sqnegq64(__int64x2_t_to_n128(reg)))
#define vqnegb_s8(reg) neon_sqnegs8(__int8ToN8_v(reg)).n8_i8[0]
#define vqnegh_s16(reg) neon_sqnegs16(__int16ToN16_v(reg)).n16_i16[0]
#define vqnegs_s32(reg) _CopyInt32FromFloat(neon_sqnegs32(_CopyFloatFromInt32(reg)))
#define vnegd_s64(reg) neon_negs64(__int64ToN64_v(reg)).n64_i64[0]
#define vqnegd_s64(reg) neon_sqnegs64(__int64ToN64_v(reg)).n64_i64[0]

// FABS/ABS/SQABS
__n64 neon_fabs16(__n64);
__n128 neon_fabsq16(__n128);
__n64 neon_fabs32(__n64);
__n128 neon_fabsq32(__n128);
__n64 neon_fabs64(__n64);
__n128 neon_fabsq64(__n128);
__n64 neon_abs8(__n64);
__n128 neon_absq8(__n128);
__n64 neon_abs16(__n64);
__n128 neon_absq16(__n128);
__n64 neon_abs32(__n64);
__n128 neon_absq32(__n128);
__n64 neon_abs64(__n64);
__n128 neon_absq64(__n128);
__n64 neon_sqabs8(__n64);
__n128 neon_sqabsq8(__n128);
__n64 neon_sqabs16(__n64);
__n128 neon_sqabsq16(__n128);
__n64 neon_sqabs32(__n64);
__n64 neon_sqabs64(__n64);
__n128 neon_sqabsq32(__n128);
__n128 neon_sqabsq64(__n128);
__n8  neon_sqabss8(__n8);
__n16 neon_sqabss16(__n16);
float neon_sqabss32(float);
__n64 neon_sqabss64(__n64);
__n64 neon_abss64(__n64);
#define vabs_f32(reg) __n64_to_float32x2_t(neon_fabs32(__float32x2_t_to_n64(reg)))
#define vabs_f64(reg) __n64_to_float64x1_t(neon_fabs64(__float64x1_t_to_n64(reg)))
#define vabsq_f32(reg) __n128_to_float32x4_t(neon_fabsq32(__float32x4_t_to_n128(reg)))
#define vabsq_f64(reg) __n128_to_float64x2_t(neon_fabsq64(__float64x2_t_to_n128(reg)))
#define vabs_s8(reg) __n64_to_int8x8_t(neon_abs8(__int8x8_t_to_n64(reg)))
#define vabsq_s8(reg) __n128_to_int8x16_t(neon_absq8(__int8x16_t_to_n128(reg)))
#define vqabs_s8(reg) __n64_to_int8x8_t(neon_sqabs8(__int8x8_t_to_n64(reg)))
#define vqabsq_s8(reg) __n128_to_int8x16_t(neon_sqabsq8(__int8x16_t_to_n128(reg)))
#define vabs_s16(reg) __n64_to_int16x4_t(neon_abs16(__int16x4_t_to_n64(reg)))
#define vabsq_s16(reg) __n128_to_int16x8_t(neon_absq16(__int16x8_t_to_n128(reg)))
#define vqabs_s16(reg) __n64_to_int16x4_t(neon_sqabs16(__int16x4_t_to_n64(reg)))
#define vqabsq_s16(reg) __n128_to_int16x8_t(neon_sqabsq16(__int16x8_t_to_n128(reg)))
#define vabs_s32(reg) __n64_to_int32x2_t(neon_abs32(__int32x2_t_to_n64(reg)))
#define vabsq_s32(reg) __n128_to_int32x4_t(neon_absq32(__int32x4_t_to_n128(reg)))
#define vqabs_s32(reg) __n64_to_int32x2_t(neon_sqabs32(__int32x2_t_to_n64(reg)))
#define vqabsq_s32(reg) __n128_to_int32x4_t(neon_sqabsq32(__int32x4_t_to_n128(reg)))
#define vabs_s64(reg) __n64_to_int64x1_t(neon_abs64(__int64x1_t_to_n64(reg)))
#define vabsq_s64(reg) __n128_to_int64x2_t(neon_absq64(__int64x2_t_to_n128(reg)))
#define vqabs_s64(reg) __n64_to_int64x1_t(neon_sqabs64(__int64x1_t_to_n64(reg)))
#define vqabsq_s64(reg) __n128_to_int64x2_t(neon_sqabsq64(__int64x2_t_to_n128(reg)))
#define vqabsb_s8(reg) neon_sqabss8(__int8ToN8_v(reg)).n8_i8[0]
#define vqabsh_s16(reg) neon_sqabss16(__int16ToN16_v(reg)).n16_i16[0]
#define vqabss_s32(reg) _CopyInt32FromFloat(neon_sqabss32(_CopyFloatFromInt32(reg)))
#define vabsd_s64(reg) neon_abss64(__int64ToN64_v(reg)).n64_i64[0]
#define vqabsd_s64(reg) neon_sqabss64(__int64ToN64_v(reg)).n64_i64[0]

// ADD, FADD, SQADD, UQADD, SUQADD, USQADD
__n64  neon_fadd16(__n64, __n64);
__n64  neon_fadd32(__n64, __n64);
__n64  neon_fadd64(__n64, __n64);
__n128 neon_faddq16(__n128, __n128);
__n128 neon_faddq32(__n128, __n128);
__n128 neon_faddq64(__n128, __n128);
__n64  neon_add8(__n64, __n64);
__n128 neon_addq8(__n128, __n128);
__n64  neon_add16(__n64, __n64);
__n128 neon_addq16(__n128, __n128);
__n64  neon_add32(__n64, __n64);
__n128 neon_addq32(__n128, __n128);
__n128 neon_addq64(__n128, __n128);
__n64  neon_sqadd8(__n64, __n64);
__n128 neon_sqaddq8(__n128, __n128);
__n64  neon_sqadd16(__n64, __n64);
__n128 neon_sqaddq16(__n128, __n128);
__n64  neon_sqadd32(__n64, __n64);
__n128 neon_sqaddq32(__n128, __n128);
__n128 neon_sqaddq64(__n128, __n128);
__n64  neon_uqadd8(__n64, __n64);
__n128 neon_uqaddq8(__n128, __n128);
__n64  neon_uqadd16(__n64, __n64);
__n128 neon_uqaddq16(__n128, __n128);
__n64  neon_uqadd32(__n64, __n64);
__n128 neon_uqaddq32(__n128, __n128);
__n128 neon_uqaddq64(__n128, __n128);
__n64  neon_suqadd8(__n64, __n64);
__n128 neon_suqaddq8(__n128, __n128);
__n64  neon_suqadd16(__n64, __n64);
__n128 neon_suqaddq16(__n128, __n128);
__n64  neon_suqadd32(__n64, __n64);
__n64  neon_suqadd64(__n64, __n64);
__n128 neon_suqaddq32(__n128, __n128);
__n128 neon_suqaddq64(__n128, __n128);
__n64  neon_usqadd8(__n64, __n64);
__n128 neon_usqaddq8(__n128, __n128);
__n64  neon_usqadd16(__n64, __n64);
__n128 neon_usqaddq16(__n128, __n128);
__n64  neon_usqadd32(__n64, __n64);
__n64  neon_usqadd64(__n64, __n64);
__n128 neon_usqaddq32(__n128, __n128);
__n128 neon_usqaddq64(__n128, __n128);
__n64 neon_adds64(__n64, __n64);
__n64 neon_sqadds64(__n64, __n64);
float neon_sqadds32(float, float);
__n16 neon_sqadds16(__n16, __n16);
__n8  neon_sqadds8(__n8, __n8);
__n64 neon_uqadds64(__n64, __n64);
float neon_uqadds32(float, float);
__n16 neon_uqadds16(__n16, __n16);
__n8  neon_uqadds8(__n8, __n8);
__n8  neon_suqadds8(__n8, __n8);
__n16 neon_suqadds16(__n16, __n16);
float neon_suqadds32(float, float);
__n64 neon_suqadds64(__n64, __n64);
__n8  neon_usqadds8(__n8, __n8);
__n16 neon_usqadds16(__n16, __n16);
float neon_usqadds32(float, float);
__n64 neon_usqadds64(__n64, __n64);
#define vadd_s8(src1, src2)    __n64_to_int8x8_t(neon_add8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vadd_u8(src1, src2)    __n64_to_uint8x8_t(neon_add8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vadd_p8(src1, src2)    __n64_to_poly8x8_t(neon_add8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vadd_s16(src1, src2)   __n64_to_int16x4_t(neon_add16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vadd_u16(src1, src2)   __n64_to_uint16x4_t(neon_add16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vadd_p16(src1, src2)   __n64_to_poly16x4_t(neon_add16(__poly16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2)))
#define vadd_s32(src1, src2)   __n64_to_int32x2_t(neon_add32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vadd_u32(src1, src2)   __n64_to_uint32x2_t(neon_add32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vadd_f32(src1, src2)   __n64_to_float32x2_t(neon_fadd32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vadd_f64(src1, src2)   __n64_to_float64x1_t(neon_fadd64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vadd_s64(src1, src2)   __n64_to_int64x1_t(neon_adds64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vadd_u64(src1, src2)   __n64_to_uint64x1_t(neon_adds64(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define vadd_p64(src1, src2)   __n64_to_poly64x1_t(neon_adds64(__poly64x1_t_to_n64(src1), __poly64x1_t_to_n64(src2)))
#define vaddd_s64(src1, src2)  neon_adds64(__int64ToN64_v(src1), __int64ToN64_v(src2)).n64_i64[0]
#define vaddd_u64(src1, src2)  neon_adds64(__uint64ToN64_v(src1), __uint64ToN64_v(src2)).n64_u64[0]
#define vaddq_s8(src1, src2)   __n128_to_int8x16_t(neon_addq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vaddq_u8(src1, src2)   __n128_to_uint8x16_t(neon_addq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vaddq_p8(src1, src2)   __n128_to_poly8x16_t(neon_addq8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vaddq_s16(src1, src2)  __n128_to_int16x8_t(neon_addq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vaddq_u16(src1, src2)  __n128_to_uint16x8_t(neon_addq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vaddq_p16(src1, src2)  __n128_to_poly16x8_t(neon_addq16(__poly16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2)))
#define vaddq_s32(src1, src2)  __n128_to_int32x4_t(neon_addq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vaddq_u32(src1, src2)  __n128_to_uint32x4_t(neon_addq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vaddq_f32(src1, src2)  __n128_to_float32x4_t(neon_faddq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vaddq_f64(src1, src2)  __n128_to_float64x2_t(neon_faddq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vaddq_s64(src1, src2)  __n128_to_int64x2_t(neon_addq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vaddq_u64(src1, src2)  __n128_to_uint64x2_t(neon_addq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vaddq_p64(src1, src2)  __n128_to_poly64x2_t(neon_addq64(__poly64x2_t_to_n128(src1), __poly64x2_t_to_n128(src2)))
#define vqadd_s8(src1, src2)   __n64_to_int8x8_t(neon_sqadd8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vqadd_u8(src1, src2)   __n64_to_uint8x8_t(neon_uqadd8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vqadd_s16(src1, src2)  __n64_to_int16x4_t(neon_sqadd16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vqadd_u16(src1, src2)  __n64_to_uint16x4_t(neon_uqadd16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vqadd_s32(src1, src2)  __n64_to_int32x2_t(neon_sqadd32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vqadd_u32(src1, src2)  __n64_to_uint32x2_t(neon_uqadd32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vqadd_s64(src1, src2)  __n64_to_int64x1_t(neon_sqadds64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vqadd_u64(src1, src2)  __n64_to_uint64x1_t(neon_uqadds64(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define vqaddq_s8(src1, src2)  __n128_to_int8x16_t(neon_sqaddq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vqaddq_u8(src1, src2)  __n128_to_uint8x16_t(neon_uqaddq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vqaddq_s16(src1, src2) __n128_to_int16x8_t(neon_sqaddq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vqaddq_u16(src1, src2) __n128_to_uint16x8_t(neon_uqaddq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vqaddq_s32(src1, src2) __n128_to_int32x4_t(neon_sqaddq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vqaddq_u32(src1, src2) __n128_to_uint32x4_t(neon_uqaddq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vqaddq_s64(src1, src2) __n128_to_int64x2_t(neon_sqaddq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vqaddq_u64(src1, src2) __n128_to_uint64x2_t(neon_uqaddq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vqaddb_s8(src1, src2) neon_sqadds8(__int8ToN8_v(src1), __int8ToN8_v(src2)).n8_i8[0]
#define vqaddh_s16(src1, src2) neon_sqadds16(__int16ToN16_v(src1), __int16ToN16_v(src2)).n16_i16[0]
#define vqadds_s32(src1, src2) _CopyInt32FromFloat(neon_sqadds32(_CopyFloatFromInt32(src1), _CopyFloatFromInt32(src2)))
#define vqaddd_s64(src1, src2) neon_sqadds64(__int64ToN64_v(src1), __int64ToN64_v(src2)).n64_i64[0]
#define vqaddb_u8(src1, src2) neon_uqadds8(__uint8ToN8_v(src1), __uint8ToN8_v(src2)).n8_u8[0]
#define vqaddh_u16(src1, src2) neon_uqadds16(__uint16ToN16_v(src1), __uint16ToN16_v(src2)).n16_u16[0]
#define vqadds_u32(src1, src2) _CopyUInt32FromFloat(neon_uqadds32(_CopyFloatFromUInt32(src1), _CopyFloatFromUInt32(src2)))
#define vqaddd_u64(src1, src2) neon_uqadds64(__uint64ToN64_v(src1), __uint64ToN64_v(src2)).n64_u64[0]
#define vuqadd_s8(src1, src2) __n64_to_int8x8_t(neon_suqadd8(__int8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vuqadd_s16(src1, src2) __n64_to_int16x4_t(neon_suqadd16(__int16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vuqadd_s32(src1, src2) __n64_to_int32x2_t(neon_suqadd32(__int32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vuqadd_s64(src1, src2) __n64_to_int64x1_t(neon_suqadd64(__int64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define vuqaddq_s8(src1, src2) __n128_to_int8x16_t(neon_suqaddq8(__int8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vuqaddq_s16(src1, src2) __n128_to_int16x8_t(neon_suqaddq16(__int16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vuqaddq_s32(src1, src2) __n128_to_int32x4_t(neon_suqaddq32(__int32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vuqaddq_s64(src1, src2) __n128_to_int64x2_t(neon_suqaddq64(__int64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vsqadd_u8(src1, src2) __n64_to_uint8x8_t(neon_usqadd8(__uint8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vsqadd_u16(src1, src2) __n64_to_uint16x4_t(neon_usqadd16(__uint16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vsqadd_u32(src1, src2) __n64_to_uint32x2_t(neon_usqadd32(__uint32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vsqadd_u64(src1, src2) __n64_to_uint64x1_t(neon_usqadd64(__uint64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vsqaddq_u8(src1, src2) __n128_to_uint8x16_t(neon_usqaddq8(__uint8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vsqaddq_u16(src1, src2) __n128_to_uint16x8_t(neon_usqaddq16(__uint16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vsqaddq_u32(src1, src2) __n128_to_uint32x4_t(neon_usqaddq32(__uint32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vsqaddq_u64(src1, src2) __n128_to_uint64x2_t(neon_usqaddq64(__uint64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vsqaddb_u8(src1, src2) neon_usqadds8(__uint8ToN8_v(src1), __int8ToN8_v(src2)).n8_u8[0]
#define vsqaddh_u16(src1, src2) neon_usqadds16(__uint16ToN16_v(src1), __int16ToN16_v(src2)).n16_u16[0]
#define vsqadds_u32(src1, src2) _CopyUInt32FromFloat(neon_usqadds32(_CopyFloatFromUInt32(src1), _CopyFloatFromInt32(src2)))
#define vsqaddd_u64(src1, src2) neon_usqadds64(__uint64ToN64_v(src1), __int64ToN64_v(src2)).n64_u64[0]
#define vuqaddb_s8(src1, src2) neon_suqadds8(__int8ToN8_v(src1), __uint8ToN8_v(src2)).n8_i8[0]
#define vuqaddh_s16(src1, src2) neon_suqadds16(__int16ToN16_v(src1), __uint16ToN16_v(src2)).n16_i16[0]
#define vuqadds_s32(src1, src2) _CopyInt32FromFloat(neon_suqadds32(_CopyFloatFromInt32(src1), _CopyFloatFromUInt32(src2)))
#define vuqaddd_s64(src1, src2) neon_suqadds64(__int64ToN64_v(src1), __uint64ToN64_v(src2)).n64_i64[0]

// SUB, FSUB, SQSUB, UQSUB
__n64  neon_fsub16(__n64, __n64);
__n128 neon_fsubq16(__n128, __n128);
__n64  neon_fsub32(__n64, __n64);
__n128 neon_fsubq32(__n128, __n128);
__n64  neon_fsub64(__n64, __n64);
__n128 neon_fsubq64(__n128, __n128);
__n64  neon_sub8(__n64, __n64);
__n128 neon_subq8(__n128, __n128);
__n64  neon_sub16(__n64, __n64);
__n128 neon_subq16(__n128, __n128);
__n64  neon_sub32(__n64, __n64);
__n128 neon_subq32(__n128, __n128);
__n128 neon_subq64(__n128, __n128);
__n64  neon_sqsub8(__n64, __n64);
__n128 neon_sqsubq8(__n128, __n128);
__n64  neon_sqsub16(__n64, __n64);
__n128 neon_sqsubq16(__n128, __n128);
__n64  neon_sqsub32(__n64, __n64);
__n128 neon_sqsubq32(__n128, __n128);
__n128 neon_sqsubq64(__n128, __n128);
__n64  neon_uqsub8(__n64, __n64);
__n128 neon_uqsubq8(__n128, __n128);
__n64  neon_uqsub16(__n64, __n64);
__n128 neon_uqsubq16(__n128, __n128);
__n64  neon_uqsub32(__n64, __n64);
__n128 neon_uqsubq32(__n128, __n128);
__n128 neon_uqsubq64(__n128, __n128);
__n64 neon_subs64(__n64, __n64);
__n64 neon_sqsubs64(__n64, __n64);
float neon_sqsubs32(float, float);
__n16 neon_sqsubs16(__n16, __n16);
__n8  neon_sqsubs8(__n8, __n8);
__n64 neon_uqsubs64(__n64, __n64);
float neon_uqsubs32(float, float);
__n16 neon_uqsubs16(__n16, __n16);
__n8  neon_uqsubs8(__n8, __n8);
#define vsub_s8(src1, src2)    __n64_to_int8x8_t(neon_sub8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vsub_u8(src1, src2)    __n64_to_uint8x8_t(neon_sub8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vsub_s16(src1, src2)   __n64_to_int16x4_t(neon_sub16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vsub_u16(src1, src2)   __n64_to_uint16x4_t(neon_sub16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vsub_s32(src1, src2)   __n64_to_int32x2_t(neon_sub32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vsub_u32(src1, src2)   __n64_to_uint32x2_t(neon_sub32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vsub_f32(src1, src2)   __n64_to_float32x2_t(neon_fsub32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vsub_s64(src1, src2)   __n64_to_int64x1_t(neon_subs64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vsub_u64(src1, src2)   __n64_to_uint64x1_t(neon_subs64(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define vsubd_s64(src1, src2)  neon_subs64(__int64ToN64_v(src1), __int64ToN64_v(src2)).n64_i64[0]
#define vsubd_u64(src1, src2)  neon_subs64(__uint64ToN64_v(src1), __uint64ToN64_v(src2)).n64_u64[0]
#define vsub_f64(src1, src2)   __n64_to_float64x1_t(neon_fsub64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vsubq_s8(src1, src2)   __n128_to_int8x16_t(neon_subq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vsubq_u8(src1, src2)   __n128_to_uint8x16_t(neon_subq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vsubq_s16(src1, src2)  __n128_to_int16x8_t(neon_subq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vsubq_u16(src1, src2)  __n128_to_uint16x8_t(neon_subq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vsubq_s32(src1, src2)  __n128_to_int32x4_t(neon_subq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vsubq_u32(src1, src2)  __n128_to_uint32x4_t(neon_subq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vsubq_f32(src1, src2)  __n128_to_float32x4_t(neon_fsubq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vsubq_s64(src1, src2)  __n128_to_int64x2_t(neon_subq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vsubq_u64(src1, src2)  __n128_to_uint64x2_t(neon_subq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vsubq_f64(src1, src2)  __n128_to_float64x2_t(neon_fsubq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vqsub_s8(src1, src2)   __n64_to_int8x8_t(neon_sqsub8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vqsub_u8(src1, src2)   __n64_to_uint8x8_t(neon_uqsub8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vqsub_s16(src1, src2)  __n64_to_int16x4_t(neon_sqsub16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vqsub_u16(src1, src2)  __n64_to_uint16x4_t(neon_uqsub16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vqsub_s32(src1, src2)  __n64_to_int32x2_t(neon_sqsub32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vqsub_u32(src1, src2)  __n64_to_uint32x2_t(neon_uqsub32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vqsub_s64(src1, src2)  __n64_to_int64x1_t(neon_sqsubs64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vqsub_u64(src1, src2)  __n64_to_uint64x1_t(neon_uqsubs64(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define vqsubq_s8(src1, src2)  __n128_to_int8x16_t(neon_sqsubq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vqsubq_u8(src1, src2)  __n128_to_uint8x16_t(neon_uqsubq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vqsubq_s16(src1, src2) __n128_to_int16x8_t(neon_sqsubq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vqsubq_u16(src1, src2) __n128_to_uint16x8_t(neon_uqsubq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vqsubq_s32(src1, src2) __n128_to_int32x4_t(neon_sqsubq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vqsubq_u32(src1, src2) __n128_to_uint32x4_t(neon_uqsubq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vqsubq_s64(src1, src2) __n128_to_int64x2_t(neon_sqsubq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vqsubq_u64(src1, src2) __n128_to_uint64x2_t(neon_uqsubq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vqsubb_s8(src1, src2) neon_sqsubs8(__int8ToN8_v(src1), __int8ToN8_v(src2)).n8_i8[0]
#define vqsubh_s16(src1, src2) neon_sqsubs16(__int16ToN16_v(src1), __int16ToN16_v(src2)).n16_i16[0]
#define vqsubs_s32(src1, src2) _CopyInt32FromFloat(neon_sqsubs32(_CopyFloatFromInt32(src1), _CopyFloatFromInt32(src2)))
#define vqsubd_s64(src1, src2) neon_sqsubs64(__int64ToN64_v(src1), __int64ToN64_v(src2)).n64_i64[0]
#define vqsubb_u8(src1, src2) neon_uqsubs8(__uint8ToN8_v(src1), __uint8ToN8_v(src2)).n8_u8[0]
#define vqsubh_u16(src1, src2) neon_uqsubs16(__uint16ToN16_v(src1), __uint16ToN16_v(src2)).n16_u16[0]
#define vqsubs_u32(src1, src2) _CopyUInt32FromFloat(neon_uqsubs32(_CopyFloatFromUInt32(src1), _CopyFloatFromUInt32(src2)))
#define vqsubd_u64(src1, src2) neon_uqsubs64(__uint64ToN64_v(src1), __uint64ToN64_v(src2)).n64_u64[0]

// SH(R)ADD, UH(R)ADD and SUB
__n64  neon_shadd8(__n64, __n64);
__n64  neon_shadd16(__n64, __n64);
__n64  neon_shadd32(__n64, __n64);
__n128 neon_shaddq8(__n128, __n128);
__n128 neon_shaddq16(__n128, __n128);
__n128 neon_shaddq32(__n128, __n128);
__n64  neon_srhadd8(__n64, __n64);
__n64  neon_srhadd16(__n64, __n64);
__n64  neon_srhadd32(__n64, __n64);
__n128 neon_srhaddq8(__n128, __n128);
__n128 neon_srhaddq16(__n128, __n128);
__n128 neon_srhaddq32(__n128, __n128);
__n64  neon_uhadd8(__n64, __n64);
__n64  neon_uhadd16(__n64, __n64);
__n64  neon_uhadd32(__n64, __n64);
__n128 neon_uhaddq8(__n128, __n128);
__n128 neon_uhaddq16(__n128, __n128);
__n128 neon_uhaddq32(__n128, __n128);
__n64  neon_urhadd8(__n64, __n64);
__n64  neon_urhadd16(__n64, __n64);
__n64  neon_urhadd32(__n64, __n64);
__n128 neon_urhaddq8(__n128, __n128);
__n128 neon_urhaddq16(__n128, __n128);
__n128 neon_urhaddq32(__n128, __n128);
__n64  neon_shsub8(__n64, __n64);
__n64  neon_shsub16(__n64, __n64);
__n64  neon_shsub32(__n64, __n64);
__n128 neon_shsubq8(__n128, __n128);
__n128 neon_shsubq16(__n128, __n128);
__n128 neon_shsubq32(__n128, __n128);
__n64  neon_uhsub8(__n64, __n64);
__n64  neon_uhsub16(__n64, __n64);
__n64  neon_uhsub32(__n64, __n64);
__n128 neon_uhsubq8(__n128, __n128);
__n128 neon_uhsubq16(__n128, __n128);
__n128 neon_uhsubq32(__n128, __n128);
#define vhadd_s8(src1, src2)    __n64_to_int8x8_t(neon_shadd8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vhadd_s16(src1, src2)   __n64_to_int16x4_t(neon_shadd16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vhadd_s32(src1, src2)   __n64_to_int32x2_t(neon_shadd32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vhaddq_s8(src1, src2)   __n128_to_int8x16_t(neon_shaddq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vhaddq_s16(src1, src2)  __n128_to_int16x8_t(neon_shaddq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vhaddq_s32(src1, src2)  __n128_to_int32x4_t(neon_shaddq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vrhadd_s8(src1, src2)   __n64_to_int8x8_t(neon_srhadd8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vrhadd_s16(src1, src2)  __n64_to_int16x4_t(neon_srhadd16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vrhadd_s32(src1, src2)  __n64_to_int32x2_t(neon_srhadd32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vrhaddq_s8(src1, src2)  __n128_to_int8x16_t(neon_srhaddq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vrhaddq_s16(src1, src2) __n128_to_int16x8_t(neon_srhaddq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vrhaddq_s32(src1, src2) __n128_to_int32x4_t(neon_srhaddq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vhadd_u8(src1, src2)    __n64_to_uint8x8_t(neon_uhadd8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vhadd_u16(src1, src2)   __n64_to_uint16x4_t(neon_uhadd16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vhadd_u32(src1, src2)   __n64_to_uint32x2_t(neon_uhadd32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vhaddq_u8(src1, src2)   __n128_to_uint8x16_t(neon_uhaddq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vhaddq_u16(src1, src2)  __n128_to_uint16x8_t(neon_uhaddq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vhaddq_u32(src1, src2)  __n128_to_uint32x4_t(neon_uhaddq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vrhadd_u8(src1, src2)   __n64_to_uint8x8_t(neon_urhadd8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vrhadd_u16(src1, src2)  __n64_to_uint16x4_t(neon_urhadd16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vrhadd_u32(src1, src2)  __n64_to_uint32x2_t(neon_urhadd32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vrhaddq_u8(src1, src2)  __n128_to_uint8x16_t(neon_urhaddq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vrhaddq_u16(src1, src2) __n128_to_uint16x8_t(neon_urhaddq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vrhaddq_u32(src1, src2) __n128_to_uint32x4_t(neon_urhaddq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vhsub_s8(src1, src2)    __n64_to_int8x8_t(neon_shsub8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vhsub_s16(src1, src2)   __n64_to_int16x4_t(neon_shsub16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vhsub_s32(src1, src2)   __n64_to_int32x2_t(neon_shsub32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vhsubq_s8(src1, src2)   __n128_to_int8x16_t(neon_shsubq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vhsubq_s16(src1, src2)  __n128_to_int16x8_t(neon_shsubq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vhsubq_s32(src1, src2)  __n128_to_int32x4_t(neon_shsubq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vhsub_u8(src1, src2)    __n64_to_uint8x8_t(neon_uhsub8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vhsub_u16(src1, src2)   __n64_to_uint16x4_t(neon_uhsub16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vhsub_u32(src1, src2)   __n64_to_uint32x2_t(neon_uhsub32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vhsubq_u8(src1, src2)   __n128_to_uint8x16_t(neon_uhsubq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vhsubq_u16(src1, src2)  __n128_to_uint16x8_t(neon_uhsubq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vhsubq_u32(src1, src2)  __n128_to_uint32x4_t(neon_uhsubq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))

// ADDP/FADDP
__n64  neon_addp8  (__n64, __n64);
__n64  neon_addp16 (__n64, __n64);
__n64  neon_addp32 (__n64, __n64);
__n64  neon_addps64(__n128);
__n128 neon_addpq8 (__n128, __n128);
__n128 neon_addpq16(__n128, __n128);
__n128 neon_addpq32(__n128, __n128);
__n128 neon_addpq64(__n128, __n128);
__n64  neon_faddp16(__n64, __n64);
__n64  neon_faddp32(__n64, __n64);
float  neon_faddps32(__n64);
float  neon_faddpsq32(__n128, __n128);
__n128 neon_faddpq16 (__n128, __n128);
__n128 neon_faddpq32 (__n128, __n128);
__n128 neon_faddpq64 (__n128, __n128);
__n64 neon_faddpsq64(__n128);
#define vpadd_s8(src1, src2)  __n64_to_int8x8_t(neon_addp8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vpadd_u8(src1, src2)  __n64_to_uint8x8_t(neon_addp8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vpadd_s16(src1, src2) __n64_to_int16x4_t(neon_addp16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vpadd_u16(src1, src2) __n64_to_uint16x4_t(neon_addp16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vpadd_s32(src1, src2) __n64_to_int32x2_t(neon_addp32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vpadd_u32(src1, src2) __n64_to_uint32x2_t(neon_addp32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vpadd_f32(src1, src2) __n64_to_float32x2_t(neon_faddp32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vpaddq_s8(src1, src2) __n128_to_int8x16_t(neon_addpq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vpaddq_u8(src1, src2) __n128_to_uint8x16_t(neon_addpq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vpaddq_s16(src1, src2) __n128_to_int16x8_t(neon_addpq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vpaddq_u16(src1, src2) __n128_to_uint16x8_t(neon_addpq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vpaddq_s32(src1, src2) __n128_to_int32x4_t(neon_addpq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vpaddq_u32(src1, src2) __n128_to_uint32x4_t(neon_addpq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vpaddq_s64(src1, src2) __n128_to_int64x2_t(neon_addpq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vpaddq_u64(src1, src2) __n128_to_uint64x2_t(neon_addpq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vpaddq_f32(src1, src2) __n128_to_float32x4_t(neon_faddpq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vpaddq_f64(src1, src2) __n128_to_float64x2_t(neon_faddpq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))

// ADDV/SADDLV/UADDLV
__n8  neon_addv8(__n64);
__n8  neon_addvq8(__n128);
__n16 neon_addv16(__n64);
__n16 neon_addvq16(__n128);
float neon_addvq32(__n128);
__n16 neon_saddlv8(__n64);
__n16 neon_saddlvq8(__n128);
float neon_saddlv16(__n64);
float neon_saddlvq16(__n128);
__n64 neon_saddlvq32(__n128);
__n16 neon_uaddlv8(__n64);
__n16 neon_uaddlvq8(__n128);
float neon_uaddlv16(__n64);
float neon_uaddlvq16(__n128);
__n64 neon_uaddlvq32(__n128);
#define vaddv_s8(src1) neon_addv8(__int8x8_t_to_n64(src1)).n8_i8[0]
#define vaddvq_s8(src1) neon_addvq8(__int8x16_t_to_n128(src1)).n8_i8[0]
#define vaddv_s16(src1) neon_addv16(__int16x4_t_to_n64(src1)).n16_i16[0]
#define vaddvq_s16(src1) neon_addvq16(__int16x8_t_to_n128(src1)).n16_i16[0]
#define vaddv_s32(src1) neon_addp32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src1)).n64_i32[0]
#define vaddvq_s32(src1) _CopyInt32FromFloat(neon_addvq32(__int32x4_t_to_n128(src1)))
#define vaddvq_s64(src1) neon_addps64(__int64x2_t_to_n128(src1)).n64_i64[0]
#define vpaddd_s64(src1) neon_addps64(__int64x2_t_to_n128(src1)).n64_i64[0]
#define vaddv_u8(src1) neon_addv8(__uint8x8_t_to_n64(src1)).n8_u8[0]
#define vaddvq_u8(src1) neon_addvq8(__uint8x16_t_to_n128(src1)).n8_u8[0]
#define vaddv_u16(src1) neon_addv16(__uint16x4_t_to_n64(src1)).n16_u16[0]
#define vaddvq_u16(src1) neon_addvq16(__uint16x8_t_to_n128(src1)).n16_u16[0]
#define vaddv_u32(src1) neon_addp32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src1)).n64_u32[0]
#define vaddvq_u32(src1) _CopyUInt32FromFloat(neon_addvq32(__uint32x4_t_to_n128(src1)))
#define vaddvq_u64(src1) neon_addps64(__uint64x2_t_to_n128(src1)).n64_u64[0]
#define vpaddd_u64(src1) neon_addps64(__uint64x2_t_to_n128(src1)).n64_u64[0]
#define vaddv_f32(src1) neon_faddps32(__float32x2_t_to_n64(src1))
#define vpadds_f32(src1) neon_faddps32(__float32x2_t_to_n64(src1))
#define vaddvq_f32(src1) neon_faddpsq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src1))
#define vaddvq_f64(src1) neon_faddpsq64(__float64x2_t_to_n128(src1)).n64_f64[0]
#define vpaddd_f64(src1) neon_faddpsq64(__float64x2_t_to_n128(src1)).n64_f64[0]
#define vaddlv_s8(src1) neon_saddlv8(__int8x8_t_to_n64(src1)).n16_i16[0]
#define vaddlvq_s8(src1) neon_saddlvq8(__int8x16_t_to_n128(src1)).n16_i16[0]
#define vaddlv_s16(src1) _CopyInt32FromFloat(neon_saddlv16(__int16x4_t_to_n64(src1)))
#define vaddlvq_s16(src1) _CopyInt32FromFloat(neon_saddlvq16(__int16x8_t_to_n128(src1)))
#define vaddlv_s32(src1) neon_saddlp32(__int32x2_t_to_n64(src1)).n64_i64[0]
#define vaddlvq_s32(src1) neon_saddlvq32(__int32x4_t_to_n128(src1)).n64_i64[0]
#define vaddlv_u8(src1) neon_uaddlv8(__uint8x8_t_to_n64(src1)).n16_u16[0]
#define vaddlvq_u8(src1) neon_uaddlvq8(__uint8x16_t_to_n128(src1)).n16_u16[0]
#define vaddlv_u16(src1) _CopyUInt32FromFloat(neon_uaddlv16(__uint16x4_t_to_n64(src1)))
#define vaddlvq_u16(src1) _CopyUInt32FromFloat(neon_uaddlvq16(__uint16x8_t_to_n128(src1)))
#define vaddlv_u32(src1) neon_uaddlp32(__uint32x2_t_to_n64(src1)).n64_u64[0]
#define vaddlvq_u32(src1) neon_uaddlvq32(__uint32x4_t_to_n128(src1)).n64_u64[0]



// SADALP/UADALP/SADDLP/UADDLP
__n64 neon_saddlp8(__n64);
__n128 neon_saddlpq8(__n128);
__n64 neon_saddlp16(__n64);
__n128 neon_saddlpq16(__n128);
__n64 neon_saddlp32(__n64);
__n128 neon_saddlpq32(__n128);
__n64 neon_uaddlp8(__n64);
__n128 neon_uaddlpq8(__n128);
__n64 neon_uaddlp16(__n64);
__n128 neon_uaddlpq16(__n128);
__n64 neon_uaddlp32(__n64);
__n128 neon_uaddlpq32(__n128);
__n64 neon_sadalp8(__n64, __n64);
__n128 neon_sadalpq8(__n128, __n128);
__n64 neon_sadalp16(__n64, __n64);
__n128 neon_sadalpq16(__n128, __n128);
__n64 neon_sadalp32(__n64, __n64);
__n128 neon_sadalpq32(__n128, __n128);
__n64 neon_uadalp8(__n64, __n64);
__n128 neon_uadalpq8(__n128, __n128);
__n64 neon_uadalp16(__n64, __n64);
__n128 neon_uadalpq16(__n128, __n128);
__n64 neon_uadalp32(__n64, __n64);
__n128 neon_uadalpq32(__n128, __n128);
#define vpaddl_s8(src)          __n64_to_int16x4_t(neon_saddlp8(__int8x8_t_to_n64(src)))
#define vpaddlq_s8(src)         __n128_to_int16x8_t(neon_saddlpq8(__int8x16_t_to_n128(src)))
#define vpaddl_s16(src)         __n64_to_int32x2_t(neon_saddlp16(__int16x4_t_to_n64(src)))
#define vpaddlq_s16(src)        __n128_to_int32x4_t(neon_saddlpq16(__int16x8_t_to_n128(src)))
#define vpaddl_s32(src)         __n64_to_int64x1_t(neon_saddlp32(__int32x2_t_to_n64(src)))
#define vpaddlq_s32(src)        __n128_to_int64x2_t(neon_saddlpq32(__int32x4_t_to_n128(src)))
#define vpaddl_u8(src)          __n64_to_uint16x4_t(neon_uaddlp8(__uint8x8_t_to_n64(src)))
#define vpaddlq_u8(src)         __n128_to_uint16x8_t(neon_uaddlpq8(__uint8x16_t_to_n128(src)))
#define vpaddl_u16(src)         __n64_to_uint32x2_t(neon_uaddlp16(__uint16x4_t_to_n64(src)))
#define vpaddlq_u16(src)        __n128_to_uint32x4_t(neon_uaddlpq16(__uint16x8_t_to_n128(src)))
#define vpaddl_u32(src)         __n64_to_uint64x1_t(neon_uaddlp32(__uint32x2_t_to_n64(src)))
#define vpaddlq_u32(src)        __n128_to_uint64x2_t(neon_uaddlpq32(__uint32x4_t_to_n128(src)))
#define vpadal_s8(src1, src2)   __n64_to_int16x4_t(neon_sadalp8(__int16x4_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vpadalq_s8(src1, src2)  __n128_to_int16x8_t(neon_sadalpq8(__int16x8_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vpadal_s16(src1, src2)  __n64_to_int32x2_t(neon_sadalp16(__int32x2_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vpadalq_s16(src1, src2) __n128_to_int32x4_t(neon_sadalpq16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vpadal_s32(src1, src2)  __n64_to_int64x1_t(neon_sadalp32(__int64x1_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vpadalq_s32(src1, src2) __n128_to_int64x2_t(neon_sadalpq32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vpadal_u8(src1, src2)   __n64_to_uint16x4_t(neon_uadalp8(__uint16x4_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vpadalq_u8(src1, src2)  __n128_to_uint16x8_t(neon_uadalpq8(__uint16x8_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vpadal_u16(src1, src2)  __n64_to_uint32x2_t(neon_uadalp16(__uint32x2_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vpadalq_u16(src1, src2) __n128_to_uint32x4_t(neon_uadalpq16(__uint32x4_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vpadal_u32(src1, src2)  __n64_to_uint64x1_t(neon_uadalp32(__uint64x1_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vpadalq_u32(src1, src2) __n128_to_uint64x2_t(neon_uadalpq32(__uint64x2_t_to_n128(src1), __uint32x4_t_to_n128(src2)))

// AESE/AESD/AESMC/AESIMC
__n128 neon_aese(__n128, __n128);
__n128 neon_aesd(__n128, __n128);
__n128 neon_aesmc(__n128);
__n128 neon_aesimc(__n128);
#define vaeseq_u8(src1, src2) __n128_to_uint8x16_t(neon_aese(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vaesdq_u8(src1, src2) __n128_to_uint8x16_t(neon_aesd(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vaesmcq_u8(src) __n128_to_uint8x16_t(neon_aesmc(__uint8x16_t_to_n128(src)))
#define vaesimcq_u8(src) __n128_to_uint8x16_t(neon_aesimc(__uint8x16_t_to_n128(src)))

// AND/BIC/BIF/BIT/BSL/EOR/ORN/ORR
__n64  neon_and(__n64, __n64);
__n128 neon_andq(__n128, __n128);
__n64  neon_eor(__n64, __n64);
__n128 neon_eorq(__n128, __n128);
__n64  neon_orn(__n64, __n64);
__n128 neon_ornq(__n128, __n128);
__n64  neon_orr(__n64, __n64);
__n128 neon_orrq(__n128, __n128);
__n64  neon_bic(__n64, __n64);
__n128 neon_bicq(__n128, __n128);
__n64  neon_bif(__n64, __n64, __n64);
__n128 neon_bifq(__n128, __n128, __n128);
__n64  neon_bit(__n64, __n64, __n64);
__n128 neon_bitq(__n128, __n128, __n128);
__n64  neon_bsl(__n64, __n64, __n64);
__n128 neon_bslq(__n128, __n128, __n128);
#define vand_s8(src1, src2)   __n64_to_int8x8_t(neon_and(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vand_u8(src1, src2)   __n64_to_uint8x8_t(neon_and(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vand_s16(src1, src2)  __n64_to_int16x4_t(neon_and(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vand_u16(src1, src2)  __n64_to_uint16x4_t(neon_and(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vand_s32(src1, src2)  __n64_to_int32x2_t(neon_and(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vand_u32(src1, src2)  __n64_to_uint32x2_t(neon_and(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vand_s64(src1, src2)  __n64_to_int64x1_t(neon_and(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vand_u64(src1, src2)  __n64_to_uint64x1_t(neon_and(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define vandq_s8(src1, src2)  __n128_to_int8x16_t(neon_andq(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vandq_u8(src1, src2)  __n128_to_uint8x16_t(neon_andq(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vandq_s16(src1, src2) __n128_to_int16x8_t(neon_andq(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vandq_u16(src1, src2) __n128_to_uint16x8_t(neon_andq(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vandq_s32(src1, src2) __n128_to_int32x4_t(neon_andq(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vandq_u32(src1, src2) __n128_to_uint32x4_t(neon_andq(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vandq_s64(src1, src2) __n128_to_int64x2_t(neon_andq(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vandq_u64(src1, src2) __n128_to_uint64x2_t(neon_andq(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define veor_s8(src1, src2)   __n64_to_int8x8_t(neon_eor(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define veor_u8(src1, src2)   __n64_to_uint8x8_t(neon_eor(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define veor_s16(src1, src2)  __n64_to_int16x4_t(neon_eor(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define veor_u16(src1, src2)  __n64_to_uint16x4_t(neon_eor(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define veor_s32(src1, src2)  __n64_to_int32x2_t(neon_eor(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define veor_u32(src1, src2)  __n64_to_uint32x2_t(neon_eor(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define veor_s64(src1, src2)  __n64_to_int64x1_t(neon_eor(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define veor_u64(src1, src2)  __n64_to_uint64x1_t(neon_eor(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define veorq_s8(src1, src2)  __n128_to_int8x16_t(neon_eorq(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define veorq_u8(src1, src2)  __n128_to_uint8x16_t(neon_eorq(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define veorq_s16(src1, src2) __n128_to_int16x8_t(neon_eorq(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define veorq_u16(src1, src2) __n128_to_uint16x8_t(neon_eorq(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define veorq_s32(src1, src2) __n128_to_int32x4_t(neon_eorq(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define veorq_u32(src1, src2) __n128_to_uint32x4_t(neon_eorq(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define veorq_s64(src1, src2) __n128_to_int64x2_t(neon_eorq(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define veorq_u64(src1, src2) __n128_to_uint64x2_t(neon_eorq(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vorr_s8(src1, src2)   __n64_to_int8x8_t(neon_orr(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vorr_u8(src1, src2)   __n64_to_uint8x8_t(neon_orr(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vorr_s16(src1, src2)  __n64_to_int16x4_t(neon_orr(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vorr_u16(src1, src2)  __n64_to_uint16x4_t(neon_orr(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vorr_s32(src1, src2)  __n64_to_int32x2_t(neon_orr(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vorr_u32(src1, src2)  __n64_to_uint32x2_t(neon_orr(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vorr_s64(src1, src2)  __n64_to_int64x1_t(neon_orr(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vorr_u64(src1, src2)  __n64_to_uint64x1_t(neon_orr(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define vorrq_s8(src1, src2)  __n128_to_int8x16_t(neon_orrq(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vorrq_u8(src1, src2)  __n128_to_uint8x16_t(neon_orrq(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vorrq_s16(src1, src2) __n128_to_int16x8_t(neon_orrq(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vorrq_u16(src1, src2) __n128_to_uint16x8_t(neon_orrq(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vorrq_s32(src1, src2) __n128_to_int32x4_t(neon_orrq(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vorrq_u32(src1, src2) __n128_to_uint32x4_t(neon_orrq(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vorrq_s64(src1, src2) __n128_to_int64x2_t(neon_orrq(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vorrq_u64(src1, src2) __n128_to_uint64x2_t(neon_orrq(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vorn_s8(src1, src2)   __n64_to_int8x8_t(neon_orn(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vorn_u8(src1, src2)   __n64_to_uint8x8_t(neon_orn(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vorn_s16(src1, src2)  __n64_to_int16x4_t(neon_orn(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vorn_u16(src1, src2)  __n64_to_uint16x4_t(neon_orn(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vorn_s32(src1, src2)  __n64_to_int32x2_t(neon_orn(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vorn_u32(src1, src2)  __n64_to_uint32x2_t(neon_orn(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vorn_s64(src1, src2)  __n64_to_int64x1_t(neon_orn(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vorn_u64(src1, src2)  __n64_to_uint64x1_t(neon_orn(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define vornq_s8(src1, src2)  __n128_to_int8x16_t(neon_ornq(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vornq_u8(src1, src2)  __n128_to_uint8x16_t(neon_ornq(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vornq_s16(src1, src2) __n128_to_int16x8_t(neon_ornq(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vornq_u16(src1, src2) __n128_to_uint16x8_t(neon_ornq(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vornq_s32(src1, src2) __n128_to_int32x4_t(neon_ornq(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vornq_u32(src1, src2) __n128_to_uint32x4_t(neon_ornq(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vornq_s64(src1, src2) __n128_to_int64x2_t(neon_ornq(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vornq_u64(src1, src2) __n128_to_uint64x2_t(neon_ornq(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vbic_s8(src1, src2)   __n64_to_int8x8_t(neon_bic(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vbic_u8(src1, src2)   __n64_to_uint8x8_t(neon_bic(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vbic_s16(src1, src2)  __n64_to_int16x4_t(neon_bic(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vbic_u16(src1, src2)  __n64_to_uint16x4_t(neon_bic(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vbic_s32(src1, src2)  __n64_to_int32x2_t(neon_bic(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vbic_u32(src1, src2)  __n64_to_uint32x2_t(neon_bic(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vbic_s64(src1, src2)  __n64_to_int64x1_t(neon_bic(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vbic_u64(src1, src2)  __n64_to_uint64x1_t(neon_bic(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define vbicq_s8(src1, src2)  __n128_to_int8x16_t(neon_bicq(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vbicq_u8(src1, src2)  __n128_to_uint8x16_t(neon_bicq(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vbicq_s16(src1, src2) __n128_to_int16x8_t(neon_bicq(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vbicq_u16(src1, src2) __n128_to_uint16x8_t(neon_bicq(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vbicq_s32(src1, src2) __n128_to_int32x4_t(neon_bicq(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vbicq_u32(src1, src2) __n128_to_uint32x4_t(neon_bicq(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vbicq_s64(src1, src2) __n128_to_int64x2_t(neon_bicq(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vbicq_u64(src1, src2) __n128_to_uint64x2_t(neon_bicq(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vbsl_s8(src1, src2, src3)   __n64_to_int8x8_t(neon_bsl(__uint8x8_t_to_n64(src1), __int8x8_t_to_n64(src2), __int8x8_t_to_n64(src3)))
#define vbsl_u8(src1, src2, src3)   __n64_to_uint8x8_t(neon_bsl(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2), __uint8x8_t_to_n64(src3)))
#define vbsl_p8(src1, src2, src3)   __n64_to_poly8x8_t(neon_bsl(__uint8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2), __poly8x8_t_to_n64(src3)))
#define vbsl_s16(src1, src2, src3)  __n64_to_int16x4_t(neon_bsl(__uint16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3)))
#define vbsl_u16(src1, src2, src3)  __n64_to_uint16x4_t(neon_bsl(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), __uint16x4_t_to_n64(src3)))
#define vbsl_p16(src1, src2, src3)  __n64_to_poly16x4_t(neon_bsl(__uint16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2), __poly16x4_t_to_n64(src3)))
#define vbsl_s32(src1, src2, src3)  __n64_to_int32x2_t(neon_bsl(__uint32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3)))
#define vbsl_f32(src1, src2, src3)  __n64_to_float32x2_t(neon_bsl(__uint32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3)))
#define vbsl_u32(src1, src2, src3)  __n64_to_uint32x2_t(neon_bsl(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), __uint32x2_t_to_n64(src3)))
#define vbsl_s64(src1, src2, src3)  __n64_to_int64x1_t(neon_bsl(__uint64x1_t_to_n64(src1), __int64x1_t_to_n64(src2), __int64x1_t_to_n64(src3)))
#define vbsl_f64(src1, src2, src3)  __n64_to_float64x1_t(neon_bsl(__uint64x1_t_to_n64(src1), __float64x1_t_to_n64(src2), __float64x1_t_to_n64(src3)))
#define vbsl_u64(src1, src2, src3)  __n64_to_uint64x1_t(neon_bsl(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2), __uint64x1_t_to_n64(src3)))
#define vbsl_p64(src1, src2, src3)  __n64_to_poly64x1_t(neon_bsl(__poly64x1_t_to_n64(src1), __poly64x1_t_to_n64(src2), __poly64x1_t_to_n64(src3)))
#define vbslq_s8(src1, src2, src3)  __n128_to_int8x16_t(neon_bslq(__uint8x16_t_to_n128(src1), __int8x16_t_to_n128(src2), __int8x16_t_to_n128(src3)))
#define vbslq_u8(src1, src2, src3)  __n128_to_uint8x16_t(neon_bslq(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3)))
#define vbslq_p8(src1, src2, src3)  __n128_to_poly8x16_t(neon_bslq(__uint8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2), __poly8x16_t_to_n128(src3)))
#define vbslq_s16(src1, src2, src3) __n128_to_int16x8_t(neon_bslq(__uint16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vbslq_u16(src1, src2, src3) __n128_to_uint16x8_t(neon_bslq(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3)))
#define vbslq_p16(src1, src2, src3) __n128_to_poly16x8_t(neon_bslq(__uint16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2), __poly16x8_t_to_n128(src3)))
#define vbslq_s32(src1, src2, src3) __n128_to_int32x4_t(neon_bslq(__uint32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vbslq_f32(src1, src2, src3) __n128_to_float32x4_t(neon_bslq(__uint32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3)))
#define vbslq_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_bslq(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vbslq_s64(src1, src2, src3) __n128_to_int64x2_t(neon_bslq(__uint64x2_t_to_n128(src1), __int64x2_t_to_n128(src2), __int64x2_t_to_n128(src3)))
#define vbslq_u64(src1, src2, src3) __n128_to_uint64x2_t(neon_bslq(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2), __uint64x2_t_to_n128(src3)))
#define vbslq_f64(src1, src2, src3) __n128_to_float64x2_t(neon_bslq(__uint64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), __float64x2_t_to_n128(src3)))
#define vbslq_p64(src1, src2, src3) __n128_to_poly64x2_t(neon_bslq(__poly64x2_t_to_n128(src1), __poly64x2_t_to_n128(src2), __poly64x2_t_to_n128(src3)))

// BIC/ORR immediate
__n64  neon_bich(__n64, const int);
__n64  neon_bicw(__n64, const int);
__n64  neon_bic_shifth(__n64, const int, const int);
__n64  neon_bic_shiftw(__n64, const int, const int);
__n128 neon_bicqh(__n128, const int);
__n128 neon_bicqw(__n128, const int);
__n128 neon_bicq_shifth(__n128, const int, const int);
__n128 neon_bicq_shiftw(__n128, const int, const int);
__n64  neon_orrh(__n64, const int);
__n64  neon_orrw(__n64, const int);
__n64  neon_orr_shifth(__n64, const int, const int);
__n64  neon_orr_shiftw(__n64, const int, const int);
__n128 neon_orrqh(__n128, const int);
__n128 neon_orrqw(__n128, const int);
__n128 neon_orrq_shifth(__n128, const int, const int);
__n128 neon_orrq_shiftw(__n128, const int, const int);

// RBIT/REV16/REV32/REV64
__n64 neon_rbit(__n64);
__n128 neon_rbitq(__n128);
__n64 neon_rev16(__n64);
__n128 neon_rev16q(__n128);
__n64 neon_rev32_8(__n64);
__n128 neon_rev32q_8(__n128);
__n64 neon_rev32_16(__n64);
__n128 neon_rev32q_16(__n128);
__n64 neon_rev64_8(__n64);
__n128 neon_rev64q_8(__n128);
__n64 neon_rev64_16(__n64);
__n128 neon_rev64q_16(__n128);
__n64 neon_rev64_32(__n64);
__n128 neon_rev64q_32(__n128);
#define vrbit_p8(src)   __n64_to_poly8x8_t(neon_rbit(__poly8x8_t_to_n64(src)))
#define vrbit_s8(src)   __n64_to_int8x8_t(neon_rbit(__int8x8_t_to_n64(src)))
#define vrbit_u8(src)   __n64_to_uint8x8_t(neon_rbit(__uint8x8_t_to_n64(src)))
#define vrbitq_p8(src)  __n128_to_poly8x16_t(neon_rbitq(__poly8x16_t_to_n128(src)))
#define vrbitq_s8(src)  __n128_to_int8x16_t(neon_rbitq(__int8x16_t_to_n128(src)))
#define vrbitq_u8(src)  __n128_to_uint8x16_t(neon_rbitq(__uint8x16_t_to_n128(src)))
#define vrev16_p8(src)  __n64_to_poly8x8_t(neon_rev16(__poly8x8_t_to_n64(src)))
#define vrev16_s8(src)  __n64_to_int8x8_t(neon_rev16(__int8x8_t_to_n64(src)))
#define vrev16_u8(src)  __n64_to_uint8x8_t(neon_rev16(__uint8x8_t_to_n64(src)))
#define vrev32_p8(src)  __n64_to_poly8x8_t(neon_rev32_8(__poly8x8_t_to_n64(src)))
#define vrev32_s8(src)  __n64_to_int8x8_t(neon_rev32_8(__int8x8_t_to_n64(src)))
#define vrev32_u8(src)  __n64_to_uint8x8_t(neon_rev32_8(__uint8x8_t_to_n64(src)))
#define vrev32_p16(src) __n64_to_poly16x4_t(neon_rev32_16(__poly16x4_t_to_n64(src)))
#define vrev32_s16(src) __n64_to_int16x4_t(neon_rev32_16(__int16x4_t_to_n64(src)))
#define vrev32_u16(src) __n64_to_uint16x4_t(neon_rev32_16(__uint16x4_t_to_n64(src)))
#define vrev64_p8(src)   __n64_to_poly8x8_t(neon_rev64_8(__poly8x8_t_to_n64(src)))
#define vrev64_s8(src)   __n64_to_int8x8_t(neon_rev64_8(__int8x8_t_to_n64(src)))
#define vrev64_u8(src)   __n64_to_uint8x8_t(neon_rev64_8(__uint8x8_t_to_n64(src)))
#define vrev64_p16(src)  __n64_to_poly16x4_t(neon_rev64_16(__poly16x4_t_to_n64(src)))
#define vrev64_s16(src)  __n64_to_int16x4_t(neon_rev64_16(__int16x4_t_to_n64(src)))
#define vrev64_u16(src)  __n64_to_uint16x4_t(neon_rev64_16(__uint16x4_t_to_n64(src)))
#define vrev64_s32(src)  __n64_to_int32x2_t(neon_rev64_32(__int32x2_t_to_n64(src)))
#define vrev64_u32(src)  __n64_to_uint32x2_t(neon_rev64_32(__uint32x2_t_to_n64(src)))
#define vrev64_f32(src)  __n64_to_float32x2_t(neon_rev64_32(__float32x2_t_to_n64(src)))
#define vrev16q_p8(src)  __n128_to_poly8x16_t(neon_rev16q(__poly8x16_t_to_n128(src)))
#define vrev16q_s8(src)  __n128_to_int8x16_t(neon_rev16q(__int8x16_t_to_n128(src)))
#define vrev16q_u8(src)  __n128_to_uint8x16_t(neon_rev16q(__uint8x16_t_to_n128(src)))
#define vrev32q_p8(src)  __n128_to_poly8x16_t(neon_rev32q_8(__poly8x16_t_to_n128(src)))
#define vrev32q_s8(src)  __n128_to_int8x16_t(neon_rev32q_8(__int8x16_t_to_n128(src)))
#define vrev32q_u8(src)  __n128_to_uint8x16_t(neon_rev32q_8(__uint8x16_t_to_n128(src)))
#define vrev32q_p16(src) __n128_to_poly16x8_t(neon_rev32q_16(__poly16x8_t_to_n128(src)))
#define vrev32q_s16(src) __n128_to_int16x8_t(neon_rev32q_16(__int16x8_t_to_n128(src)))
#define vrev32q_u16(src) __n128_to_uint16x8_t(neon_rev32q_16(__uint16x8_t_to_n128(src)))
#define vrev64q_p8(src)  __n128_to_poly8x16_t(neon_rev64q_8(__poly8x16_t_to_n128(src)))
#define vrev64q_s8(src)  __n128_to_int8x16_t(neon_rev64q_8(__int8x16_t_to_n128(src)))
#define vrev64q_u8(src)  __n128_to_uint8x16_t(neon_rev64q_8(__uint8x16_t_to_n128(src)))
#define vrev64q_p16(src) __n128_to_poly16x8_t(neon_rev64q_16(__poly16x8_t_to_n128(src)))
#define vrev64q_s16(src) __n128_to_int16x8_t(neon_rev64q_16(__int16x8_t_to_n128(src)))
#define vrev64q_u16(src) __n128_to_uint16x8_t(neon_rev64q_16(__uint16x8_t_to_n128(src)))
#define vrev64q_s32(src) __n128_to_int32x4_t(neon_rev64q_32(__int32x4_t_to_n128(src)))
#define vrev64q_u32(src) __n128_to_uint32x4_t(neon_rev64q_32(__uint32x4_t_to_n128(src)))
#define vrev64q_f32(src) __n128_to_float32x4_t(neon_rev64q_32(__float32x4_t_to_n128(src)))

// CNT/CLS/CLZ
__n64  neon_cnt(__n64);
__n128 neon_cntq(__n128);
__n64  neon_cls8(__n64);
__n128 neon_clsq8(__n128);
__n64  neon_cls16(__n64);
__n128 neon_clsq16(__n128);
__n64  neon_cls32(__n64);
__n128 neon_clsq32(__n128);
__n64  neon_clz8(__n64);
__n128 neon_clzq8(__n128);
__n64  neon_clz16(__n64);
__n128 neon_clzq16(__n128);
__n64  neon_clz32(__n64);
__n128 neon_clzq32(__n128);
#define vcnt_p8(src) __n64_to_poly8x8_t(neon_cnt(__poly8x8_t_to_n64(src)))
#define vcnt_s8(src) __n64_to_int8x8_t(neon_cnt(__int8x8_t_to_n64(src)))
#define vcnt_u8(src) __n64_to_uint8x8_t(neon_cnt(__uint8x8_t_to_n64(src)))
#define vcntq_p8(src) __n128_to_poly8x16_t(neon_cntq(__poly8x16_t_to_n128(src)))
#define vcntq_s8(src) __n128_to_int8x16_t(neon_cntq(__int8x16_t_to_n128(src)))
#define vcntq_u8(src) __n128_to_uint8x16_t(neon_cntq(__uint8x16_t_to_n128(src)))
#define vcls_s8(src) __n64_to_int8x8_t(neon_cls8(__int8x8_t_to_n64(src)))
#define vcls_s16(src) __n64_to_int16x4_t(neon_cls16(__int16x4_t_to_n64(src)))
#define vcls_s32(src) __n64_to_int32x2_t(neon_cls32(__int32x2_t_to_n64(src)))
#define vclsq_s8(src) __n128_to_int8x16_t(neon_clsq8(__int8x16_t_to_n128(src)))
#define vclsq_s16(src) __n128_to_int16x8_t(neon_clsq16(__int16x8_t_to_n128(src)))
#define vclsq_s32(src) __n128_to_int32x4_t(neon_clsq32(__int32x4_t_to_n128(src)))
#define vclz_s8(src) __n64_to_int8x8_t(neon_clz8(__int8x8_t_to_n64(src)))
#define vclz_s16(src) __n64_to_int16x4_t(neon_clz16(__int16x4_t_to_n64(src)))
#define vclz_s32(src) __n64_to_int32x2_t(neon_clz32(__int32x2_t_to_n64(src)))
#define vclz_u8(src) __n64_to_uint8x8_t(neon_clz8(__uint8x8_t_to_n64(src)))
#define vclz_u16(src) __n64_to_uint16x4_t(neon_clz16(__uint16x4_t_to_n64(src)))
#define vclz_u32(src) __n64_to_uint32x2_t(neon_clz32(__uint32x2_t_to_n64(src)))
#define vclzq_s8(src) __n128_to_int8x16_t(neon_clzq8(__int8x16_t_to_n128(src)))
#define vclzq_s16(src) __n128_to_int16x8_t(neon_clzq16(__int16x8_t_to_n128(src)))
#define vclzq_s32(src) __n128_to_int32x4_t(neon_clzq32(__int32x4_t_to_n128(src)))
#define vclzq_u8(src) __n128_to_uint8x16_t(neon_clzq8(__uint8x16_t_to_n128(src)))
#define vclzq_u16(src) __n128_to_uint16x8_t(neon_clzq16(__uint16x8_t_to_n128(src)))
#define vclzq_u32(src) __n128_to_uint32x4_t(neon_clzq32(__uint32x4_t_to_n128(src)))

// FMAX/FMAXNM/FMAXNMP/FMAXNMV/FMAXP/FMAXV/SMAX/SMAXP/SMAXV/UMAX/UMAXP/UMAXV
__n64 neon_fmax16(__n64, __n64);
__n64 neon_fmax32(__n64, __n64);
__n64 neon_fmax64(__n64, __n64);
__n128 neon_fmaxq16(__n128, __n128);
__n128 neon_fmaxq32(__n128, __n128);
__n128 neon_fmaxq64(__n128, __n128);
__n64 neon_fmaxnm16(__n64, __n64);
__n64 neon_fmaxnm32(__n64, __n64);
__n64 neon_fmaxnm64(__n64, __n64);
__n128 neon_fmaxnmq16(__n128, __n128);
__n128 neon_fmaxnmq32(__n128, __n128);
__n128 neon_fmaxnmq64(__n128, __n128);
__n64 neon_fmaxnmp16(__n64, __n64);
__n64 neon_fmaxnmp32(__n64, __n64);
__n128 neon_fmaxnmpq16(__n128, __n128);
__n128 neon_fmaxnmpq32(__n128, __n128);
__n128 neon_fmaxnmpq64(__n128, __n128);
float neon_fmaxnmps32(__n64);
double neon_fmaxnmps64(__n128);
float neon_fmaxnmv(__n128);
__n64 neon_fmaxp16(__n64, __n64);
__n64 neon_fmaxp32(__n64, __n64);
__n64 neon_fmaxp64(__n64, __n64);
__n128 neon_fmaxpq16(__n128, __n128);
__n128 neon_fmaxpq32(__n128, __n128);
__n128 neon_fmaxpq64(__n128, __n128);
float neon_fmaxps32(__n64);
double neon_fmaxps64(__n128);
float neon_fmaxv(__n128);
__n64 neon_smax8(__n64, __n64);
__n64 neon_smax16(__n64, __n64);
__n64 neon_smax32(__n64, __n64);
__n128 neon_smaxq8(__n128, __n128);
__n128 neon_smaxq16(__n128, __n128);
__n128 neon_smaxq32(__n128, __n128);
__n64 neon_smaxp8(__n64, __n64);
__n64 neon_smaxp16(__n64, __n64);
__n64 neon_smaxp32(__n64, __n64);
__n128 neon_smaxpq8(__n128, __n128);
__n128 neon_smaxpq16(__n128, __n128);
__n128 neon_smaxpq32(__n128, __n128);
__n8 neon_smaxv8(__n64);
__n8 neon_smaxvq8(__n128);
__n16 neon_smaxv16(__n64);
__n16 neon_smaxvq16(__n128);
float neon_smaxvq32(__n128);
__n64 neon_umax8(__n64, __n64);
__n64 neon_umax16(__n64, __n64);
__n64 neon_umax32(__n64, __n64);
__n128 neon_umaxq8(__n128, __n128);
__n128 neon_umaxq16(__n128, __n128);
__n128 neon_umaxq32(__n128, __n128);
__n64 neon_umaxp8(__n64, __n64);
__n64 neon_umaxp16(__n64, __n64);
__n64 neon_umaxp32(__n64, __n64);
__n128 neon_umaxpq8(__n128, __n128);
__n128 neon_umaxpq16(__n128, __n128);
__n128 neon_umaxpq32(__n128, __n128);
__n8 neon_umaxv8(__n64);
__n8 neon_umaxvq8(__n128);
__n16 neon_umaxv16(__n64);
__n16 neon_umaxvq16(__n128);
float neon_umaxvq32(__n128);
#define vmax_f32(src1, src2)    __n64_to_float32x2_t(neon_fmax32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vmaxnm_f32(src1, src2)  __n64_to_float32x2_t(neon_fmaxnm32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vmaxq_f32(src1, src2)   __n128_to_float32x4_t(neon_fmaxq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vmaxnmq_f32(src1, src2) __n128_to_float32x4_t(neon_fmaxnmq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vmax_f64(src1, src2)    __n64_to_float64x1_t(neon_fmax64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vmaxnm_f64(src1, src2)  __n64_to_float64x1_t(neon_fmaxnm64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vmaxq_f64(src1, src2)   __n128_to_float64x2_t(neon_fmaxq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vmaxnmq_f64(src1, src2) __n128_to_float64x2_t(neon_fmaxnmq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vmax_s8(src1, src2)   __n64_to_int8x8_t(neon_smax8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vmax_s16(src1, src2)  __n64_to_int16x4_t(neon_smax16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vmax_s32(src1, src2)  __n64_to_int32x2_t(neon_smax32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vmax_u8(src1, src2)   __n64_to_uint8x8_t(neon_umax8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vmax_u16(src1, src2)  __n64_to_uint16x4_t(neon_umax16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vmax_u32(src1, src2)  __n64_to_uint32x2_t(neon_umax32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vmaxq_s8(src1, src2)  __n128_to_int8x16_t(neon_smaxq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vmaxq_s16(src1, src2) __n128_to_int16x8_t(neon_smaxq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vmaxq_s32(src1, src2) __n128_to_int32x4_t(neon_smaxq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vmaxq_u8(src1, src2)  __n128_to_uint8x16_t(neon_umaxq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vmaxq_u16(src1, src2) __n128_to_uint16x8_t(neon_umaxq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vmaxq_u32(src1, src2) __n128_to_uint32x4_t(neon_umaxq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vpmax_f32(src1, src2) __n64_to_float32x2_t(neon_fmaxp32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vpmax_s8(src1, src2)  __n64_to_int8x8_t(neon_smaxp8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vpmax_s16(src1, src2) __n64_to_int16x4_t(neon_smaxp16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vpmax_s32(src1, src2) __n64_to_int32x2_t(neon_smaxp32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vpmax_u8(src1, src2)  __n64_to_uint8x8_t(neon_umaxp8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vpmax_u16(src1, src2) __n64_to_uint16x4_t(neon_umaxp16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vpmax_u32(src1, src2) __n64_to_uint32x2_t(neon_umaxp32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vpmaxq_f32(src1, src2) __n128_to_float32x4_t(neon_fmaxpq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vpmaxq_f64(src1, src2) __n128_to_float64x2_t(neon_fmaxpq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vpmaxq_s8(src1, src2)  __n128_to_int8x16_t(neon_smaxpq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vpmaxq_s16(src1, src2) __n128_to_int16x8_t(neon_smaxpq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vpmaxq_s32(src1, src2) __n128_to_int32x4_t(neon_smaxpq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vpmaxq_u8(src1, src2)  __n128_to_uint8x16_t(neon_umaxpq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vpmaxq_u16(src1, src2) __n128_to_uint16x8_t(neon_umaxpq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vpmaxq_u32(src1, src2) __n128_to_uint32x4_t(neon_umaxpq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vmaxv_f32(src1) neon_fmaxps32(__float32x2_t_to_n64(src1))
#define vmaxnmv_f32(src1) neon_fmaxnmps32(__float32x2_t_to_n64(src1))
#define vmaxv_s8(src1) neon_smaxv8(__int8x8_t_to_n64(src1)).n8_i8[0]
#define vmaxv_s16(src1) neon_smaxv16(__int16x4_t_to_n64(src1)).n16_i16[0]
#define vmaxv_s32(src1) neon_smaxp32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src1)).n64_i32[0]
#define vmaxv_u8(src1) neon_umaxv8(__uint8x8_t_to_n64(src1)).n8_u8[0]
#define vmaxv_u16(src1) neon_umaxv16(__uint16x4_t_to_n64(src1)).n16_u16[0]
#define vmaxv_u32(src1) neon_umaxp32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src1)).n64_u32[0]
#define vmaxvq_f32(src1) neon_fmaxv(__float32x4_t_to_n128(src1))
#define vmaxnmvq_f32(src1) neon_fmaxnmv(__float32x4_t_to_n128(src1))
#define vmaxvq_f64(src1) neon_fmaxps64(__float64x2_t_to_n128(src1))
#define vmaxnmvq_f64(src1) neon_fmaxnmps64(__float64x2_t_to_n128(src1))
#define vmaxvq_s8(src1) neon_smaxvq8(__int8x16_t_to_n128(src1)).n8_i8[0]
#define vmaxvq_s16(src1) neon_smaxvq16(__int16x8_t_to_n128(src1)).n16_i16[0]
#define vmaxvq_s32(src1) _CopyInt32FromFloat(neon_smaxvq32(__int32x4_t_to_n128(src1)))
#define vmaxvq_u8(src1) neon_umaxvq8(__uint8x16_t_to_n128(src1)).n8_u8[0]
#define vmaxvq_u16(src1) neon_umaxvq16(__uint16x8_t_to_n128(src1)).n16_u16[0]
#define vmaxvq_u32(src1) _CopyUInt32FromFloat(neon_umaxvq32(__uint32x4_t_to_n128(src1)))
#define vpmaxnm_f32(src1, src2) __n64_to_float32x2_t(neon_fmaxnmp32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vpmaxnmq_f32(src1, src2) __n128_to_float32x4_t(neon_fmaxnmpq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vpmaxnmq_f64(src1, src2) __n128_to_float64x2_t(neon_fmaxnmpq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))

// FMIN/FMINNM/FMINNMP/FMINNMV/FMINP/FMINV/SMIN/SMINP/SMINV/UMIN/UMINP/UMINV
__n64 neon_fmin16(__n64, __n64);
__n64 neon_fmin32(__n64, __n64);
__n64 neon_fmin64(__n64, __n64);
__n128 neon_fminq16(__n128, __n128);
__n128 neon_fminq32(__n128, __n128);
__n128 neon_fminq64(__n128, __n128);
__n64 neon_fminnm16(__n64, __n64);
__n64 neon_fminnm32(__n64, __n64);
__n64 neon_fminnm64(__n64, __n64);
__n128 neon_fminnmq16(__n128, __n128);
__n128 neon_fminnmq32(__n128, __n128);
__n128 neon_fminnmq64(__n128, __n128);
__n64 neon_fminnmp16(__n64, __n64);
__n64 neon_fminnmp32(__n64, __n64);
__n128 neon_fminnmpq16(__n128, __n128);
__n128 neon_fminnmpq32(__n128, __n128);
__n128 neon_fminnmpq64(__n128, __n128);
float neon_fminnmps32(__n64);
double neon_fminnmps64(__n128);
float neon_fminnmv(__n128);
__n64 neon_fminp16(__n64, __n64);
__n64 neon_fminp32(__n64, __n64);
__n64 neon_fminp64(__n64, __n64);
__n128 neon_fminpq16(__n128, __n128);
__n128 neon_fminpq32(__n128, __n128);
__n128 neon_fminpq64(__n128, __n128);
float neon_fminps32(__n64);
double neon_fminps64(__n128);
float neon_fminv(__n128);
__n64 neon_smin8(__n64, __n64);
__n64 neon_smin16(__n64, __n64);
__n64 neon_smin32(__n64, __n64);
__n128 neon_sminq8(__n128, __n128);
__n128 neon_sminq16(__n128, __n128);
__n128 neon_sminq32(__n128, __n128);
__n64 neon_sminp8(__n64, __n64);
__n64 neon_sminp16(__n64, __n64);
__n64 neon_sminp32(__n64, __n64);
__n128 neon_sminpq8(__n128, __n128);
__n128 neon_sminpq16(__n128, __n128);
__n128 neon_sminpq32(__n128, __n128);
__n8 neon_sminv8(__n64);
__n8 neon_sminvq8(__n128);
__n16 neon_sminv16(__n64);
__n16 neon_sminvq16(__n128);
float neon_sminvq32(__n128);
__n64 neon_umin8(__n64, __n64);
__n64 neon_umin16(__n64, __n64);
__n64 neon_umin32(__n64, __n64);
__n128 neon_uminq8(__n128, __n128);
__n128 neon_uminq16(__n128, __n128);
__n128 neon_uminq32(__n128, __n128);
__n64 neon_uminp8(__n64, __n64);
__n64 neon_uminp16(__n64, __n64);
__n64 neon_uminp32(__n64, __n64);
__n128 neon_uminpq8(__n128, __n128);
__n128 neon_uminpq16(__n128, __n128);
__n128 neon_uminpq32(__n128, __n128);
__n8 neon_uminv8(__n64);
__n8 neon_uminvq8(__n128);
__n16 neon_uminv16(__n64);
__n16 neon_uminvq16(__n128);
float neon_uminvq32(__n128);
#define vmin_f32(src1, src2) __n64_to_float32x2_t(neon_fmin32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vminnm_f32(src1, src2) __n64_to_float32x2_t(neon_fminnm32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vminq_f32(src1, src2) __n128_to_float32x4_t(neon_fminq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vminnmq_f32(src1, src2) __n128_to_float32x4_t(neon_fminnmq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vmin_f64(src1, src2) __n64_to_float64x1_t(neon_fmin64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vminnm_f64(src1, src2) __n64_to_float64x1_t(neon_fminnm64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vminq_f64(src1, src2) __n128_to_float64x2_t(neon_fminq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vminnmq_f64(src1, src2) __n128_to_float64x2_t(neon_fminnmq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vmin_s8(src1, src2)  __n64_to_int8x8_t(neon_smin8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vmin_s16(src1, src2) __n64_to_int16x4_t(neon_smin16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vmin_s32(src1, src2) __n64_to_int32x2_t(neon_smin32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vmin_u8(src1, src2)  __n64_to_uint8x8_t(neon_umin8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vmin_u16(src1, src2) __n64_to_uint16x4_t(neon_umin16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vmin_u32(src1, src2) __n64_to_uint32x2_t(neon_umin32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vminq_s8(src1, src2)  __n128_to_int8x16_t(neon_sminq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vminq_s16(src1, src2) __n128_to_int16x8_t(neon_sminq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vminq_s32(src1, src2) __n128_to_int32x4_t(neon_sminq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vminq_u8(src1, src2)  __n128_to_uint8x16_t(neon_uminq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vminq_u16(src1, src2) __n128_to_uint16x8_t(neon_uminq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vminq_u32(src1, src2) __n128_to_uint32x4_t(neon_uminq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vpmin_f32(src1, src2) __n64_to_float32x2_t(neon_fminp32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vpmin_s8(src1, src2)  __n64_to_int8x8_t(neon_sminp8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vpmin_s16(src1, src2) __n64_to_int16x4_t(neon_sminp16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vpmin_s32(src1, src2) __n64_to_int32x2_t(neon_sminp32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vpmin_u8(src1, src2)  __n64_to_uint8x8_t(neon_uminp8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vpmin_u16(src1, src2) __n64_to_uint16x4_t(neon_uminp16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vpmin_u32(src1, src2) __n64_to_uint32x2_t(neon_uminp32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vpminq_f32(src1, src2) __n128_to_float32x4_t(neon_fminpq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vpminq_f64(src1, src2) __n128_to_float64x2_t(neon_fminpq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vpminq_s8(src1, src2)  __n128_to_int8x16_t(neon_sminpq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vpminq_s16(src1, src2) __n128_to_int16x8_t(neon_sminpq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vpminq_s32(src1, src2) __n128_to_int32x4_t(neon_sminpq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vpminq_u8(src1, src2)  __n128_to_uint8x16_t(neon_uminpq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vpminq_u16(src1, src2) __n128_to_uint16x8_t(neon_uminpq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vpminq_u32(src1, src2) __n128_to_uint32x4_t(neon_uminpq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vminv_f32(src1) neon_fminps32(__float32x2_t_to_n64(src1))
#define vminnmv_f32(src1) neon_fminnmps32(__float32x2_t_to_n64(src1))
#define vminv_s8(src1) neon_sminv8(__int8x8_t_to_n64(src1)).n8_i8[0]
#define vminv_s16(src1) neon_sminv16(__int16x4_t_to_n64(src1)).n16_i16[0]
#define vminv_s32(src1) neon_sminp32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src1)).n64_i32[0]
#define vminv_u8(src1) neon_uminv8(__uint8x8_t_to_n64(src1)).n8_u8[0]
#define vminv_u16(src1) neon_uminv16(__uint16x4_t_to_n64(src1)).n16_u16[0]
#define vminv_u32(src1) neon_uminp32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src1)).n64_u32[0]
#define vminvq_f32(src1) neon_fminv(__float32x4_t_to_n128(src1))
#define vminnmvq_f32(src1) neon_fminnmv(__float32x4_t_to_n128(src1))
#define vminvq_f64(src1) neon_fminps64(__float64x2_t_to_n128(src1))
#define vminnmvq_f64(src1) neon_fminnmps64(__float64x2_t_to_n128(src1))
#define vminvq_s8(src1) neon_sminvq8(__int8x16_t_to_n128(src1)).n8_i8[0]
#define vminvq_s16(src1) neon_sminvq16(__int16x8_t_to_n128(src1)).n16_i16[0]
#define vminvq_s32(src1) _CopyInt32FromFloat(neon_sminvq32(__int32x4_t_to_n128(src1)))
#define vminvq_u8(src1) neon_uminvq8(__uint8x16_t_to_n128(src1)).n8_u8[0]
#define vminvq_u16(src1) neon_uminvq16(__uint16x8_t_to_n128(src1)).n16_u16[0]
#define vminvq_u32(src1) _CopyUInt32FromFloat(neon_uminvq32(__uint32x4_t_to_n128(src1)))
#define vpminnm_f32(src1, src2) __n64_to_float32x2_t(neon_fminnmp32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vpminnmq_f32(src1, src2) __n128_to_float32x4_t(neon_fminnmpq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vpminnmq_f64(src1, src2) __n128_to_float64x2_t(neon_fminnmpq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vpmins_f32(src1) neon_fminps32(__float32x2_t_to_n64(src1))
#define vpminqd_f64(src1) neon_fminps64(__float64x2_t_to_n128(src1))
#define vpminnms_f32(src1) neon_fminnmps32(__float32x2_t_to_n64(src1))
#define vpminnmqd_f64(src1) neon_fminnmps64(__float64x2_t_to_n128(src1))
#define vpmaxs_f32(src1) neon_fmaxps32(__float32x2_t_to_n64(src1))
#define vpmaxqd_f64(src1) neon_fmaxps64(__float64x2_t_to_n128(src1))
#define vpmaxnms_f32(src1) neon_fmaxnmps32(__float32x2_t_to_n64(src1))
#define vpmaxnmqd_f64(src1) neon_fmaxnmps64(__float64x2_t_to_n128(src1))

// EXT
__n64  neon_ext8(__n64, __n64, const int);
__n64  neon_ext16(__n64, __n64, const int);
__n64  neon_ext32(__n64, __n64, const int);
__n64  neon_ext64(__n64, __n64, const int);
__n128 neon_extq8(__n128, __n128, const int);
__n128 neon_extq16(__n128, __n128, const int);
__n128 neon_extq32(__n128, __n128, const int);
__n128 neon_extq64(__n128, __n128, const int);
#define vext_s8(src1, src2, pos)  __n64_to_int8x8_t(neon_ext8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2), (pos)))
#define vext_u8(src1, src2, pos)  __n64_to_uint8x8_t(neon_ext8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2), (pos)))
#define vext_s16(src1, src2, pos)  __n64_to_int16x4_t(neon_ext16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), (pos)))
#define vext_u16(src1, src2, pos)  __n64_to_uint16x4_t(neon_ext16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), (pos)))
#define vext_s32(src1, src2, pos)  __n64_to_int32x2_t(neon_ext32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), (pos)))
#define vext_u32(src1, src2, pos)  __n64_to_uint32x2_t(neon_ext32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), (pos)))
#define vext_s64(src1, src2, pos)  __n64_to_int64x1_t(neon_ext64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2), (pos)))
#define vext_u64(src1, src2, pos)  __n64_to_uint64x1_t(neon_ext64(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2), (pos)))
#define vext_p8(src1, src2, pos)  __n64_to_poly8x8_t(neon_ext8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2), (pos)))
#define vext_p16(src1, src2, pos)  __n64_to_poly16x4_t(neon_ext16(__poly16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2), (pos)))
#define vext_p64(src1, src2, pos)  __n64_to_poly64x1_t(neon_ext64(__poly64x1_t_to_n64(src1), __poly64x1_t_to_n64(src2), (pos)))
#define vext_f32(src1, src2, pos)  __n64_to_float32x2_t(neon_ext32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), (pos)))
#define vext_f64(src1, src2, pos)  __n64_to_float64x1_t(neon_ext64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2), (pos)))
#define vextq_s8(src1, src2, pos)  __n128_to_int8x16_t(neon_extq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2), (pos)))
#define vextq_u8(src1, src2, pos)  __n128_to_uint8x16_t(neon_extq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2), (pos)))
#define vextq_s16(src1, src2, pos)  __n128_to_int16x8_t(neon_extq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), (pos)))
#define vextq_u16(src1, src2, pos)  __n128_to_uint16x8_t(neon_extq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), (pos)))
#define vextq_s32(src1, src2, pos)  __n128_to_int32x4_t(neon_extq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), (pos)))
#define vextq_u32(src1, src2, pos)  __n128_to_uint32x4_t(neon_extq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), (pos)))
#define vextq_s64(src1, src2, pos)  __n128_to_int64x2_t(neon_extq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2), (pos)))
#define vextq_u64(src1, src2, pos)  __n128_to_uint64x2_t(neon_extq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2), (pos)))
#define vextq_p8(src1, src2, pos)  __n128_to_poly8x16_t(neon_extq8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2), (pos)))
#define vextq_p16(src1, src2, pos)  __n128_to_poly16x8_t(neon_extq16(__poly16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2), (pos)))
#define vextq_p64(src1, src2, pos)  __n128_to_poly64x2_t(neon_extq64(__poly64x2_t_to_n128(src1), __poly64x2_t_to_n128(src2), (pos)))
#define vextq_f32(src1, src2, pos)  __n128_to_float32x4_t(neon_extq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), (pos)))
#define vextq_f64(src1, src2, pos)  __n128_to_float64x2_t(neon_extq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), (pos)))

// FABD/SABD/SABA/UABD/UABA
__n64  neon_fabd16(__n64, __n64);
__n128 neon_fabdq16(__n128, __n128);
__n64  neon_fabd32(__n64, __n64);
__n128 neon_fabdq32(__n128, __n128);
__n64  neon_fabd64(__n64, __n64);
__n128 neon_fabdq64(__n128, __n128);
float  neon_fabds32(float, float);
double neon_fabds64(double, double);
__n64  neon_sabd8(__n64, __n64);
__n64  neon_sabd16(__n64, __n64);
__n64  neon_sabd32(__n64, __n64);
__n128 neon_sabdq8(__n128, __n128);
__n128 neon_sabdq16(__n128, __n128);
__n128 neon_sabdq32(__n128, __n128);
__n64  neon_saba8(__n64, __n64, __n64);
__n64  neon_saba16(__n64, __n64, __n64);
__n64  neon_saba32(__n64, __n64, __n64);
__n128 neon_sabaq8(__n128, __n128, __n128);
__n128 neon_sabaq16(__n128, __n128, __n128);
__n128 neon_sabaq32(__n128, __n128, __n128);
__n64  neon_uabd8(__n64, __n64);
__n64  neon_uabd16(__n64, __n64);
__n64  neon_uabd32(__n64, __n64);
__n128 neon_uabdq8(__n128, __n128);
__n128 neon_uabdq16(__n128, __n128);
__n128 neon_uabdq32(__n128, __n128);
__n64  neon_uaba8(__n64, __n64, __n64);
__n64  neon_uaba16(__n64, __n64, __n64);
__n64  neon_uaba32(__n64, __n64, __n64);
__n128 neon_uabaq8(__n128, __n128, __n128);
__n128 neon_uabaq16(__n128, __n128, __n128);
__n128 neon_uabaq32(__n128, __n128, __n128);
#define vabd_f32(src1, src2) __n64_to_float32x2_t(neon_fabd32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vabds_f32(src1, src2) neon_fabds32((src1), (src2))
#define vabdd_f64(src1, src2) neon_fabds64((src1), (src2))
#define vabd_f64(src1, src2) __n64_to_float64x1_t(neon_fabd64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vabdq_f32(src1, src2) __n128_to_float32x4_t(neon_fabdq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vabd_s8(src1, src2) __n64_to_int8x8_t(neon_sabd8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vabd_s16(src1, src2) __n64_to_int16x4_t(neon_sabd16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vabd_s32(src1, src2) __n64_to_int32x2_t(neon_sabd32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vabd_u8(src1, src2) __n64_to_uint8x8_t(neon_uabd8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vabd_u16(src1, src2) __n64_to_uint16x4_t(neon_uabd16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vabd_u32(src1, src2) __n64_to_uint32x2_t(neon_uabd32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vabdq_s8(src1, src2) __n128_to_int8x16_t(neon_sabdq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vabdq_s16(src1, src2) __n128_to_int16x8_t(neon_sabdq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vabdq_s32(src1, src2) __n128_to_int32x4_t(neon_sabdq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vabdq_u8(src1, src2) __n128_to_uint8x16_t(neon_uabdq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vabdq_u16(src1, src2) __n128_to_uint16x8_t(neon_uabdq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vabdq_u32(src1, src2) __n128_to_uint32x4_t(neon_uabdq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vabdq_f64(src1, src2) __n128_to_float64x2_t(neon_fabdq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vaba_s8(src1, src2, src3) __n64_to_int8x8_t(neon_saba8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2), __int8x8_t_to_n64(src3)))
#define vaba_s16(src1, src2, src3) __n64_to_int16x4_t(neon_saba16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3)))
#define vaba_s32(src1, src2, src3) __n64_to_int32x2_t(neon_saba32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3)))
#define vaba_u8(src1, src2, src3) __n64_to_uint8x8_t(neon_uaba8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2), __uint8x8_t_to_n64(src3)))
#define vaba_u16(src1, src2, src3) __n64_to_uint16x4_t(neon_uaba16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), __uint16x4_t_to_n64(src3)))
#define vaba_u32(src1, src2, src3) __n64_to_uint32x2_t(neon_uaba32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), __uint32x2_t_to_n64(src3)))
#define vabaq_s8(src1, src2, src3) __n128_to_int8x16_t(neon_sabaq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2), __int8x16_t_to_n128(src3)))
#define vabaq_s16(src1, src2, src3) __n128_to_int16x8_t(neon_sabaq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vabaq_s32(src1, src2, src3) __n128_to_int32x4_t(neon_sabaq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vabaq_u8(src1, src2, src3) __n128_to_uint8x16_t(neon_uabaq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3)))
#define vabaq_u16(src1, src2, src3) __n128_to_uint16x8_t(neon_uabaq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3)))
#define vabaq_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_uabaq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))

// FDIV
__n64  neon_fdiv16(__n64, __n64);
__n64  neon_fdiv32(__n64, __n64);
__n64  neon_fdiv64(__n64, __n64);
__n128 neon_fdivq16(__n128, __n128);
__n128 neon_fdivq32(__n128, __n128);
__n128 neon_fdivq64(__n128, __n128);
#define vdiv_f32(src1, src2) __n64_to_float32x2_t(neon_fdiv32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vdiv_f64(src1, src2) __n64_to_float64x1_t(neon_fdiv64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vdivq_f32(src1, src2) __n128_to_float32x4_t(neon_fdivq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vdivq_f64(src1, src2) __n128_to_float64x2_t(neon_fdivq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))

// FSQRT/FRSQRTE/URSQRTE/FRSQRTS
__n64  neon_fsqrt16(__n64);
__n128 neon_fsqrtq16(__n128);
__n64  neon_fsqrt32(__n64);
__n128 neon_fsqrtq32(__n128);
__n64  neon_fsqrt64(__n64);
__n128 neon_fsqrtq64(__n128);
__n64  neon_frsqrte16(__n64);
__n128 neon_frsqrteq16(__n128);
__n64  neon_frsqrte32(__n64);
__n128 neon_frsqrteq32(__n128);
__n64  neon_frsqrte64(__n64);
__n128 neon_frsqrteq64(__n128);
float  neon_frsqrtes32(float);
double neon_frsqrtes64(double);
__n64  neon_ursqrte32(__n64);
__n128 neon_ursqrteq32(__n128);
__n64  neon_frsqrts16(__n64, __n64);
__n128 neon_frsqrtsq16(__n128, __n128);
__n64  neon_frsqrts32(__n64, __n64);
__n128 neon_frsqrtsq32(__n128, __n128);
__n64  neon_frsqrts64(__n64, __n64);
__n128 neon_frsqrtsq64(__n128, __n128);
float  neon_frsqrtss32(float, float);
double neon_frsqrtss64(double, double);
#define vsqrt_f32(src)           __n64_to_float32x2_t(neon_fsqrt32(__float32x2_t_to_n64(src)))
#define vsqrt_f64(src)           __n64_to_float64x1_t(neon_fsqrt64(__float64x1_t_to_n64(src)))
#define vsqrtq_f32(src)          __n128_to_float32x4_t(neon_fsqrtq32(__float32x4_t_to_n128(src)))
#define vsqrtq_f64(src)          __n128_to_float64x2_t(neon_fsqrtq64(__float64x2_t_to_n128(src)))
#define vrsqrte_f32(src)         __n64_to_float32x2_t(neon_frsqrte32(__float32x2_t_to_n64(src)))
#define vrsqrte_u32(src)         __n64_to_uint32x2_t(neon_ursqrte32(__uint32x2_t_to_n64(src)))
#define vrsqrte_f64(src)         __n64_to_float64x1_t(neon_frsqrte64(__float64x1_t_to_n64(src)))
#define vrsqrteq_f32(src)        __n128_to_float32x4_t(neon_frsqrteq32(__float32x4_t_to_n128(src)))
#define vrsqrteq_u32(src)        __n128_to_uint32x4_t(neon_ursqrteq32(__uint32x4_t_to_n128(src)))
#define vrsqrteq_f64(src)        __n128_to_float64x2_t(neon_frsqrteq64(__float64x2_t_to_n128(src)))
#define vrsqrtes_f32(src1)       neon_frsqrtes32(src1)
#define vrsqrted_f64(src1)       neon_frsqrtes64(src1)
#define vrsqrts_f32(src1, src2)  __n64_to_float32x2_t(neon_frsqrts32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vrsqrts_f64(src1, src2)  __n64_to_float64x1_t(neon_frsqrts64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vrsqrtsq_f32(src1, src2) __n128_to_float32x4_t(neon_frsqrtsq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vrsqrtsq_f64(src1, src2) __n128_to_float64x2_t(neon_frsqrtsq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vrsqrtss_f32(src1, src2) neon_frsqrtss32((src1), (src2))
#define vrsqrtsd_f64(src1, src2) neon_frsqrtss64((src1), (src2))

// PMUL/MUL/MLA/MLS/SQDMULH/SQRDMULH/SQRDMLAH/SQRDMLSH/FMUL/FMLA/FMLS/FMULX/FMLAL/FMLSL
__n64  neon_pmul(__n64, __n64);
__n128 neon_pmulq(__n128, __n128);
__n128 neon_pmull_8(__n64, __n64);
__n128 neon_pmull_q8(__n128, __n128);
__n128 neon_pmull2_8(__n128, __n128);
__n128 neon_pmull_64(__n64, __n64);
__n128 neon_pmull_q64(__n128, __n128);
__n128 neon_pmull2_64(__n128, __n128);
__n64  neon_fmulvind16 (__n64,  __n64,  const int);
__n64  neon_fmulvind16q(__n64,  __n128, const int);
__n128 neon_fmulqvind16(__n128, __n64, const int);
__n128 neon_fmulqvind16q(__n128, __n128, const int);
__n64  neon_fmulvind32 (__n64,  __n64,  const int);
__n64  neon_fmulvind32q(__n64,  __n128, const int);
__n128 neon_fmulqvind32(__n128, __n64, const int);
__n128 neon_fmulqvind32q(__n128, __n128, const int);
__n64  neon_fmulvind64 (__n64,  __n64,  const int);
__n64  neon_fmulvind64q(__n64,  __n128, const int);
__n128 neon_fmulqvind64(__n128, __n64, const int);
__n128 neon_fmulqvind64q(__n128, __n128, const int);
__n64  neon_fmul16 (__n64,  __n64);
__n128 neon_fmulq16(__n128, __n128);
__n64  neon_fmul32 (__n64,  __n64);
__n128 neon_fmulq32(__n128, __n128);
__n64  neon_fmul64 (__n64,  __n64);
__n128 neon_fmulq64(__n128, __n128);
float  neon_fmulsind32(float, __n64, const int);
double neon_fmulsind64(double, __n64, const int);
float  neon_fmulsind32q(float, __n128, const int);
double neon_fmulsind64q(double, __n128, const int);
__n64  neon_fmlavind16 (__n64, __n64,  __n64,  const int);
__n64  neon_fmlavind16q (__n64, __n64,  __n128,  const int);
__n128 neon_fmlaqvind16(__n128, __n128, __n64, const int);
__n128 neon_fmlaqvind16q(__n128, __n128, __n128, const int);
__n64  neon_fmlavind32 (__n64, __n64,  __n64,  const int);
__n64  neon_fmlavind32q (__n64, __n64,  __n128,  const int);
__n128 neon_fmlaqvind32(__n128, __n128, __n64, const int);
__n128 neon_fmlaqvind32q(__n128, __n128, __n128, const int);
__n64  neon_fmlavind64 (__n64, __n64,  __n64,  const int);
__n64  neon_fmlavind64q (__n64, __n64,  __n128,  const int);
__n128 neon_fmlaqvind64(__n128, __n128, __n64, const int);
__n128 neon_fmlaqvind64q(__n128, __n128, __n128, const int);
__n64  neon_fmla16 (__n64, __n64,  __n64);
__n64  neon_fmla32 (__n64, __n64,  __n64);
__n64  neon_fmla64 (__n64, __n64,  __n64);
__n128 neon_fmlaq16(__n128, __n128, __n128);
__n128 neon_fmlaq32(__n128, __n128, __n128);
__n128 neon_fmlaq64(__n128, __n128, __n128);
float  neon_fmlasind32(float,  float, __n64, const int);
double neon_fmlasind64(double, double, __n64, const int);
float  neon_fmlasind32q(float,  float, __n128, const int);
double neon_fmlasind64q(double, double, __n128, const int);
__n64  neon_fmlsvind16 (__n64,  __n64,  __n64,  const int);
__n64  neon_fmlsvind16q(__n64,  __n64,  __n128, const int);
__n128 neon_fmlsqvind16(__n128, __n128, __n64, const int);
__n128 neon_fmlsqvind16q(__n128, __n128, __n128, const int);
__n64  neon_fmlsvind32 (__n64,  __n64,  __n64,  const int);
__n64  neon_fmlsvind32q(__n64,  __n64,  __n128, const int);
__n128 neon_fmlsqvind32(__n128, __n128, __n64, const int);
__n128 neon_fmlsqvind32q(__n128, __n128, __n128, const int);
__n64  neon_fmlsvind64 (__n64,  __n64,  __n64,  const int);
__n64  neon_fmlsvind64q(__n64,  __n64,  __n128, const int);
__n128 neon_fmlsqvind64(__n128, __n128, __n64, const int);
__n128 neon_fmlsqvind64q(__n128, __n128, __n128, const int);
__n64  neon_fmls16 (__n64,  __n64,  __n64);
__n64  neon_fmls32 (__n64,  __n64,  __n64);
__n64  neon_fmls64 (__n64,  __n64,  __n64);
__n128 neon_fmlsq16(__n128, __n128, __n128);
__n128 neon_fmlsq32(__n128, __n128, __n128);
__n128 neon_fmlsq64(__n128, __n128, __n128);
float  neon_fmlssind32(float,  float, __n64, const int);
double neon_fmlssind64(double, double, __n64, const int);
float  neon_fmlssind32q(float,  float, __n128, const int);
double neon_fmlssind64q(double, double, __n128, const int);
__n64  neon_fmulxvind16 (__n64,  __n64,  const int);
__n64  neon_fmulxvind16q(__n64,  __n128, const int);
__n128 neon_fmulxqvind16(__n128, __n64, const int);
__n128 neon_fmulxqvind16q(__n128, __n128, const int);
__n64  neon_fmulxvind32 (__n64,  __n64,  const int);
__n64  neon_fmulxvind32q(__n64,  __n128, const int);
__n128 neon_fmulxqvind32(__n128, __n64, const int);
__n128 neon_fmulxqvind32q(__n128, __n128, const int);
__n64  neon_fmulxvind64 (__n64,  __n64,  const int);
__n64  neon_fmulxvind64q(__n64,  __n128, const int);
__n128 neon_fmulxqvind64(__n128, __n64, const int);
__n128 neon_fmulxqvind64q(__n128, __n128, const int);
__n64  neon_fmulx16 (__n64,  __n64);
__n128 neon_fmulxq16(__n128, __n128);
__n64  neon_fmulx32 (__n64,  __n64);
__n128 neon_fmulxq32(__n128, __n128);
__n64  neon_fmulx64 (__n64,  __n64);
__n128 neon_fmulxq64(__n128, __n128);
float  neon_fmulxs32(float,  float);
double neon_fmulxs64(double, double);
float  neon_fmulxsind32(float, __n64, const int);
double neon_fmulxsind64(double, __n64, const int);
float  neon_fmulxsind32q(float, __n128, const int);
double neon_fmulxsind64q(double, __n128, const int);
__n64  neon_mulvind16 (__n64,  __n64,  const int);
__n64  neon_mulvind32 (__n64,  __n64,  const int);
__n64  neon_mulvind16q(__n64,  __n128, const int);
__n64  neon_mulvind32q(__n64,  __n128, const int);
__n128 neon_mulqvind16(__n128, __n64, const int);
__n128 neon_mulqvind32(__n128, __n64, const int);
__n128 neon_mulqvind16q(__n128, __n128, const int);
__n128 neon_mulqvind32q(__n128, __n128, const int);
__n64  neon_mul8  (__n64,  __n64);
__n64  neon_mul16 (__n64,  __n64);
__n64  neon_mul32 (__n64,  __n64);
__n128 neon_mulq8 (__n128, __n128);
__n128 neon_mulq16(__n128, __n128);
__n128 neon_mulq32(__n128, __n128);
__n64  neon_mlsvind16 (__n64,  __n64,  __n64,  const int);
__n64  neon_mlsvind32 (__n64,  __n64,  __n64,  const int);
__n64  neon_mlsvind16q(__n64,  __n64,  __n128, const int);
__n64  neon_mlsvind32q(__n64,  __n64,  __n128, const int);
__n128 neon_mlsqvind16(__n128, __n128, __n64, const int);
__n128 neon_mlsqvind32(__n128, __n128, __n64, const int);
__n128 neon_mlsqvind16q(__n128, __n128, __n128, const int);
__n128 neon_mlsqvind32q(__n128, __n128, __n128, const int);
__n64  neon_mls8  (__n64,  __n64,  __n64);
__n64  neon_mls16 (__n64,  __n64,  __n64);
__n64  neon_mls32 (__n64,  __n64,  __n64);
__n128 neon_mlsq8 (__n128, __n128, __n128);
__n128 neon_mlsq16(__n128, __n128, __n128);
__n128 neon_mlsq32(__n128, __n128, __n128);
__n64  neon_mlavind16 (__n64,  __n64,  __n64,  const int);
__n64  neon_mlavind32 (__n64,  __n64,  __n64,  const int);
__n64  neon_mlavind16q(__n64,  __n64,  __n128, const int);
__n64  neon_mlavind32q(__n64,  __n64,  __n128, const int);
__n128 neon_mlaqvind16(__n128, __n128, __n64, const int);
__n128 neon_mlaqvind32(__n128, __n128, __n64, const int);
__n128 neon_mlaqvind16q(__n128, __n128, __n128, const int);
__n128 neon_mlaqvind32q(__n128, __n128, __n128, const int);
__n64  neon_mla8  (__n64,  __n64,  __n64);
__n64  neon_mla16 (__n64,  __n64,  __n64);
__n64  neon_mla32 (__n64,  __n64,  __n64);
__n128 neon_mlaq8 (__n128, __n128, __n128);
__n128 neon_mlaq16(__n128, __n128, __n128);
__n128 neon_mlaq32(__n128, __n128, __n128);
__n64  neon_sqdmulhvind16 (__n64,  __n64,  const int);
__n64  neon_sqdmulhvind32 (__n64,  __n64,  const int);
__n64  neon_sqdmulhvind16q(__n64,  __n128, const int);
__n64  neon_sqdmulhvind32q(__n64,  __n128, const int);
__n128 neon_sqdmulhqvind16(__n128, __n64, const int);
__n128 neon_sqdmulhqvind32(__n128, __n64, const int);
__n128 neon_sqdmulhqvind16q(__n128, __n128, const int);
__n128 neon_sqdmulhqvind32q(__n128, __n128, const int);
__n64  neon_sqdmulh16 (__n64,  __n64);
__n64  neon_sqdmulh32 (__n64,  __n64);
__n128 neon_sqdmulhq16(__n128, __n128);
__n128 neon_sqdmulhq32(__n128, __n128);
__n16  neon_sqdmulhsind16(__n16, __n64, const int);
float  neon_sqdmulhsind32(float, __n64, const int);
__n16  neon_sqdmulhsind16q(__n16, __n128, const int);
float  neon_sqdmulhsind32q(float, __n128, const int);
__n16  neon_sqdmulhs16 (__n16,  __n16);
float  neon_sqdmulhs32 (float,  float);
__n64  neon_sqrdmulhvind16 (__n64,  __n64,  const int);
__n64  neon_sqrdmulhvind32 (__n64,  __n64,  const int);
__n64  neon_sqrdmulhvind16q(__n64,  __n128, const int);
__n64  neon_sqrdmulhvind32q(__n64,  __n128, const int);
__n128 neon_sqrdmulhqvind16(__n128, __n64, const int);
__n128 neon_sqrdmulhqvind32(__n128, __n64, const int);
__n128 neon_sqrdmulhqvind16q(__n128, __n128, const int);
__n128 neon_sqrdmulhqvind32q(__n128, __n128, const int);
__n64  neon_sqrdmulh16 (__n64,  __n64);
__n64  neon_sqrdmulh32 (__n64,  __n64);
__n128 neon_sqrdmulhq16(__n128, __n128);
__n128 neon_sqrdmulhq32(__n128, __n128);
__n16  neon_sqrdmulhsind16(__n16, __n64, const int);
float  neon_sqrdmulhsind32(float, __n64, const int);
__n16  neon_sqrdmulhsind16q(__n16, __n128, const int);
float  neon_sqrdmulhsind32q(float, __n128, const int);
__n16  neon_sqrdmulhs16 (__n16,  __n16);
float  neon_sqrdmulhs32 (float,  float);
__n64  neon_sqrdmlahvind16 (__n64,  __n64,  __n64,  const int);
__n64  neon_sqrdmlahvind32 (__n64,  __n64,  __n64,  const int);
__n64  neon_sqrdmlahvind16q(__n64,  __n64,  __n128, const int);
__n64  neon_sqrdmlahvind32q(__n64,  __n64,  __n128, const int);
__n128 neon_sqrdmlahqvind16(__n128, __n128, __n64, const int);
__n128 neon_sqrdmlahqvind32(__n128, __n128, __n64, const int);
__n128 neon_sqrdmlahqvind16q(__n128, __n128, __n128, const int);
__n128 neon_sqrdmlahqvind32q(__n128, __n128, __n128, const int);
__n64  neon_sqrdmlah16 (__n64,  __n64,  __n64);
__n64  neon_sqrdmlah32 (__n64,  __n64,  __n64);
__n128 neon_sqrdmlahq16(__n128, __n128, __n128);
__n128 neon_sqrdmlahq32(__n128, __n128, __n128);
__n16  neon_sqrdmlahsind16(__n16, __n16, __n64, const int);
float  neon_sqrdmlahsind32(float, float, __n64, const int);
__n16  neon_sqrdmlahsind16q(__n16, __n16, __n128, const int);
float  neon_sqrdmlahsind32q(float, float, __n128, const int);
__n16  neon_sqrdmlahs16 (__n16,  __n16,  __n16);
float  neon_sqrdmlahs32 (float,  float,  float);
__n64  neon_sqrdmlshvind16 (__n64,  __n64,  __n64,  const int);
__n64  neon_sqrdmlshvind32 (__n64,  __n64,  __n64,  const int);
__n64  neon_sqrdmlshvind16q(__n64,  __n64,  __n128, const int);
__n64  neon_sqrdmlshvind32q(__n64,  __n64,  __n128, const int);
__n128 neon_sqrdmlshqvind16(__n128, __n128, __n64, const int);
__n128 neon_sqrdmlshqvind32(__n128, __n128, __n64, const int);
__n128 neon_sqrdmlshqvind16q(__n128, __n128, __n128, const int);
__n128 neon_sqrdmlshqvind32q(__n128, __n128, __n128, const int);
__n64  neon_sqrdmlsh16 (__n64,  __n64,  __n64);
__n64  neon_sqrdmlsh32 (__n64,  __n64,  __n64);
__n128 neon_sqrdmlshq16(__n128, __n128, __n128);
__n128 neon_sqrdmlshq32(__n128, __n128, __n128);
__n16  neon_sqrdmlshsind16(__n16, __n16, __n64, const int);
float  neon_sqrdmlshsind32(float, float, __n64, const int);
__n16  neon_sqrdmlshsind16q(__n16, __n16, __n128, const int);
float  neon_sqrdmlshsind32q(float, float, __n128, const int);
__n16  neon_sqrdmlshs16 (__n16,  __n16,  __n16);
float  neon_sqrdmlshs32 (float,  float,  float);
__n64  neon_fmlal_16 (__n64, __n64,  __n64);
__n128 neon_fmlal_16q(__n128, __n128, __n128);
__n64  neon_fmlalvind_16 (__n64, __n64,  __n64,  const int);
__n64  neon_fmlalvind_16q(__n64, __n64,  __n128,  const int);
__n128 neon_fmlalqvind_16 (__n128, __n128, __n64, const int);
__n128 neon_fmlalqvind_16q(__n128, __n128, __n128, const int);
__n64  neon_fmlsl_16 (__n64, __n64,  __n64);
__n128 neon_fmlsl_16q(__n128, __n128, __n128);
__n64  neon_fmlslvind_16 (__n64, __n64,  __n64,  const int);
__n64  neon_fmlslvind_16q(__n64, __n64,  __n128,  const int);
__n128 neon_fmlslqvind_16 (__n128, __n128, __n64, const int);
__n128 neon_fmlslqvind_16q(__n128, __n128, __n128, const int);
__n64  neon_fmlal2_16 (__n64, __n64,  __n64);
__n128 neon_fmlal2_16q(__n128, __n128, __n128);
__n64  neon_fmlal2vind_16 (__n64, __n64,  __n64,  const int);
__n64  neon_fmlal2vind_16q(__n64, __n64,  __n128,  const int);
__n128 neon_fmlal2qvind_16 (__n128, __n128, __n64, const int);
__n128 neon_fmlal2qvind_16q(__n128, __n128, __n128, const int);
__n64  neon_fmlsl2_16 (__n64, __n64,  __n64);
__n128 neon_fmlsl2_16q(__n128, __n128, __n128);
__n64  neon_fmlsl2vind_16 (__n64, __n64,  __n64,  const int);
__n64  neon_fmlsl2vind_16q(__n64, __n64,  __n128,  const int);
__n128 neon_fmlsl2qvind_16 (__n128, __n128, __n64, const int);
__n128 neon_fmlsl2qvind_16q(__n128, __n128, __n128, const int);
#define vmul_p8(src1, src2) __n64_to_poly8x8_t(neon_pmul(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vmull_p8(src1, src2) __n128_to_poly16x8_t(neon_pmull_8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vmull_high_p8(src1, src2) __n128_to_poly16x8_t(neon_pmull2_8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vmul_f32(src1, src2) __n64_to_float32x2_t(neon_fmul32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vmul_f64(src1, src2) __n64_to_float64x1_t(neon_fmul64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vmul_s16(src1, src2) __n64_to_int16x4_t(neon_mul16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vmul_s32(src1, src2) __n64_to_int32x2_t(neon_mul32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vmul_s8(src1, src2) __n64_to_int8x8_t(neon_mul8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vmul_u16(src1, src2) __n64_to_uint16x4_t(neon_mul16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vmul_u32(src1, src2) __n64_to_uint32x2_t(neon_mul32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vmul_u8(src1, src2) __n64_to_uint8x8_t(neon_mul8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vmulq_p8(src1, src2) __n128_to_poly8x16_t(neon_pmulq(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vmulq_f32(src1, src2) __n128_to_float32x4_t(neon_fmulq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vmulq_f64(src1, src2) __n128_to_float64x2_t(neon_fmulq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vmulq_s16(src1, src2) __n128_to_int16x8_t(neon_mulq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vmulq_s32(src1, src2) __n128_to_int32x4_t(neon_mulq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vmulq_s8(src1, src2) __n128_to_int8x16_t(neon_mulq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vmulq_u16(src1, src2) __n128_to_uint16x8_t(neon_mulq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vmulq_u32(src1, src2) __n128_to_uint32x4_t(neon_mulq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vmulq_u8(src1, src2) __n128_to_uint8x16_t(neon_mulq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vmul_lane_f32(src1, src2, lane) __n64_to_float32x2_t(neon_fmulvind32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), (lane)))
#define vmul_lane_f64(src1, src2, lane) __n64_to_float64x1_t(neon_fmulvind64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2), (lane)))
#define vmul_lane_s16(src1, src2, lane) __n64_to_int16x4_t(neon_mulvind16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), (lane)))
#define vmul_lane_s32(src1, src2, lane) __n64_to_int32x2_t(neon_mulvind32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), (lane)))
#define vmul_lane_u16(src1, src2, lane) __n64_to_uint16x4_t(neon_mulvind16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), (lane)))
#define vmul_lane_u32(src1, src2, lane) __n64_to_uint32x2_t(neon_mulvind32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), (lane)))
#define vmulq_lane_f32(src1, src2, lane) __n128_to_float32x4_t(neon_fmulqvind32(__float32x4_t_to_n128(src1), __float32x2_t_to_n64(src2), (lane)))
#define vmulq_lane_f64(src1, src2, lane) __n128_to_float64x2_t(neon_fmulqvind64(__float64x2_t_to_n128(src1), __float64x1_t_to_n64(src2), (lane)))
#define vmulq_lane_s16(src1, src2, lane) __n128_to_int16x8_t(neon_mulqvind16(__int16x8_t_to_n128(src1), __int16x4_t_to_n64(src2), (lane)))
#define vmulq_lane_s32(src1, src2, lane) __n128_to_int32x4_t(neon_mulqvind32(__int32x4_t_to_n128(src1), __int32x2_t_to_n64(src2), (lane)))
#define vmulq_lane_u16(src1, src2, lane) __n128_to_uint16x8_t(neon_mulqvind16(__uint16x8_t_to_n128(src1), __uint16x4_t_to_n64(src2), (lane)))
#define vmulq_lane_u32(src1, src2, lane) __n128_to_uint32x4_t(neon_mulqvind32(__uint32x4_t_to_n128(src1), __uint32x2_t_to_n64(src2), (lane)))
#define vmul_laneq_f32(src1, src2, lane) __n64_to_float32x2_t(neon_fmulvind32q(__float32x2_t_to_n64(src1), __float32x4_t_to_n128(src2), (lane)))
#define vmul_laneq_f64(src1, src2, lane) __n64_to_float64x1_t(neon_fmulvind64q(__float64x1_t_to_n64(src1), __float64x2_t_to_n128(src2), (lane)))
#define vmul_laneq_s16(src1, src2, lane) __n64_to_int16x4_t(neon_mulvind16q(__int16x4_t_to_n64(src1), __int16x8_t_to_n128(src2), (lane)))
#define vmul_laneq_s32(src1, src2, lane) __n64_to_int32x2_t(neon_mulvind32q(__int32x2_t_to_n64(src1), __int32x4_t_to_n128(src2), (lane)))
#define vmul_laneq_u16(src1, src2, lane) __n64_to_uint16x4_t(neon_mulvind16q(__uint16x4_t_to_n64(src1), __uint16x8_t_to_n128(src2), (lane)))
#define vmul_laneq_u32(src1, src2, lane) __n64_to_uint32x2_t(neon_mulvind32q(__uint32x2_t_to_n64(src1), __uint32x4_t_to_n128(src2), (lane)))
#define vmulq_laneq_f32(src1, src2, lane) __n128_to_float32x4_t(neon_fmulqvind32q(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), (lane)))
#define vmulq_laneq_f64(src1, src2, lane) __n128_to_float64x2_t(neon_fmulqvind64q(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), (lane)))
#define vmulq_laneq_s16(src1, src2, lane) __n128_to_int16x8_t(neon_mulqvind16q(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), (lane)))
#define vmulq_laneq_s32(src1, src2, lane) __n128_to_int32x4_t(neon_mulqvind32q(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), (lane)))
#define vmulq_laneq_u16(src1, src2, lane) __n128_to_uint16x8_t(neon_mulqvind16q(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), (lane)))
#define vmulq_laneq_u32(src1, src2, lane) __n128_to_uint32x4_t(neon_mulqvind32q(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), (lane)))
#define vmuls_lane_f32(src1, src2, lane) neon_fmulsind32((src1), __float32x2_t_to_n64(src2), (lane))
#define vmuld_lane_f64(src1, src2, lane) neon_fmulsind64((src1), __float64x1_t_to_n64(src2), (lane))
#define vmuls_laneq_f32(src1, src2, lane) neon_fmulsind32q((src1), __float32x4_t_to_n128(src2), (lane))
#define vmuld_laneq_f64(src1, src2, lane) neon_fmulsind64q((src1), __float64x2_t_to_n128(src2), (lane))
#define vmulx_f32(src1, src2) __n64_to_float32x2_t(neon_fmulx32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vmulx_f64(src1, src2) __n64_to_float64x1_t(neon_fmulx64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vmulxq_f32(src1, src2) __n128_to_float32x4_t(neon_fmulxq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vmulxq_f64(src1, src2) __n128_to_float64x2_t(neon_fmulxq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vmulx_lane_f32(src1, src2, lane) __n64_to_float32x2_t(neon_fmulxvind32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), (lane)))
#define vmulx_lane_f64(src1, src2, lane) __n64_to_float64x1_t(neon_fmulxvind64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2), (lane)))
#define vmulxq_lane_f32(src1, src2, lane) __n128_to_float32x4_t(neon_fmulxqvind32(__float32x4_t_to_n128(src1), __float32x2_t_to_n64(src2), (lane)))
#define vmulxq_lane_f64(src1, src2, lane) __n128_to_float64x2_t(neon_fmulxqvind64(__float64x2_t_to_n128(src1), __float64x1_t_to_n64(src2), (lane)))
#define vmulx_laneq_f32(src1, src2, lane) __n64_to_float32x2_t(neon_fmulxvind32q(__float32x2_t_to_n64(src1), __float32x4_t_to_n128(src2), (lane)))
#define vmulx_laneq_f64(src1, src2, lane) __n64_to_float64x1_t(neon_fmulxvind64q(__float64x1_t_to_n64(src1), __float64x2_t_to_n128(src2), (lane)))
#define vmulxq_laneq_f32(src1, src2, lane) __n128_to_float32x4_t(neon_fmulxqvind32q(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), (lane)))
#define vmulxq_laneq_f64(src1, src2, lane) __n128_to_float64x2_t(neon_fmulxqvind64q(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), (lane)))
#define vmulxs_f32(src1, src2) neon_fmulxs32((src1), (src2))
#define vmulxd_f64(src1, src2) neon_fmulxs64((src1), (src2))
#define vmulxs_lane_f32(src1, src2, lane) neon_fmulxsind32((src1), __float32x2_t_to_n64(src2), (lane))
#define vmulxd_lane_f64(src1, src2, lane) neon_fmulxsind64((src1), __float64x1_t_to_n64(src2), (lane))
#define vmulxs_laneq_f32(src1, src2, lane) neon_fmulxsind32q((src1), __float32x4_t_to_n128(src2), (lane))
#define vmulxd_laneq_f64(src1, src2, lane) neon_fmulxsind64q((src1), __float64x2_t_to_n128(src2), (lane))
#define vqdmulh_lane_s16(src1, src2, lane) __n64_to_int16x4_t(neon_sqdmulhvind16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), (lane)))
#define vqdmulh_lane_s32(src1, src2, lane) __n64_to_int32x2_t(neon_sqdmulhvind32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), (lane)))
#define vqrdmulh_lane_s16(src1, src2, lane) __n64_to_int16x4_t(neon_sqrdmulhvind16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), (lane)))
#define vqrdmulh_lane_s32(src1, src2, lane) __n64_to_int32x2_t(neon_sqrdmulhvind32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), (lane)))
#define vqdmulhq_lane_s16(src1, src2, lane) __n128_to_int16x8_t(neon_sqdmulhqvind16(__int16x8_t_to_n128(src1), __int16x4_t_to_n64(src2), (lane)))
#define vqdmulhq_lane_s32(src1, src2, lane) __n128_to_int32x4_t(neon_sqdmulhqvind32(__int32x4_t_to_n128(src1), __int32x2_t_to_n64(src2), (lane)))
#define vqrdmulhq_lane_s16(src1, src2, lane) __n128_to_int16x8_t(neon_sqrdmulhqvind16(__int16x8_t_to_n128(src1), __int16x4_t_to_n64(src2), (lane)))
#define vqrdmulhq_lane_s32(src1, src2, lane) __n128_to_int32x4_t(neon_sqrdmulhqvind32(__int32x4_t_to_n128(src1), __int32x2_t_to_n64(src2), (lane)))
#define vqdmulh_laneq_s16(src1, src2, lane) __n64_to_int16x4_t(neon_sqdmulhvind16q(__int16x4_t_to_n64(src1), __int16x8_t_to_n128(src2), (lane)))
#define vqdmulh_laneq_s32(src1, src2, lane) __n64_to_int32x2_t(neon_sqdmulhvind32q(__int32x2_t_to_n64(src1), __int32x4_t_to_n128(src2), (lane)))
#define vqrdmulh_laneq_s16(src1, src2, lane) __n64_to_int16x4_t(neon_sqrdmulhvind16q(__int16x4_t_to_n64(src1), __int16x8_t_to_n128(src2), (lane)))
#define vqrdmulh_laneq_s32(src1, src2, lane) __n64_to_int32x2_t(neon_sqrdmulhvind32q(__int32x2_t_to_n64(src1), __int32x4_t_to_n128(src2), (lane)))
#define vqdmulhq_laneq_s16(src1, src2, lane) __n128_to_int16x8_t(neon_sqdmulhqvind16q(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), (lane)))
#define vqdmulhq_laneq_s32(src1, src2, lane) __n128_to_int32x4_t(neon_sqdmulhqvind32q(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), (lane)))
#define vqrdmulhq_laneq_s16(src1, src2, lane) __n128_to_int16x8_t(neon_sqrdmulhqvind16q(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), (lane)))
#define vqrdmulhq_laneq_s32(src1, src2, lane) __n128_to_int32x4_t(neon_sqrdmulhqvind32q(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), (lane)))
#define vqdmulh_s16(src1, src2) __n64_to_int16x4_t(neon_sqdmulh16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vqdmulh_s32(src1, src2) __n64_to_int32x2_t(neon_sqdmulh32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vqrdmulh_s16(src1, src2) __n64_to_int16x4_t(neon_sqrdmulh16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vqrdmulh_s32(src1, src2) __n64_to_int32x2_t(neon_sqrdmulh32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vqdmulhq_s16(src1, src2) __n128_to_int16x8_t(neon_sqdmulhq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vqdmulhq_s32(src1, src2) __n128_to_int32x4_t(neon_sqdmulhq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vqrdmulhq_s16(src1, src2) __n128_to_int16x8_t(neon_sqrdmulhq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vqrdmulhq_s32(src1, src2) __n128_to_int32x4_t(neon_sqrdmulhq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vqdmulhh_s16(src1, src2) neon_sqdmulhs16(__int16ToN16_v(src1), __int16ToN16_v(src2)).n16_i16[0]
#define vqdmulhs_s32(src1, src2) _CopyInt32FromFloat(neon_sqdmulhs32(_CopyFloatFromInt32(src1), _CopyFloatFromInt32(src2)))
#define vqdmulhh_lane_s16(src1, src2, lane) neon_sqdmulhsind16(__int16ToN16_v(src1), __int16x4_t_to_n64(src2), (lane)).n16_i16[0]
#define vqdmulhs_lane_s32(src1, src2, lane) _CopyInt32FromFloat(neon_sqdmulhsind32(_CopyFloatFromInt32(src1), __int32x2_t_to_n64(src2), (lane)))
#define vqdmulhh_laneq_s16(src1, src2, lane) neon_sqdmulhsind16q(__int16ToN16_v(src1), __int16x8_t_to_n128(src2), (lane)).n16_i16[0]
#define vqdmulhs_laneq_s32(src1, src2, lane) _CopyInt32FromFloat(neon_sqdmulhsind32q(_CopyFloatFromInt32(src1), __int32x4_t_to_n128(src2), (lane)))
#define vqrdmulhh_s16(src1, src2) neon_sqrdmulhs16(__int16ToN16_v(src1), __int16ToN16_v(src2)).n16_i16[0]
#define vqrdmulhs_s32(src1, src2) _CopyInt32FromFloat(neon_sqrdmulhs32(_CopyFloatFromInt32(src1), _CopyFloatFromInt32(src2)))
#define vqrdmulhh_lane_s16(src1, src2, lane) neon_sqrdmulhsind16(__int16ToN16_v(src1), __int16x4_t_to_n64(src2), (lane)).n16_i16[0]
#define vqrdmulhs_lane_s32(src1, src2, lane) _CopyInt32FromFloat(neon_sqrdmulhsind32(_CopyFloatFromInt32(src1), __int32x2_t_to_n64(src2), (lane)))
#define vqrdmulhh_laneq_s16(src1, src2, lane) neon_sqrdmulhsind16q(__int16ToN16_v(src1), __int16x8_t_to_n128(src2), (lane)).n16_i16[0]
#define vqrdmulhs_laneq_s32(src1, src2, lane) _CopyInt32FromFloat(neon_sqrdmulhsind32q(_CopyFloatFromInt32(src1), __int32x4_t_to_n128(src2), (lane)))
#define vqrdmlah_lane_s16(src1, src2, src3, lane) __n64_to_int16x4_t(neon_sqrdmlahvind16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3), (lane)))
#define vqrdmlah_lane_s32(src1, src2, src3, lane) __n64_to_int32x2_t(neon_sqrdmlahvind32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3), (lane)))
#define vqrdmlahq_lane_s16(src1, src2, src3, lane) __n128_to_int16x8_t(neon_sqrdmlahqvind16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x4_t_to_n64(src3), (lane)))
#define vqrdmlahq_lane_s32(src1, src2, src3, lane) __n128_to_int32x4_t(neon_sqrdmlahqvind32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x2_t_to_n64(src3), (lane)))
#define vqrdmlah_laneq_s16(src1, src2, src3, lane) __n64_to_int16x4_t(neon_sqrdmlahvind16q(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x8_t_to_n128(src3), (lane)))
#define vqrdmlah_laneq_s32(src1, src2, src3, lane) __n64_to_int32x2_t(neon_sqrdmlahvind32q(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x4_t_to_n128(src3), (lane)))
#define vqrdmlahq_laneq_s16(src1, src2, src3, lane) __n128_to_int16x8_t(neon_sqrdmlahqvind16q(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3), (lane)))
#define vqrdmlahq_laneq_s32(src1, src2, src3, lane) __n128_to_int32x4_t(neon_sqrdmlahqvind32q(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3), (lane)))
#define vqrdmlah_s16(src1, src2, src3) __n64_to_int16x4_t(neon_sqrdmlah16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3)))
#define vqrdmlah_s32(src1, src2, src3) __n64_to_int32x2_t(neon_sqrdmlah32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3)))
#define vqrdmlahq_s16(src1, src2, src3) __n128_to_int16x8_t(neon_sqrdmlahq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vqrdmlahq_s32(src1, src2, src3) __n128_to_int32x4_t(neon_sqrdmlahq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vqrdmlahh_s16(src1, src2, src3) neon_sqrdmlahs16(__int16ToN16_v(src1), __int16ToN16_v(src2), __int16ToN16_v(src3)).n16_i16[0]
#define vqrdmlahs_s32(src1, src2, src3) _CopyInt32FromFloat(neon_sqrdmlahs32(_CopyFloatFromInt32(src1), _CopyFloatFromInt32(src2), _CopyFloatFromInt32(src3)))
#define vqrdmlahh_lane_s16(src1, src2, src3, lane) neon_sqrdmlahsind16(__int16ToN16_v(src1), __int16ToN16_v(src2), __int16x4_t_to_n64(src3), (lane)).n16_i16[0]
#define vqrdmlahs_lane_s32(src1, src2, src3, lane) _CopyInt32FromFloat(neon_sqrdmlahsind32(_CopyFloatFromInt32(src1), _CopyFloatFromInt32(src2), __int32x2_t_to_n64(src3), (lane)))
#define vqrdmlahh_laneq_s16(src1, src2, src3, lane) neon_sqrdmlahsind16q(__int16ToN16_v(src1), __int16ToN16_v(src2), __int16x8_t_to_n128(src3), (lane)).n16_i16[0]
#define vqrdmlahs_laneq_s32(src1, src2, src3, lane) _CopyInt32FromFloat(neon_sqrdmlahsind32q(_CopyFloatFromInt32(src1), _CopyFloatFromInt32(src2), __int32x4_t_to_n128(src3), (lane)))
#define vqrdmlsh_lane_s16(src1, src2, src3, lane) __n64_to_int16x4_t(neon_sqrdmlshvind16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3), (lane)))
#define vqrdmlsh_lane_s32(src1, src2, src3, lane) __n64_to_int32x2_t(neon_sqrdmlshvind32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3), (lane)))
#define vqrdmlshq_lane_s16(src1, src2, src3, lane) __n128_to_int16x8_t(neon_sqrdmlshqvind16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x4_t_to_n64(src3), (lane)))
#define vqrdmlshq_lane_s32(src1, src2, src3, lane) __n128_to_int32x4_t(neon_sqrdmlshqvind32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x2_t_to_n64(src3), (lane)))
#define vqrdmlsh_laneq_s16(src1, src2, src3, lane) __n64_to_int16x4_t(neon_sqrdmlshvind16q(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x8_t_to_n128(src3), (lane)))
#define vqrdmlsh_laneq_s32(src1, src2, src3, lane) __n64_to_int32x2_t(neon_sqrdmlshvind32q(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x4_t_to_n128(src3), (lane)))
#define vqrdmlshq_laneq_s16(src1, src2, src3, lane) __n128_to_int16x8_t(neon_sqrdmlshqvind16q(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3), (lane)))
#define vqrdmlshq_laneq_s32(src1, src2, src3, lane) __n128_to_int32x4_t(neon_sqrdmlshqvind32q(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3), (lane)))
#define vqrdmlsh_s16(src1, src2, src3) __n64_to_int16x4_t(neon_sqrdmlsh16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3)))
#define vqrdmlsh_s32(src1, src2, src3) __n64_to_int32x2_t(neon_sqrdmlsh32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3)))
#define vqrdmlshq_s16(src1, src2, src3) __n128_to_int16x8_t(neon_sqrdmlshq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vqrdmlshq_s32(src1, src2, src3) __n128_to_int32x4_t(neon_sqrdmlshq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vqrdmlshh_s16(src1, src2, src3) neon_sqrdmlshs16(__int16ToN16_v(src1), __int16ToN16_v(src2), __int16ToN16_v(src3)).n16_i16[0]
#define vqrdmlshs_s32(src1, src2, src3) _CopyInt32FromFloat(neon_sqrdmlshs32(_CopyFloatFromInt32(src1), _CopyFloatFromInt32(src2), _CopyFloatFromInt32(src3)))
#define vqrdmlshh_lane_s16(src1, src2, src3, lane) neon_sqrdmlshsind16(__int16ToN16_v(src1), __int16ToN16_v(src2), __int16x4_t_to_n64(src3), (lane)).n16_i16[0]
#define vqrdmlshs_lane_s32(src1, src2, src3, lane) _CopyInt32FromFloat(neon_sqrdmlshsind32(_CopyFloatFromInt32(src1), _CopyFloatFromInt32(src2), __int32x2_t_to_n64(src3), (lane)))
#define vqrdmlshh_laneq_s16(src1, src2, src3, lane) neon_sqrdmlshsind16q(__int16ToN16_v(src1), __int16ToN16_v(src2), __int16x8_t_to_n128(src3), (lane)).n16_i16[0]
#define vqrdmlshs_laneq_s32(src1, src2, src3, lane) _CopyInt32FromFloat(neon_sqrdmlshsind32q(_CopyFloatFromInt32(src1), _CopyFloatFromInt32(src2), __int32x4_t_to_n128(src3), (lane)))
#define vmla_lane_f32(src1, src2, src3, lane) __n64_to_float32x2_t(neon_fmlavind32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3), (lane)))
#define vmla_lane_s16(src1, src2, src3, lane) __n64_to_int16x4_t(neon_mlavind16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3), (lane)))
#define vmla_lane_s32(src1, src2, src3, lane) __n64_to_int32x2_t(neon_mlavind32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3), (lane)))
#define vmla_lane_u16(src1, src2, src3, lane) __n64_to_uint16x4_t(neon_mlavind16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), __uint16x4_t_to_n64(src3), (lane)))
#define vmla_lane_u32(src1, src2, src3, lane) __n64_to_uint32x2_t(neon_mlavind32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), __uint32x2_t_to_n64(src3), (lane)))
#define vmla_laneq_f32(src1, src2, src3, lane) __n64_to_float32x2_t(neon_fmlavind32q(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x4_t_to_n128(src3), (lane)))
#define vmla_laneq_s16(src1, src2, src3, lane) __n64_to_int16x4_t(neon_mlavind16q(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x8_t_to_n128(src3), (lane)))
#define vmla_laneq_s32(src1, src2, src3, lane) __n64_to_int32x2_t(neon_mlavind32q(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x4_t_to_n128(src3), (lane)))
#define vmla_laneq_u16(src1, src2, src3, lane) __n64_to_uint16x4_t(neon_mlavind16q(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), __uint16x8_t_to_n128(src3), (lane)))
#define vmla_laneq_u32(src1, src2, src3, lane) __n64_to_uint32x2_t(neon_mlavind32q(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), __uint32x4_t_to_n128(src3), (lane)))
#define vmls_lane_f32(src1, src2, src3, lane) __n64_to_float32x2_t(neon_fmlsvind32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3), (lane)))
#define vmls_lane_s16(src1, src2, src3, lane) __n64_to_int16x4_t(neon_mlsvind16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3), (lane)))
#define vmls_lane_s32(src1, src2, src3, lane) __n64_to_int32x2_t(neon_mlsvind32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3), (lane)))
#define vmls_lane_u16(src1, src2, src3, lane) __n64_to_uint16x4_t(neon_mlsvind16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), __uint16x4_t_to_n64(src3), (lane)))
#define vmls_lane_u32(src1, src2, src3, lane) __n64_to_uint32x2_t(neon_mlsvind32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), __uint32x2_t_to_n64(src3), (lane)))
#define vmls_laneq_f32(src1, src2, src3, lane) __n64_to_float32x2_t(neon_fmlsvind32q(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x4_t_to_n128(src3), (lane)))
#define vmls_laneq_s16(src1, src2, src3, lane) __n64_to_int16x4_t(neon_mlsvind16q(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x8_t_to_n128(src3), (lane)))
#define vmls_laneq_s32(src1, src2, src3, lane) __n64_to_int32x2_t(neon_mlsvind32q(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x4_t_to_n128(src3), (lane)))
#define vmls_laneq_u16(src1, src2, src3, lane) __n64_to_uint16x4_t(neon_mlsvind16q(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), __uint16x8_t_to_n128(src3), (lane)))
#define vmls_laneq_u32(src1, src2, src3, lane) __n64_to_uint32x2_t(neon_mlsvind32q(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), __uint32x4_t_to_n128(src3), (lane)))
#define vfmas_lane_f32(src1, src2, src3, lane) neon_fmlasind32((src1), (src2), __float32x2_t_to_n64(src3), (lane))
#define vfmad_lane_f64(src1, src2, src3, lane) neon_fmlasind64((src1), (src2), __float64x1_t_to_n64(src3), (lane))
#define vfmas_laneq_f32(src1, src2, src3, lane) neon_fmlasind32q((src1), (src2), __float32x4_t_to_n128(src3), (lane))
#define vfmad_laneq_f64(src1, src2, src3, lane) neon_fmlasind64q((src1), (src2), __float64x2_t_to_n128(src3), (lane))
#define vfmss_lane_f32(src1, src2, src3, lane) neon_fmlssind32((src1), (src2), __float32x2_t_to_n64(src3), (lane))
#define vfmsd_lane_f64(src1, src2, src3, lane) neon_fmlssind64((src1), (src2), __float64x1_t_to_n64(src3), (lane))
#define vfmss_laneq_f32(src1, src2, src3, lane) neon_fmlssind32q((src1), (src2), __float32x4_t_to_n128(src3), (lane))
#define vfmsd_laneq_f64(src1, src2, src3, lane) neon_fmlssind64q((src1), (src2), __float64x2_t_to_n128(src3), (lane))
#define vmlaq_lane_f32(src1, src2, src3, lane) __n128_to_float32x4_t(neon_fmlaqvind32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x2_t_to_n64(src3), (lane)))
#define vmlaq_lane_s16(src1, src2, src3, lane) __n128_to_int16x8_t(neon_mlaqvind16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x4_t_to_n64(src3), (lane)))
#define vmlaq_lane_s32(src1, src2, src3, lane) __n128_to_int32x4_t(neon_mlaqvind32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x2_t_to_n64(src3), (lane)))
#define vmlaq_lane_u16(src1, src2, src3, lane) __n128_to_uint16x8_t(neon_mlaqvind16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x4_t_to_n64(src3), (lane)))
#define vmlaq_lane_u32(src1, src2, src3, lane) __n128_to_uint32x4_t(neon_mlaqvind32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x2_t_to_n64(src3), (lane)))
#define vmlaq_laneq_f32(src1, src2, src3, lane) __n128_to_float32x4_t(neon_fmlaqvind32q(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3), (lane)))
#define vmlaq_laneq_s16(src1, src2, src3, lane) __n128_to_int16x8_t(neon_mlaqvind16q(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3), (lane)))
#define vmlaq_laneq_s32(src1, src2, src3, lane) __n128_to_int32x4_t(neon_mlaqvind32q(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3), (lane)))
#define vmlaq_laneq_u16(src1, src2, src3, lane) __n128_to_uint16x8_t(neon_mlaqvind16q(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3), (lane)))
#define vmlaq_laneq_u32(src1, src2, src3, lane) __n128_to_uint32x4_t(neon_mlaqvind32q(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3), (lane)))
#define vmlsq_lane_f32(src1, src2, src3, lane) __n128_to_float32x4_t(neon_fmlsqvind32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x2_t_to_n64(src3), (lane)))
#define vmlsq_lane_s16(src1, src2, src3, lane) __n128_to_int16x8_t(neon_mlsqvind16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x4_t_to_n64(src3), (lane)))
#define vmlsq_lane_s32(src1, src2, src3, lane) __n128_to_int32x4_t(neon_mlsqvind32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x2_t_to_n64(src3), (lane)))
#define vmlsq_lane_u16(src1, src2, src3, lane) __n128_to_uint16x8_t(neon_mlsqvind16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x4_t_to_n64(src3), (lane)))
#define vmlsq_lane_u32(src1, src2, src3, lane) __n128_to_uint32x4_t(neon_mlsqvind32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x2_t_to_n64(src3), (lane)))
#define vmlsq_laneq_f32(src1, src2, src3, lane) __n128_to_float32x4_t(neon_fmlsqvind32q(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3), (lane)))
#define vmlsq_laneq_s16(src1, src2, src3, lane) __n128_to_int16x8_t(neon_mlsqvind16q(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3), (lane)))
#define vmlsq_laneq_s32(src1, src2, src3, lane) __n128_to_int32x4_t(neon_mlsqvind32q(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3), (lane)))
#define vmlsq_laneq_u16(src1, src2, src3, lane) __n128_to_uint16x8_t(neon_mlsqvind16q(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3), (lane)))
#define vmlsq_laneq_u32(src1, src2, src3, lane) __n128_to_uint32x4_t(neon_mlsqvind32q(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3), (lane)))
#define vmla_f32(src1, src2, src3) __n64_to_float32x2_t(neon_fmla32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3)))
#define vmla_f64(src1, src2, src3) __n64_to_float64x1_t(neon_fmla64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2), __float64x1_t_to_n64(src3)))
#define vmls_f32(src1, src2, src3) __n64_to_float32x2_t(neon_fmls32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3)))
#define vmls_f64(src1, src2, src3) __n64_to_float64x1_t(neon_fmls64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2), __float64x1_t_to_n64(src3)))
#define vmlaq_f32(src1, src2, src3) __n128_to_float32x4_t(neon_fmlaq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3)))
#define vmlsq_f32(src1, src2, src3) __n128_to_float32x4_t(neon_fmlsq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3)))
#define vmlaq_f64(src1, src2, src3) __n128_to_float64x2_t(neon_fmlaq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), __float64x2_t_to_n128(src3)))
#define vmlsq_f64(src1, src2, src3) __n128_to_float64x2_t(neon_fmlsq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), __float64x2_t_to_n128(src3)))
#define vmla_s16(src1, src2, src3) __n64_to_int16x4_t(neon_mla16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3)))
#define vmla_s32(src1, src2, src3) __n64_to_int32x2_t(neon_mla32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3)))
#define vmla_s8(src1, src2, src3) __n64_to_int8x8_t(neon_mla8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2), __int8x8_t_to_n64(src3)))
#define vmla_u16(src1, src2, src3) __n64_to_uint16x4_t(neon_mla16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), __uint16x4_t_to_n64(src3)))
#define vmla_u32(src1, src2, src3) __n64_to_uint32x2_t(neon_mla32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), __uint32x2_t_to_n64(src3)))
#define vmla_u8(src1, src2, src3) __n64_to_uint8x8_t(neon_mla8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2), __uint8x8_t_to_n64(src3)))
#define vmls_s16(src1, src2, src3) __n64_to_int16x4_t(neon_mls16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3)))
#define vmls_s32(src1, src2, src3) __n64_to_int32x2_t(neon_mls32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3)))
#define vmls_s8(src1, src2, src3) __n64_to_int8x8_t(neon_mls8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2), __int8x8_t_to_n64(src3)))
#define vmls_u16(src1, src2, src3) __n64_to_uint16x4_t(neon_mls16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), __uint16x4_t_to_n64(src3)))
#define vmls_u32(src1, src2, src3) __n64_to_uint32x2_t(neon_mls32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), __uint32x2_t_to_n64(src3)))
#define vmls_u8(src1, src2, src3) __n64_to_uint8x8_t(neon_mls8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2), __uint8x8_t_to_n64(src3)))
#define vmlaq_s16(src1, src2, src3) __n128_to_int16x8_t(neon_mlaq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vmlaq_s32(src1, src2, src3) __n128_to_int32x4_t(neon_mlaq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vmlaq_s8(src1, src2, src3) __n128_to_int8x16_t(neon_mlaq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2), __int8x16_t_to_n128(src3)))
#define vmlaq_u16(src1, src2, src3) __n128_to_uint16x8_t(neon_mlaq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3)))
#define vmlaq_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_mlaq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vmlaq_u8(src1, src2, src3) __n128_to_uint8x16_t(neon_mlaq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3)))
#define vmlsq_s16(src1, src2, src3) __n128_to_int16x8_t(neon_mlsq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vmlsq_s32(src1, src2, src3) __n128_to_int32x4_t(neon_mlsq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vmlsq_s8(src1, src2, src3) __n128_to_int8x16_t(neon_mlsq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2), __int8x16_t_to_n128(src3)))
#define vmlsq_u16(src1, src2, src3) __n128_to_uint16x8_t(neon_mlsq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3)))
#define vmlsq_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_mlsq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vmlsq_u8(src1, src2, src3) __n128_to_uint8x16_t(neon_mlsq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3)))

#define vfma_lane_f32(src1, src2, src3, lane) __n64_to_float32x2_t(neon_fmlavind32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3), (lane)))
#define vfma_lane_f64(src1, src2, src3, lane) __n64_to_float64x1_t(neon_fmlavind64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2), __float64x1_t_to_n64(src3), (lane)))
#define vfma_laneq_f32(src1, src2, src3, lane) __n64_to_float32x2_t(neon_fmlavind32q(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x4_t_to_n128(src3), (lane)))
#define vfma_laneq_f64(src1, src2, src3, lane) __n64_to_float64x1_t(neon_fmlavind64q(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2), __float64x2_t_to_n128(src3), (lane)))
#define vfms_lane_f32(src1, src2, src3, lane) __n64_to_float32x2_t(neon_fmlsvind32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3), (lane)))
#define vfms_lane_f64(src1, src2, src3, lane) __n64_to_float64x1_t(neon_fmlsvind64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2), __float64x1_t_to_n64(src3), (lane)))
#define vfms_laneq_f32(src1, src2, src3, lane) __n64_to_float32x2_t(neon_fmlsvind32q(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x4_t_to_n128(src3), (lane)))
#define vfms_laneq_f64(src1, src2, src3, lane) __n64_to_float64x1_t(neon_fmlsvind64q(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2), __float64x2_t_to_n128(src3), (lane)))
#define vfmaq_lane_f32(src1, src2, src3, lane) __n128_to_float32x4_t(neon_fmlaqvind32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x2_t_to_n64(src3), (lane)))
#define vfmaq_lane_f64(src1, src2, src3, lane) __n128_to_float64x2_t(neon_fmlaqvind64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), __float64x1_t_to_n64(src3), (lane)))
#define vfmaq_laneq_f32(src1, src2, src3, lane) __n128_to_float32x4_t(neon_fmlaqvind32q(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3), (lane)))
#define vfmaq_laneq_f64(src1, src2, src3, lane) __n128_to_float64x2_t(neon_fmlaqvind64q(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), __float64x2_t_to_n128(src3), (lane)))
#define vfmsq_lane_f32(src1, src2, src3, lane) __n128_to_float32x4_t(neon_fmlsqvind32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x2_t_to_n64(src3), (lane)))
#define vfmsq_lane_f64(src1, src2, src3, lane) __n128_to_float64x2_t(neon_fmlsqvind64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), __float64x1_t_to_n64(src3), (lane)))
#define vfmsq_laneq_f32(src1, src2, src3, lane) __n128_to_float32x4_t(neon_fmlsqvind32q(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3), (lane)))
#define vfmsq_laneq_f64(src1, src2, src3, lane) __n128_to_float64x2_t(neon_fmlsqvind64q(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), __float64x2_t_to_n128(src3), (lane)))
#define vfma_f32(src1, src2, src3) __n64_to_float32x2_t(neon_fmla32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3)))
#define vfms_f32(src1, src2, src3) __n64_to_float32x2_t(neon_fmls32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3)))
#define vfma_f64(src1, src2, src3) __n64_to_float64x1_t(neon_fmla64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2), __float64x1_t_to_n64(src3)))
#define vfms_f64(src1, src2, src3) __n64_to_float64x1_t(neon_fmls64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2), __float64x1_t_to_n64(src3)))
#define vfmaq_f32(src1, src2, src3) __n128_to_float32x4_t(neon_fmlaq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3)))
#define vfmsq_f32(src1, src2, src3) __n128_to_float32x4_t(neon_fmlsq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3)))
#define vfmaq_f64(src1, src2, src3) __n128_to_float64x2_t(neon_fmlaq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), __float64x2_t_to_n128(src3)))
#define vfmsq_f64(src1, src2, src3) __n128_to_float64x2_t(neon_fmlsq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), __float64x2_t_to_n128(src3)))

//  Multiply by scalar
#define vmul_n_s16(Vd, Rt)             vmul_lane_s16((Vd), vmov_n_s16(Rt), 0)
#define vmul_n_s32(Vd, Rt)             vmul_lane_s32((Vd), vmov_n_s32(Rt), 0)
#define vmul_n_u16(Vd, Rt)             vmul_lane_u16((Vd), vmov_n_u16(Rt), 0)
#define vmul_n_u32(Vd, Rt)             vmul_lane_u32((Vd), vmov_n_u32(Rt), 0)
#define vmul_n_f32(Vd, Rt)             vmul_lane_f32((Vd), vmov_n_f32(Rt), 0)
#define vmul_n_f64(Vd, Rt)             vmul_lane_f64((Vd), vmov_n_f64(Rt), 0)
#define vmulq_n_s16(Vd, Rt)            vmulq_lane_s16((Vd), vmov_n_s16(Rt), 0)
#define vmulq_n_s32(Vd, Rt)            vmulq_lane_s32((Vd), vmov_n_s32(Rt), 0)
#define vmulq_n_u16(Vd, Rt)            vmulq_lane_u16((Vd), vmov_n_u16(Rt), 0)
#define vmulq_n_u32(Vd, Rt)            vmulq_lane_u32((Vd), vmov_n_u32(Rt), 0)
#define vmulq_n_f32(Vd, Rt)            vmulq_lane_f32((Vd), vmov_n_f32(Rt), 0)
#define vmulq_n_f64(Vd, Rt)            vmulq_lane_f64((Vd), vmov_n_f64(Rt), 0)
#define vqdmulh_n_s16(Vd, Rt)          vqdmulh_lane_s16((Vd), vmov_n_s16(Rt), 0)
#define vqdmulh_n_s32(Vd, Rt)          vqdmulh_lane_s32((Vd), vmov_n_s32(Rt), 0)
#define vqdmulhq_n_s16(Vd, Rt)         vqdmulhq_lane_s16((Vd), vmov_n_s16(Rt), 0)
#define vqdmulhq_n_s32(Vd, Rt)         vqdmulhq_lane_s32((Vd), vmov_n_s32(Rt), 0)
#define vqrdmulh_n_s16(Vd, Rt)         vqrdmulh_lane_s16((Vd), vmov_n_s16(Rt), 0)
#define vqrdmulh_n_s32(Vd, Rt)         vqrdmulh_lane_s32((Vd), vmov_n_s32(Rt), 0)
#define vqrdmulhq_n_s16(Vd, Rt)        vqrdmulhq_lane_s16((Vd), vmov_n_s16(Rt), 0)
#define vqrdmulhq_n_s32(Vd, Rt)        vqrdmulhq_lane_s32((Vd), vmov_n_s32(Rt), 0)
//  Multiply by scalar with accumulate
#define vmla_n_s16(Vd, Vn, Rt)         vmla_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
#define vmla_n_s32(Vd, Vn, Rt)         vmla_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
#define vmla_n_u16(Vd, Vn, Rt)         vmla_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
#define vmla_n_u32(Vd, Vn, Rt)         vmla_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
#define vmla_n_f32(Vd, Vn, Rt)         vmla_lane_f32((Vd), (Vn), vmov_n_f32(Rt), 0)
#define vmlaq_n_s16(Vd, Vn, Rt)        vmlaq_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
#define vmlaq_n_s32(Vd, Vn, Rt)        vmlaq_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
#define vmlaq_n_f32(Vd, Vn, Rt)        vmlaq_lane_f32((Vd), (Vn), vmov_n_f32(Rt), 0)
#define vmlaq_n_u16(Vd, Vn, Rt)        vmlaq_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
#define vmlaq_n_u32(Vd, Vn, Rt)        vmlaq_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
#define vmls_n_s16(Vd, Vn, Rt)         vmls_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
#define vmls_n_s32(Vd, Vn, Rt)         vmls_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
#define vmls_n_u16(Vd, Vn, Rt)         vmls_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
#define vmls_n_u32(Vd, Vn, Rt)         vmls_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
#define vmls_n_f32(Vd, Vn, Rt)         vmls_lane_f32((Vd), (Vn), vmov_n_f32(Rt), 0)
#define vmlsq_n_s16(Vd, Vn, Rt)        vmlsq_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
#define vmlsq_n_s32(Vd, Vn, Rt)        vmlsq_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
#define vmlsq_n_u16(Vd, Vn, Rt)        vmlsq_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
#define vmlsq_n_u32(Vd, Vn, Rt)        vmlsq_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
#define vmlsq_n_f32(Vd, Vn, Rt)        vmlsq_lane_f32((Vd), (Vn), vmov_n_f32(Rt), 0)
#define vfma_n_s16(Vd, Vn, Rt)         vfma_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
#define vfma_n_s32(Vd, Vn, Rt)         vfma_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
#define vfma_n_u16(Vd, Vn, Rt)         vfma_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
#define vfma_n_u32(Vd, Vn, Rt)         vfma_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
#define vfma_n_f32(Vd, Vn, Rt)         vfma_lane_f32((Vd), (Vn), vmov_n_f32(Rt), 0)
#define vfma_n_f64(Vd, Vn, Rt)         vfma_f64((Vd), (Vn), vmov_n_f64(Rt))
#define vfmaq_n_s16(Vd, Vn, Rt)        vfmaq_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
#define vfmaq_n_s32(Vd, Vn, Rt)        vfmaq_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
#define vfmaq_n_f32(Vd, Vn, Rt)        vfmaq_lane_f32((Vd), (Vn), vmov_n_f32(Rt), 0)
#define vfmaq_n_f64(Vd, Vn, Rt)        vfmaq_lane_f64((Vd), (Vn), vmov_n_f64(Rt), 0)
#define vfmaq_n_u16(Vd, Vn, Rt)        vfmaq_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
#define vfmaq_n_u32(Vd, Vn, Rt)        vfmaq_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
#define vfms_n_s16(Vd, Vn, Rt)         vfms_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
#define vfms_n_s32(Vd, Vn, Rt)         vfms_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
#define vfms_n_u16(Vd, Vn, Rt)         vfms_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
#define vfms_n_u32(Vd, Vn, Rt)         vfms_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
#define vfms_n_f32(Vd, Vn, Rt)         vfms_lane_f32((Vd), (Vn), vmov_n_f32(Rt), 0)
#define vfms_n_f64(Vd, Vn, Rt)         vfms_f64((Vd), (Vn), vmov_n_f64(Rt))
#define vfmsq_n_s16(Vd, Vn, Rt)        vfmsq_lane_s16((Vd), (Vn), vmov_n_s16(Rt), 0)
#define vfmsq_n_s32(Vd, Vn, Rt)        vfmsq_lane_s32((Vd), (Vn), vmov_n_s32(Rt), 0)
#define vfmsq_n_u16(Vd, Vn, Rt)        vfmsq_lane_u16((Vd), (Vn), vmov_n_u16(Rt), 0)
#define vfmsq_n_u32(Vd, Vn, Rt)        vfmsq_lane_u32((Vd), (Vn), vmov_n_u32(Rt), 0)
#define vfmsq_n_f32(Vd, Vn, Rt)        vfmsq_lane_f32((Vd), (Vn), vmov_n_f32(Rt), 0)
#define vfmsq_n_f64(Vd, Vn, Rt)        vfmsq_lane_f64((Vd), (Vn), vmov_n_f64(Rt), 0)

// SMULL(2)/UMULL(2)/SMLAL(2)/UMLAL(2)/SMLSL(2)/UMLSL(2)/SQDMULL(2)/SQDMLAL(2)/SQDMLSL(2)
__n128 neon_smull_8(__n64, __n64);
__n128 neon_smull_16(__n64, __n64);
__n128 neon_smull_32(__n64, __n64);
__n128 neon_smull2_8(__n128, __n128);
__n128 neon_smull2_16(__n128, __n128);
__n128 neon_smull2_32(__n128, __n128);
__n128 neon_smull_i16(__n64, __n64, const int);
__n128 neon_smull_i32(__n64, __n64, const int);
__n128 neon_smull2_i16(__n128, __n64, const int);
__n128 neon_smull2_i32(__n128, __n64, const int);
__n128 neon_smull_qi16(__n64, __n128, const int);
__n128 neon_smull_qi32(__n64, __n128, const int);
__n128 neon_smull2_qi16(__n128, __n128, const int);
__n128 neon_smull2_qi32(__n128, __n128, const int);
__n128 neon_umull_8(__n64, __n64);
__n128 neon_umull_16(__n64, __n64);
__n128 neon_umull_32(__n64, __n64);
__n128 neon_umull2_8(__n128, __n128);
__n128 neon_umull2_16(__n128, __n128);
__n128 neon_umull2_32(__n128, __n128);
__n128 neon_umull_i16(__n64, __n64, const int);
__n128 neon_umull_i32(__n64, __n64, const int);
__n128 neon_umull2_i16(__n128, __n64, const int);
__n128 neon_umull2_i32(__n128, __n64, const int);
__n128 neon_umull_qi16(__n64, __n128, const int);
__n128 neon_umull_qi32(__n64, __n128, const int);
__n128 neon_umull2_qi16(__n128, __n128, const int);
__n128 neon_umull2_qi32(__n128, __n128, const int);
#define vmull_s8(src1, src2) __n128_to_int16x8_t(neon_smull_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vmull_s16(src1, src2) __n128_to_int32x4_t(neon_smull_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vmull_s32(src1, src2) __n128_to_int64x2_t(neon_smull_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vmull_high_s8(src1, src2) __n128_to_int16x8_t(neon_smull2_8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vmull_high_s16(src1, src2) __n128_to_int32x4_t(neon_smull2_16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vmull_high_s32(src1, src2) __n128_to_int64x2_t(neon_smull2_32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vmull_lane_s16(src1, src2, src3) __n128_to_int32x4_t(neon_smull_i16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), (src3)))
#define vmull_lane_s32(src1, src2, src3) __n128_to_int64x2_t(neon_smull_i32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), (src3)))
#define vmull_high_lane_s16(src1, src2, src3) __n128_to_int32x4_t(neon_smull2_i16(__int16x8_t_to_n128(src1), __int16x4_t_to_n64(src2), (src3)))
#define vmull_high_lane_s32(src1, src2, src3) __n128_to_int64x2_t(neon_smull2_i32(__int32x4_t_to_n128(src1), __int32x2_t_to_n64(src2), (src3)))
#define vmull_laneq_s16(src1, src2, src3) __n128_to_int32x4_t(neon_smull_qi16(__int16x4_t_to_n64(src1), __int16x8_t_to_n128(src2), (src3)))
#define vmull_laneq_s32(src1, src2, src3) __n128_to_int64x2_t(neon_smull_qi32(__int32x2_t_to_n64(src1), __int32x4_t_to_n128(src2), (src3)))
#define vmull_high_laneq_s16(src1, src2, src3) __n128_to_int32x4_t(neon_smull2_qi16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), (src3)))
#define vmull_high_laneq_s32(src1, src2, src3) __n128_to_int64x2_t(neon_smull2_qi32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), (src3)))
#define vmull_u8(src1, src2) __n128_to_uint16x8_t(neon_umull_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vmull_u16(src1, src2) __n128_to_uint32x4_t(neon_umull_16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vmull_u32(src1, src2) __n128_to_uint64x2_t(neon_umull_32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vmull_high_u8(src1, src2) __n128_to_uint16x8_t(neon_umull2_8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vmull_high_u16(src1, src2) __n128_to_uint32x4_t(neon_umull2_16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vmull_high_u32(src1, src2) __n128_to_uint64x2_t(neon_umull2_32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vmull_lane_u16(src1, src2, src3) __n128_to_uint32x4_t(neon_umull_i16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), (src3)))
#define vmull_lane_u32(src1, src2, src3) __n128_to_uint64x2_t(neon_umull_i32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), (src3)))
#define vmull_high_lane_u16(src1, src2, src3) __n128_to_uint32x4_t(neon_umull2_i16(__uint16x8_t_to_n128(src1), __uint16x4_t_to_n64(src2), (src3)))
#define vmull_high_lane_u32(src1, src2, src3) __n128_to_uint64x2_t(neon_umull2_i32(__uint32x4_t_to_n128(src1), __uint32x2_t_to_n64(src2), (src3)))
#define vmull_laneq_u16(src1, src2, src3) __n128_to_uint32x4_t(neon_umull_qi16(__uint16x4_t_to_n64(src1), __uint16x8_t_to_n128(src2), (src3)))
#define vmull_laneq_u32(src1, src2, src3) __n128_to_uint64x2_t(neon_umull_qi32(__uint32x2_t_to_n64(src1), __uint32x4_t_to_n128(src2), (src3)))
#define vmull_high_laneq_u16(src1, src2, src3) __n128_to_uint32x4_t(neon_umull2_qi16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), (src3)))
#define vmull_high_laneq_u32(src1, src2, src3) __n128_to_uint64x2_t(neon_umull2_qi32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), (src3)))
#define vmull_n_s16(src1, src2) vmull_lane_s16((src1), vmov_n_s16(src2), 0)
#define vmull_n_s32(src1, src2) vmull_lane_s32((src1), vmov_n_s32(src2), 0)
#define vmull_high_n_s16(src1, src2) vmull_high_lane_s16((src1), vmov_n_s16(src2), 0)
#define vmull_high_n_s32(src1, src2) vmull_high_lane_s32((src1), vmov_n_s32(src2), 0)
#define vmull_n_u16(src1, src2) vmull_lane_u16((src1), vmov_n_u16(src2), 0)
#define vmull_n_u32(src1, src2) vmull_lane_u32((src1), vmov_n_u32(src2), 0)
#define vmull_high_n_u16(src1, src2) vmull_high_lane_u16((src1), vmov_n_u16(src2), 0)
#define vmull_high_n_u32(src1, src2) vmull_high_lane_u32((src1), vmov_n_u32(src2), 0)
__n128 neon_smlal_8(__n128, __n64, __n64);
__n128 neon_smlal_16(__n128, __n64, __n64);
__n128 neon_smlal_32(__n128, __n64, __n64);
__n128 neon_smlal2_8(__n128, __n128, __n128);
__n128 neon_smlal2_16(__n128, __n128, __n128);
__n128 neon_smlal2_32(__n128, __n128, __n128);
__n128 neon_smlal_i16(__n128, __n64, __n64, const int);
__n128 neon_smlal_i32(__n128, __n64, __n64, const int);
__n128 neon_smlal2_i16(__n128, __n128, __n64, const int);
__n128 neon_smlal2_i32(__n128, __n128, __n64, const int);
__n128 neon_smlal_qi16(__n128, __n64, __n128, const int);
__n128 neon_smlal_qi32(__n128, __n64, __n128, const int);
__n128 neon_smlal2_qi16(__n128, __n128, __n128, const int);
__n128 neon_smlal2_qi32(__n128, __n128, __n128, const int);
__n128 neon_umlal_8(__n128, __n64, __n64);
__n128 neon_umlal_16(__n128, __n64, __n64);
__n128 neon_umlal_32(__n128, __n64, __n64);
__n128 neon_umlal2_8(__n128, __n128, __n128);
__n128 neon_umlal2_16(__n128, __n128, __n128);
__n128 neon_umlal2_32(__n128, __n128, __n128);
__n128 neon_umlal_i16(__n128, __n64, __n64, const int);
__n128 neon_umlal_i32(__n128, __n64, __n64, const int);
__n128 neon_umlal2_i16(__n128, __n128, __n64, const int);
__n128 neon_umlal2_i32(__n128, __n128, __n64, const int);
__n128 neon_umlal_qi16(__n128, __n64, __n128, const int);
__n128 neon_umlal_qi32(__n128, __n64, __n128, const int);
__n128 neon_umlal2_qi16(__n128, __n128, __n128, const int);
__n128 neon_umlal2_qi32(__n128, __n128, __n128, const int);
#define vmlal_s8(src1, src2, src3) __n128_to_int16x8_t(neon_smlal_8(__int16x8_t_to_n128(src1), __int8x8_t_to_n64(src2), __int8x8_t_to_n64(src3)))
#define vmlal_s16(src1, src2, src3) __n128_to_int32x4_t(neon_smlal_16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3)))
#define vmlal_s32(src1, src2, src3) __n128_to_int64x2_t(neon_smlal_32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3)))
#define vmlal_high_s8(src1, src2, src3) __n128_to_int16x8_t(neon_smlal2_8(__int16x8_t_to_n128(src1), __int8x16_t_to_n128(src2), __int8x16_t_to_n128(src3)))
#define vmlal_high_s16(src1, src2, src3) __n128_to_int32x4_t(neon_smlal2_16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vmlal_high_s32(src1, src2, src3) __n128_to_int64x2_t(neon_smlal2_32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vmlal_lane_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_smlal_i16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3), (src4)))
#define vmlal_lane_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_smlal_i32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3), (src4)))
#define vmlal_high_lane_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_smlal2_i16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x4_t_to_n64(src3), (src4)))
#define vmlal_high_lane_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_smlal2_i32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x2_t_to_n64(src3), (src4)))
#define vmlal_laneq_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_smlal_qi16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2), __int16x8_t_to_n128(src3), (src4)))
#define vmlal_laneq_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_smlal_qi32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2), __int32x4_t_to_n128(src3), (src4)))
#define vmlal_high_laneq_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_smlal2_qi16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3), (src4)))
#define vmlal_high_laneq_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_smlal2_qi32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3), (src4)))
#define vmlal_u8(src1, src2, src3) __n128_to_uint16x8_t(neon_umlal_8(__uint16x8_t_to_n128(src1), __uint8x8_t_to_n64(src2), __uint8x8_t_to_n64(src3)))
#define vmlal_u16(src1, src2, src3) __n128_to_uint32x4_t(neon_umlal_16(__uint32x4_t_to_n128(src1), __uint16x4_t_to_n64(src2), __uint16x4_t_to_n64(src3)))
#define vmlal_u32(src1, src2, src3) __n128_to_uint64x2_t(neon_umlal_32(__uint64x2_t_to_n128(src1), __uint32x2_t_to_n64(src2), __uint32x2_t_to_n64(src3)))
#define vmlal_high_u8(src1, src2, src3) __n128_to_uint16x8_t(neon_umlal2_8(__uint16x8_t_to_n128(src1), __uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3)))
#define vmlal_high_u16(src1, src2, src3) __n128_to_uint32x4_t(neon_umlal2_16(__uint32x4_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3)))
#define vmlal_high_u32(src1, src2, src3) __n128_to_uint64x2_t(neon_umlal2_32(__uint64x2_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vmlal_lane_u16(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_umlal_i16(__uint32x4_t_to_n128(src1), __uint16x4_t_to_n64(src2), __uint16x4_t_to_n64(src3), (src4)))
#define vmlal_lane_u32(src1, src2, src3, src4) __n128_to_uint64x2_t(neon_umlal_i32(__uint64x2_t_to_n128(src1), __uint32x2_t_to_n64(src2), __uint32x2_t_to_n64(src3), (src4)))
#define vmlal_high_lane_u16(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_umlal2_i16(__uint32x4_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x4_t_to_n64(src3), (src4)))
#define vmlal_high_lane_u32(src1, src2, src3, src4) __n128_to_uint64x2_t(neon_umlal2_i32(__uint64x2_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x2_t_to_n64(src3), (src4)))
#define vmlal_laneq_u16(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_umlal_qi16(__uint32x4_t_to_n128(src1), __uint16x4_t_to_n64(src2), __uint16x8_t_to_n128(src3), (src4)))
#define vmlal_laneq_u32(src1, src2, src3, src4) __n128_to_uint64x2_t(neon_umlal_qi32(__uint64x2_t_to_n128(src1), __uint32x2_t_to_n64(src2), __uint32x4_t_to_n128(src3), (src4)))
#define vmlal_high_laneq_u16(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_umlal2_qi16(__uint32x4_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3), (src4)))
#define vmlal_high_laneq_u32(src1, src2, src3, src4) __n128_to_uint64x2_t(neon_umlal2_qi32(__uint64x2_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3), (src4)))
#define vmlal_n_s16(src1, src2, src3) vmlal_lane_s16((src1), (src2), vmov_n_s16(src3), 0)
#define vmlal_n_s32(src1, src2, src3) vmlal_lane_s32((src1), (src2), vmov_n_s32(src3), 0)
#define vmlal_high_n_s16(src1, src2, src3) vmlal_high_lane_s16((src1), (src2), vmov_n_s16(src3), 0)
#define vmlal_high_n_s32(src1, src2, src3) vmlal_high_lane_s32((src1), (src2), vmov_n_s32(src3), 0)
#define vmlal_n_u16(src1, src2, src3) vmlal_lane_u16((src1), (src2), vmov_n_u16(src3), 0)
#define vmlal_n_u32(src1, src2, src3) vmlal_lane_u32((src1), (src2), vmov_n_u32(src3), 0)
#define vmlal_high_n_u16(src1, src2, src3) vmlal_high_lane_u16((src1), (src2), vmov_n_u16(src3), 0)
#define vmlal_high_n_u32(src1, src2, src3) vmlal_high_lane_u32((src1), (src2), vmov_n_u32(src3), 0)
__n128 neon_smlsl_8(__n128, __n64, __n64);
__n128 neon_smlsl_16(__n128, __n64, __n64);
__n128 neon_smlsl_32(__n128, __n64, __n64);
__n128 neon_smlsl2_8(__n128, __n128, __n128);
__n128 neon_smlsl2_16(__n128, __n128, __n128);
__n128 neon_smlsl2_32(__n128, __n128, __n128);
__n128 neon_smlsl_i16(__n128, __n64, __n64, const int);
__n128 neon_smlsl_i32(__n128, __n64, __n64, const int);
__n128 neon_smlsl2_i16(__n128, __n128, __n64, const int);
__n128 neon_smlsl2_i32(__n128, __n128, __n64, const int);
__n128 neon_smlsl_qi16(__n128, __n64, __n128, const int);
__n128 neon_smlsl_qi32(__n128, __n64, __n128, const int);
__n128 neon_smlsl2_qi16(__n128, __n128, __n128, const int);
__n128 neon_smlsl2_qi32(__n128, __n128, __n128, const int);
__n128 neon_umlsl_8(__n128, __n64, __n64);
__n128 neon_umlsl_16(__n128, __n64, __n64);
__n128 neon_umlsl_32(__n128, __n64, __n64);
__n128 neon_umlsl2_8(__n128, __n128, __n128);
__n128 neon_umlsl2_16(__n128, __n128, __n128);
__n128 neon_umlsl2_32(__n128, __n128, __n128);
__n128 neon_umlsl_i16(__n128, __n64, __n64, const int);
__n128 neon_umlsl_i32(__n128, __n64, __n64, const int);
__n128 neon_umlsl2_i16(__n128, __n128, __n64, const int);
__n128 neon_umlsl2_i32(__n128, __n128, __n64, const int);
__n128 neon_umlsl_qi16(__n128, __n64, __n128, const int);
__n128 neon_umlsl_qi32(__n128, __n64, __n128, const int);
__n128 neon_umlsl2_qi16(__n128, __n128, __n128, const int);
__n128 neon_umlsl2_qi32(__n128, __n128, __n128, const int);
#define vmlsl_s8(src1, src2, src3) __n128_to_int16x8_t(neon_smlsl_8(__int16x8_t_to_n128(src1), __int8x8_t_to_n64(src2), __int8x8_t_to_n64(src3)))
#define vmlsl_s16(src1, src2, src3) __n128_to_int32x4_t(neon_smlsl_16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3)))
#define vmlsl_s32(src1, src2, src3) __n128_to_int64x2_t(neon_smlsl_32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3)))
#define vmlsl_high_s8(src1, src2, src3) __n128_to_int16x8_t(neon_smlsl2_8(__int16x8_t_to_n128(src1), __int8x16_t_to_n128(src2), __int8x16_t_to_n128(src3)))
#define vmlsl_high_s16(src1, src2, src3) __n128_to_int32x4_t(neon_smlsl2_16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vmlsl_high_s32(src1, src2, src3) __n128_to_int64x2_t(neon_smlsl2_32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vmlsl_lane_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_smlsl_i16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3), (src4)))
#define vmlsl_lane_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_smlsl_i32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3), (src4)))
#define vmlsl_high_lane_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_smlsl2_i16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x4_t_to_n64(src3), (src4)))
#define vmlsl_high_lane_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_smlsl2_i32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x2_t_to_n64(src3), (src4)))
#define vmlsl_laneq_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_smlsl_qi16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2), __int16x8_t_to_n128(src3), (src4)))
#define vmlsl_laneq_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_smlsl_qi32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2), __int32x4_t_to_n128(src3), (src4)))
#define vmlsl_high_laneq_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_smlsl2_qi16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3), (src4)))
#define vmlsl_high_laneq_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_smlsl2_qi32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3), (src4)))
#define vmlsl_u8(src1, src2, src3) __n128_to_uint16x8_t(neon_umlsl_8(__uint16x8_t_to_n128(src1), __uint8x8_t_to_n64(src2), __uint8x8_t_to_n64(src3)))
#define vmlsl_u16(src1, src2, src3) __n128_to_uint32x4_t(neon_umlsl_16(__uint32x4_t_to_n128(src1), __uint16x4_t_to_n64(src2), __uint16x4_t_to_n64(src3)))
#define vmlsl_u32(src1, src2, src3) __n128_to_uint64x2_t(neon_umlsl_32(__uint64x2_t_to_n128(src1), __uint32x2_t_to_n64(src2), __uint32x2_t_to_n64(src3)))
#define vmlsl_high_u8(src1, src2, src3) __n128_to_uint16x8_t(neon_umlsl2_8(__uint16x8_t_to_n128(src1), __uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3)))
#define vmlsl_high_u16(src1, src2, src3) __n128_to_uint32x4_t(neon_umlsl2_16(__uint32x4_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3)))
#define vmlsl_high_u32(src1, src2, src3) __n128_to_uint64x2_t(neon_umlsl2_32(__uint64x2_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vmlsl_lane_u16(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_umlsl_i16(__uint32x4_t_to_n128(src1), __uint16x4_t_to_n64(src2), __uint16x4_t_to_n64(src3), (src4)))
#define vmlsl_lane_u32(src1, src2, src3, src4) __n128_to_uint64x2_t(neon_umlsl_i32(__uint64x2_t_to_n128(src1), __uint32x2_t_to_n64(src2), __uint32x2_t_to_n64(src3), (src4)))
#define vmlsl_high_lane_u16(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_umlsl2_i16(__uint32x4_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x4_t_to_n64(src3), (src4)))
#define vmlsl_high_lane_u32(src1, src2, src3, src4) __n128_to_uint64x2_t(neon_umlsl2_i32(__uint64x2_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x2_t_to_n64(src3), (src4)))
#define vmlsl_laneq_u16(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_umlsl_qi16(__uint32x4_t_to_n128(src1), __uint16x4_t_to_n64(src2), __uint16x8_t_to_n128(src3), (src4)))
#define vmlsl_laneq_u32(src1, src2, src3, src4) __n128_to_uint64x2_t(neon_umlsl_qi32(__uint64x2_t_to_n128(src1), __uint32x2_t_to_n64(src2), __uint32x4_t_to_n128(src3), (src4)))
#define vmlsl_high_laneq_u16(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_umlsl2_qi16(__uint32x4_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3), (src4)))
#define vmlsl_high_laneq_u32(src1, src2, src3, src4) __n128_to_uint64x2_t(neon_umlsl2_qi32(__uint64x2_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3), (src4)))
#define vmlsl_n_s16(src1, src2, src3) vmlsl_lane_s16((src1), (src2), vmov_n_s16(src3), 0)
#define vmlsl_n_s32(src1, src2, src3) vmlsl_lane_s32((src1), (src2), vmov_n_s32(src3), 0)
#define vmlsl_high_n_s16(src1, src2, src3) vmlsl_high_lane_s16((src1), (src2), vmov_n_s16(src3), 0)
#define vmlsl_high_n_s32(src1, src2, src3) vmlsl_high_lane_s32((src1), (src2), vmov_n_s32(src3), 0)
#define vmlsl_n_u16(src1, src2, src3) vmlsl_lane_u16((src1), (src2), vmov_n_u16(src3), 0)
#define vmlsl_n_u32(src1, src2, src3) vmlsl_lane_u32((src1), (src2), vmov_n_u32(src3), 0)
#define vmlsl_high_n_u16(src1, src2, src3) vmlsl_high_lane_u16((src1), (src2), vmov_n_u16(src3), 0)
#define vmlsl_high_n_u32(src1, src2, src3) vmlsl_high_lane_u32((src1), (src2), vmov_n_u32(src3), 0)
__n128 neon_sqdmull_16(__n64, __n64);
__n128 neon_sqdmull_32(__n64, __n64);
__n128 neon_sqdmull2_16(__n128, __n128);
__n128 neon_sqdmull2_32(__n128, __n128);
__n128 neon_sqdmull_i16(__n64, __n64, const int);
__n128 neon_sqdmull_i32(__n64, __n64, const int);
__n128 neon_sqdmull2_i16(__n128, __n64, const int);
__n128 neon_sqdmull2_i32(__n128, __n64, const int);
__n128 neon_sqdmull_qi16(__n64, __n128, const int);
__n128 neon_sqdmull_qi32(__n64, __n128, const int);
__n128 neon_sqdmull2_qi16(__n128, __n128, const int);
__n128 neon_sqdmull2_qi32(__n128, __n128, const int);
#define vqdmull_s16(src1, src2) __n128_to_int32x4_t(neon_sqdmull_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vqdmull_s32(src1, src2) __n128_to_int64x2_t(neon_sqdmull_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vqdmull_high_s16(src1, src2) __n128_to_int32x4_t(neon_sqdmull2_16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vqdmull_high_s32(src1, src2) __n128_to_int64x2_t(neon_sqdmull2_32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vqdmull_lane_s16(src1, src2, src3) __n128_to_int32x4_t(neon_sqdmull_i16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), (src3)))
#define vqdmull_lane_s32(src1, src2, src3) __n128_to_int64x2_t(neon_sqdmull_i32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), (src3)))
#define vqdmull_high_lane_s16(src1, src2, src3) __n128_to_int32x4_t(neon_sqdmull2_i16(__int16x8_t_to_n128(src1), __int16x4_t_to_n64(src2), (src3)))
#define vqdmull_high_lane_s32(src1, src2, src3) __n128_to_int64x2_t(neon_sqdmull2_i32(__int32x4_t_to_n128(src1), __int32x2_t_to_n64(src2), (src3)))
#define vqdmull_laneq_s16(src1, src2, src3) __n128_to_int32x4_t(neon_sqdmull_qi16(__int16x4_t_to_n64(src1), __int16x8_t_to_n128(src2), (src3)))
#define vqdmull_laneq_s32(src1, src2, src3) __n128_to_int64x2_t(neon_sqdmull_qi32(__int32x2_t_to_n64(src1), __int32x4_t_to_n128(src2), (src3)))
#define vqdmull_high_laneq_s16(src1, src2, src3) __n128_to_int32x4_t(neon_sqdmull2_qi16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), (src3)))
#define vqdmull_high_laneq_s32(src1, src2, src3) __n128_to_int64x2_t(neon_sqdmull2_qi32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), (src3)))
#define vqdmull_n_s16(src1, src2) vqdmull_lane_s16((src1), vmov_n_s16(src2), 0)
#define vqdmull_n_s32(src1, src2) vqdmull_lane_s32((src1), vmov_n_s32(src2), 0)
#define vqdmull_high_n_s16(src1, src2) vqdmull_high_lane_s16((src1), vmov_n_s16(src2), 0)
#define vqdmull_high_n_s32(src1, src2) vqdmull_high_lane_s32((src1), vmov_n_s32(src2), 0)
float neon_sqdmullh_16(__n16, __n16);
__n64 neon_sqdmulls_32(float, float);
float neon_sqdmullh_i16(__n16, __n64, const int);
__n64 neon_sqdmulls_i32(float, __n64, const int);
float neon_sqdmullh_qi16(__n16, __n128, const int);
__n64 neon_sqdmulls_qi32(float, __n128, const int);
#define vqdmullh_s16(src1, src2) _CopyInt32FromFloat(neon_sqdmullh_16(__int16ToN16_v(src1), __int16ToN16_v(src2)))
#define vqdmulls_s32(src1, src2) neon_sqdmulls_32(_CopyFloatFromInt32(src1), _CopyFloatFromInt32(src2)).n64_i64[0]
#define vqdmullh_lane_s16(src1, src2, lane) _CopyInt32FromFloat(neon_sqdmullh_i16(__int16ToN16_v(src1), __int16x4_t_to_n64(src2), (lane)))
#define vqdmulls_lane_s32(src1, src2, lane) neon_sqdmulls_i32(_CopyFloatFromInt32(src1), __int32x2_t_to_n64(src2), (lane)).n64_i64[0]
#define vqdmullh_laneq_s16(src1, src2, lane) _CopyInt32FromFloat(neon_sqdmullh_qi16(__int16ToN16_v(src1), __int16x8_t_to_n128(src2), (lane)))
#define vqdmulls_laneq_s32(src1, src2, lane) neon_sqdmulls_qi32(_CopyFloatFromInt32(src1), __int32x4_t_to_n128(src2), (lane)).n64_i64[0]
__n128 neon_sqdmlal_16(__n128, __n64, __n64);
__n128 neon_sqdmlal_32(__n128, __n64, __n64);
__n128 neon_sqdmlal2_16(__n128, __n128, __n128);
__n128 neon_sqdmlal2_32(__n128, __n128, __n128);
__n128 neon_sqdmlal_i16(__n128, __n64, __n64, const int);
__n128 neon_sqdmlal_i32(__n128, __n64, __n64, const int);
__n128 neon_sqdmlal2_i16(__n128, __n128, __n64, const int);
__n128 neon_sqdmlal2_i32(__n128, __n128, __n64, const int);
__n128 neon_sqdmlal_qi16(__n128, __n64, __n128, const int);
__n128 neon_sqdmlal_qi32(__n128, __n64, __n128, const int);
__n128 neon_sqdmlal2_qi16(__n128, __n128, __n128, const int);
__n128 neon_sqdmlal2_qi32(__n128, __n128, __n128, const int);
#define vqdmlal_s16(src1, src2, src3) __n128_to_int32x4_t(neon_sqdmlal_16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3)))
#define vqdmlal_s32(src1, src2, src3) __n128_to_int64x2_t(neon_sqdmlal_32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3)))
#define vqdmlal_high_s16(src1, src2, src3) __n128_to_int32x4_t(neon_sqdmlal2_16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vqdmlal_high_s32(src1, src2, src3) __n128_to_int64x2_t(neon_sqdmlal2_32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vqdmlal_lane_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_sqdmlal_i16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3), (src4)))
#define vqdmlal_lane_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_sqdmlal_i32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3), (src4)))
#define vqdmlal_high_lane_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_sqdmlal2_i16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x4_t_to_n64(src3), (src4)))
#define vqdmlal_high_lane_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_sqdmlal2_i32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x2_t_to_n64(src3), (src4)))
#define vqdmlal_laneq_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_sqdmlal_qi16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2), __int16x8_t_to_n128(src3), (src4)))
#define vqdmlal_laneq_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_sqdmlal_qi32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2), __int32x4_t_to_n128(src3), (src4)))
#define vqdmlal_high_laneq_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_sqdmlal2_qi16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3), (src4)))
#define vqdmlal_high_laneq_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_sqdmlal2_qi32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3), (src4)))
#define vqdmlal_n_s16(src1, src2, src3) vqdmlal_lane_s16((src1), (src2), vmov_n_s16(src3), 0)
#define vqdmlal_n_s32(src1, src2, src3) vqdmlal_lane_s32((src1), (src2), vmov_n_s32(src3), 0)
#define vqdmlal_high_n_s16(src1, src2, src3) vqdmlal_high_lane_s16((src1), (src2), vmov_n_s16(src3), 0)
#define vqdmlal_high_n_s32(src1, src2, src3) vqdmlal_high_lane_s32((src1), (src2), vmov_n_s32(src3), 0)
float  neon_sqdmlalh_16(float, __n16, __n16);
__n64  neon_sqdmlals_32(__n64, float, float);
float  neon_sqdmlalh_i16(float, __n16, __n64, const int);
__n64  neon_sqdmlals_i32(__n64, float, __n64, const int);
float  neon_sqdmlalh_qi16(float, __n16, __n128, const int);
__n64  neon_sqdmlals_qi32(__n64, float, __n128, const int);
#define vqdmlalh_s16(src1, src2, src3) _CopyInt32FromFloat(neon_sqdmlalh_16(_CopyFloatFromInt32(src1), __int16ToN16_v(src2), __int16ToN16_v(src3)))
#define vqdmlals_s32(src1, src2, src3) neon_sqdmlals_32(__int64ToN64_v(src1), _CopyFloatFromInt32(src2), _CopyFloatFromInt32(src3)).n64_i64[0]
#define vqdmlalh_lane_s16(src1, src2, src3, lane) _CopyInt32FromFloat(neon_sqdmlalh_i16(_CopyFloatFromInt32(src1), __int16ToN16_v(src2), __int16x4_t_to_n64(src3), (lane)))
#define vqdmlals_lane_s32(src1, src2, src3, lane) neon_sqdmlals_i32(__int64ToN64_v(src1), _CopyFloatFromInt32(src2), __int32x2_t_to_n64(src3), (lane)).n64_i64[0]
#define vqdmlalh_laneq_s16(src1, src2, src3, lane) _CopyInt32FromFloat(neon_sqdmlalh_qi16(_CopyFloatFromInt32(src1), __int16ToN16_v(src2), __int16x8_t_to_n128(src3), (lane)))
#define vqdmlals_laneq_s32(src1, src2, src3, lane) neon_sqdmlals_qi32(__int64ToN64_v(src1), _CopyFloatFromInt32(src2), __int32x4_t_to_n128(src3), (lane)).n64_i64[0]
__n128 neon_sqdmlsl_16(__n128, __n64, __n64);
__n128 neon_sqdmlsl_32(__n128, __n64, __n64);
__n128 neon_sqdmlsl2_16(__n128, __n128, __n128);
__n128 neon_sqdmlsl2_32(__n128, __n128, __n128);
__n128 neon_sqdmlsl_i16(__n128, __n64, __n64, const int);
__n128 neon_sqdmlsl_i32(__n128, __n64, __n64, const int);
__n128 neon_sqdmlsl2_i16(__n128, __n128, __n64, const int);
__n128 neon_sqdmlsl2_i32(__n128, __n128, __n64, const int);
__n128 neon_sqdmlsl_qi16(__n128, __n64, __n128, const int);
__n128 neon_sqdmlsl_qi32(__n128, __n64, __n128, const int);
__n128 neon_sqdmlsl2_qi16(__n128, __n128, __n128, const int);
__n128 neon_sqdmlsl2_qi32(__n128, __n128, __n128, const int);
#define vqdmlsl_s16(src1, src2, src3) __n128_to_int32x4_t(neon_sqdmlsl_16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3)))
#define vqdmlsl_s32(src1, src2, src3) __n128_to_int64x2_t(neon_sqdmlsl_32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3)))
#define vqdmlsl_high_s16(src1, src2, src3) __n128_to_int32x4_t(neon_sqdmlsl2_16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vqdmlsl_high_s32(src1, src2, src3) __n128_to_int64x2_t(neon_sqdmlsl2_32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vqdmlsl_lane_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_sqdmlsl_i16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3), (src4)))
#define vqdmlsl_lane_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_sqdmlsl_i32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3), (src4)))
#define vqdmlsl_high_lane_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_sqdmlsl2_i16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x4_t_to_n64(src3), (src4)))
#define vqdmlsl_high_lane_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_sqdmlsl2_i32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x2_t_to_n64(src3), (src4)))
#define vqdmlsl_laneq_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_sqdmlsl_qi16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2), __int16x8_t_to_n128(src3), (src4)))
#define vqdmlsl_laneq_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_sqdmlsl_qi32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2), __int32x4_t_to_n128(src3), (src4)))
#define vqdmlsl_high_laneq_s16(src1, src2, src3, src4) __n128_to_int32x4_t(neon_sqdmlsl2_qi16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3), (src4)))
#define vqdmlsl_high_laneq_s32(src1, src2, src3, src4) __n128_to_int64x2_t(neon_sqdmlsl2_qi32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3), (src4)))
#define vqdmlsl_n_s16(src1, src2, src3) vqdmlsl_lane_s16((src1), (src2), vmov_n_s16(src3), 0)
#define vqdmlsl_n_s32(src1, src2, src3) vqdmlsl_lane_s32((src1), (src2), vmov_n_s32(src3), 0)
#define vqdmlsl_high_n_s16(src1, src2, src3) vqdmlsl_high_lane_s16((src1), (src2), vmov_n_s16(src3), 0)
#define vqdmlsl_high_n_s32(src1, src2, src3) vqdmlsl_high_lane_s32((src1), (src2), vmov_n_s32(src3), 0)
float neon_sqdmlslh_16(float, __n16, __n16);
__n64 neon_sqdmlsls_32(__n64, float, float);
float neon_sqdmlslh_i16(float, __n16, __n64, const int);
__n64 neon_sqdmlsls_i32(__n64, float, __n64, const int);
float neon_sqdmlslh_qi16(float, __n16, __n128, const int);
__n64 neon_sqdmlsls_qi32(__n64, float, __n128, const int);
#define vqdmlslh_s16(src1, src2, src3) _CopyInt32FromFloat(neon_sqdmlslh_16(_CopyFloatFromInt32(src1), __int16ToN16_v(src2), __int16ToN16_v(src3)))
#define vqdmlsls_s32(src1, src2, src3) neon_sqdmlsls_32(__int64ToN64_v(src1), _CopyFloatFromInt32(src2), _CopyFloatFromInt32(src3)).n64_i64[0]
#define vqdmlslh_lane_s16(src1, src2, src3, lane) _CopyInt32FromFloat(neon_sqdmlslh_i16(_CopyFloatFromInt32(src1), __int16ToN16_v(src2), __int16x4_t_to_n64(src3), (lane)))
#define vqdmlsls_lane_s32(src1, src2, src3, lane) neon_sqdmlsls_i32(__int64ToN64_v(src1), _CopyFloatFromInt32(src2), __int32x2_t_to_n64(src3), (lane)).n64_i64[0]
#define vqdmlslh_laneq_s16(src1, src2, src3, lane) _CopyInt32FromFloat(neon_sqdmlslh_qi16(_CopyFloatFromInt32(src1), __int16ToN16_v(src2), __int16x8_t_to_n128(src3), (lane)))
#define vqdmlsls_laneq_s32(src1, src2, src3, lane) neon_sqdmlsls_qi32(__int64ToN64_v(src1), _CopyFloatFromInt32(src2), __int32x4_t_to_n128(src3), (lane)).n64_i64[0]

// SDOT/UDOT
__n64 neon_sdot(__n64, __n64, __n64);
__n64 neon_udot(__n64, __n64, __n64);
__n128 neon_sdotq(__n128, __n128, __n128);
__n128 neon_udotq(__n128, __n128, __n128);
__n64 neon_sdot_lane(__n64, __n64, __n64, const int);
__n64 neon_udot_lane(__n64, __n64, __n64, const int);
__n128 neon_sdotq_laneq(__n128, __n128, __n128, const int);
__n128 neon_udotq_laneq(__n128, __n128, __n128, const int);
__n64 neon_sdot_laneq(__n64, __n64, __n128, const int);
__n64 neon_udot_laneq(__n64, __n64, __n128, const int);
__n128 neon_sdotq_lane(__n128, __n128, __n64, const int);
__n128 neon_udotq_lane(__n128, __n128, __n64, const int);
#define vdot_u32(src1, src2, src3) __n64_to_uint32x2_t(neon_udot(__uint32x2_t_to_n64(src1), __uint8x8_t_to_n64(src2), __uint8x8_t_to_n64(src3)))
#define vdot_s32(src1, src2, src3) __n64_to_int32x2_t(neon_sdot(__int32x2_t_to_n64(src1), __int8x8_t_to_n64(src2), __int8x8_t_to_n64(src3)))
#define vdotq_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_udotq(__uint32x4_t_to_n128(src1), __uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3)))
#define vdotq_s32(src1, src2, src3) __n128_to_int32x4_t(neon_sdotq(__int32x4_t_to_n128(src1), __int8x16_t_to_n128(src2), __int8x16_t_to_n128(src3)))
#define vdot_lane_u32(src1, src2, src3, src4) __n64_to_uint32x2_t(neon_udot_lane(__uint32x2_t_to_n64(src1), __uint8x8_t_to_n64(src2), __uint8x8_t_to_n64(src3), (src4)))
#define vdot_lane_s32(src1, src2, src3, src4) __n64_to_int32x2_t(neon_sdot_lane(__int32x2_t_to_n64(src1), __int8x8_t_to_n64(src2), __int8x8_t_to_n64(src3), (src4)))
#define vdotq_laneq_u32(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_udotq_laneq(__uint32x4_t_to_n128(src1), __uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3), (src4)))
#define vdotq_laneq_s32(src1, src2, src3, src4) __n128_to_int32x4_t(neon_sdotq_laneq(__int32x4_t_to_n128(src1), __int8x16_t_to_n128(src2), __int8x16_t_to_n128(src3), (src4)))
#define vdot_laneq_u32(src1, src2, src3, src4) __n64_to_uint32x2_t(neon_udot_laneq(__uint32x2_t_to_n64(src1), __uint8x8_t_to_n64(src2), __uint8x16_t_to_n128(src3), (src4)))
#define vdot_laneq_s32(src1, src2, src3, src4) __n64_to_int32x2_t(neon_sdot_laneq(__int32x2_t_to_n64(src1), __int8x8_t_to_n64(src2), __int8x16_t_to_n128(src3), (src4)))
#define vdotq_lane_u32(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_udotq_lane(__uint32x4_t_to_n128(src1), __uint8x16_t_to_n128(src2), __uint8x8_t_to_n64(src3), (src4)))
#define vdotq_lane_s32(src1, src2, src3, src4) __n128_to_int32x4_t(neon_sdotq_lane(__int32x4_t_to_n128(src1), __int8x16_t_to_n128(src2), __int8x8_t_to_n64(src3), (src4)))

// CMEQ/CMGE/CMGT/CMHI/CMHS/CMLE/CMLT/CMTST/FACGE/FACGT/FCMEQ/FCMGE/FCMGT/FCMLE/FCMLT/
__n64 neon_facge16(__n64, __n64);
__n64 neon_facge32(__n64, __n64);
__n64 neon_facge64(__n64, __n64);
__n128 neon_facgeq16(__n128, __n128);
__n128 neon_facgeq32(__n128, __n128);
__n128 neon_facgeq64(__n128, __n128);
float neon_facges32(float, float);
double neon_facges64(double, double);
__n64 neon_facgt16(__n64, __n64);
__n64 neon_facgt32(__n64, __n64);
__n64 neon_facgt64(__n64, __n64);
__n128 neon_facgtq16(__n128, __n128);
__n128 neon_facgtq32(__n128, __n128);
__n128 neon_facgtq64(__n128, __n128);
float neon_facgts32(float, float);
double neon_facgts64(double, double);
__n64 neon_fcmeq16(__n64, __n64);
__n128 neon_fcmeqq16(__n128, __n128);
__n64 neon_fcmeq32(__n64, __n64);
__n128 neon_fcmeqq32(__n128, __n128);
__n64 neon_fcmeq64(__n64, __n64);
__n128 neon_fcmeqq64(__n128, __n128);
__n64 neon_fcmeqz16(__n64);
__n128 neon_fcmeqzq16(__n128);
__n64 neon_fcmeqz32(__n64);
__n128 neon_fcmeqzq32(__n128);
__n64 neon_fcmeqz64(__n64);
__n128 neon_fcmeqzq64(__n128);
float neon_fcmeqs32(float, float);
double neon_fcmeqs64(double, double);
float neon_fcmeqzs32(float);
double neon_fcmeqzs64(double);
__n64 neon_fcmge16(__n64, __n64);
__n64 neon_fcmge32(__n64, __n64);
__n64 neon_fcmge64(__n64, __n64);
__n128 neon_fcmgeq16(__n128, __n128);
__n128 neon_fcmgeq32(__n128, __n128);
__n128 neon_fcmgeq64(__n128, __n128);
__n64 neon_fcmgez16(__n64);
__n64 neon_fcmgez32(__n64);
__n64 neon_fcmgez64(__n64);
__n128 neon_fcmgezq16(__n128);
__n128 neon_fcmgezq32(__n128);
__n128 neon_fcmgezq64(__n128);
float neon_fcmges32(float, float);
double neon_fcmges64(double, double);
float neon_fcmgezs32(float);
double neon_fcmgezs64(double);
__n64 neon_fcmgt16(__n64, __n64);
__n64 neon_fcmgt32(__n64, __n64);
__n64 neon_fcmgt64(__n64, __n64);
__n128 neon_fcmgtq16(__n128, __n128);
__n128 neon_fcmgtq32(__n128, __n128);
__n128 neon_fcmgtq64(__n128, __n128);
__n64 neon_fcmgtz16(__n64);
__n128 neon_fcmgtzq16(__n128);
__n64 neon_fcmgtz32(__n64);
__n128 neon_fcmgtzq32(__n128);
__n64 neon_fcmgtz64(__n64);
__n128 neon_fcmgtzq64(__n128);
float neon_fcmgts32(float, float);
double neon_fcmgts64(double, double);
float neon_fcmgtzs32(float);
double neon_fcmgtzs64(double);
__n64 neon_fcmlez16(__n64);
__n128 neon_fcmlezq16(__n128);
__n64 neon_fcmlez32(__n64);
__n128 neon_fcmlezq32(__n128);
__n64 neon_fcmlez64(__n64);
__n128 neon_fcmlezq64(__n128);
float neon_fcmlezs32(float);
double neon_fcmlezs64(double);
__n64 neon_fcmltz16(__n64);
__n128 neon_fcmltzq16(__n128);
__n64 neon_fcmltz32(__n64);
__n128 neon_fcmltzq32(__n128);
__n64 neon_fcmltz64(__n64);
__n128 neon_fcmltzq64(__n128);
float neon_fcmltzs32(float);
double neon_fcmltzs64(double);
__n64 neon_cmeq8(__n64, __n64);
__n128 neon_cmeqq8(__n128, __n128);
__n64 neon_cmeq16(__n64, __n64);
__n128 neon_cmeqq16(__n128, __n128);
__n64 neon_cmeq32(__n64, __n64);
__n128 neon_cmeqq32(__n128, __n128);
__n64 neon_cmeq64(__n64, __n64);
__n128 neon_cmeqq64(__n128, __n128);
__n64 neon_cmeqz8(__n64);
__n128 neon_cmeqzq8(__n128);
__n64 neon_cmeqz16(__n64);
__n128 neon_cmeqzq16(__n128);
__n64 neon_cmeqz32(__n64);
__n128 neon_cmeqzq32(__n128);
__n64 neon_cmeqz64(__n64);
__n128 neon_cmeqzq64(__n128);
double neon_cmeqs64(double, double);
double neon_cmeqzs64(double);
__n64 neon_cmge8(__n64, __n64);
__n128 neon_cmgeq8(__n128, __n128);
__n64 neon_cmge16(__n64, __n64);
__n128 neon_cmgeq16(__n128, __n128);
__n64 neon_cmge32(__n64, __n64);
__n128 neon_cmgeq32(__n128, __n128);
__n64 neon_cmge64(__n64, __n64);
__n128 neon_cmgeq64(__n128, __n128);
__n64 neon_cmgez8(__n64);
__n128 neon_cmgezq8(__n128);
__n64 neon_cmgez16(__n64);
__n128 neon_cmgezq16(__n128);
__n64 neon_cmgez32(__n64);
__n128 neon_cmgezq32(__n128);
__n64 neon_cmgez64(__n64);
__n128 neon_cmgezq64(__n128);
double neon_cmges64(double, double);
double neon_cmgezs64(double);
__n64 neon_cmgt8(__n64, __n64);
__n128 neon_cmgtq8(__n128, __n128);
__n64 neon_cmgt16(__n64, __n64);
__n128 neon_cmgtq16(__n128, __n128);
__n64 neon_cmgt32(__n64, __n64);
__n128 neon_cmgtq32(__n128, __n128);
__n64 neon_cmgt64(__n64, __n64);
__n128 neon_cmgtq64(__n128, __n128);
__n64 neon_cmgtz8(__n64);
__n128 neon_cmgtzq8(__n128);
__n64 neon_cmgtz16(__n64);
__n128 neon_cmgtzq16(__n128);
__n64 neon_cmgtz32(__n64);
__n128 neon_cmgtzq32(__n128);
__n64 neon_cmgtz64(__n64);
__n128 neon_cmgtzq64(__n128);
double neon_cmgts64(double, double);
double neon_cmgtzs64(double);
__n64 neon_cmhi8(__n64, __n64);
__n128 neon_cmhiq8(__n128, __n128);
__n64 neon_cmhi16(__n64, __n64);
__n128 neon_cmhiq16(__n128, __n128);
__n64 neon_cmhi32(__n64, __n64);
__n128 neon_cmhiq32(__n128, __n128);
__n64 neon_cmhi64(__n64, __n64);
__n128 neon_cmhiq64(__n128, __n128);
double neon_cmhis64(double, double);
__n64 neon_cmhs8(__n64, __n64);
__n128 neon_cmhsq8(__n128, __n128);
__n64 neon_cmhs16(__n64, __n64);
__n128 neon_cmhsq16(__n128, __n128);
__n64 neon_cmhs32(__n64, __n64);
__n128 neon_cmhsq32(__n128, __n128);
__n64 neon_cmhs64(__n64, __n64);
__n128 neon_cmhsq64(__n128, __n128);
double neon_cmhss64(double, double);
__n64 neon_cmlez8(__n64);
__n128 neon_cmlezq8(__n128);
__n64 neon_cmlez16(__n64);
__n128 neon_cmlezq16(__n128);
__n64 neon_cmlez32(__n64);
__n128 neon_cmlezq32(__n128);
__n64 neon_cmlez64(__n64);
__n128 neon_cmlezq64(__n128);
double neon_cmlezs64(double);
__n64 neon_cmltz8(__n64);
__n128 neon_cmltzq8(__n128);
__n64 neon_cmltz16(__n64);
__n128 neon_cmltzq16(__n128);
__n64 neon_cmltz32(__n64);
__n128 neon_cmltzq32(__n128);
__n64 neon_cmltz64(__n64);
__n128 neon_cmltzq64(__n128);
double neon_cmltzs64(double);
__n64 neon_cmtst8(__n64, __n64);
__n128 neon_cmtstq8(__n128, __n128);
__n64 neon_cmtst16(__n64, __n64);
__n128 neon_cmtstq16(__n128, __n128);
__n64 neon_cmtst32(__n64, __n64);
__n128 neon_cmtstq32(__n128, __n128);
__n64 neon_cmtst64(__n64, __n64);
__n128 neon_cmtstq64(__n128, __n128);
double neon_cmtsts64(double, double);
#define vceqz_f32(src) __n64_to_uint32x2_t(neon_fcmeqz32(__float32x2_t_to_n64(src)))
#define vceqz_s16(src) __n64_to_uint16x4_t(neon_cmeqz16(__int16x4_t_to_n64(src)))
#define vceqz_s32(src) __n64_to_uint32x2_t(neon_cmeqz32(__int32x2_t_to_n64(src)))
#define vceqz_s64(src) __n64_to_uint64x1_t(neon_cmeqz64(__int64x1_t_to_n64(src)))
#define vceqz_u64(src) __n64_to_uint64x1_t(neon_cmeqz64(__uint64x1_t_to_n64(src)))
#define vceqz_p64(src) __n64_to_uint64x1_t(neon_cmeqz64(__poly64x1_t_to_n64(src)))
#define vceqz_f64(src) __n64_to_uint64x1_t(neon_fcmeqz64(__float64x1_t_to_n64(src)))
#define vceqzd_s64(src) neon_cmeqz64(__int64ToN64_v(src)).n64_u64[0]
#define vceqzd_u64(src) neon_cmeqz64(__uint64ToN64_v(src)).n64_u64[0]
#define vceqzs_f32(src) _CopyUInt32FromFloat(neon_fcmeqzs32(src))
#define vceqzd_f64(src) _CopyUInt64FromDouble(neon_fcmeqzs64(src))
#define vceqz_s8(src) __n64_to_uint8x8_t(neon_cmeqz8(__int8x8_t_to_n64(src)))
#define vceqz_u16(src) __n64_to_uint16x4_t(neon_cmeqz16(__uint16x4_t_to_n64(src)))
#define vceqz_u32(src) __n64_to_uint32x2_t(neon_cmeqz32(__uint32x2_t_to_n64(src)))
#define vceqz_u8(src) __n64_to_uint8x8_t(neon_cmeqz8(__uint8x8_t_to_n64(src)))
#define vceqzq_f32(src) __n128_to_uint32x4_t(neon_fcmeqzq32(__float32x4_t_to_n128(src)))
#define vceqzq_s64(src) __n128_to_uint64x2_t(neon_cmeqzq64(__int64x2_t_to_n128(src)))
#define vceqzq_u64(src) __n128_to_uint64x2_t(neon_cmeqzq64(__uint64x2_t_to_n128(src)))
#define vceqzq_p64(src) __n128_to_uint64x2_t(neon_cmeqzq64(__poly64x2_t_to_n128(src)))
#define vceqzq_f64(src) __n128_to_uint64x2_t(neon_fcmeqzq64(__float64x2_t_to_n128(src)))
#define vceqzq_s16(src) __n128_to_uint16x8_t(neon_cmeqzq16(__int16x8_t_to_n128(src)))
#define vceqzq_s32(src) __n128_to_uint32x4_t(neon_cmeqzq32(__int32x4_t_to_n128(src)))
#define vceqzq_s8(src) __n128_to_uint8x16_t(neon_cmeqzq8(__int8x16_t_to_n128(src)))
#define vceqzq_u16(src) __n128_to_uint16x8_t(neon_cmeqzq16(__uint16x8_t_to_n128(src)))
#define vceqzq_u32(src) __n128_to_uint32x4_t(neon_cmeqzq32(__uint32x4_t_to_n128(src)))
#define vceqzq_u8(src) __n128_to_uint8x16_t(neon_cmeqzq8(__uint8x16_t_to_n128(src)))
#define vceqz_p8(src) __n64_to_uint8x8_t(neon_cmeqz8(__poly8x8_t_to_n64(src)))
#define vceqzq_p8(src) __n128_to_uint8x16_t(neon_cmeqzq8(__poly8x16_t_to_n128(src)))
#define vceq_f32(src1, src2) __n64_to_uint32x2_t(neon_fcmeq32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vceq_f64(src1, src2) __n64_to_uint64x1_t(neon_fcmeq64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vceq_p8(src1, src2) __n64_to_uint8x8_t(neon_cmeq8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vceq_s16(src1, src2) __n64_to_uint16x4_t(neon_cmeq16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vceq_s32(src1, src2) __n64_to_uint32x2_t(neon_cmeq32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vceq_s8(src1, src2) __n64_to_uint8x8_t(neon_cmeq8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vceq_s64(src1, src2) __n64_to_uint64x1_t(neon_cmeq64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vceq_u16(src1, src2) __n64_to_uint16x4_t(neon_cmeq16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vceq_u32(src1, src2) __n64_to_uint32x2_t(neon_cmeq32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vceq_u8(src1, src2) __n64_to_uint8x8_t(neon_cmeq8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vceq_u64(src1, src2) __n64_to_uint64x1_t(neon_cmeq64(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define vceq_p64(src1, src2) __n64_to_uint64x1_t(neon_cmeq64(__poly64x1_t_to_n64(src1), __poly64x1_t_to_n64(src2)))
#define vceqd_s64(src1, src2) neon_cmeq64(__int64ToN64_v(src1), __int64ToN64_v(src2)).n64_u64[0]
#define vceqd_u64(src1, src2) neon_cmeq64(__uint64ToN64_v(src1), __uint64ToN64_v(src2)).n64_u64[0]
#define vceqs_f32(src1, src2) _CopyUInt32FromFloat(neon_fcmeqs32((src1), (src2)))
#define vceqd_f64(src1, src2) _CopyUInt64FromDouble(neon_fcmeqs64((src1), (src2)))
#define vceqq_f32(src1, src2) __n128_to_uint32x4_t(neon_fcmeqq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vceqq_f64(src1, src2) __n128_to_uint64x2_t(neon_fcmeqq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vceqq_p8(src1, src2) __n128_to_uint8x16_t(neon_cmeqq8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vceqq_s16(src1, src2) __n128_to_uint16x8_t(neon_cmeqq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vceqq_s32(src1, src2) __n128_to_uint32x4_t(neon_cmeqq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vceqq_s8(src1, src2) __n128_to_uint8x16_t(neon_cmeqq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vceqq_s64(src1, src2) __n128_to_uint64x2_t(neon_cmeqq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vceqq_u16(src1, src2) __n128_to_uint16x8_t(neon_cmeqq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vceqq_u32(src1, src2) __n128_to_uint32x4_t(neon_cmeqq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vceqq_u8(src1, src2) __n128_to_uint8x16_t(neon_cmeqq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vceqq_u64(src1, src2) __n128_to_uint64x2_t(neon_cmeqq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vceqq_p64(src1, src2) __n128_to_uint64x2_t(neon_cmeqq64(__poly64x2_t_to_n128(src1), __poly64x2_t_to_n128(src2)))
#define vcgez_f32(src) __n64_to_uint32x2_t(neon_fcmgez32(__float32x2_t_to_n64(src)))
#define vcgez_f64(src) __n64_to_uint64x1_t(neon_fcmgez64(__float64x1_t_to_n64(src)))
#define vcgez_s8(src) __n64_to_uint8x8_t(neon_cmgez8(__int8x8_t_to_n64(src)))
#define vcgez_s16(src) __n64_to_uint16x4_t(neon_cmgez16(__int16x4_t_to_n64(src)))
#define vcgez_s32(src) __n64_to_uint32x2_t(neon_cmgez32(__int32x2_t_to_n64(src)))
#define vcgez_s64(src) __n64_to_uint64x1_t(neon_cmgez64(__int64x1_t_to_n64(src)))
#define vcgezd_s64(src) neon_cmgez64(__int64ToN64_v(src)).n64_u64[0]
#define vcgezs_f32(src) _CopyUInt32FromFloat(neon_fcmgezs32(src))
#define vcgezd_f64(src) _CopyUInt64FromDouble(neon_fcmgezs64(src))
#define vcgezq_f32(src) __n128_to_uint32x4_t(neon_fcmgezq32(__float32x4_t_to_n128(src)))
#define vcgezq_f64(src) __n128_to_uint64x2_t(neon_fcmgezq64(__float64x2_t_to_n128(src)))
#define vcgezq_s8(src) __n128_to_uint8x16_t(neon_cmgezq8(__int8x16_t_to_n128(src)))
#define vcgezq_s16(src) __n128_to_uint16x8_t(neon_cmgezq16(__int16x8_t_to_n128(src)))
#define vcgezq_s32(src) __n128_to_uint32x4_t(neon_cmgezq32(__int32x4_t_to_n128(src)))
#define vcgezq_s64(src) __n128_to_uint64x2_t(neon_cmgezq64(__int64x2_t_to_n128(src)))
#define vcge_f32(src1, src2) __n64_to_uint32x2_t(neon_fcmge32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vcge_f64(src1, src2) __n64_to_uint64x1_t(neon_fcmge64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vcge_s8(src1, src2) __n64_to_uint8x8_t(neon_cmge8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vcge_s16(src1, src2) __n64_to_uint16x4_t(neon_cmge16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vcge_s32(src1, src2) __n64_to_uint32x2_t(neon_cmge32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vcge_s64(src1, src2) __n64_to_uint64x1_t(neon_cmge64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vcge_u8(src1, src2) __n64_to_uint8x8_t(neon_cmhs8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vcge_u16(src1, src2) __n64_to_uint16x4_t(neon_cmhs16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vcge_u32(src1, src2) __n64_to_uint32x2_t(neon_cmhs32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vcge_u64(src1, src2) __n64_to_uint64x1_t(neon_cmhs64(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define vcged_s64(src1, src2) neon_cmge64(__int64ToN64_v(src1), __int64ToN64_v(src2)).n64_u64[0]
#define vcged_u64(src1, src2) neon_cmhs64(__uint64ToN64_v(src1), __uint64ToN64_v(src2)).n64_u64[0]
#define vcges_f32(src1, src2) _CopyUInt32FromFloat(neon_fcmges32((src1), (src2)))
#define vcged_f64(src1, src2) _CopyUInt64FromDouble(neon_fcmges64((src1), (src2)))
#define vcgeq_f32(src1, src2) __n128_to_uint32x4_t(neon_fcmgeq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vcgeq_f64(src1, src2) __n128_to_uint64x2_t(neon_fcmgeq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vcgeq_s8(src1, src2) __n128_to_uint8x16_t(neon_cmgeq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vcgeq_s16(src1, src2) __n128_to_uint16x8_t(neon_cmgeq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vcgeq_s32(src1, src2) __n128_to_uint32x4_t(neon_cmgeq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vcgeq_s64(src1, src2) __n128_to_uint64x2_t(neon_cmgeq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vcgeq_u8(src1, src2) __n128_to_uint8x16_t(neon_cmhsq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vcgeq_u16(src1, src2) __n128_to_uint16x8_t(neon_cmhsq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vcgeq_u32(src1, src2) __n128_to_uint32x4_t(neon_cmhsq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vcgeq_u64(src1, src2) __n128_to_uint64x2_t(neon_cmhsq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vclez_f32(src) __n64_to_uint32x2_t(neon_fcmlez32(__float32x2_t_to_n64(src)))
#define vclez_f64(src) __n64_to_uint64x1_t(neon_fcmlez64(__float64x1_t_to_n64(src)))
#define vclez_s8(src) __n64_to_uint8x8_t(neon_cmlez8(__int8x8_t_to_n64(src)))
#define vclez_s16(src) __n64_to_uint16x4_t(neon_cmlez16(__int16x4_t_to_n64(src)))
#define vclez_s32(src) __n64_to_uint32x2_t(neon_cmlez32(__int32x2_t_to_n64(src)))
#define vclez_s64(src) __n64_to_uint64x1_t(neon_cmlez64(__int64x1_t_to_n64(src)))
#define vclezd_s64(src) neon_cmltz64(__int64ToN64_v(src)).n64_u64[0]
#define vclezs_f32(src1) _CopyUInt32FromFloat(neon_fcmlezs32(src1))
#define vclezd_f64(src1) _CopyUInt64FromDouble(neon_fcmlezs64(src1))
#define vclezq_f32(src) __n128_to_uint32x4_t(neon_fcmlezq32(__float32x4_t_to_n128(src)))
#define vclezq_f64(src) __n128_to_uint64x2_t(neon_fcmlezq64(__float64x2_t_to_n128(src)))
#define vclezq_s8(src) __n128_to_uint8x16_t(neon_cmlezq8(__int8x16_t_to_n128(src)))
#define vclezq_s16(src) __n128_to_uint16x8_t(neon_cmlezq16(__int16x8_t_to_n128(src)))
#define vclezq_s32(src) __n128_to_uint32x4_t(neon_cmlezq32(__int32x4_t_to_n128(src)))
#define vclezq_s64(src) __n128_to_uint64x2_t(neon_cmlezq64(__int64x2_t_to_n128(src)))
// vcle register form is alias with vcge with reversed operands
#define vcle_f32(src1, src2) __n64_to_uint32x2_t(neon_fcmge32(__float32x2_t_to_n64(src2), __float32x2_t_to_n64(src1)))
#define vcle_f64(src1, src2) __n64_to_uint64x1_t(neon_fcmge64(__float64x1_t_to_n64(src2), __float64x1_t_to_n64(src1)))
#define vcle_s8(src1, src2) __n64_to_uint8x8_t(neon_cmge8(__int8x8_t_to_n64(src2), __int8x8_t_to_n64(src1)))
#define vcle_s16(src1, src2) __n64_to_uint16x4_t(neon_cmge16(__int16x4_t_to_n64(src2), __int16x4_t_to_n64(src1)))
#define vcle_s32(src1, src2) __n64_to_uint32x2_t(neon_cmge32(__int32x2_t_to_n64(src2), __int32x2_t_to_n64(src1)))
#define vcle_s64(src1, src2) __n64_to_uint64x1_t(neon_cmge64(__int64x1_t_to_n64(src2), __int64x1_t_to_n64(src1)))
#define vcle_u8(src1, src2) __n64_to_uint8x8_t(neon_cmhs8(__uint8x8_t_to_n64(src2), __uint8x8_t_to_n64(src1)))
#define vcle_u16(src1, src2) __n64_to_uint16x4_t(neon_cmhs16(__uint16x4_t_to_n64(src2), __uint16x4_t_to_n64(src1)))
#define vcle_u32(src1, src2) __n64_to_uint32x2_t(neon_cmhs32(__uint32x2_t_to_n64(src2), __uint32x2_t_to_n64(src1)))
#define vcle_u64(src1, src2) __n64_to_uint64x1_t(neon_cmhs64(__uint64x1_t_to_n64(src2), __uint64x1_t_to_n64(src1)))
#define vcled_s64(src1, src2) neon_cmge64(__int64ToN64_v(src2), __int64ToN64_v(src1)).n64_u64[0]
#define vcled_u64(src1, src2) neon_cmhs64(__uint64ToN64_v(src2), __uint64ToN64_v(src1)).n64_u64[0]
#define vcles_f32(src1, src2) _CopyUInt32FromFloat(neon_fcmges32((src2), (src1)))
#define vcled_f64(src1, src2) _CopyUInt64FromDouble(neon_fcmges64((src2), (src1)))
#define vcleq_f32(src1, src2) __n128_to_uint32x4_t(neon_fcmgeq32(__float32x4_t_to_n128(src2), __float32x4_t_to_n128(src1)))
#define vcleq_f64(src1, src2) __n128_to_uint64x2_t(neon_fcmgeq64(__float64x2_t_to_n128(src2), __float64x2_t_to_n128(src1)))
#define vcleq_s8(src1, src2) __n128_to_uint8x16_t(neon_cmgeq8(__int8x16_t_to_n128(src2), __int8x16_t_to_n128(src1)))
#define vcleq_s16(src1, src2) __n128_to_uint16x8_t(neon_cmgeq16(__int16x8_t_to_n128(src2), __int16x8_t_to_n128(src1)))
#define vcleq_s32(src1, src2) __n128_to_uint32x4_t(neon_cmgeq32(__int32x4_t_to_n128(src2), __int32x4_t_to_n128(src1)))
#define vcleq_s64(src1, src2) __n128_to_uint64x2_t(neon_cmgeq64(__int64x2_t_to_n128(src2), __int64x2_t_to_n128(src1)))
#define vcleq_u8(src1, src2) __n128_to_uint8x16_t(neon_cmhsq8(__uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src1)))
#define vcleq_u16(src1, src2) __n128_to_uint16x8_t(neon_cmhsq16(__uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src1)))
#define vcleq_u32(src1, src2) __n128_to_uint32x4_t(neon_cmhsq32(__uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src1)))
#define vcleq_u64(src1, src2) __n128_to_uint64x2_t(neon_cmhsq64(__uint64x2_t_to_n128(src2), __uint64x2_t_to_n128(src1)))
#define vcgtz_f32(src) __n64_to_uint32x2_t(neon_fcmgtz32(__float32x2_t_to_n64(src)))
#define vcgtz_f64(src) __n64_to_uint64x1_t(neon_fcmgtz64(__float64x1_t_to_n64(src)))
#define vcgtz_s8(src) __n64_to_uint8x8_t(neon_cmgtz8(__int8x8_t_to_n64(src)))
#define vcgtz_s16(src) __n64_to_uint16x4_t(neon_cmgtz16(__int16x4_t_to_n64(src)))
#define vcgtz_s32(src) __n64_to_uint32x2_t(neon_cmgtz32(__int32x2_t_to_n64(src)))
#define vcgtz_s64(src) __n64_to_uint64x1_t(neon_cmgtz64(__int64x1_t_to_n64(src)))
#define vcgtzd_s64(src) neon_cmgtz64(__int64ToN64_v(src)).n64_u64[0]
#define vcgtzs_f32(src) _CopyUInt32FromFloat(neon_fcmgtzs32(src))
#define vcgtzd_f64(src) _CopyUInt64FromDouble(neon_fcmgtzs64(src))
#define vcgtzq_f32(src) __n128_to_uint32x4_t(neon_fcmgtzq32(__float32x4_t_to_n128(src)))
#define vcgtzq_f64(src) __n128_to_uint64x2_t(neon_fcmgtzq64(__float64x2_t_to_n128(src)))
#define vcgtzq_s8(src) __n128_to_uint8x16_t(neon_cmgtzq8(__int8x16_t_to_n128(src)))
#define vcgtzq_s16(src) __n128_to_uint16x8_t(neon_cmgtzq16(__int16x8_t_to_n128(src)))
#define vcgtzq_s32(src) __n128_to_uint32x4_t(neon_cmgtzq32(__int32x4_t_to_n128(src)))
#define vcgtzq_s64(src) __n128_to_uint64x2_t(neon_cmgtzq64(__int64x2_t_to_n128(src)))
#define vcgt_f32(src1, src2) __n64_to_uint32x2_t(neon_fcmgt32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vcgt_f64(src1, src2) __n64_to_uint64x1_t(neon_fcmgt64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vcgt_s16(src1, src2) __n64_to_uint16x4_t(neon_cmgt16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vcgt_s32(src1, src2) __n64_to_uint32x2_t(neon_cmgt32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vcgt_s8(src1, src2) __n64_to_uint8x8_t(neon_cmgt8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vcgt_s64(src1, src2) __n64_to_uint64x1_t(neon_cmgt64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vcgt_u16(src1, src2) __n64_to_uint16x4_t(neon_cmhi16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vcgt_u32(src1, src2) __n64_to_uint32x2_t(neon_cmhi32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vcgt_u8(src1, src2) __n64_to_uint8x8_t(neon_cmhi8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vcgt_u64(src1, src2) __n64_to_uint64x1_t(neon_cmhi64(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define vcgtd_s64(src1, src2) neon_cmgt64(__int64ToN64_v(src1), __int64ToN64_v(src2)).n64_u64[0]
#define vcgtd_u64(src1, src2) neon_cmhi64(__uint64ToN64_v(src1), __uint64ToN64_v(src2)).n64_u64[0]
#define vcgts_f32(src1, src2) _CopyUInt32FromFloat(neon_fcmgts32((src1), (src2)))
#define vcgtd_f64(src1, src2) _CopyUInt64FromDouble(neon_fcmgts64((src1), (src2)))
#define vcgtq_f32(src1, src2) __n128_to_uint32x4_t(neon_fcmgtq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vcgtq_f64(src1, src2) __n128_to_uint64x2_t(neon_fcmgtq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vcgtq_s16(src1, src2) __n128_to_uint16x8_t(neon_cmgtq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vcgtq_s32(src1, src2) __n128_to_uint32x4_t(neon_cmgtq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vcgtq_s8(src1, src2) __n128_to_uint8x16_t(neon_cmgtq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vcgtq_s64(src1, src2) __n128_to_uint64x2_t(neon_cmgtq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vcgtq_u16(src1, src2) __n128_to_uint16x8_t(neon_cmhiq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vcgtq_u32(src1, src2) __n128_to_uint32x4_t(neon_cmhiq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vcgtq_u8(src1, src2) __n128_to_uint8x16_t(neon_cmhiq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vcgtq_u64(src1, src2) __n128_to_uint64x2_t(neon_cmhiq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vcltz_f32(src) __n64_to_uint32x2_t(neon_fcmltz32(__float32x2_t_to_n64(src)))
#define vcltz_f64(src) __n64_to_uint64x1_t(neon_fcmltz64(__float64x1_t_to_n64(src)))
#define vcltz_s8(src) __n64_to_uint8x8_t(neon_cmltz8(__int8x8_t_to_n64(src)))
#define vcltz_s16(src) __n64_to_uint16x4_t(neon_cmltz16(__int16x4_t_to_n64(src)))
#define vcltz_s32(src) __n64_to_uint32x2_t(neon_cmltz32(__int32x2_t_to_n64(src)))
#define vcltz_s64(src) __n64_to_uint64x1_t(neon_cmltz64(__int64x1_t_to_n64(src)))
#define vcltzd_s64(src) neon_cmltz64(__int64ToN64_v(src)).n64_u64[0]
#define vcltzs_f32(src1) _CopyUInt32FromFloat(neon_fcmltzs32(src1))
#define vcltzd_f64(src1) _CopyUInt64FromDouble(neon_fcmltzs64(src1))
#define vcltzq_f32(src) __n128_to_uint32x4_t(neon_fcmltzq32(__float32x4_t_to_n128(src)))
#define vcltzq_f64(src) __n128_to_uint64x2_t(neon_fcmltzq64(__float64x2_t_to_n128(src)))
#define vcltzq_s8(src) __n128_to_uint8x16_t(neon_cmltzq8(__int8x16_t_to_n128(src)))
#define vcltzq_s16(src) __n128_to_uint16x8_t(neon_cmltzq16(__int16x8_t_to_n128(src)))
#define vcltzq_s32(src) __n128_to_uint32x4_t(neon_cmltzq32(__int32x4_t_to_n128(src)))
#define vcltzq_s64(src) __n128_to_uint64x2_t(neon_cmltzq64(__int64x2_t_to_n128(src)))
// vclt register form is alias with vcgt with reversed operands
#define vclt_f32(src1, src2) __n64_to_uint32x2_t(neon_fcmgt32(__float32x2_t_to_n64(src2), __float32x2_t_to_n64(src1)))
#define vclt_f64(src1, src2) __n64_to_uint64x1_t(neon_fcmgt64(__float64x1_t_to_n64(src2), __float64x1_t_to_n64(src1)))
#define vclt_s16(src1, src2) __n64_to_uint16x4_t(neon_cmgt16(__int16x4_t_to_n64(src2), __int16x4_t_to_n64(src1)))
#define vclt_s32(src1, src2) __n64_to_uint32x2_t(neon_cmgt32(__int32x2_t_to_n64(src2), __int32x2_t_to_n64(src1)))
#define vclt_s8(src1, src2) __n64_to_uint8x8_t(neon_cmgt8(__int8x8_t_to_n64(src2), __int8x8_t_to_n64(src1)))
#define vclt_s64(src1, src2) __n64_to_uint64x1_t(neon_cmgt64(__int64x1_t_to_n64(src2), __int64x1_t_to_n64(src1)))
#define vclt_u16(src1, src2) __n64_to_uint16x4_t(neon_cmhi16(__uint16x4_t_to_n64(src2), __uint16x4_t_to_n64(src1)))
#define vclt_u32(src1, src2) __n64_to_uint32x2_t(neon_cmhi32(__uint32x2_t_to_n64(src2), __uint32x2_t_to_n64(src1)))
#define vclt_u8(src1, src2) __n64_to_uint8x8_t(neon_cmhi8(__uint8x8_t_to_n64(src2), __uint8x8_t_to_n64(src1)))
#define vclt_u64(src1, src2) __n64_to_uint64x1_t(neon_cmhi64(__uint64x1_t_to_n64(src2), __uint64x1_t_to_n64(src1)))
#define vcltd_s64(src1, src2) neon_cmgt64(__int64ToN64_v(src2), __int64ToN64_v(src1)).n64_u64[0]
#define vcltd_u64(src1, src2) neon_cmhi64(__uint64ToN64_v(src2), __uint64ToN64_v(src1)).n64_u64[0]
#define vclts_f32(src1, src2) _CopyUInt32FromFloat(neon_fcmgts32((src2), (src1)))
#define vcltd_f64(src1, src2) _CopyUInt64FromDouble(neon_fcmgts64((src2), (src1)))
#define vcltq_f32(src1, src2) __n128_to_uint32x4_t(neon_fcmgtq32(__float32x4_t_to_n128(src2), __float32x4_t_to_n128(src1)))
#define vcltq_f64(src1, src2) __n128_to_uint64x2_t(neon_fcmgtq64(__float64x2_t_to_n128(src2), __float64x2_t_to_n128(src1)))
#define vcltq_s16(src1, src2) __n128_to_uint16x8_t(neon_cmgtq16(__int16x8_t_to_n128(src2), __int16x8_t_to_n128(src1)))
#define vcltq_s32(src1, src2) __n128_to_uint32x4_t(neon_cmgtq32(__int32x4_t_to_n128(src2), __int32x4_t_to_n128(src1)))
#define vcltq_s8(src1, src2) __n128_to_uint8x16_t(neon_cmgtq8(__int8x16_t_to_n128(src2), __int8x16_t_to_n128(src1)))
#define vcltq_s64(src1, src2) __n128_to_uint64x2_t(neon_cmgtq64(__int64x2_t_to_n128(src2), __int64x2_t_to_n128(src1)))
#define vcltq_u16(src1, src2) __n128_to_uint16x8_t(neon_cmhiq16(__uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src1)))
#define vcltq_u32(src1, src2) __n128_to_uint32x4_t(neon_cmhiq32(__uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src1)))
#define vcltq_u8(src1, src2) __n128_to_uint8x16_t(neon_cmhiq8(__uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src1)))
#define vcltq_u64(src1, src2) __n128_to_uint64x2_t(neon_cmhiq64(__uint64x2_t_to_n128(src2), __uint64x2_t_to_n128(src1)))
#define vcage_f32(src1, src2) __n64_to_uint32x2_t(neon_facge32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vcage_f64(src1, src2) __n64_to_uint64x1_t(neon_facge64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vcages_f32(src1, src2) _CopyUInt32FromFloat(neon_facges32((src1), (src2)))
#define vcaged_f64(src1, src2) _CopyUInt64FromDouble(neon_facges64((src1), (src2)))
#define vcagt_f32(src1, src2) __n64_to_uint32x2_t(neon_facgt32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vcagt_f64(src1, src2) __n64_to_uint64x1_t(neon_facgt64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vcagts_f32(src1, src2) _CopyUInt32FromFloat(neon_facgts32((src1), (src2)))
#define vcagtd_f64(src1, src2) _CopyUInt64FromDouble(neon_facgts64((src1), (src2)))
// vcale register form is alias with vcage with operands reversed
#define vcale_f32(src1, src2) __n64_to_uint32x2_t(neon_facge32(__float32x2_t_to_n64(src2), __float32x2_t_to_n64(src1)))
#define vcale_f64(src1, src2) __n64_to_uint64x1_t(neon_facge64(__float64x1_t_to_n64(src2), __float64x1_t_to_n64(src1)))
#define vcalt_f32(src1, src2) __n64_to_uint32x2_t(neon_facgt32(__float32x2_t_to_n64(src2), __float32x2_t_to_n64(src1)))
#define vcalt_f64(src1, src2) __n64_to_uint64x1_t(neon_facgt64(__float64x1_t_to_n64(src2), __float64x1_t_to_n64(src1)))
#define vcales_f32(src1, src2) _CopyUInt32FromFloat(neon_facges32((src2), (src1)))
#define vcaled_f64(src1, src2) _CopyUInt64FromDouble(neon_facges64((src2), (src1)))
#define vcalts_f32(src1, src2) _CopyUInt32FromFloat(neon_facgts32((src2), (src1)))
#define vcaltd_f64(src1, src2) _CopyUInt64FromDouble(neon_facgts64((src2), (src1)))
#define vcageq_f32(src1, src2) __n128_to_uint32x4_t(neon_facgeq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vcageq_f64(src1, src2) __n128_to_uint64x2_t(neon_facgeq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vcagtq_f32(src1, src2) __n128_to_uint32x4_t(neon_facgtq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vcagtq_f64(src1, src2) __n128_to_uint64x2_t(neon_facgtq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vcaleq_f32(src1, src2) __n128_to_uint32x4_t(neon_facgeq32(__float32x4_t_to_n128(src2), __float32x4_t_to_n128(src1)))
#define vcaleq_f64(src1, src2) __n128_to_uint64x2_t(neon_facgeq64(__float64x2_t_to_n128(src2), __float64x2_t_to_n128(src1)))
#define vcaltq_f32(src1, src2) __n128_to_uint32x4_t(neon_facgtq32(__float32x4_t_to_n128(src2), __float32x4_t_to_n128(src1)))
#define vcaltq_f64(src1, src2) __n128_to_uint64x2_t(neon_facgtq64(__float64x2_t_to_n128(src2), __float64x2_t_to_n128(src1)))

#if defined(_ARM64_EXTENDED_INTRINSICS)
// compat
#define  vacge_f32 vcage_f32
#define  vacgt_f32 vcagt_f32
#define  vacle_f32 vcale_f32
#define  vaclt_f32 vcalt_f32
#define  vacgeq_f32 vcageq_f32
#define  vacgtq_f32 vcagtq_f32
#define  vacleq_f32 vcaleq_f32
#define  vacltq_f32 vcaltq_f32
#endif  /* _ARM64_EXTENDED_INTRINSICS */

#define vtst_s8(src1, src2) __n64_to_uint8x8_t(neon_cmtst8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vtstq_s8(src1, src2) __n128_to_uint8x16_t(neon_cmtstq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vtst_s16(src1, src2) __n64_to_uint16x4_t(neon_cmtst16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vtstq_s16(src1, src2) __n128_to_uint16x8_t(neon_cmtstq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vtst_s32(src1, src2) __n64_to_uint32x2_t(neon_cmtst32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vtstq_s32(src1, src2) __n128_to_uint32x4_t(neon_cmtstq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vtst_u8(src1, src2) __n64_to_uint8x8_t(neon_cmtst8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vtstq_u8(src1, src2) __n128_to_uint8x16_t(neon_cmtstq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vtst_u16(src1, src2) __n64_to_uint16x4_t(neon_cmtst16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vtstq_u16(src1, src2) __n128_to_uint16x8_t(neon_cmtstq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vtst_u32(src1, src2) __n64_to_uint32x2_t(neon_cmtst32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vtstq_u32(src1, src2) __n128_to_uint32x4_t(neon_cmtstq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vtst_p8(src1, src2) __n64_to_uint8x8_t(neon_cmtst8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vtstq_p8(src1, src2) __n128_to_uint8x16_t(neon_cmtstq8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vtst_s64(src1, src2) __n64_to_uint64x1_t(neon_cmtst64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vtstq_s64(src1, src2) __n128_to_uint64x2_t(neon_cmtstq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vtst_u64(src1, src2) __n64_to_uint64x1_t(neon_cmtst64(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2)))
#define vtstq_u64(src1, src2) __n128_to_uint64x2_t(neon_cmtstq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vtst_p64(src1, src2) __n64_to_uint64x1_t(neon_cmtst64(__poly64x1_t_to_n64(src1), __poly64x1_t_to_n64(src2)))
#define vtstq_p64(src1, src2) __n128_to_uint64x2_t(neon_cmtstq64(__poly64x2_t_to_n128(src1), __poly64x2_t_to_n128(src2)))
#define vtstd_s64(src1, src2) neon_cmtst64(__int64ToN64_v(src1), __int64ToN64_v(src2)).n64_u64[0]
#define vtstd_u64(src1, src2) neon_cmtst64(__uint64ToN64_v(src1), __uint64ToN64_v(src2)).n64_u64[0]

// FCVTAS/FCVTAU/FCVTMS/FCVTMU/FCVTNS/FCVTPS/FCVTPU/FCVTZS/FCVTZU/SCVTF/UCVTF
__n64  neon_fcvtas16(__n64);
__n64  neon_fcvtas32(__n64);
__n64  neon_fcvtas64(__n64);
__n128 neon_fcvtasq16(__n128);
__n128 neon_fcvtasq32(__n128);
__n128 neon_fcvtasq64(__n128);
float  neon_fcvtass32(float);
double neon_fcvtass64(double);
__n64  neon_fcvtau16(__n64);
__n64  neon_fcvtau32(__n64);
__n64  neon_fcvtau64(__n64);
__n128 neon_fcvtauq16(__n128);
__n128 neon_fcvtauq32(__n128);
__n128 neon_fcvtauq64(__n128);
float  neon_fcvtaus32(float);
double neon_fcvtaus64(double);
__n64  neon_fcvtms16(__n64);
__n64  neon_fcvtms32(__n64);
__n64  neon_fcvtms64(__n64);
__n128 neon_fcvtmsq16(__n128);
__n128 neon_fcvtmsq32(__n128);
__n128 neon_fcvtmsq64(__n128);
float  neon_fcvtmss32(float);
double neon_fcvtmss64(double);
__n64  neon_fcvtmu16(__n64);
__n64  neon_fcvtmu32(__n64);
__n64  neon_fcvtmu64(__n64);
__n128 neon_fcvtmuq16(__n128);
__n128 neon_fcvtmuq32(__n128);
__n128 neon_fcvtmuq64(__n128);
float  neon_fcvtmus32(float);
double neon_fcvtmus64(double);
__n64  neon_fcvtns16(__n64);
__n64  neon_fcvtns32(__n64);
__n64  neon_fcvtns64(__n64);
__n128 neon_fcvtnsq16(__n128);
__n128 neon_fcvtnsq32(__n128);
__n128 neon_fcvtnsq64(__n128);
float  neon_fcvtnss32(float);
double neon_fcvtnss64(double);
__n64  neon_fcvtnu16(__n64);
__n64  neon_fcvtnu32(__n64);
__n64  neon_fcvtnu64(__n64);
__n128 neon_fcvtnuq16(__n128);
__n128 neon_fcvtnuq32(__n128);
__n128 neon_fcvtnuq64(__n128);
float  neon_fcvtnus32(float);
double neon_fcvtnus64(double);
__n64  neon_fcvtps16(__n64);
__n64  neon_fcvtps32(__n64);
__n64  neon_fcvtps64(__n64);
__n128 neon_fcvtpsq16(__n128);
__n128 neon_fcvtpsq32(__n128);
__n128 neon_fcvtpsq64(__n128);
float  neon_fcvtpss32(float);
double neon_fcvtpss64(double);
__n64  neon_fcvtpu16(__n64);
__n64  neon_fcvtpu32(__n64);
__n64  neon_fcvtpu64(__n64);
__n128 neon_fcvtpuq16(__n128);
__n128 neon_fcvtpuq32(__n128);
__n128 neon_fcvtpuq64(__n128);
float  neon_fcvtpus32(float);
double neon_fcvtpus64(double);
__n64  neon_fcvtzs16(__n64);
__n64  neon_fcvtzs32(__n64);
__n64  neon_fcvtzs64(__n64);
__n128 neon_fcvtzsq16(__n128);
__n128 neon_fcvtzsq32(__n128);
__n128 neon_fcvtzsq64(__n128);
float  neon_fcvtzss32(float);
double neon_fcvtzss64(double);
__n64  neon_fcvtzu16(__n64);
__n64  neon_fcvtzu32(__n64);
__n64  neon_fcvtzu64(__n64);
__n128 neon_fcvtzuq16(__n128);
__n128 neon_fcvtzuq32(__n128);
__n128 neon_fcvtzuq64(__n128);
float  neon_fcvtzus32(float);
double neon_fcvtzus64(double);
__n64  neon_scvtf16(__n64);
__n64  neon_scvtf32(__n64);
__n64  neon_scvtf64(__n64);
__n64  neon_scvtf16(__n64);
__n128 neon_scvtfq16(__n128);
__n128 neon_scvtfq32(__n128);
__n128 neon_scvtfq64(__n128);
float  neon_scvtfs32(__int32);
double neon_scvtfs64(__int64);
__n64  neon_ucvtf16(__n64);
__n64  neon_ucvtf32(__n64);
__n64  neon_ucvtf64(__n64);
__n128 neon_ucvtfq16(__n128);
__n128 neon_ucvtfq32(__n128);
__n128 neon_ucvtfq64(__n128);
float  neon_ucvtfs32(unsigned __int32);
double neon_ucvtfs64(unsigned __int64);
__n64  neon_fcvtzsfp16(__n64, const int);
__n64  neon_fcvtzsfp32(__n64, const int);
__n64  neon_fcvtzsfp64(__n64, const int);
__n128 neon_fcvtzsfpq16(__n128, const int);
__n128 neon_fcvtzsfpq32(__n128, const int);
__n128 neon_fcvtzsfpq64(__n128, const int);
float  neon_fcvtzsfps32(float, const int);
double neon_fcvtzsfps64(double, const int);
__n64  neon_fcvtzufp16(__n64, const int);
__n64  neon_fcvtzufp32(__n64, const int);
__n64  neon_fcvtzufp64(__n64, const int);
__n128 neon_fcvtzufpq16(__n128, const int);
__n128 neon_fcvtzufpq32(__n128, const int);
__n128 neon_fcvtzufpq64(__n128, const int);
float  neon_fcvtzufps32(float, const int);
double neon_fcvtzufps64(double, const int);
__n64  neon_scvtffp16(__n64, const int);
__n64  neon_scvtffp32(__n64, const int);
__n64  neon_scvtffp64(__n64, const int);
__n128 neon_scvtffpq16(__n128, const int);
__n128 neon_scvtffpq32(__n128, const int);
__n128 neon_scvtffpq64(__n128, const int);
float  neon_scvtffps32(__int32, const int);
double neon_scvtffps64(__int64, const int);
__n64  neon_ucvtffp16(__n64, const int);
__n64  neon_ucvtffp32(__n64, const int);
__n64  neon_ucvtffp64(__n64, const int);
__n128 neon_ucvtffpq16(__n128, const int);
__n128 neon_ucvtffpq32(__n128, const int);
__n128 neon_ucvtffpq64(__n128, const int);
float  neon_ucvtffps32(unsigned __int32, const int);
double neon_ucvtffps64(unsigned __int64, const int);
#define vcvt_n_f32_s32(src1, src2)  __n64_to_float32x2_t(neon_scvtffp32(__int32x2_t_to_n64(src1), (src2)))
#define vcvt_n_f64_s64(src1, src2)  __n64_to_float64x1_t(neon_scvtffp64(__int64x1_t_to_n64(src1), (src2)))
#define vcvt_n_f32_u32(src1, src2)  __n64_to_float32x2_t(neon_ucvtffp32(__uint32x2_t_to_n64(src1), (src2)))
#define vcvt_n_f64_u64(src1, src2)  __n64_to_float64x1_t(neon_ucvtffp64(__uint64x1_t_to_n64(src1), (src2)))
#define vcvt_n_s32_f32(src1, src2)  __n64_to_int32x2_t(neon_fcvtzsfp32(__float32x2_t_to_n64(src1), (src2)))
#define vcvt_n_s64_f64(src1, src2)  __n64_to_int64x1_t(neon_fcvtzsfp64(__float64x1_t_to_n64(src1), (src2)))
#define vcvt_n_u32_f32(src1, src2)  __n64_to_uint32x2_t(neon_fcvtzufp32(__float32x2_t_to_n64(src1), (src2)))
#define vcvt_n_u64_f64(src1, src2)  __n64_to_uint64x1_t(neon_fcvtzufp64(__float64x1_t_to_n64(src1), (src2)))
#define vcvtq_n_f32_s32(src1, src2) __n128_to_float32x4_t(neon_scvtffpq32(__int32x4_t_to_n128(src1), (src2)))
#define vcvtq_n_f64_s64(src1, src2) __n128_to_float64x2_t(neon_scvtffpq64(__int64x2_t_to_n128(src1), (src2)))
#define vcvtq_n_f32_u32(src1, src2) __n128_to_float32x4_t(neon_ucvtffpq32(__uint32x4_t_to_n128(src1), (src2)))
#define vcvtq_n_f64_u64(src1, src2) __n128_to_float64x2_t(neon_ucvtffpq64(__uint64x2_t_to_n128(src1), (src2)))
#define vcvtq_n_s32_f32(src1, src2) __n128_to_int32x4_t(neon_fcvtzsfpq32(__float32x4_t_to_n128(src1), (src2)))
#define vcvtq_n_s64_f64(src1, src2) __n128_to_int64x2_t(neon_fcvtzsfpq64(__float64x2_t_to_n128(src1), (src2)))
#define vcvtq_n_u32_f32(src1, src2) __n128_to_uint32x4_t(neon_fcvtzufpq32(__float32x4_t_to_n128(src1), (src2)))
#define vcvtq_n_u64_f64(src1, src2) __n128_to_uint64x2_t(neon_fcvtzufpq64(__float64x2_t_to_n128(src1), (src2)))
#define vcvta_s32_f32(src)  __n64_to_int32x2_t(neon_fcvtas32(__float32x2_t_to_n64(src)))
#define vcvta_s64_f64(src)  __n64_to_int64x1_t(neon_fcvtas64(__float64x1_t_to_n64(src)))
#define vcvta_u32_f32(src)  __n64_to_uint32x2_t(neon_fcvtau32(__float32x2_t_to_n64(src)))
#define vcvta_u64_f64(src)  __n64_to_uint64x1_t(neon_fcvtau64(__float64x1_t_to_n64(src)))
#define vcvtm_s32_f32(src)  __n64_to_int32x2_t(neon_fcvtms32(__float32x2_t_to_n64(src)))
#define vcvtm_s64_f64(src)  __n64_to_int64x1_t(neon_fcvtms64(__float64x1_t_to_n64(src)))
#define vcvtm_u32_f32(src)  __n64_to_uint32x2_t(neon_fcvtmu32(__float32x2_t_to_n64(src)))
#define vcvtm_u64_f64(src)  __n64_to_uint64x1_t(neon_fcvtmu64(__float64x1_t_to_n64(src)))
#define vcvtn_s32_f32(src)  __n64_to_int32x2_t(neon_fcvtns32(__float32x2_t_to_n64(src)))
#define vcvtn_s64_f64(src)  __n64_to_int64x1_t(neon_fcvtns64(__float64x1_t_to_n64(src)))
#define vcvtn_u32_f32(src)  __n64_to_uint32x2_t(neon_fcvtnu32(__float32x2_t_to_n64(src)))
#define vcvtn_u64_f64(src)  __n64_to_uint64x1_t(neon_fcvtnu64(__float64x1_t_to_n64(src)))
#define vcvtp_s32_f32(src)  __n64_to_int32x2_t(neon_fcvtps32(__float32x2_t_to_n64(src)))
#define vcvtp_s64_f64(src)  __n64_to_int64x1_t(neon_fcvtps64(__float64x1_t_to_n64(src)))
#define vcvtp_u32_f32(src)  __n64_to_uint32x2_t(neon_fcvtpu32(__float32x2_t_to_n64(src)))
#define vcvtp_u64_f64(src)  __n64_to_uint64x1_t(neon_fcvtpu64(__float64x1_t_to_n64(src)))
#define vcvtaq_s32_f32(src) __n128_to_int32x4_t(neon_fcvtasq32(__float32x4_t_to_n128(src)))
#define vcvtaq_s64_f64(src) __n128_to_int64x2_t(neon_fcvtasq64(__float64x2_t_to_n128(src)))
#define vcvtaq_u32_f32(src) __n128_to_uint32x4_t(neon_fcvtauq32(__float32x4_t_to_n128(src)))
#define vcvtaq_u64_f64(src) __n128_to_uint64x2_t(neon_fcvtauq64(__float64x2_t_to_n128(src)))
#define vcvtmq_s32_f32(src) __n128_to_int32x4_t(neon_fcvtmsq32(__float32x4_t_to_n128(src)))
#define vcvtmq_s64_f64(src) __n128_to_int64x2_t(neon_fcvtmsq64(__float64x2_t_to_n128(src)))
#define vcvtmq_u32_f32(src) __n128_to_uint32x4_t(neon_fcvtmuq32(__float32x4_t_to_n128(src)))
#define vcvtmq_u64_f64(src) __n128_to_uint64x2_t(neon_fcvtmuq64(__float64x2_t_to_n128(src)))
#define vcvtnq_s32_f32(src) __n128_to_int32x4_t(neon_fcvtnsq32(__float32x4_t_to_n128(src)))
#define vcvtnq_s64_f64(src) __n128_to_int64x2_t(neon_fcvtnsq64(__float64x2_t_to_n128(src)))
#define vcvtnq_u32_f32(src) __n128_to_uint32x4_t(neon_fcvtnuq32(__float32x4_t_to_n128(src)))
#define vcvtnq_u64_f64(src) __n128_to_uint64x2_t(neon_fcvtnuq64(__float64x2_t_to_n128(src)))
#define vcvtpq_s32_f32(src) __n128_to_int32x4_t(neon_fcvtpsq32(__float32x4_t_to_n128(src)))
#define vcvtpq_s64_f64(src) __n128_to_int64x2_t(neon_fcvtpsq64(__float64x2_t_to_n128(src)))
#define vcvtpq_u32_f32(src) __n128_to_uint32x4_t(neon_fcvtpuq32(__float32x4_t_to_n128(src)))
#define vcvtpq_u64_f64(src) __n128_to_uint64x2_t(neon_fcvtpuq64(__float64x2_t_to_n128(src)))
#define vcvt_f32_s32(src)  __n64_to_float32x2_t(neon_scvtf32(__int32x2_t_to_n64(src)))
#define vcvt_f64_s64(src)  __n64_to_float64x1_t(neon_scvtf64(__int64x1_t_to_n64(src)))
#define vcvt_f32_u32(src)  __n64_to_float32x2_t(neon_ucvtf32(__uint32x2_t_to_n64(src)))
#define vcvt_f64_u64(src)  __n64_to_float64x1_t(neon_ucvtf64(__uint64x1_t_to_n64(src)))
#define vcvt_s32_f32(src)  __n64_to_int32x2_t(neon_fcvtzs32(__float32x2_t_to_n64(src)))
#define vcvt_s64_f64(src)  __n64_to_int64x1_t(neon_fcvtzs64(__float64x1_t_to_n64(src)))
#define vcvt_u32_f32(src)  __n64_to_uint32x2_t(neon_fcvtzu32(__float32x2_t_to_n64(src)))
#define vcvt_u64_f64(src)  __n64_to_uint64x1_t(neon_fcvtzu64(__float64x1_t_to_n64(src)))
#define vcvtq_f32_s32(src) __n128_to_float32x4_t(neon_scvtfq32(__int32x4_t_to_n128(src)))
#define vcvtq_f64_s64(src) __n128_to_float64x2_t(neon_scvtfq64(__int64x2_t_to_n128(src)))
#define vcvtq_f32_u32(src) __n128_to_float32x4_t(neon_ucvtfq32(__uint32x4_t_to_n128(src)))
#define vcvtq_f64_u64(src) __n128_to_float64x2_t(neon_ucvtfq64(__uint64x2_t_to_n128(src)))
#define vcvtq_s32_f32(src) __n128_to_int32x4_t(neon_fcvtzsq32(__float32x4_t_to_n128(src)))
#define vcvtq_s64_f64(src) __n128_to_int64x2_t(neon_fcvtzsq64(__float64x2_t_to_n128(src)))
#define vcvtq_u32_f32(src) __n128_to_uint32x4_t(neon_fcvtzuq32(__float32x4_t_to_n128(src)))
#define vcvtq_u64_f64(src) __n128_to_uint64x2_t(neon_fcvtzuq64(__float64x2_t_to_n128(src)))
#define vcvts_s32_f32(src1) _CopyInt32FromFloat(neon_fcvtzss32(src1))
#define vcvtd_s64_f64(src1) _CopyInt64FromDouble(neon_fcvtzss64(src1))
#define vcvtas_s32_f32(src1) _CopyInt32FromFloat(neon_fcvtass32(src1))
#define vcvtad_s64_f64(src1) _CopyInt64FromDouble(neon_fcvtass64(src1))
#define vcvtms_s32_f32(src1) _CopyInt32FromFloat(neon_fcvtmss32(src1))
#define vcvtmd_s64_f64(src1) _CopyInt64FromDouble(neon_fcvtmss64(src1))
#define vcvtns_s32_f32(src1) _CopyInt32FromFloat(neon_fcvtnss32(src1))
#define vcvtnd_s64_f64(src1) _CopyInt64FromDouble(neon_fcvtnss64(src1))
#define vcvtps_s32_f32(src1) _CopyInt32FromFloat(neon_fcvtpss32(src1))
#define vcvtpd_s64_f64(src1) _CopyInt64FromDouble(neon_fcvtpss64(src1))
#define vcvts_n_s32_f32(src1, src2) _CopyInt32FromFloat(neon_fcvtzsfps32((src1), (src2)))
#define vcvtd_n_s64_f64(src1, src2) _CopyInt64FromDouble(neon_fcvtzsfps64((src1), (src2)))
#define vcvts_u32_f32(src1) _CopyUInt32FromFloat(neon_fcvtzus32(src1))
#define vcvtd_u64_f64(src1) _CopyUInt64FromDouble(neon_fcvtzus64(src1))
#define vcvtas_u32_f32(src1) _CopyUInt32FromFloat(neon_fcvtaus32(src1))
#define vcvtad_u64_f64(src1) _CopyUInt64FromDouble(neon_fcvtaus64(src1))
#define vcvtms_u32_f32(src1) _CopyUInt32FromFloat(neon_fcvtmus32(src1))
#define vcvtmd_u64_f64(src1) _CopyUInt64FromDouble(neon_fcvtmus64(src1))
#define vcvtns_u32_f32(src1) _CopyUInt32FromFloat(neon_fcvtnus32(src1))
#define vcvtnd_u64_f64(src1) _CopyUInt64FromDouble(neon_fcvtnus64(src1))
#define vcvtps_u32_f32(src1) _CopyUInt32FromFloat(neon_fcvtpus32(src1))
#define vcvtpd_u64_f64(src1) _CopyUInt64FromDouble(neon_fcvtpus64(src1))
#define vcvts_n_u32_f32(src1, src2) _CopyUInt32FromFloat(neon_fcvtzufps32((src1), (src2)))
#define vcvtd_n_u64_f64(src1, src2) _CopyUInt64FromDouble(neon_fcvtzufps64((src1), (src2)))
#define vcvts_f32_s32(src1) neon_scvtfs32(src1)
#define vcvtd_f64_s64(src1) neon_scvtfs64(src1)
#define vcvts_f32_u32(src1) neon_ucvtfs32(src1)
#define vcvtd_f64_u64(src1) neon_ucvtfs64(src1)
#define vcvts_n_f32_s32(src1, src2) neon_scvtffps32((src1), (src2))
#define vcvtd_n_f64_s64(src1, src2) neon_scvtffps64((src1), (src2))
#define vcvts_n_f32_u32(src1, src2) neon_ucvtffps32((src1), (src2))
#define vcvtd_n_f64_u64(src1, src2) neon_ucvtffps64((src1), (src2))

// FRECPE/FRECPS/FRECPX/URECPE
__n64  neon_frecpe16 (__n64);
__n128 neon_frecpeq16(__n128);
__n64  neon_frecpe32 (__n64);
__n128 neon_frecpeq32(__n128);
__n64  neon_frecpe64 (__n64);
__n128 neon_frecpeq64(__n128);
float  neon_frecpes32(float);
double neon_frecpes64(double);
__n64  neon_frecps16 (__n64, __n64);
__n64  neon_frecps32 (__n64, __n64);
__n64  neon_frecps64 (__n64, __n64);
__n128 neon_frecpsq16(__n128, __n128);
__n128 neon_frecpsq32(__n128, __n128);
__n128 neon_frecpsq64(__n128, __n128);
float  neon_frecpss32(float, float);
double neon_frecpss64(double, double);
__n64  neon_urecpe32 (__n64);
__n128 neon_urecpeq32(__n128);
float  neon_frecpx32(float);
double neon_frecpx64(double);
#define vrecpe_f32(src)         __n64_to_float32x2_t(neon_frecpe32(__float32x2_t_to_n64(src)))
#define vrecpe_u32(src)         __n64_to_uint32x2_t(neon_urecpe32(__uint32x2_t_to_n64(src)))
#define vrecpeq_f32(src)        __n128_to_float32x4_t(neon_frecpeq32(__float32x4_t_to_n128(src)))
#define vrecpeq_u32(src)        __n128_to_uint32x4_t(neon_urecpeq32(__uint32x4_t_to_n128(src)))
#define vrecpes_f32(src1)       neon_frecpes32(src1)
#define vrecpxs_f32(src1)       neon_frecpx32(src1)
#define vrecps_f32(src1, src2)  __n64_to_float32x2_t(neon_frecps32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vrecpsq_f32(src1, src2) __n128_to_float32x4_t(neon_frecpsq32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vrecpss_f32(src1, src2) neon_frecpss32((src1), (src2))
#define vrecpe_f64(src)         __n64_to_float64x1_t(neon_frecpe64(__float64x1_t_to_n64(src)))
#define vrecpeq_f64(src)        __n128_to_float64x2_t(neon_frecpeq64(__float64x2_t_to_n128(src)))
#define vrecped_f64(src1)       neon_frecpes64(src1)
#define vrecpxd_f64(src1)       neon_frecpx64(src1)
#define vrecps_f64(src1, src2)  __n64_to_float64x1_t(neon_frecps64(__float64x1_t_to_n64(src1), __float64x1_t_to_n64(src2)))
#define vrecpsq_f64(src1, src2) __n128_to_float64x2_t(neon_frecpsq64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vrecpsd_f64(src1, src2) neon_frecpss64((src1), (src2))

// ZIP1/ZIP2/UZP1/UZP2/TRN1/TRN2
__n64 neon_zip1_8(__n64 _Dd, __n64 _Dm);
__n128 neon_zip1_q8(__n128 _Qd, __n128 _Qm);
__n64 neon_zip1_16(__n64 _Dd, __n64 _Dm);
__n128 neon_zip1_q16(__n128 _Qd, __n128 _Qm);
__n64 neon_zip1_32(__n64 _Dd, __n64 _Dm);
__n128 neon_zip1_q32(__n128 _Qd, __n128 _Qm);
__n128 neon_zip1_q64(__n128 _Qd, __n128 _Qm);
__n64 neon_zip2_8(__n64 _Dd, __n64 _Dm);
__n128 neon_zip2_q8(__n128 _Qd, __n128 _Qm);
__n64 neon_zip2_16(__n64 _Dd, __n64 _Dm);
__n128 neon_zip2_q16(__n128 _Qd, __n128 _Qm);
__n64 neon_zip2_32(__n64 _Dd, __n64 _Dm);
__n128 neon_zip2_q32(__n128 _Qd, __n128 _Qm);
__n128 neon_zip2_q64(__n128 _Qd, __n128 _Qm);
__n64 neon_uzp1_8(__n64 _Dd, __n64 _Dm);
__n128 neon_uzp1_q8(__n128 _Qd, __n128 _Qm);
__n64 neon_uzp1_16(__n64 _Dd, __n64 _Dm);
__n128 neon_uzp1_q16(__n128 _Qd, __n128 _Qm);
__n64 neon_uzp1_32(__n64 _Dd, __n64 _Dm);
__n128 neon_uzp1_q32(__n128 _Qd, __n128 _Qm);
__n128 neon_uzp1_q64(__n128 _Qd, __n128 _Qm);
__n64 neon_uzp2_8(__n64 _Dd, __n64 _Dm);
__n128 neon_uzp2_q8(__n128 _Qd, __n128 _Qm);
__n64 neon_uzp2_16(__n64 _Dd, __n64 _Dm);
__n128 neon_uzp2_q16(__n128 _Qd, __n128 _Qm);
__n64 neon_uzp2_32(__n64 _Dd, __n64 _Dm);
__n128 neon_uzp2_q32(__n128 _Qd, __n128 _Qm);
__n128 neon_uzp2_q64(__n128 _Qd, __n128 _Qm);
__n64 neon_trn1_8(__n64 _Dd, __n64 _Dm);
__n128 neon_trn1_q8(__n128 _Qd, __n128 _Qm);
__n64 neon_trn1_16(__n64 _Dd, __n64 _Dm);
__n128 neon_trn1_q16(__n128 _Qd, __n128 _Qm);
__n64 neon_trn1_32(__n64 _Dd, __n64 _Dm);
__n128 neon_trn1_q32(__n128 _Qd, __n128 _Qm);
__n128 neon_trn1_q64(__n128 _Qd, __n128 _Qm);
__n64 neon_trn2_8(__n64 _Dd, __n64 _Dm);
__n128 neon_trn2_q8(__n128 _Qd, __n128 _Qm);
__n64 neon_trn2_16(__n64 _Dd, __n64 _Dm);
__n128 neon_trn2_q16(__n128 _Qd, __n128 _Qm);
__n64 neon_trn2_32(__n64 _Dd, __n64 _Dm);
__n128 neon_trn2_q32(__n128 _Qd, __n128 _Qm);
__n128 neon_trn2_q64(__n128 _Qd, __n128 _Qm);
__n64x2 neon_zip_8(__n64 _Dd, __n64 _Dm);
__n128x2 neon_zip_q8(__n128 _Qd, __n128 _Qm);
__n64x2 neon_zip_16(__n64 _Dd, __n64 _Dm);
__n128x2 neon_zip_q16(__n128 _Qd, __n128 _Qm);
__n64x2 neon_zip_32(__n64 _Dd, __n64 _Dm);
__n128x2 neon_zip_q32(__n128 _Qd, __n128 _Qm);
__n128x2 neon_zip_q64(__n128 _Qd, __n128 _Qm);
__n64x2 neon_uzp_8(__n64 _Dd, __n64 _Dm);
__n128x2 neon_uzp_q8(__n128 _Qd, __n128 _Qm);
__n64x2 neon_uzp_16(__n64 _Dd, __n64 _Dm);
__n128x2 neon_uzp_q16(__n128 _Qd, __n128 _Qm);
__n64x2 neon_uzp_32(__n64 _Dd, __n64 _Dm);
__n128x2 neon_uzp_q32(__n128 _Qd, __n128 _Qm);
__n128x2 neon_uzp_q64(__n128 _Qd, __n128 _Qm);
__n64x2 neon_trn_8(__n64 _Dd, __n64 _Dm);
__n128x2 neon_trn_q8(__n128 _Qd, __n128 _Qm);
__n64x2 neon_trn_16(__n64 _Dd, __n64 _Dm);
__n128x2 neon_trn_q16(__n128 _Qd, __n128 _Qm);
__n64x2 neon_trn_32(__n64 _Dd, __n64 _Dm);
__n128x2 neon_trn_q32(__n128 _Qd, __n128 _Qm);
__n128x2 neon_trn_q64(__n128 _Qd, __n128 _Qm);
#define vzip_s8(src1, src2) __n64x2_to_int8x8x2_t(neon_zip_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vzip_s16(src1, src2) __n64x2_to_int16x4x2_t(neon_zip_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vzip_s32(src1, src2) __n64x2_to_int32x2x2_t(neon_zip_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vzip_u8(src1, src2) __n64x2_to_uint8x8x2_t(neon_zip_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vzip_u16(src1, src2) __n64x2_to_uint16x4x2_t(neon_zip_16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vzip_u32(src1, src2) __n64x2_to_uint32x2x2_t(neon_zip_32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vzip_f32(src1, src2) __n64x2_to_float32x2x2_t(neon_zip_32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vzip_p8(src1, src2) __n64x2_to_poly8x8x2_t(neon_zip_8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vzip_p16(src1, src2) __n64x2_to_poly16x4x2_t(neon_zip_16(__poly16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2)))
#define vzipq_s8(src1, src2) __n128x2_to_int8x16x2_t(neon_zip_q8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vzipq_s16(src1, src2) __n128x2_to_int16x8x2_t(neon_zip_q16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vzipq_s32(src1, src2) __n128x2_to_int32x4x2_t(neon_zip_q32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vzipq_u8(src1, src2) __n128x2_to_uint8x16x2_t(neon_zip_q8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vzipq_u16(src1, src2) __n128x2_to_uint16x8x2_t(neon_zip_q16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vzipq_u32(src1, src2) __n128x2_to_uint32x4x2_t(neon_zip_q32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vzipq_f32(src1, src2) __n128x2_to_float32x4x2_t(neon_zip_q32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vzipq_p8(src1, src2) __n128x2_to_poly8x16x2_t(neon_zip_q8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vzipq_p16(src1, src2) __n128x2_to_poly16x8x2_t(neon_zip_q16(__poly16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2)))

#define vzip1_s8(src1, src2) __n64_to_int8x8_t(neon_zip1_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vzip1_s16(src1, src2) __n64_to_int16x4_t(neon_zip1_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vzip1_s32(src1, src2) __n64_to_int32x2_t(neon_zip1_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vzip1_u8(src1, src2) __n64_to_uint8x8_t(neon_zip1_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vzip1_u16(src1, src2) __n64_to_uint16x4_t(neon_zip1_16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vzip1_u32(src1, src2) __n64_to_uint32x2_t(neon_zip1_32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vzip1_f32(src1, src2) __n64_to_float32x2_t(neon_zip1_32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vzip1_p8(src1, src2) __n64_to_poly8x8_t(neon_zip1_8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vzip1_p16(src1, src2) __n64_to_poly16x4_t(neon_zip1_16(__poly16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2)))
#define vzip1q_s8(src1, src2) __n128_to_int8x16_t(neon_zip1_q8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vzip1q_s16(src1, src2) __n128_to_int16x8_t(neon_zip1_q16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vzip1q_s32(src1, src2) __n128_to_int32x4_t(neon_zip1_q32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vzip1q_s64(src1, src2) __n128_to_int64x2_t(neon_zip1_q64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vzip1q_u8(src1, src2) __n128_to_uint8x16_t(neon_zip1_q8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vzip1q_u16(src1, src2) __n128_to_uint16x8_t(neon_zip1_q16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vzip1q_u32(src1, src2) __n128_to_uint32x4_t(neon_zip1_q32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vzip1q_u64(src1, src2) __n128_to_uint64x2_t(neon_zip1_q64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vzip1q_f32(src1, src2) __n128_to_float32x4_t(neon_zip1_q32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vzip1q_f64(src1, src2) __n128_to_float64x2_t(neon_zip1_q64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vzip1q_p8(src1, src2) __n128_to_poly8x16_t(neon_zip1_q8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vzip1q_p16(src1, src2) __n128_to_poly16x8_t(neon_zip1_q16(__poly16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2)))
#define vzip1q_p64(src1, src2) __n128_to_poly64x2_t(neon_zip1_q64(__poly64x2_t_to_n128(src1), __poly64x2_t_to_n128(src2)))

#define vzip2_s8(src1, src2) __n64_to_int8x8_t(neon_zip2_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vzip2_s16(src1, src2) __n64_to_int16x4_t(neon_zip2_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vzip2_s32(src1, src2) __n64_to_int32x2_t(neon_zip2_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vzip2_u8(src1, src2) __n64_to_uint8x8_t(neon_zip2_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vzip2_u16(src1, src2) __n64_to_uint16x4_t(neon_zip2_16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vzip2_u32(src1, src2) __n64_to_uint32x2_t(neon_zip2_32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vzip2_f32(src1, src2) __n64_to_float32x2_t(neon_zip2_32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vzip2_p8(src1, src2) __n64_to_poly8x8_t(neon_zip2_8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vzip2_p16(src1, src2) __n64_to_poly16x4_t(neon_zip2_16(__poly16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2)))
#define vzip2q_s8(src1, src2) __n128_to_int8x16_t(neon_zip2_q8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vzip2q_s16(src1, src2) __n128_to_int16x8_t(neon_zip2_q16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vzip2q_s32(src1, src2) __n128_to_int32x4_t(neon_zip2_q32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vzip2q_s64(src1, src2) __n128_to_int64x2_t(neon_zip2_q64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vzip2q_u8(src1, src2) __n128_to_uint8x16_t(neon_zip2_q8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vzip2q_u16(src1, src2) __n128_to_uint16x8_t(neon_zip2_q16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vzip2q_u32(src1, src2) __n128_to_uint32x4_t(neon_zip2_q32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vzip2q_u64(src1, src2) __n128_to_uint64x2_t(neon_zip2_q64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vzip2q_f32(src1, src2) __n128_to_float32x4_t(neon_zip2_q32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vzip2q_f64(src1, src2) __n128_to_float64x2_t(neon_zip2_q64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vzip2q_p8(src1, src2) __n128_to_poly8x16_t(neon_zip2_q8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vzip2q_p16(src1, src2) __n128_to_poly16x8_t(neon_zip2_q16(__poly16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2)))
#define vzip2q_p64(src1, src2) __n128_to_poly64x2_t(neon_zip2_q64(__poly64x2_t_to_n128(src1), __poly64x2_t_to_n128(src2)))

#define vuzp_s8(src1, src2) __n64x2_to_int8x8x2_t(neon_uzp_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vuzp_s16(src1, src2) __n64x2_to_int16x4x2_t(neon_uzp_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vuzp_s32(src1, src2) __n64x2_to_int32x2x2_t(neon_uzp_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vuzp_u8(src1, src2) __n64x2_to_uint8x8x2_t(neon_uzp_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vuzp_u16(src1, src2) __n64x2_to_uint16x4x2_t(neon_uzp_16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vuzp_u32(src1, src2) __n64x2_to_uint32x2x2_t(neon_uzp_32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vuzp_f32(src1, src2) __n64x2_to_float32x2x2_t(neon_uzp_32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vuzp_p8(src1, src2) __n64x2_to_poly8x8x2_t(neon_uzp_8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vuzp_p16(src1, src2) __n64x2_to_poly16x4x2_t(neon_uzp_16(__poly16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2)))
#define vuzpq_s8(src1, src2) __n128x2_to_int8x16x2_t(neon_uzp_q8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vuzpq_s16(src1, src2) __n128x2_to_int16x8x2_t(neon_uzp_q16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vuzpq_s32(src1, src2) __n128x2_to_int32x4x2_t(neon_uzp_q32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vuzpq_u8(src1, src2) __n128x2_to_uint8x16x2_t(neon_uzp_q8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vuzpq_u16(src1, src2) __n128x2_to_uint16x8x2_t(neon_uzp_q16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vuzpq_u32(src1, src2) __n128x2_to_uint32x4x2_t(neon_uzp_q32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vuzpq_f32(src1, src2) __n128x2_to_float32x4x2_t(neon_uzp_q32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vuzpq_p8(src1, src2) __n128x2_to_poly8x16x2_t(neon_uzp_q8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vuzpq_p16(src1, src2) __n128x2_to_poly16x8x2_t(neon_uzp_q16(__poly16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2)))

#define vuzp1_s8(src1, src2) __n64_to_int8x8_t(neon_uzp1_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vuzp1_s16(src1, src2) __n64_to_int16x4_t(neon_uzp1_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vuzp1_s32(src1, src2) __n64_to_int32x2_t(neon_uzp1_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vuzp1_u8(src1, src2) __n64_to_uint8x8_t(neon_uzp1_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vuzp1_u16(src1, src2) __n64_to_uint16x4_t(neon_uzp1_16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vuzp1_u32(src1, src2) __n64_to_uint32x2_t(neon_uzp1_32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vuzp1_f32(src1, src2) __n64_to_float32x2_t(neon_uzp1_32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vuzp1_p8(src1, src2) __n64_to_poly8x8_t(neon_uzp1_8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vuzp1_p16(src1, src2) __n64_to_poly16x4_t(neon_uzp1_16(__poly16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2)))
#define vuzp1q_s8(src1, src2) __n128_to_int8x16_t(neon_uzp1_q8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vuzp1q_s16(src1, src2) __n128_to_int16x8_t(neon_uzp1_q16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vuzp1q_s32(src1, src2) __n128_to_int32x4_t(neon_uzp1_q32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vuzp1q_s64(src1, src2) __n128_to_int64x2_t(neon_uzp1_q64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vuzp1q_u8(src1, src2) __n128_to_uint8x16_t(neon_uzp1_q8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vuzp1q_u16(src1, src2) __n128_to_uint16x8_t(neon_uzp1_q16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vuzp1q_u32(src1, src2) __n128_to_uint32x4_t(neon_uzp1_q32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vuzp1q_u64(src1, src2) __n128_to_uint64x2_t(neon_uzp1_q64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vuzp1q_f32(src1, src2) __n128_to_float32x4_t(neon_uzp1_q32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vuzp1q_f64(src1, src2) __n128_to_float64x2_t(neon_uzp1_q64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vuzp1q_p8(src1, src2) __n128_to_poly8x16_t(neon_uzp1_q8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vuzp1q_p16(src1, src2) __n128_to_poly16x8_t(neon_uzp1_q16(__poly16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2)))
#define vuzp1q_p64(src1, src2) __n128_to_poly64x2_t(neon_uzp1_q64(__poly64x2_t_to_n128(src1), __poly64x2_t_to_n128(src2)))

#define vuzp2_s8(src1, src2) __n64_to_int8x8_t(neon_uzp2_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vuzp2_s16(src1, src2) __n64_to_int16x4_t(neon_uzp2_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vuzp2_s32(src1, src2) __n64_to_int32x2_t(neon_uzp2_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vuzp2_u8(src1, src2) __n64_to_uint8x8_t(neon_uzp2_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vuzp2_u16(src1, src2) __n64_to_uint16x4_t(neon_uzp2_16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vuzp2_u32(src1, src2) __n64_to_uint32x2_t(neon_uzp2_32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vuzp2_f32(src1, src2) __n64_to_float32x2_t(neon_uzp2_32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vuzp2_p8(src1, src2) __n64_to_poly8x8_t(neon_uzp2_8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vuzp2_p16(src1, src2) __n64_to_poly16x4_t(neon_uzp2_16(__poly16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2)))
#define vuzp2q_s8(src1, src2) __n128_to_int8x16_t(neon_uzp2_q8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vuzp2q_s16(src1, src2) __n128_to_int16x8_t(neon_uzp2_q16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vuzp2q_s32(src1, src2) __n128_to_int32x4_t(neon_uzp2_q32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vuzp2q_s64(src1, src2) __n128_to_int64x2_t(neon_uzp2_q64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vuzp2q_u8(src1, src2) __n128_to_uint8x16_t(neon_uzp2_q8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vuzp2q_u16(src1, src2) __n128_to_uint16x8_t(neon_uzp2_q16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vuzp2q_u32(src1, src2) __n128_to_uint32x4_t(neon_uzp2_q32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vuzp2q_u64(src1, src2) __n128_to_uint64x2_t(neon_uzp2_q64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vuzp2q_f32(src1, src2) __n128_to_float32x4_t(neon_uzp2_q32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vuzp2q_f64(src1, src2) __n128_to_float64x2_t(neon_uzp2_q64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vuzp2q_p8(src1, src2) __n128_to_poly8x16_t(neon_uzp2_q8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vuzp2q_p16(src1, src2) __n128_to_poly16x8_t(neon_uzp2_q16(__poly16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2)))
#define vuzp2q_p64(src1, src2) __n128_to_poly64x2_t(neon_uzp2_q64(__poly64x2_t_to_n128(src1), __poly64x2_t_to_n128(src2)))

#define vtrn_s8(src1, src2) __n64x2_to_int8x8x2_t(neon_trn_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vtrn_s16(src1, src2) __n64x2_to_int16x4x2_t(neon_trn_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vtrn_s32(src1, src2) __n64x2_to_int32x2x2_t(neon_trn_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vtrn_u8(src1, src2) __n64x2_to_uint8x8x2_t(neon_trn_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vtrn_u16(src1, src2) __n64x2_to_uint16x4x2_t(neon_trn_16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vtrn_u32(src1, src2) __n64x2_to_uint32x2x2_t(neon_trn_32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vtrn_f32(src1, src2) __n64x2_to_float32x2x2_t(neon_trn_32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vtrn_p8(src1, src2) __n64x2_to_poly8x8x2_t(neon_trn_8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vtrn_p16(src1, src2) __n64x2_to_poly16x4x2_t(neon_trn_16(__poly16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2)))
#define vtrnq_s8(src1, src2) __n128x2_to_int8x16x2_t(neon_trn_q8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vtrnq_s16(src1, src2) __n128x2_to_int16x8x2_t(neon_trn_q16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vtrnq_s32(src1, src2) __n128x2_to_int32x4x2_t(neon_trn_q32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vtrnq_u8(src1, src2) __n128x2_to_uint8x16x2_t(neon_trn_q8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vtrnq_u16(src1, src2) __n128x2_to_uint16x8x2_t(neon_trn_q16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vtrnq_u32(src1, src2) __n128x2_to_uint32x4x2_t(neon_trn_q32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vtrnq_f32(src1, src2) __n128x2_to_float32x4x2_t(neon_trn_q32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vtrnq_p8(src1, src2) __n128x2_to_poly8x16x2_t(neon_trn_q8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vtrnq_p16(src1, src2) __n128x2_to_poly16x8x2_t(neon_trn_q16(__poly16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2)))

#define vtrn1_s8(src1, src2) __n64_to_int8x8_t(neon_trn1_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vtrn1_s16(src1, src2) __n64_to_int16x4_t(neon_trn1_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vtrn1_s32(src1, src2) __n64_to_int32x2_t(neon_trn1_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vtrn1_u8(src1, src2) __n64_to_uint8x8_t(neon_trn1_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vtrn1_u16(src1, src2) __n64_to_uint16x4_t(neon_trn1_16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vtrn1_u32(src1, src2) __n64_to_uint32x2_t(neon_trn1_32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vtrn1_f32(src1, src2) __n64_to_float32x2_t(neon_trn1_32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vtrn1_p8(src1, src2) __n64_to_poly8x8_t(neon_trn1_8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vtrn1_p16(src1, src2) __n64_to_poly16x4_t(neon_trn1_16(__poly16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2)))
#define vtrn1q_s8(src1, src2) __n128_to_int8x16_t(neon_trn1_q8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vtrn1q_s16(src1, src2) __n128_to_int16x8_t(neon_trn1_q16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vtrn1q_s32(src1, src2) __n128_to_int32x4_t(neon_trn1_q32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vtrn1q_s64(src1, src2) __n128_to_int64x2_t(neon_trn1_q64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vtrn1q_u8(src1, src2) __n128_to_uint8x16_t(neon_trn1_q8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vtrn1q_u16(src1, src2) __n128_to_uint16x8_t(neon_trn1_q16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vtrn1q_u32(src1, src2) __n128_to_uint32x4_t(neon_trn1_q32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vtrn1q_u64(src1, src2) __n128_to_uint64x2_t(neon_trn1_q64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vtrn1q_f32(src1, src2) __n128_to_float32x4_t(neon_trn1_q32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vtrn1q_f64(src1, src2) __n128_to_float64x2_t(neon_trn1_q64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vtrn1q_p8(src1, src2) __n128_to_poly8x16_t(neon_trn1_q8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vtrn1q_p16(src1, src2) __n128_to_poly16x8_t(neon_trn1_q16(__poly16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2)))
#define vtrn1q_p64(src1, src2) __n128_to_poly64x2_t(neon_trn1_q64(__poly64x2_t_to_n128(src1), __poly64x2_t_to_n128(src2)))

#define vtrn2_s8(src1, src2) __n64_to_int8x8_t(neon_trn2_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vtrn2_s16(src1, src2) __n64_to_int16x4_t(neon_trn2_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vtrn2_s32(src1, src2) __n64_to_int32x2_t(neon_trn2_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vtrn2_u8(src1, src2) __n64_to_uint8x8_t(neon_trn2_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vtrn2_u16(src1, src2) __n64_to_uint16x4_t(neon_trn2_16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vtrn2_u32(src1, src2) __n64_to_uint32x2_t(neon_trn2_32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vtrn2_f32(src1, src2) __n64_to_float32x2_t(neon_trn2_32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2)))
#define vtrn2_p8(src1, src2) __n64_to_poly8x8_t(neon_trn2_8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2)))
#define vtrn2_p16(src1, src2) __n64_to_poly16x4_t(neon_trn2_16(__poly16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2)))
#define vtrn2q_s8(src1, src2) __n128_to_int8x16_t(neon_trn2_q8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vtrn2q_s16(src1, src2) __n128_to_int16x8_t(neon_trn2_q16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vtrn2q_s32(src1, src2) __n128_to_int32x4_t(neon_trn2_q32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vtrn2q_s64(src1, src2) __n128_to_int64x2_t(neon_trn2_q64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vtrn2q_u8(src1, src2) __n128_to_uint8x16_t(neon_trn2_q8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vtrn2q_u16(src1, src2) __n128_to_uint16x8_t(neon_trn2_q16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vtrn2q_u32(src1, src2) __n128_to_uint32x4_t(neon_trn2_q32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vtrn2q_u64(src1, src2) __n128_to_uint64x2_t(neon_trn2_q64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vtrn2q_f32(src1, src2) __n128_to_float32x4_t(neon_trn2_q32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2)))
#define vtrn2q_f64(src1, src2) __n128_to_float64x2_t(neon_trn2_q64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2)))
#define vtrn2q_p8(src1, src2) __n128_to_poly8x16_t(neon_trn2_q8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2)))
#define vtrn2q_p16(src1, src2) __n128_to_poly16x8_t(neon_trn2_q16(__poly16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2)))
#define vtrn2q_p64(src1, src2) __n128_to_poly64x2_t(neon_trn2_q64(__poly64x2_t_to_n128(src1), __poly64x2_t_to_n128(src2)))

__n64 neon_frinta_16(__n64);
__n64 neon_frinta_32(__n64);
__n64 neon_frinta_64(__n64);
__n128 neon_frinta_q16(__n128);
__n128 neon_frinta_q32(__n128);
__n128 neon_frinta_q64(__n128);
__n64 neon_frinti_16(__n64);
__n64 neon_frinti_32(__n64);
__n64 neon_frinti_64(__n64);
__n128 neon_frinti_q16(__n128);
__n128 neon_frinti_q32(__n128);
__n128 neon_frinti_q64(__n128);
__n64 neon_frintm_16(__n64);
__n64 neon_frintm_32(__n64);
__n64 neon_frintm_64(__n64);
__n128 neon_frintm_q16(__n128);
__n128 neon_frintm_q32(__n128);
__n128 neon_frintm_q64(__n128);
__n64 neon_frintn_16(__n64);
__n128 neon_frintn_q16(__n128);
__n64 neon_frintn_32(__n64);
__n64 neon_frintn_64(__n64);
__n128 neon_frintn_q16(__n128);
__n128 neon_frintn_q32(__n128);
__n128 neon_frintn_q64(__n128);
__n64 neon_frintp_16(__n64);
__n64 neon_frintp_32(__n64);
__n64 neon_frintp_64(__n64);
__n128 neon_frintp_q16(__n128);
__n128 neon_frintp_q32(__n128);
__n128 neon_frintp_q64(__n128);
__n64 neon_frintx_16(__n64);
__n64 neon_frintx_32(__n64);
__n64 neon_frintx_64(__n64);
__n128 neon_frintx_q16(__n128);
__n128 neon_frintx_q32(__n128);
__n128 neon_frintx_q64(__n128);
__n64 neon_frintz_16(__n64);
__n64 neon_frintz_32(__n64);
__n64 neon_frintz_64(__n64);
__n128 neon_frintz_q16(__n128);
__n128 neon_frintz_q32(__n128);
__n128 neon_frintz_q64(__n128);
float neon_frintns_f32(float);
#define vrndi_f32(src) __n64_to_float32x2_t(neon_frinti_32(__float32x2_t_to_n64(src)))
#define vrndi_f64(src) __n64_to_float64x1_t(neon_frinti_64(__float64x1_t_to_n64(src)))
#define vrnda_f32(src) __n64_to_float32x2_t(neon_frinta_32(__float32x2_t_to_n64(src)))
#define vrnda_f64(src) __n64_to_float64x1_t(neon_frinta_64(__float64x1_t_to_n64(src)))
#define vrndm_f32(src) __n64_to_float32x2_t(neon_frintm_32(__float32x2_t_to_n64(src)))
#define vrndm_f64(src) __n64_to_float64x1_t(neon_frintm_64(__float64x1_t_to_n64(src)))
#define vrndn_f32(src) __n64_to_float32x2_t(neon_frintn_32(__float32x2_t_to_n64(src)))
#define vrndn_f64(src) __n64_to_float64x1_t(neon_frintn_64(__float64x1_t_to_n64(src)))
#define vrndp_f32(src) __n64_to_float32x2_t(neon_frintp_32(__float32x2_t_to_n64(src)))
#define vrndp_f64(src) __n64_to_float64x1_t(neon_frintp_64(__float64x1_t_to_n64(src)))
#define vrndx_f32(src) __n64_to_float32x2_t(neon_frintx_32(__float32x2_t_to_n64(src)))
#define vrndx_f64(src) __n64_to_float64x1_t(neon_frintx_64(__float64x1_t_to_n64(src)))
#define vrndiq_f32(src) __n128_to_float32x4_t(neon_frinti_q32(__float32x4_t_to_n128(src)))
#define vrndiq_f64(src) __n128_to_float64x2_t(neon_frinti_q64(__float64x2_t_to_n128(src)))
#define vrndaq_f32(src) __n128_to_float32x4_t(neon_frinta_q32(__float32x4_t_to_n128(src)))
#define vrndaq_f64(src) __n128_to_float64x2_t(neon_frinta_q64(__float64x2_t_to_n128(src)))
#define vrndmq_f32(src) __n128_to_float32x4_t(neon_frintm_q32(__float32x4_t_to_n128(src)))
#define vrndmq_f64(src) __n128_to_float64x2_t(neon_frintm_q64(__float64x2_t_to_n128(src)))
#define vrndnq_f32(src) __n128_to_float32x4_t(neon_frintn_q32(__float32x4_t_to_n128(src)))
#define vrndnq_f64(src) __n128_to_float64x2_t(neon_frintn_q64(__float64x2_t_to_n128(src)))
#define vrndpq_f32(src) __n128_to_float32x4_t(neon_frintp_q32(__float32x4_t_to_n128(src)))
#define vrndpq_f64(src) __n128_to_float64x2_t(neon_frintp_q64(__float64x2_t_to_n128(src)))
#define vrndxq_f32(src) __n128_to_float32x4_t(neon_frintx_q32(__float32x4_t_to_n128(src)))
#define vrndxq_f64(src) __n128_to_float64x2_t(neon_frintx_q64(__float64x2_t_to_n128(src)))
#define vrnd_f32(src) __n64_to_float32x2_t(neon_frintz_32(__float32x2_t_to_n64(src)))
#define vrnd_f64(src) __n64_to_float64x1_t(neon_frintz_64(__float64x1_t_to_n64(src)))
#define vrndq_f32(src) __n128_to_float32x4_t(neon_frintz_q32(__float32x4_t_to_n128(src)))
#define vrndq_f64(src) __n128_to_float64x2_t(neon_frintz_q64(__float64x2_t_to_n128(src)))
#define vrndns_f32(src) neon_frintns_f32(src)

// SHA1C/SHA1M/SHA1P/SHA256H2/SHA256H/SHA1SU0/SHA256SU1/SHA1SU1/SHA256SU0/SHA1H/SHA512H/SHA512H2/SHA512SU0/SHA512SU1
__n128 neon_sha1c(__n128, __n128, __n128);
__n128 neon_sha1cui(__n128, unsigned __int32, __n128);
__n128 neon_sha1m(__n128, __n128, __n128);
__n128 neon_sha1mui(__n128, unsigned __int32, __n128);
__n128 neon_sha1p(__n128, __n128, __n128);
__n128 neon_sha1pui(__n128, unsigned __int32, __n128);
__n128 neon_sha256h2(__n128, __n128, __n128);
__n128 neon_sha256h(__n128, __n128, __n128);
__n128 neon_sha1su0(__n128, __n128, __n128);
__n128 neon_sha256su1(__n128, __n128, __n128);
__n128 neon_sha1su1(__n128, __n128);
__n128 neon_sha256su0(__n128, __n128);
__n128 neon_sha1h(__n128);
unsigned __int32  neon_sha1hui(unsigned __int32);
__n128 neon_sha512h(__n128, __n128, __n128);
__n128 neon_sha512h2(__n128, __n128, __n128);
__n128 neon_sha512su0(__n128, __n128);
__n128 neon_sha512su1(__n128, __n128, __n128);
#define vsha1cq_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_sha1cui(__uint32x4_t_to_n128(src1), (src2), __uint32x4_t_to_n128(src3)))
#define vsha1pq_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_sha1pui(__uint32x4_t_to_n128(src1), (src2), __uint32x4_t_to_n128(src3)))
#define vsha1mq_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_sha1mui(__uint32x4_t_to_n128(src1), (src2), __uint32x4_t_to_n128(src3)))
#define vsha1su1q_u32(src1, src2) __n128_to_uint32x4_t(neon_sha1su1(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vsha256su0q_u32(src1, src2) __n128_to_uint32x4_t(neon_sha256su0(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vsha1su0q_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_sha1su0(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vsha256hq_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_sha256h(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vsha256h2q_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_sha256h2(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vsha256su1q_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_sha256su1(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vsha1h_u32(src)  neon_sha1hui(src)
#define vsha512hq_u64(src1, src2, src3) __n128_to_uint64x2_t(neon_sha512h(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2), __uint64x2_t_to_n128(src3)))
#define vsha512h2q_u64(src1, src2, src3) __n128_to_uint64x2_t(neon_sha512h2(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2), __uint64x2_t_to_n128(src3)))
#define vsha512su0q_u64(src1, src2) __n128_to_uint64x2_t(neon_sha512su0(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vsha512su1q_u64(src1, src2, src3) __n128_to_uint64x2_t(neon_sha512su1(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2), __uint64x2_t_to_n128(src3)))

// EOR3/RAX1/XAR/BCAX
__n128 neon_eor3q(__n128, __n128, __n128);
__n128 neon_rax1q(__n128, __n128);
__n128 neon_xarq(__n128, __n128, const int);
__n128 neon_bcaxq(__n128, __n128, __n128);
#define veor3q_u8(src1, src2, src3)  __n128_to_uint8x16_t(neon_eor3q(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3)))
#define veor3q_u16(src1, src2, src3) __n128_to_uint16x8_t(neon_eor3q(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3)))
#define veor3q_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_eor3q(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define veor3q_u64(src1, src2, src3) __n128_to_uint64x2_t(neon_eor3q(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2), __uint64x2_t_to_n128(src3)))
#define veor3q_s8(src1, src2, src3)  __n128_to_int8x16_t(neon_eor3q(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2), __int8x16_t_to_n128(src3)))
#define veor3q_s16(src1, src2, src3) __n128_to_int16x8_t(neon_eor3q(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define veor3q_s32(src1, src2, src3) __n128_to_int32x4_t(neon_eor3q(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define veor3q_s64(src1, src2, src3) __n128_to_int64x2_t(neon_eor3q(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2), __int64x2_t_to_n128(src3)))
#define vrax1q_u64(src1, src2) __n128_to_uint64x2_t(neon_rax1q(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vxarq_u64(src1, src2, src3) __n128_to_uint64x2_t(neon_xarq(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2), (src3)))
#define vbcaxq_u8(src1, src2, src3)  __n128_to_uint8x16_t(neon_bcaxq(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3)))
#define vbcaxq_u16(src1, src2, src3) __n128_to_uint16x8_t(neon_bcaxq(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3)))
#define vbcaxq_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_bcaxq(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vbcaxq_u64(src1, src2, src3) __n128_to_uint64x2_t(neon_bcaxq(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2), __uint64x2_t_to_n128(src3)))
#define vbcaxq_s8(src1, src2, src3)  __n128_to_int8x16_t(neon_bcaxq(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2), __int8x16_t_to_n128(src3)))
#define vbcaxq_s16(src1, src2, src3) __n128_to_int16x8_t(neon_bcaxq(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vbcaxq_s32(src1, src2, src3) __n128_to_int32x4_t(neon_bcaxq(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vbcaxq_s64(src1, src2, src3) __n128_to_int64x2_t(neon_bcaxq(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2), __int64x2_t_to_n128(src3)))

// SM3/SM4
__n128 neon_sm3ss1q(__n128, __n128, __n128);
__n128 neon_sm3tt1aq(__n128, __n128, __n128, const int);
__n128 neon_sm3tt1bq(__n128, __n128, __n128, const int);
__n128 neon_sm3tt2aq(__n128, __n128, __n128, const int);
__n128 neon_sm3tt2bq(__n128, __n128, __n128, const int);
__n128 neon_sm3partw1q(__n128, __n128, __n128);
__n128 neon_sm3partw2q(__n128, __n128, __n128);
__n128 neon_sm4eq(__n128, __n128);
__n128 neon_sm4ekeyq(__n128, __n128);
#define vsm3ss1q_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_sm3ss1q(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vsm3tt1aq_u32(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_sm3tt1aq(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3), (src4)))
#define vsm3tt1bq_u32(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_sm3tt1bq(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3), (src4)))
#define vsm3tt2aq_u32(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_sm3tt2aq(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3), (src4)))
#define vsm3tt2bq_u32(src1, src2, src3, src4) __n128_to_uint32x4_t(neon_sm3tt2bq(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3), (src4)))
#define vsm3partw1q_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_sm3partw1q(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vsm3partw2q_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_sm3partw2q(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vsm4eq_u32(src1, src2) __n128_to_uint32x4_t(neon_sm4eq(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vsm4ekeyq_u32(src1, src2) __n128_to_uint32x4_t(neon_sm4ekeyq(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))

// SMMLA/SUDOT/UMMLA/USMMLA/USDOT AArch64 Int8 matrix multiplication instructions (FEAT_I8MM)
// SMMLA
int32x4_t vmmlaq_s32(int32x4_t __r, int8x16_t __a, int8x16_t __b);

// SUDOT
int32x2_t vsudot_lane_s32(int32x2_t __r, int8x8_t __a, uint8x8_t __b, const int __lane);
int32x2_t vsudot_laneq_s32(int32x2_t __r, int8x8_t __a, uint8x16_t __b, const int __lane);
int32x4_t vsudotq_lane_s32(int32x4_t __r, int8x16_t __a, uint8x8_t __b, const int __lane);
int32x4_t vsudotq_laneq_s32(int32x4_t __r, int8x16_t __a, uint8x16_t __b, const int __lane);

// UMMLA
uint32x4_t vmmlaq_u32(uint32x4_t __r, uint8x16_t __a, uint8x16_t __b);

// USMMLA
int32x4_t vusmmlaq_s32(int32x4_t __r, uint8x16_t __a, int8x16_t __b);

// USDOT
int32x2_t vusdot_s32(int32x2_t __r, uint8x8_t __a, int8x8_t __b);
int32x2_t vusdot_lane_s32(int32x2_t __r, uint8x8_t __a, int8x8_t __b, const int __lane);
int32x2_t vusdot_laneq_s32(int32x2_t __r, uint8x8_t __a, int8x16_t __b, const int __lane);
int32x4_t vusdotq_s32(int32x4_t __r, uint8x16_t __a, int8x16_t __b);
int32x4_t vusdotq_lane_s32(int32x4_t __r, uint8x16_t __a, int8x8_t __b, const int __lane);
int32x4_t vusdotq_laneq_s32(int32x4_t __r, uint8x16_t __a, int8x16_t __b, const int __lane);

// SRI/SRSHR/SSHR/SSRA/USHR/URSRA/USRA/URSHR/SRSRA/SHL/SLI/SQSHLU/SQSHL/UQSHL/SQRSHL/URSHL/SRSHL/USHL/UQRSHL/SSHL
__n64  neon_srii8  (__n64,  __n64,  const int);
__n128 neon_sriiq8 (__n128, __n128, const int);
__n64  neon_srii16 (__n64,  __n64,  const int);
__n128 neon_sriiq16(__n128, __n128, const int);
__n64  neon_srii32 (__n64,  __n64,  const int);
__n128 neon_sriiq32(__n128, __n128, const int);
__n128 neon_sriiq64(__n128, __n128, const int);
__n64  neon_sriis64(__n64,  __n64,  const int);
__n64  neon_srshri8  (__n64,  const int);
__n128 neon_srshriq8 (__n128, const int);
__n64  neon_srshri16 (__n64,  const int);
__n128 neon_srshriq16(__n128, const int);
__n64  neon_srshri32 (__n64,  const int);
__n128 neon_srshriq32(__n128, const int);
__n128 neon_srshriq64(__n128, const int);
__n64  neon_srshris64(__n64,  const int);
__n64  neon_sshri8  (__n64,  const int);
__n128 neon_sshriq8 (__n128, const int);
__n64  neon_sshri16 (__n64,  const int);
__n128 neon_sshriq16(__n128, const int);
__n64  neon_sshri32 (__n64,  const int);
__n128 neon_sshriq32(__n128, const int);
__n128 neon_sshriq64(__n128, const int);
__n64  neon_sshris64(__n64,  const int);
__n64  neon_ssrai8  (__n64,  __n64,  const int);
__n128 neon_ssraiq8 (__n128, __n128, const int);
__n64  neon_ssrai16 (__n64,  __n64,  const int);
__n128 neon_ssraiq16(__n128, __n128, const int);
__n64  neon_ssrai32 (__n64,  __n64,  const int);
__n128 neon_ssraiq32(__n128, __n128, const int);
__n128 neon_ssraiq64(__n128, __n128, const int);
__n64  neon_ssrais64(__n64,  __n64,  const int);
__n64  neon_ushri8  (__n64,  const int);
__n128 neon_ushriq8 (__n128, const int);
__n64  neon_ushri16 (__n64,  const int);
__n128 neon_ushriq16(__n128, const int);
__n64  neon_ushri32 (__n64,  const int);
__n128 neon_ushriq32(__n128, const int);
__n128 neon_ushriq64(__n128, const int);
__n64  neon_ushris64(__n64,  const int);
__n64  neon_ursrai8  (__n64,  __n64,  const int);
__n128 neon_ursraiq8 (__n128, __n128, const int);
__n64  neon_ursrai16 (__n64,  __n64,  const int);
__n128 neon_ursraiq16(__n128, __n128, const int);
__n64  neon_ursrai32 (__n64,  __n64,  const int);
__n128 neon_ursraiq32(__n128, __n128, const int);
__n128 neon_ursraiq64(__n128, __n128, const int);
__n64  neon_ursrais64(__n64,  __n64,  const int);
__n64  neon_usrai8  (__n64,  __n64,  const int);
__n128 neon_usraiq8 (__n128, __n128, const int);
__n64  neon_usrai16 (__n64,  __n64,  const int);
__n128 neon_usraiq16(__n128, __n128, const int);
__n64  neon_usrai32 (__n64,  __n64,  const int);
__n128 neon_usraiq32(__n128, __n128, const int);
__n128 neon_usraiq64(__n128, __n128, const int);
__n64  neon_usrais64(__n64,  __n64,  const int);
__n64  neon_urshri8  (__n64,  const int);
__n128 neon_urshriq8 (__n128, const int);
__n64  neon_urshri16 (__n64,  const int);
__n128 neon_urshriq16(__n128, const int);
__n64  neon_urshri32 (__n64,  const int);
__n128 neon_urshriq32(__n128, const int);
__n128 neon_urshriq64(__n128, const int);
__n64  neon_urshris64(__n64,  const int);
__n64  neon_srsrai8  (__n64,  __n64,  const int);
__n128 neon_srsraiq8 (__n128, __n128, const int);
__n64  neon_srsrai16 (__n64,  __n64,  const int);
__n128 neon_srsraiq16(__n128, __n128, const int);
__n64  neon_srsrai32 (__n64,  __n64,  const int);
__n128 neon_srsraiq32(__n128, __n128, const int);
__n128 neon_srsraiq64(__n128, __n128, const int);
__n64  neon_srsrais64(__n64,  __n64,  const int);
__n64  neon_shli8  (__n64,  const int);
__n128 neon_shliq8 (__n128, const int);
__n64  neon_shli16 (__n64,  const int);
__n128 neon_shliq16(__n128, const int);
__n64  neon_shli32 (__n64,  const int);
__n128 neon_shliq32(__n128, const int);
__n128 neon_shliq64(__n128, const int);
__n64  neon_shlis64(__n64,  const int);
__n64  neon_slii8  (__n64,  __n64,  const int);
__n128 neon_sliiq8 (__n128, __n128, const int);
__n64  neon_slii16 (__n64,  __n64,  const int);
__n128 neon_sliiq16(__n128, __n128, const int);
__n64  neon_slii32 (__n64,  __n64,  const int);
__n128 neon_sliiq32(__n128, __n128, const int);
__n128 neon_sliiq64(__n128, __n128, const int);
__n64  neon_sliis64(__n64,  __n64,  const int);
__n64  neon_sqshlui8  (__n64,  const int);
__n128 neon_sqshluiq8 (__n128, const int);
__n64  neon_sqshlui16 (__n64,  const int);
__n128 neon_sqshluiq16(__n128, const int);
__n64  neon_sqshlui32 (__n64,  const int);
__n128 neon_sqshluiq32(__n128, const int);
__n64  neon_sqshlui64 (__n64,  const int);
__n128 neon_sqshluiq64(__n128, const int);
__n8   neon_sqshluis8(__n8,  const int);
__n16  neon_sqshluis16(__n16, const int);
float  neon_sqshluis32(float, const int);
__n64  neon_sqshluis64(__n64, const int);
__n64  neon_sqshli8  (__n64,  const int);
__n128 neon_sqshliq8 (__n128, const int);
__n64  neon_sqshli16 (__n64,  const int);
__n128 neon_sqshliq16(__n128, const int);
__n64  neon_sqshli32 (__n64,  const int);
__n128 neon_sqshliq32(__n128, const int);
__n64  neon_sqshli64 (__n64,  const int);
__n128 neon_sqshliq64(__n128, const int);
__n64  neon_sqshl8  (__n64,  __n64);
__n128 neon_sqshlq8 (__n128, __n128);
__n64  neon_sqshl16 (__n64,  __n64);
__n128 neon_sqshlq16(__n128, __n128);
__n64  neon_sqshl32 (__n64,  __n64);
__n128 neon_sqshlq32(__n128, __n128);
__n64  neon_sqshl64 (__n64,  __n64);
__n128 neon_sqshlq64(__n128, __n128);
__n8   neon_sqshlis8(__n8,  const int);
__n16  neon_sqshlis16(__n16, const int);
float  neon_sqshlis32(float, const int);
__n64  neon_sqshlis64(__n64, const int);
__n8   neon_sqshls8(__n8,  __n8);
__n16  neon_sqshls16(__n16, __n16);
float  neon_sqshls32(float, float);
__n64  neon_sqshls64(__n64, __n64);
__n64  neon_uqshli8  (__n64,  const int);
__n128 neon_uqshliq8 (__n128, const int);
__n64  neon_uqshli16 (__n64,  const int);
__n128 neon_uqshliq16(__n128, const int);
__n64  neon_uqshli32 (__n64,  const int);
__n128 neon_uqshliq32(__n128, const int);
__n64  neon_uqshli64 (__n64,  const int);
__n128 neon_uqshliq64(__n128, const int);
__n64  neon_uqshl8  (__n64,  __n64);
__n128 neon_uqshlq8 (__n128, __n128);
__n64  neon_uqshl16 (__n64,  __n64);
__n128 neon_uqshlq16(__n128, __n128);
__n64  neon_uqshl32 (__n64,  __n64);
__n128 neon_uqshlq32(__n128, __n128);
__n64  neon_uqshl64 (__n64,  __n64);
__n128 neon_uqshlq64(__n128, __n128);
__n8   neon_uqshlis8(__n8,  const int);
__n16  neon_uqshlis16(__n16, const int);
float  neon_uqshlis32(float, const int);
__n64  neon_uqshlis64(__n64, const int);
__n8   neon_uqshls8(__n8,  __n8);
__n16  neon_uqshls16(__n16, __n16);
float  neon_uqshls32(float, float);
__n64  neon_uqshls64(__n64, __n64);
__n64  neon_sqrshl8  (__n64,  __n64);
__n128 neon_sqrshlq8 (__n128, __n128);
__n64  neon_sqrshl16 (__n64,  __n64);
__n128 neon_sqrshlq16(__n128, __n128);
__n64  neon_sqrshl32 (__n64,  __n64);
__n128 neon_sqrshlq32(__n128, __n128);
__n64  neon_sqrshl64 (__n64,  __n64);
__n128 neon_sqrshlq64(__n128, __n128);
__n8   neon_sqrshls8(__n8,  __n8);
__n16  neon_sqrshls16(__n16, __n16);
float  neon_sqrshls32(float, float);
__n64  neon_sqrshls64(__n64, __n64);
__n64  neon_urshl8  (__n64,  __n64);
__n128 neon_urshlq8 (__n128, __n128);
__n64  neon_urshl16 (__n64,  __n64);
__n128 neon_urshlq16(__n128, __n128);
__n64  neon_urshl32 (__n64,  __n64);
__n128 neon_urshlq32(__n128, __n128);
__n64  neon_urshl64 (__n64,  __n64);
__n128 neon_urshlq64(__n128, __n128);
__n64  neon_urshls64(__n64, __n64);
__n64  neon_srshl8  (__n64,  __n64);
__n128 neon_srshlq8 (__n128, __n128);
__n64  neon_srshl16 (__n64,  __n64);
__n128 neon_srshlq16(__n128, __n128);
__n64  neon_srshl32 (__n64,  __n64);
__n128 neon_srshlq32(__n128, __n128);
__n64  neon_srshl64 (__n64,  __n64);
__n128 neon_srshlq64(__n128, __n128);
__n64  neon_srshls64(__n64, __n64);
__n64  neon_ushl8  (__n64,  __n64);
__n128 neon_ushlq8 (__n128, __n128);
__n64  neon_ushl16 (__n64,  __n64);
__n128 neon_ushlq16(__n128, __n128);
__n64  neon_ushl32 (__n64,  __n64);
__n128 neon_ushlq32(__n128, __n128);
__n128 neon_ushlq64(__n128, __n128);
__n64  neon_ushls64(__n64, __n64);
__n64  neon_uqrshl8  (__n64,  __n64);
__n128 neon_uqrshlq8 (__n128, __n128);
__n64  neon_uqrshl16 (__n64,  __n64);
__n128 neon_uqrshlq16(__n128, __n128);
__n64  neon_uqrshl32 (__n64,  __n64);
__n128 neon_uqrshlq32(__n128, __n128);
__n64  neon_uqrshl64 (__n64,  __n64);
__n128 neon_uqrshlq64(__n128, __n128);
__n8   neon_uqrshls8(__n8, __n8);
__n16  neon_uqrshls16(__n16, __n16);
float  neon_uqrshls32(float, float);
__n64  neon_uqrshls64(__n64, __n64);
__n64  neon_sshl8  (__n64,  __n64);
__n128 neon_sshlq8 (__n128, __n128);
__n64  neon_sshl16 (__n64,  __n64);
__n128 neon_sshlq16(__n128, __n128);
__n64  neon_sshl32 (__n64,  __n64);
__n128 neon_sshlq32(__n128, __n128);
__n128 neon_sshlq64(__n128, __n128);
__n64  neon_sshls64(__n64, __n64);
#define vsri_n_p16(src1, src2, src3) __n64_to_poly16x4_t(neon_srii16(__poly16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2), (src3)))
#define vsri_n_p64(src1, src2, src3) __n64_to_poly64x1_t(neon_sriis64(__poly64x1_t_to_n64(src1), __poly64x1_t_to_n64(src2), (src3)))
#define vsri_n_p8(src1, src2, src3) __n64_to_poly8x8_t(neon_srii8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2), (src3)))
#define vsri_n_s16(src1, src2, src3) __n64_to_int16x4_t(neon_srii16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), (src3)))
#define vsri_n_s32(src1, src2, src3) __n64_to_int32x2_t(neon_srii32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), (src3)))
#define vsri_n_s64(src1, src2, src3) __n64_to_int64x1_t(neon_sriis64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2), (src3)))
#define vsri_n_s8(src1, src2, src3) __n64_to_int8x8_t(neon_srii8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2), (src3)))
#define vsri_n_u16(src1, src2, src3) __n64_to_uint16x4_t(neon_srii16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), (src3)))
#define vsri_n_u32(src1, src2, src3) __n64_to_uint32x2_t(neon_srii32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), (src3)))
#define vsri_n_u64(src1, src2, src3) __n64_to_uint64x1_t(neon_sriis64(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2), (src3)))
#define vsri_n_u8(src1, src2, src3) __n64_to_uint8x8_t(neon_srii8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2), (src3)))
#define vsrid_n_s64(src1, src2, src3) neon_sriis64(__int64ToN64_v(src1), __int64ToN64_v(src2), (src3)).n64_i64[0]
#define vsrid_n_u64(src1, src2, src3) neon_sriis64(__uint64ToN64_v(src1), __uint64ToN64_v(src2), (src3)).n64_u64[0]
#define vsriq_n_p16(src1, src2, src3) __n128_to_poly16x8_t(neon_sriiq16(__poly16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2), (src3)))
#define vsriq_n_p64(src1, src2, src3) __n128_to_poly64x2_t(neon_sriiq64(__poly64x2_t_to_n128(src1), __poly64x2_t_to_n128(src2), (src3)))
#define vsriq_n_p8(src1, src2, src3) __n128_to_poly8x16_t(neon_sriiq8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2), (src3)))
#define vsriq_n_s16(src1, src2, src3) __n128_to_int16x8_t(neon_sriiq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), (src3)))
#define vsriq_n_s32(src1, src2, src3) __n128_to_int32x4_t(neon_sriiq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), (src3)))
#define vsriq_n_s64(src1, src2, src3) __n128_to_int64x2_t(neon_sriiq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2), (src3)))
#define vsriq_n_s8(src1, src2, src3) __n128_to_int8x16_t(neon_sriiq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2), (src3)))
#define vsriq_n_u16(src1, src2, src3) __n128_to_uint16x8_t(neon_sriiq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), (src3)))
#define vsriq_n_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_sriiq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), (src3)))
#define vsriq_n_u64(src1, src2, src3) __n128_to_uint64x2_t(neon_sriiq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2), (src3)))
#define vsriq_n_u8(src1, src2, src3) __n128_to_uint8x16_t(neon_sriiq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2), (src3)))
#define vrshr_n_s16(src1, src2) __n64_to_int16x4_t(neon_srshri16(__int16x4_t_to_n64(src1), (src2)))
#define vrshr_n_s32(src1, src2) __n64_to_int32x2_t(neon_srshri32(__int32x2_t_to_n64(src1), (src2)))
#define vrshr_n_s64(src1, src2) __n64_to_int64x1_t(neon_srshris64(__int64x1_t_to_n64(src1), (src2)))
#define vrshr_n_s8(src1, src2) __n64_to_int8x8_t(neon_srshri8(__int8x8_t_to_n64(src1), (src2)))
#define vrshr_n_u16(src1, src2) __n64_to_uint16x4_t(neon_urshri16(__uint16x4_t_to_n64(src1), (src2)))
#define vrshr_n_u32(src1, src2) __n64_to_uint32x2_t(neon_urshri32(__uint32x2_t_to_n64(src1), (src2)))
#define vrshr_n_u64(src1, src2) __n64_to_uint64x1_t(neon_urshris64(__uint64x1_t_to_n64(src1), (src2)))
#define vrshr_n_u8(src1, src2) __n64_to_uint8x8_t(neon_urshri8(__uint8x8_t_to_n64(src1), (src2)))
#define vshr_n_s16(src1, src2) __n64_to_int16x4_t(neon_sshri16(__int16x4_t_to_n64(src1), (src2)))
#define vshr_n_s32(src1, src2) __n64_to_int32x2_t(neon_sshri32(__int32x2_t_to_n64(src1), (src2)))
#define vshr_n_s64(src1, src2) __n64_to_int64x1_t(neon_sshris64(__int64x1_t_to_n64(src1), (src2)))
#define vshr_n_s8(src1, src2) __n64_to_int8x8_t(neon_sshri8(__int8x8_t_to_n64(src1), (src2)))
#define vshr_n_u16(src1, src2) __n64_to_uint16x4_t(neon_ushri16(__uint16x4_t_to_n64(src1), (src2)))
#define vshr_n_u32(src1, src2) __n64_to_uint32x2_t(neon_ushri32(__uint32x2_t_to_n64(src1), (src2)))
#define vshr_n_u64(src1, src2) __n64_to_uint64x1_t(neon_ushris64(__uint64x1_t_to_n64(src1), (src2)))
#define vshr_n_u8(src1, src2) __n64_to_uint8x8_t(neon_ushri8(__uint8x8_t_to_n64(src1), (src2)))
#define vshrd_n_s64(src1, src2) neon_sshris64(__int64ToN64_v(src1), (src2)).n64_i64[0]
#define vshrd_n_u64(src1, src2) neon_ushris64(__uint64ToN64_v(src1), (src2)).n64_u64[0]
#define vrshrq_n_s16(src1, src2) __n128_to_int16x8_t(neon_srshriq16(__int16x8_t_to_n128(src1), (src2)))
#define vrshrq_n_s32(src1, src2) __n128_to_int32x4_t(neon_srshriq32(__int32x4_t_to_n128(src1), (src2)))
#define vrshrq_n_s64(src1, src2) __n128_to_int64x2_t(neon_srshriq64(__int64x2_t_to_n128(src1), (src2)))
#define vrshrq_n_s8(src1, src2) __n128_to_int8x16_t(neon_srshriq8(__int8x16_t_to_n128(src1), (src2)))
#define vrshrq_n_u16(src1, src2) __n128_to_uint16x8_t(neon_urshriq16(__uint16x8_t_to_n128(src1), (src2)))
#define vrshrq_n_u32(src1, src2) __n128_to_uint32x4_t(neon_urshriq32(__uint32x4_t_to_n128(src1), (src2)))
#define vrshrq_n_u64(src1, src2) __n128_to_uint64x2_t(neon_urshriq64(__uint64x2_t_to_n128(src1), (src2)))
#define vrshrq_n_u8(src1, src2) __n128_to_uint8x16_t(neon_urshriq8(__uint8x16_t_to_n128(src1), (src2)))
#define vrshrd_n_s64(src1, src2) neon_srshris64(__int64ToN64_v(src1), (src2)).n64_i64[0]
#define vrshrd_n_u64(src1, src2) neon_urshris64(__uint64ToN64_v(src1), (src2)).n64_u64[0]
#define vshrq_n_s16(src1, src2) __n128_to_int16x8_t(neon_sshriq16(__int16x8_t_to_n128(src1), (src2)))
#define vshrq_n_s32(src1, src2) __n128_to_int32x4_t(neon_sshriq32(__int32x4_t_to_n128(src1), (src2)))
#define vshrq_n_s64(src1, src2) __n128_to_int64x2_t(neon_sshriq64(__int64x2_t_to_n128(src1), (src2)))
#define vshrq_n_s8(src1, src2) __n128_to_int8x16_t(neon_sshriq8(__int8x16_t_to_n128(src1), (src2)))
#define vshrq_n_u16(src1, src2) __n128_to_uint16x8_t(neon_ushriq16(__uint16x8_t_to_n128(src1), (src2)))
#define vshrq_n_u32(src1, src2) __n128_to_uint32x4_t(neon_ushriq32(__uint32x4_t_to_n128(src1), (src2)))
#define vshrq_n_u64(src1, src2) __n128_to_uint64x2_t(neon_ushriq64(__uint64x2_t_to_n128(src1), (src2)))
#define vshrq_n_u8(src1, src2) __n128_to_uint8x16_t(neon_ushriq8(__uint8x16_t_to_n128(src1), (src2)))
#define vrsra_n_s16(src1, src2, src3) __n64_to_int16x4_t(neon_srsrai16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), (src3)))
#define vrsra_n_s32(src1, src2, src3) __n64_to_int32x2_t(neon_srsrai32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), (src3)))
#define vrsra_n_s64(src1, src2, src3) __n64_to_int64x1_t(neon_srsrais64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2), (src3)))
#define vrsra_n_s8(src1, src2, src3) __n64_to_int8x8_t(neon_srsrai8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2), (src3)))
#define vrsra_n_u16(src1, src2, src3) __n64_to_uint16x4_t(neon_ursrai16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), (src3)))
#define vrsra_n_u32(src1, src2, src3) __n64_to_uint32x2_t(neon_ursrai32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), (src3)))
#define vrsra_n_u64(src1, src2, src3) __n64_to_uint64x1_t(neon_ursrais64(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2), (src3)))
#define vrsra_n_u8(src1, src2, src3) __n64_to_uint8x8_t(neon_ursrai8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2), (src3)))
#define vrsrad_n_s64(src1, src2, src3) neon_srsrais64(__int64ToN64_v(src1), __int64ToN64_v(src2), (src3)).n64_i64[0]
#define vrsrad_n_u64(src1, src2, src3) neon_ursrais64(__uint64ToN64_v(src1), __uint64ToN64_v(src2), (src3)).n64_u64[0]
#define vsra_n_s16(src1, src2, src3) __n64_to_int16x4_t(neon_ssrai16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), (src3)))
#define vsra_n_s32(src1, src2, src3) __n64_to_int32x2_t(neon_ssrai32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), (src3)))
#define vsra_n_s64(src1, src2, src3) __n64_to_int64x1_t(neon_ssrais64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2), (src3)))
#define vsra_n_s8(src1, src2, src3) __n64_to_int8x8_t(neon_ssrai8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2), (src3)))
#define vsra_n_u16(src1, src2, src3) __n64_to_uint16x4_t(neon_usrai16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), (src3)))
#define vsra_n_u32(src1, src2, src3) __n64_to_uint32x2_t(neon_usrai32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), (src3)))
#define vsra_n_u64(src1, src2, src3) __n64_to_uint64x1_t(neon_usrais64(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2), (src3)))
#define vsra_n_u8(src1, src2, src3) __n64_to_uint8x8_t(neon_usrai8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2), (src3)))
#define vsrad_n_s64(src1, src2, src3) neon_ssrais64(__int64ToN64_v(src1), __int64ToN64_v(src2), (src3)).n64_i64[0]
#define vsrad_n_u64(src1, src2, src3) neon_usrais64(__uint64ToN64_v(src1), __uint64ToN64_v(src2), (src3)).n64_u64[0]
#define vrsraq_n_s16(src1, src2, src3) __n128_to_int16x8_t(neon_srsraiq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), (src3)))
#define vrsraq_n_s32(src1, src2, src3) __n128_to_int32x4_t(neon_srsraiq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), (src3)))
#define vrsraq_n_s64(src1, src2, src3) __n128_to_int64x2_t(neon_srsraiq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2), (src3)))
#define vrsraq_n_s8(src1, src2, src3) __n128_to_int8x16_t(neon_srsraiq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2), (src3)))
#define vrsraq_n_u16(src1, src2, src3) __n128_to_uint16x8_t(neon_ursraiq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), (src3)))
#define vrsraq_n_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_ursraiq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), (src3)))
#define vrsraq_n_u64(src1, src2, src3) __n128_to_uint64x2_t(neon_ursraiq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2), (src3)))
#define vrsraq_n_u8(src1, src2, src3) __n128_to_uint8x16_t(neon_ursraiq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2), (src3)))
#define vsraq_n_s16(src1, src2, src3) __n128_to_int16x8_t(neon_ssraiq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), (src3)))
#define vsraq_n_s32(src1, src2, src3) __n128_to_int32x4_t(neon_ssraiq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), (src3)))
#define vsraq_n_s64(src1, src2, src3) __n128_to_int64x2_t(neon_ssraiq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2), (src3)))
#define vsraq_n_s8(src1, src2, src3) __n128_to_int8x16_t(neon_ssraiq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2), (src3)))
#define vsraq_n_u16(src1, src2, src3) __n128_to_uint16x8_t(neon_usraiq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), (src3)))
#define vsraq_n_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_usraiq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), (src3)))
#define vsraq_n_u64(src1, src2, src3) __n128_to_uint64x2_t(neon_usraiq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2), (src3)))
#define vsraq_n_u8(src1, src2, src3) __n128_to_uint8x16_t(neon_usraiq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2), (src3)))
#define vqshl_n_s16(src1, src2) __n64_to_int16x4_t(neon_sqshli16(__int16x4_t_to_n64(src1), (src2)))
#define vqshl_n_s32(src1, src2) __n64_to_int32x2_t(neon_sqshli32(__int32x2_t_to_n64(src1), (src2)))
#define vqshl_n_s64(src1, src2) __n64_to_int64x1_t(neon_sqshli64(__int64x1_t_to_n64(src1), (src2)))
#define vqshl_n_s8(src1, src2) __n64_to_int8x8_t(neon_sqshli8(__int8x8_t_to_n64(src1), (src2)))
#define vqshl_n_u16(src1, src2) __n64_to_uint16x4_t(neon_uqshli16(__uint16x4_t_to_n64(src1), (src2)))
#define vqshl_n_u32(src1, src2) __n64_to_uint32x2_t(neon_uqshli32(__uint32x2_t_to_n64(src1), (src2)))
#define vqshl_n_u64(src1, src2) __n64_to_uint64x1_t(neon_uqshli64(__uint64x1_t_to_n64(src1), (src2)))
#define vqshl_n_u8(src1, src2) __n64_to_uint8x8_t(neon_uqshli8(__uint8x8_t_to_n64(src1), (src2)))
#define vqshlb_n_s8(src1, src2) neon_sqshlis8(__int8ToN8_v(src1), (src2)).n8_i8[0]
#define vqshlh_n_s16(src1, src2) neon_sqshlis16(__int16ToN16_v(src1), (src2)).n16_i16[0]
#define vqshls_n_s32(src1, src2) _CopyInt32FromFloat(neon_sqshlis32(_CopyFloatFromInt32(src1), (src2)))
#define vqshld_n_s64(src1, src2) neon_sqshlis64(__int64ToN64_v(src1), (src2)).n64_i64[0]
#define vqshlb_n_u8(src1, src2) neon_uqshlis8(__uint8ToN8_v(src1), (src2)).n8_u8[0]
#define vqshlh_n_u16(src1, src2) neon_uqshlis16(__uint16ToN16_v(src1), (src2)).n16_u16[0]
#define vqshls_n_u32(src1, src2) _CopyUInt32FromFloat(neon_uqshlis32(_CopyFloatFromUInt32(src1), (src2)))
#define vqshld_n_u64(src1, src2) neon_uqshlis64(__uint64ToN64_v(src1), (src2)).n64_u64[0]
#define vqshlq_n_s16(src1, src2) __n128_to_int16x8_t(neon_sqshliq16(__int16x8_t_to_n128(src1), (src2)))
#define vqshlq_n_s32(src1, src2) __n128_to_int32x4_t(neon_sqshliq32(__int32x4_t_to_n128(src1), (src2)))
#define vqshlq_n_s64(src1, src2) __n128_to_int64x2_t(neon_sqshliq64(__int64x2_t_to_n128(src1), (src2)))
#define vqshlq_n_s8(src1, src2) __n128_to_int8x16_t(neon_sqshliq8(__int8x16_t_to_n128(src1), (src2)))
#define vqshlq_n_u16(src1, src2) __n128_to_uint16x8_t(neon_uqshliq16(__uint16x8_t_to_n128(src1), (src2)))
#define vqshlq_n_u32(src1, src2) __n128_to_uint32x4_t(neon_uqshliq32(__uint32x4_t_to_n128(src1), (src2)))
#define vqshlq_n_u64(src1, src2) __n128_to_uint64x2_t(neon_uqshliq64(__uint64x2_t_to_n128(src1), (src2)))
#define vqshlq_n_u8(src1, src2) __n128_to_uint8x16_t(neon_uqshliq8(__uint8x16_t_to_n128(src1), (src2)))
#define vqshlu_n_s16(src1, src2) __n64_to_uint16x4_t(neon_sqshlui16(__int16x4_t_to_n64(src1), (src2)))
#define vqshlu_n_s32(src1, src2) __n64_to_uint32x2_t(neon_sqshlui32(__int32x2_t_to_n64(src1), (src2)))
#define vqshlu_n_s64(src1, src2) __n64_to_uint64x1_t(neon_sqshlui64(__int64x1_t_to_n64(src1), (src2)))
#define vqshlu_n_s8(src1, src2) __n64_to_uint8x8_t(neon_sqshlui8(__int8x8_t_to_n64(src1), (src2)))
#define vqshluq_n_s16(src1, src2) __n128_to_uint16x8_t(neon_sqshluiq16(__int16x8_t_to_n128(src1), (src2)))
#define vqshluq_n_s32(src1, src2) __n128_to_uint32x4_t(neon_sqshluiq32(__int32x4_t_to_n128(src1), (src2)))
#define vqshluq_n_s64(src1, src2) __n128_to_uint64x2_t(neon_sqshluiq64(__int64x2_t_to_n128(src1), (src2)))
#define vqshluq_n_s8(src1, src2) __n128_to_uint8x16_t(neon_sqshluiq8(__int8x16_t_to_n128(src1), (src2)))
#define vqshlub_n_s8(src1, src2) neon_sqshluis8(__int8ToN8_v(src1), (src2)).n8_i8[0]
#define vqshluh_n_s16(src1, src2) neon_sqshluis16(__int16ToN16_v(src1), (src2)).n16_i16[0]
#define vqshlus_n_s32(src1, src2) _CopyInt32FromFloat(neon_sqshluis32(_CopyFloatFromInt32(src1), (src2)))
#define vqshlud_n_s64(src1, src2) neon_sqshluis64(__int64ToN64_v(src1), (src2)).n64_i64[0]
#define vshl_n_s16(src1, src2) __n64_to_int16x4_t(neon_shli16(__int16x4_t_to_n64(src1), (src2)))
#define vshl_n_s32(src1, src2) __n64_to_int32x2_t(neon_shli32(__int32x2_t_to_n64(src1), (src2)))
#define vshl_n_s8(src1, src2) __n64_to_int8x8_t(neon_shli8(__int8x8_t_to_n64(src1), (src2)))
#define vshl_n_u16(src1, src2) __n64_to_uint16x4_t(neon_shli16(__uint16x4_t_to_n64(src1), (src2)))
#define vshl_n_u32(src1, src2) __n64_to_uint32x2_t(neon_shli32(__uint32x2_t_to_n64(src1), (src2)))
#define vshl_n_u8(src1, src2) __n64_to_uint8x8_t(neon_shli8(__uint8x8_t_to_n64(src1), (src2)))
#define vshl_n_u64(src1, src2) __n64_to_uint64x1_t(neon_shlis64(__uint64x1_t_to_n64(src1), (src2)))
#define vshl_n_s64(src1, src2) __n64_to_int64x1_t(neon_shlis64(__int64x1_t_to_n64(src1), (src2)))
#define vshlq_n_s16(src1, src2) __n128_to_int16x8_t(neon_shliq16(__int16x8_t_to_n128(src1), (src2)))
#define vshlq_n_s32(src1, src2) __n128_to_int32x4_t(neon_shliq32(__int32x4_t_to_n128(src1), (src2)))
#define vshlq_n_s64(src1, src2) __n128_to_int64x2_t(neon_shliq64(__int64x2_t_to_n128(src1), (src2)))
#define vshlq_n_s8(src1, src2) __n128_to_int8x16_t(neon_shliq8(__int8x16_t_to_n128(src1), (src2)))
#define vshlq_n_u16(src1, src2) __n128_to_uint16x8_t(neon_shliq16(__uint16x8_t_to_n128(src1), (src2)))
#define vshlq_n_u32(src1, src2) __n128_to_uint32x4_t(neon_shliq32(__uint32x4_t_to_n128(src1), (src2)))
#define vshlq_n_u64(src1, src2) __n128_to_uint64x2_t(neon_shliq64(__uint64x2_t_to_n128(src1), (src2)))
#define vshlq_n_u8(src1, src2) __n128_to_uint8x16_t(neon_shliq8(__uint8x16_t_to_n128(src1), (src2)))
#define vshld_n_u64(src1, src2) neon_shlis64(__int64ToN64_v(src1), (src2)).n64_i64[0]
#define vshld_n_s64(src1, src2) neon_shlis64(__uint64ToN64_v(src1), (src2)).n64_u64[0]
#define vqrshl_s16(src1, src2) __n64_to_int16x4_t(neon_sqrshl16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vqrshl_s32(src1, src2) __n64_to_int32x2_t(neon_sqrshl32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vqrshl_s64(src1, src2) __n64_to_int64x1_t(neon_sqrshl64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vqrshl_s8(src1, src2) __n64_to_int8x8_t(neon_sqrshl8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vqrshl_u16(src1, src2) __n64_to_uint16x4_t(neon_uqrshl16(__uint16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vqrshl_u32(src1, src2) __n64_to_uint32x2_t(neon_uqrshl32(__uint32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vqrshl_u64(src1, src2) __n64_to_uint64x1_t(neon_uqrshl64(__uint64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vqrshl_u8(src1, src2) __n64_to_uint8x8_t(neon_uqrshl8(__uint8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vqshl_s16(src1, src2) __n64_to_int16x4_t(neon_sqshl16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vqshl_s32(src1, src2) __n64_to_int32x2_t(neon_sqshl32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vqshl_s64(src1, src2) __n64_to_int64x1_t(neon_sqshl64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vqshl_s8(src1, src2) __n64_to_int8x8_t(neon_sqshl8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vqshl_u16(src1, src2) __n64_to_uint16x4_t(neon_uqshl16(__uint16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vqshl_u32(src1, src2) __n64_to_uint32x2_t(neon_uqshl32(__uint32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vqshl_u64(src1, src2) __n64_to_uint64x1_t(neon_uqshl64(__uint64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vqshl_u8(src1, src2) __n64_to_uint8x8_t(neon_uqshl8(__uint8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vqrshlb_s8(src1, src2) neon_sqrshls8(__int8ToN8_v(src1), __int8ToN8_v(src2)).n8_i8[0]
#define vqrshlh_s16(src1, src2) neon_sqrshls16(__int16ToN16_v(src1), __int16ToN16_v(src2)).n16_i16[0]
#define vqrshls_s32(src1, src2) _CopyInt32FromFloat(neon_sqrshls32(_CopyFloatFromInt32(src1), _CopyFloatFromInt32(src2)))
#define vqrshld_s64(src1, src2) neon_sqrshls64(__int64ToN64_v(src1), __int64ToN64_v(src2)).n64_i64[0]
#define vqrshlb_u8(src1, src2) neon_uqrshls8(__uint8ToN8_v(src1), __int8ToN8_v(src2)).n8_u8[0]
#define vqrshlh_u16(src1, src2) neon_uqrshls16(__uint16ToN16_v(src1), __int16ToN16_v(src2)).n16_u16[0]
#define vqrshls_u32(src1, src2) _CopyUInt32FromFloat(neon_uqrshls32(_CopyFloatFromUInt32(src1), _CopyFloatFromInt32(src2)))
#define vqrshld_u64(src1, src2) neon_uqrshls64(__uint64ToN64_v(src1), __int64ToN64_v(src2)).n64_u64[0]
#define vqshlb_s8(src1, src2) neon_sqshls8(__int8ToN8_v(src1), __int8ToN8_v(src2)).n8_i8[0]
#define vqshlh_s16(src1, src2) neon_sqshls16(__int16ToN16_v(src1), __int16ToN16_v(src2)).n16_i16[0]
#define vqshls_s32(src1, src2) _CopyInt32FromFloat(neon_sqshls32(_CopyFloatFromInt32(src1), _CopyFloatFromInt32(src2)))
#define vqshld_s64(src1, src2) neon_sqshls64(__int64ToN64_v(src1), __int64ToN64_v(src2)).n64_i64[0]
#define vqshlb_u8(src1, src2) neon_uqshls8(__uint8ToN8_v(src1), __int8ToN8_v(src2)).n8_u8[0]
#define vqshlh_u16(src1, src2) neon_uqshls16(__uint16ToN16_v(src1), __int16ToN16_v(src2)).n16_u16[0]
#define vqshls_u32(src1, src2) _CopyUInt32FromFloat(neon_uqshls32(_CopyFloatFromUInt32(src1), _CopyFloatFromInt32(src2)))
#define vqshld_u64(src1, src2) neon_uqshls64(__uint64ToN64_v(src1), __int64ToN64_v(src2)).n64_u64[0]
#define vrshl_s16(src1, src2) __n64_to_int16x4_t(neon_srshl16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vrshl_s32(src1, src2) __n64_to_int32x2_t(neon_srshl32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vrshl_s64(src1, src2) __n64_to_int64x1_t(neon_srshl64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vrshl_s8(src1, src2) __n64_to_int8x8_t(neon_srshl8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vrshl_u16(src1, src2) __n64_to_uint16x4_t(neon_urshl16(__uint16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vrshl_u32(src1, src2) __n64_to_uint32x2_t(neon_urshl32(__uint32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vrshl_u64(src1, src2) __n64_to_uint64x1_t(neon_urshl64(__uint64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vrshl_u8(src1, src2) __n64_to_uint8x8_t(neon_urshl8(__uint8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vshl_s16(src1, src2) __n64_to_int16x4_t(neon_sshl16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vshl_s32(src1, src2) __n64_to_int32x2_t(neon_sshl32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vshl_s64(src1, src2) __n64_to_int64x1_t(neon_sshls64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vshl_s8(src1, src2) __n64_to_int8x8_t(neon_sshl8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vshl_u16(src1, src2) __n64_to_uint16x4_t(neon_ushl16(__uint16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vshl_u32(src1, src2) __n64_to_uint32x2_t(neon_ushl32(__uint32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vshl_u64(src1, src2) __n64_to_uint64x1_t(neon_ushls64(__uint64x1_t_to_n64(src1), __int64x1_t_to_n64(src2)))
#define vshl_u8(src1, src2) __n64_to_uint8x8_t(neon_ushl8(__uint8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vshld_s64(src1, src2) neon_sshls64(__int64ToN64_v(src1), __int64ToN64_v(src2)).n64_i64[0]
#define vshld_u64(src1, src2) neon_ushls64(__uint64ToN64_v(src1), __int64ToN64_v(src2)).n64_u64[0]
#define vrshld_s64(src1, src2) neon_srshls64(__int64ToN64_v(src1), __int64ToN64_v(src2)).n64_i64[0]
#define vrshld_u64(src1, src2) neon_urshls64(__uint64ToN64_v(src1), __int64ToN64_v(src2)).n64_u64[0]
#define vqrshlq_s16(src1, src2) __n128_to_int16x8_t(neon_sqrshlq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vqrshlq_s32(src1, src2) __n128_to_int32x4_t(neon_sqrshlq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vqrshlq_s64(src1, src2) __n128_to_int64x2_t(neon_sqrshlq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vqrshlq_s8(src1, src2) __n128_to_int8x16_t(neon_sqrshlq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vqrshlq_u16(src1, src2) __n128_to_uint16x8_t(neon_uqrshlq16(__uint16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vqrshlq_u32(src1, src2) __n128_to_uint32x4_t(neon_uqrshlq32(__uint32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vqrshlq_u64(src1, src2) __n128_to_uint64x2_t(neon_uqrshlq64(__uint64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vqrshlq_u8(src1, src2) __n128_to_uint8x16_t(neon_uqrshlq8(__uint8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vqshlq_s16(src1, src2) __n128_to_int16x8_t(neon_sqshlq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vqshlq_s32(src1, src2) __n128_to_int32x4_t(neon_sqshlq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vqshlq_s64(src1, src2) __n128_to_int64x2_t(neon_sqshlq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vqshlq_s8(src1, src2) __n128_to_int8x16_t(neon_sqshlq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vqshlq_u16(src1, src2) __n128_to_uint16x8_t(neon_uqshlq16(__uint16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vqshlq_u32(src1, src2) __n128_to_uint32x4_t(neon_uqshlq32(__uint32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vqshlq_u64(src1, src2) __n128_to_uint64x2_t(neon_uqshlq64(__uint64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vqshlq_u8(src1, src2) __n128_to_uint8x16_t(neon_uqshlq8(__uint8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vrshlq_s16(src1, src2) __n128_to_int16x8_t(neon_srshlq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vrshlq_s32(src1, src2) __n128_to_int32x4_t(neon_srshlq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vrshlq_s64(src1, src2) __n128_to_int64x2_t(neon_srshlq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vrshlq_s8(src1, src2) __n128_to_int8x16_t(neon_srshlq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vrshlq_u16(src1, src2) __n128_to_uint16x8_t(neon_urshlq16(__uint16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vrshlq_u32(src1, src2) __n128_to_uint32x4_t(neon_urshlq32(__uint32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vrshlq_u64(src1, src2) __n128_to_uint64x2_t(neon_urshlq64(__uint64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vrshlq_u8(src1, src2) __n128_to_uint8x16_t(neon_urshlq8(__uint8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vshlq_s16(src1, src2) __n128_to_int16x8_t(neon_sshlq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vshlq_s32(src1, src2) __n128_to_int32x4_t(neon_sshlq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vshlq_s64(src1, src2) __n128_to_int64x2_t(neon_sshlq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vshlq_s8(src1, src2) __n128_to_int8x16_t(neon_sshlq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vshlq_u16(src1, src2) __n128_to_uint16x8_t(neon_ushlq16(__uint16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vshlq_u32(src1, src2) __n128_to_uint32x4_t(neon_ushlq32(__uint32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vshlq_u64(src1, src2) __n128_to_uint64x2_t(neon_ushlq64(__uint64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vshlq_u8(src1, src2) __n128_to_uint8x16_t(neon_ushlq8(__uint8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vsli_n_p16(src1, src2, src3) __n64_to_poly16x4_t(neon_slii16(__poly16x4_t_to_n64(src1), __poly16x4_t_to_n64(src2), (src3)))
#define vsli_n_p64(src1, src2, src3) __n64_to_poly64x1_t(neon_sliis64(__poly64x1_t_to_n64(src1), __poly64x1_t_to_n64(src2), (src3)))
#define vsli_n_p8(src1, src2, src3) __n64_to_poly8x8_t(neon_slii8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2), (src3)))
#define vsli_n_s16(src1, src2, src3) __n64_to_int16x4_t(neon_slii16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2), (src3)))
#define vsli_n_s32(src1, src2, src3) __n64_to_int32x2_t(neon_slii32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2), (src3)))
#define vsli_n_s64(src1, src2, src3) __n64_to_int64x1_t(neon_sliis64(__int64x1_t_to_n64(src1), __int64x1_t_to_n64(src2), (src3)))
#define vsli_n_s8(src1, src2, src3) __n64_to_int8x8_t(neon_slii8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2), (src3)))
#define vsli_n_u16(src1, src2, src3) __n64_to_uint16x4_t(neon_slii16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2), (src3)))
#define vsli_n_u32(src1, src2, src3) __n64_to_uint32x2_t(neon_slii32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2), (src3)))
#define vsli_n_u64(src1, src2, src3) __n64_to_uint64x1_t(neon_sliis64(__uint64x1_t_to_n64(src1), __uint64x1_t_to_n64(src2), (src3)))
#define vsli_n_u8(src1, src2, src3) __n64_to_uint8x8_t(neon_slii8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2), (src3)))
#define vslid_n_s64(src1, src2, src3) neon_sliis64(__int64ToN64_v(src1), __int64ToN64_v(src2), (src3)).n64_i64[0]
#define vslid_n_u64(src1, src2, src3) neon_sliis64(__uint64ToN64_v(src1), __uint64ToN64_v(src2), (src3)).n64_u64[0]
#define vsliq_n_p16(src1, src2, src3) __n128_to_poly16x8_t(neon_sliiq16(__poly16x8_t_to_n128(src1), __poly16x8_t_to_n128(src2), (src3)))
#define vsliq_n_p8(src1, src2, src3) __n128_to_poly8x16_t(neon_sliiq8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2), (src3)))
#define vsliq_n_p64(src1, src2, src3) __n128_to_poly64x2_t(neon_sliiq64(__poly64x2_t_to_n128(src1), __poly64x2_t_to_n128(src2), (src3)))
#define vsliq_n_s16(src1, src2, src3) __n128_to_int16x8_t(neon_sliiq16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2), (src3)))
#define vsliq_n_s32(src1, src2, src3) __n128_to_int32x4_t(neon_sliiq32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2), (src3)))
#define vsliq_n_s64(src1, src2, src3) __n128_to_int64x2_t(neon_sliiq64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2), (src3)))
#define vsliq_n_s8(src1, src2, src3) __n128_to_int8x16_t(neon_sliiq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2), (src3)))
#define vsliq_n_u16(src1, src2, src3) __n128_to_uint16x8_t(neon_sliiq16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2), (src3)))
#define vsliq_n_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_sliiq32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2), (src3)))
#define vsliq_n_u64(src1, src2, src3) __n128_to_uint64x2_t(neon_sliiq64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2), (src3)))
#define vsliq_n_u8(src1, src2, src3) __n128_to_uint8x16_t(neon_sliiq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2), (src3)))

// TBL/TBX
__n64  neon_tbx4_q8(__n64 src1, __n128x4 reglist, __n64 src2);
__n128 neon_tbx4_qq8(__n128 src1, __n128x4 reglist, __n128 src2);
__n64  neon_tbx3_q8(__n64 src1, __n128x3 reglist, __n64 src2);
__n128 neon_tbx3_qq8(__n128 src1, __n128x3 reglist, __n128 src2);
__n64  neon_tbx2_q8(__n64 src1, __n128x2 reglist, __n64 src2);
__n128 neon_tbx2_qq8(__n128 src1, __n128x2 reglist, __n128 src2);
__n64  neon_tbx1_q8(__n64 src1, __n128 reglist, __n64 src2);
__n128 neon_tbx1_qq8(__n128 src1, __n128 reglist, __n128 src2);
__n64  neon_tbl4_q8(__n128x4 reglist, __n64 src2);
__n128 neon_tbl4_qq8(__n128x4 reglist, __n128 src2);
__n64  neon_tbl3_q8(__n128x3 reglist, __n64 src2);
__n128 neon_tbl3_qq8(__n128x3 reglist, __n128 src2);
__n64  neon_tbl2_q8(__n128x2 reglist, __n64 src2);
__n128 neon_tbl2_qq8(__n128x2 reglist, __n128 src2);
__n64  neon_tbl1_q8(__n128 reglist, __n64 src2);
__n64  neon_tbl1_q8_2(__n64 src1, __n64 src2);
__n128 neon_tbl1_qq8(__n128 reglist, __n128 src2);
__n64 neon_tbl1_8(__n64, __n64);
__n64 neon_tbl2_8(__n64x2, __n64);
__n64 neon_tbl3_8(__n64x3, __n64);
__n64 neon_tbl4_8(__n64x4, __n64);
__n64 neon_tbx1_8(__n64, __n64, __n64);
__n64 neon_tbx2_8(__n64, __n64x2, __n64);
__n64 neon_tbx3_8(__n64, __n64x3, __n64);
__n64 neon_tbx4_8(__n64, __n64x4, __n64);
#define vtbx4_p8(src1, src2, src3) __n64_to_poly8x8_t(neon_tbx4_8(__poly8x8_t_to_n64(src1), __poly8x8x4_t_to_n64x4(src2), __uint8x8_t_to_n64(src3)))
#define vtbx4_s8(src1, src2, src3) __n64_to_int8x8_t(neon_tbx4_8(__int8x8_t_to_n64(src1), __int8x8x4_t_to_n64x4(src2), __int8x8_t_to_n64(src3)))
#define vtbx4_u8(src1, src2, src3) __n64_to_uint8x8_t(neon_tbx4_8(__uint8x8_t_to_n64(src1), __uint8x8x4_t_to_n64x4(src2), __uint8x8_t_to_n64(src3)))
#define vtbx3_p8(src1, src2, src3) __n64_to_poly8x8_t(neon_tbx3_8(__poly8x8_t_to_n64(src1), __poly8x8x3_t_to_n64x3(src2), __uint8x8_t_to_n64(src3)))
#define vtbx3_s8(src1, src2, src3) __n64_to_int8x8_t(neon_tbx3_8(__int8x8_t_to_n64(src1), __int8x8x3_t_to_n64x3(src2), __int8x8_t_to_n64(src3)))
#define vtbx3_u8(src1, src2, src3) __n64_to_uint8x8_t(neon_tbx3_8(__uint8x8_t_to_n64(src1), __uint8x8x3_t_to_n64x3(src2), __uint8x8_t_to_n64(src3)))
#define vtbx2_p8(src1, src2, src3) __n64_to_poly8x8_t(neon_tbx2_8(__poly8x8_t_to_n64(src1), __poly8x8x2_t_to_n64x2(src2), __uint8x8_t_to_n64(src3)))
#define vtbx2_s8(src1, src2, src3) __n64_to_int8x8_t(neon_tbx2_8(__int8x8_t_to_n64(src1), __int8x8x2_t_to_n64x2(src2), __int8x8_t_to_n64(src3)))
#define vtbx2_u8(src1, src2, src3) __n64_to_uint8x8_t(neon_tbx2_8(__uint8x8_t_to_n64(src1), __uint8x8x2_t_to_n64x2(src2), __uint8x8_t_to_n64(src3)))
#define vtbx1_p8(src1, src2, src3) __n64_to_poly8x8_t(neon_tbx1_8(__poly8x8_t_to_n64(src1), __poly8x8_t_to_n64(src2), __uint8x8_t_to_n64(src3)))
#define vtbx1_s8(src1, src2, src3) __n64_to_int8x8_t(neon_tbx1_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2), __int8x8_t_to_n64(src3)))
#define vtbx1_u8(src1, src2, src3) __n64_to_uint8x8_t(neon_tbx1_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2), __uint8x8_t_to_n64(src3)))
#define vtbl4_p8(src1, src2) __n64_to_poly8x8_t(neon_tbl4_8(__poly8x8x4_t_to_n64x4(src1), __uint8x8_t_to_n64(src2)))
#define vtbl4_s8(src1, src2) __n64_to_int8x8_t(neon_tbl4_8(__int8x8x4_t_to_n64x4(src1), __int8x8_t_to_n64(src2)))
#define vtbl4_u8(src1, src2) __n64_to_uint8x8_t(neon_tbl4_8(__uint8x8x4_t_to_n64x4(src1), __uint8x8_t_to_n64(src2)))
#define vtbl3_p8(src1, src2) __n64_to_poly8x8_t(neon_tbl3_8(__poly8x8x3_t_to_n64x3(src1), __uint8x8_t_to_n64(src2)))
#define vtbl3_s8(src1, src2) __n64_to_int8x8_t(neon_tbl3_8(__int8x8x3_t_to_n64x3(src1), __int8x8_t_to_n64(src2)))
#define vtbl3_u8(src1, src2) __n64_to_uint8x8_t(neon_tbl3_8(__uint8x8x3_t_to_n64x3(src1), __uint8x8_t_to_n64(src2)))
#define vtbl2_p8(src1, src2) __n64_to_poly8x8_t(neon_tbl2_8(__poly8x8x2_t_to_n64x2(src1), __uint8x8_t_to_n64(src2)))
#define vtbl2_s8(src1, src2) __n64_to_int8x8_t(neon_tbl2_8(__int8x8x2_t_to_n64x2(src1), __int8x8_t_to_n64(src2)))
#define vtbl2_u8(src1, src2) __n64_to_uint8x8_t(neon_tbl2_8(__uint8x8x2_t_to_n64x2(src1), __uint8x8_t_to_n64(src2)))
#define vtbl1_p8(src1, src2) __n64_to_poly8x8_t(neon_tbl1_8(__poly8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vtbl1_s8(src1, src2) __n64_to_int8x8_t(neon_tbl1_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vtbl1_u8(src1, src2) __n64_to_uint8x8_t(neon_tbl1_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vqtbl1_u8(src1, src2)  __n64_to_uint8x8_t(neon_tbl1_q8(__uint8x16_t_to_n128(src1), __uint8x8_t_to_n64(src2)))
#define vqtbl1q_u8(src1, src2) __n128_to_uint8x16_t(neon_tbl1_qq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vqtbl1_s8(src1, src2)  __n64_to_int8x8_t(neon_tbl1_q8(__int8x16_t_to_n128(src1), __uint8x8_t_to_n64(src2)))
#define vqtbl1q_s8(src1, src2) __n128_to_int8x16_t(neon_tbl1_qq8(__int8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vqtbl1_p8(src1, src2)  __n64_to_poly8x8_t(neon_tbl1_q8(__poly8x16_t_to_n128(src1), __uint8x8_t_to_n64(src2)))
#define vqtbl1q_p8(src1, src2) __n128_to_poly8x16_t(neon_tbl1_qq8(__poly8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vqtbx1_u8(src1, src2, src3)  __n64_to_uint8x8_t(neon_tbx1_q8(__uint8x8_t_to_n64(src1), __uint8x16_t_to_n128(src2), __uint8x8_t_to_n64(src3)))
#define vqtbx1q_u8(src1, src2, src3) __n128_to_uint8x16_t(neon_tbx1_qq8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3)))
#define vqtbx1_s8(src1, src2, src3)  __n64_to_int8x8_t(neon_tbx1_q8(__int8x8_t_to_n64(src1), __int8x16_t_to_n128(src2), __uint8x8_t_to_n64(src3)))
#define vqtbx1q_s8(src1, src2, src3) __n128_to_int8x16_t(neon_tbx1_qq8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3)))
#define vqtbx1_p8(src1, src2, src3)  __n64_to_poly8x8_t(neon_tbx1_q8(__poly8x8_t_to_n64(src1), __poly8x16_t_to_n128(src2), __uint8x8_t_to_n64(src3)))
#define vqtbx1q_p8(src1, src2, src3) __n128_to_poly8x16_t(neon_tbx1_qq8(__poly8x16_t_to_n128(src1), __poly8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3)))
#define vqtbl2_u8(src1, src2)  __n64_to_uint8x8_t(neon_tbl2_q8(__uint8x16x2_t_to_n128x2(src1), __uint8x8_t_to_n64(src2)))
#define vqtbl2q_u8(src1, src2) __n128_to_uint8x16_t(neon_tbl2_qq8(__uint8x16x2_t_to_n128x2(src1), __uint8x16_t_to_n128(src2)))
#define vqtbl2_s8(src1, src2)  __n64_to_int8x8_t(neon_tbl2_q8(__int8x16x2_t_to_n128x2(src1), __uint8x8_t_to_n64(src2)))
#define vqtbl2q_s8(src1, src2) __n128_to_int8x16_t(neon_tbl2_qq8(__int8x16x2_t_to_n128x2(src1), __uint8x16_t_to_n128(src2)))
#define vqtbl2_p8(src1, src2)  __n64_to_poly8x8_t(neon_tbl2_q8(__poly8x16x2_t_to_n128x2(src1), __uint8x8_t_to_n64(src2)))
#define vqtbl2q_p8(src1, src2) __n128_to_poly8x16_t(neon_tbl2_qq8(__poly8x16x2_t_to_n128x2(src1), __uint8x16_t_to_n128(src2)))
#define vqtbx2_u8(src1, src2, src3)  __n64_to_uint8x8_t(neon_tbx2_q8(__uint8x8_t_to_n64(src1), __uint8x16x2_t_to_n128x2(src2), __uint8x8_t_to_n64(src3)))
#define vqtbx2q_u8(src1, src2, src3) __n128_to_uint8x16_t(neon_tbx2_qq8(__uint8x16_t_to_n128(src1), __uint8x16x2_t_to_n128x2(src2), __uint8x16_t_to_n128(src3)))
#define vqtbx2_s8(src1, src2, src3)  __n64_to_int8x8_t(neon_tbx2_q8(__int8x8_t_to_n64(src1), __int8x16x2_t_to_n128x2(src2), __uint8x8_t_to_n64(src3)))
#define vqtbx2q_s8(src1, src2, src3) __n128_to_int8x16_t(neon_tbx2_qq8(__int8x16_t_to_n128(src1), __int8x16x2_t_to_n128x2(src2), __uint8x16_t_to_n128(src3)))
#define vqtbx2_p8(src1, src2, src3)  __n64_to_poly8x8_t(neon_tbx2_q8(__poly8x8_t_to_n64(src1), __poly8x16x2_t_to_n128x2(src2), __uint8x8_t_to_n64(src3)))
#define vqtbx2q_p8(src1, src2, src3) __n128_to_poly8x16_t(neon_tbx2_qq8(__poly8x16_t_to_n128(src1), __poly8x16x2_t_to_n128x2(src2), __uint8x16_t_to_n128(src3)))
#define vqtbl3_u8(src1, src2)  __n64_to_uint8x8_t(neon_tbl3_q8(__uint8x16x3_t_to_n128x3(src1), __uint8x8_t_to_n64(src2)))
#define vqtbl3q_u8(src1, src2) __n128_to_uint8x16_t(neon_tbl3_qq8(__uint8x16x3_t_to_n128x3(src1), __uint8x16_t_to_n128(src2)))
#define vqtbl3_s8(src1, src2)  __n64_to_int8x8_t(neon_tbl3_q8(__int8x16x3_t_to_n128x3(src1), __uint8x8_t_to_n64(src2)))
#define vqtbl3q_s8(src1, src2) __n128_to_int8x16_t(neon_tbl3_qq8(__int8x16x3_t_to_n128x3(src1), __uint8x16_t_to_n128(src2)))
#define vqtbl3_p8(src1, src2)  __n64_to_poly8x8_t(neon_tbl3_q8(__poly8x16x3_t_to_n128x3(src1), __uint8x8_t_to_n64(src2)))
#define vqtbl3q_p8(src1, src2) __n128_to_poly8x16_t(neon_tbl3_qq8(__poly8x16x3_t_to_n128x3(src1), __uint8x16_t_to_n128(src2)))
#define vqtbx3_u8(src1, src2, src3)  __n64_to_uint8x8_t(neon_tbx3_q8(__uint8x8_t_to_n64(src1), __uint8x16x3_t_to_n128x3(src2), __uint8x8_t_to_n64(src3)))
#define vqtbx3q_u8(src1, src2, src3) __n128_to_uint8x16_t(neon_tbx3_qq8(__uint8x16_t_to_n128(src1), __uint8x16x3_t_to_n128x3(src2), __uint8x16_t_to_n128(src3)))
#define vqtbx3_s8(src1, src2, src3)  __n64_to_int8x8_t(neon_tbx3_q8(__int8x8_t_to_n64(src1), __int8x16x3_t_to_n128x3(src2), __uint8x8_t_to_n64(src3)))
#define vqtbx3q_s8(src1, src2, src3) __n128_to_int8x16_t(neon_tbx3_qq8(__int8x16_t_to_n128(src1), __int8x16x3_t_to_n128x3(src2), __uint8x16_t_to_n128(src3)))
#define vqtbx3_p8(src1, src2, src3)  __n64_to_poly8x8_t(neon_tbx3_q8(__poly8x8_t_to_n64(src1), __poly8x16x3_t_to_n128x3(src2), __uint8x8_t_to_n64(src3)))
#define vqtbx3q_p8(src1, src2, src3) __n128_to_poly8x16_t(neon_tbx3_qq8(__poly8x16_t_to_n128(src1), __poly8x16x3_t_to_n128x3(src2), __uint8x16_t_to_n128(src3)))
#define vqtbl4_u8(src1, src2)  __n64_to_uint8x8_t(neon_tbl4_q8(__uint8x16x4_t_to_n128x4(src1), __uint8x8_t_to_n64(src2)))
#define vqtbl4q_u8(src1, src2) __n128_to_uint8x16_t(neon_tbl4_qq8(__uint8x16x4_t_to_n128x4(src1), __uint8x16_t_to_n128(src2)))
#define vqtbl4_s8(src1, src2)  __n64_to_int8x8_t(neon_tbl4_q8(__int8x16x4_t_to_n128x4(src1), __uint8x8_t_to_n64(src2)))
#define vqtbl4q_s8(src1, src2) __n128_to_int8x16_t(neon_tbl4_qq8(__int8x16x4_t_to_n128x4(src1), __uint8x16_t_to_n128(src2)))
#define vqtbl4_p8(src1, src2)  __n64_to_poly8x8_t(neon_tbl4_q8(__poly8x16x4_t_to_n128x4(src1), __uint8x8_t_to_n64(src2)))
#define vqtbl4q_p8(src1, src2) __n128_to_poly8x16_t(neon_tbl4_qq8(__poly8x16x4_t_to_n128x4(src1), __uint8x16_t_to_n128(src2)))
#define vqtbx4_u8(src1, src2, src3)  __n64_to_uint8x8_t(neon_tbx4_q8(__uint8x8_t_to_n64(src1), __uint8x16x4_t_to_n128x4(src2), __uint8x8_t_to_n64(src3)))
#define vqtbx4q_u8(src1, src2, src3) __n128_to_uint8x16_t(neon_tbx4_qq8(__uint8x16_t_to_n128(src1), __uint8x16x4_t_to_n128x4(src2), __uint8x16_t_to_n128(src3)))
#define vqtbx4_s8(src1, src2, src3)  __n64_to_int8x8_t(neon_tbx4_q8(__int8x8_t_to_n64(src1), __int8x16x4_t_to_n128x4(src2), __uint8x8_t_to_n64(src3)))
#define vqtbx4q_s8(src1, src2, src3) __n128_to_int8x16_t(neon_tbx4_qq8(__int8x16_t_to_n128(src1), __int8x16x4_t_to_n128x4(src2), __uint8x16_t_to_n128(src3)))
#define vqtbx4_p8(src1, src2, src3)  __n64_to_poly8x8_t(neon_tbx4_q8(__poly8x8_t_to_n64(src1), __poly8x16x4_t_to_n128x4(src2), __uint8x8_t_to_n64(src3)))
#define vqtbx4q_p8(src1, src2, src3) __n128_to_poly8x16_t(neon_tbx4_qq8(__poly8x16_t_to_n128(src1), __poly8x16x4_t_to_n128x4(src2), __uint8x16_t_to_n128(src3)))

// LD4R/LD4/LD3R/LD3/LD2R/LD2
__n64x4 neon_ld4r_8(const __int8 * ptr);
__n128x4 neon_ld4r_q8(const __int8 * ptr);
__n64x4 neon_ld4r_16(const __int16 * ptr);
__n128x4 neon_ld4r_q16(const __int16 * ptr);
__n64x4 neon_ld4r_32(const __int32 * ptr);
__n128x4 neon_ld4r_q32(const __int32 * ptr);
__n64x4 neon_ld4r_64(const __int64 * ptr);
__n128x4 neon_ld4r_q64(const __int64 * ptr);
__n64x4 neon_ld4m_8(const __int8 * ptr);
__n128x4 neon_ld4m_q8(const __int8 * ptr);
__n64x4 neon_ld4m_16(const __int16 * ptr);
__n128x4 neon_ld4m_q16(const __int16 * ptr);
__n64x4 neon_ld4m_32(const __int32 * ptr);
__n128x4 neon_ld4m_q32(const __int32 * ptr);
__n128x4 neon_ld4m_q64(const __int64 * ptr);
__n64x4 neon_ld4s_8(const __int8 * ptr, __n64x4 src, const int lane);
__n128x4 neon_ld4s_q8(const __int8 * ptr, __n128x4 src, const int lane);
__n64x4 neon_ld4s_16(const __int16 * ptr, __n64x4 src, const int lane);
__n128x4 neon_ld4s_q16(const __int16 * ptr, __n128x4 src, const int lane);
__n64x4 neon_ld4s_32(const __int32 * ptr, __n64x4 src, const int lane);
__n128x4 neon_ld4s_q32(const __int32 * ptr, __n128x4 src, const int lane);
__n64x4 neon_ld4s_64(const __int64 * ptr, __n64x4 src, const int lane);
__n128x4 neon_ld4s_q64(const __int64 * ptr, __n128x4 src, const int lane);
__n64x3 neon_ld3r_8(const __int8 * ptr);
__n128x3 neon_ld3r_q8(const __int8 * ptr);
__n64x3 neon_ld3r_16(const __int16 * ptr);
__n128x3 neon_ld3r_q16(const __int16 * ptr);
__n64x3 neon_ld3r_32(const __int32 * ptr);
__n128x3 neon_ld3r_q32(const __int32 * ptr);
__n64x3 neon_ld3r_64(const __int64 * ptr);
__n128x3 neon_ld3r_q64(const __int64 * ptr);
__n64x3 neon_ld3m_8(const __int8 * ptr);
__n128x3 neon_ld3m_q8(const __int8 * ptr);
__n64x3 neon_ld3m_16(const __int16 * ptr);
__n128x3 neon_ld3m_q16(const __int16 * ptr);
__n64x3 neon_ld3m_32(const __int32 * ptr);
__n128x3 neon_ld3m_q32(const __int32 * ptr);
__n128x3 neon_ld3m_q64(const __int64 * ptr);
__n64x3 neon_ld3s_8(const __int8 * ptr, __n64x3 src, const int lane);
__n128x3 neon_ld3s_q8(const __int8 * ptr, __n128x3 src, const int lane);
__n64x3 neon_ld3s_16(const __int16 * ptr, __n64x3 src, const int lane);
__n128x3 neon_ld3s_q16(const __int16 * ptr, __n128x3 src, const int lane);
__n64x3 neon_ld3s_32(const __int32 * ptr, __n64x3 src, const int lane);
__n128x3 neon_ld3s_q32(const __int32 * ptr, __n128x3 src, const int lane);
__n64x3 neon_ld3s_64(const __int64 * ptr, __n64x3 src, const int lane);
__n128x3 neon_ld3s_q64(const __int64 * ptr, __n128x3 src, const int lane);
__n64x2 neon_ld2r_8(const __int8 * ptr);
__n128x2 neon_ld2r_q8(const __int8 * ptr);
__n64x2 neon_ld2r_16(const __int16 * ptr);
__n128x2 neon_ld2r_q16(const __int16 * ptr);
__n64x2 neon_ld2r_32(const __int32 * ptr);
__n128x2 neon_ld2r_q32(const __int32 * ptr);
__n64x2 neon_ld2r_64(const __int64 * ptr);
__n128x2 neon_ld2r_q64(const __int64 * ptr);
__n64x2 neon_ld2m_8(const __int8 * ptr);
__n128x2 neon_ld2m_q8(const __int8 * ptr);
__n64x2 neon_ld2m_16(const __int16 * ptr);
__n128x2 neon_ld2m_q16(const __int16 * ptr);
__n64x2 neon_ld2m_32(const __int32 * ptr);
__n128x2 neon_ld2m_q32(const __int32 * ptr);
__n128x2 neon_ld2m_q64(const __int64 * ptr);
__n64x2 neon_ld2s_8(const __int8 * ptr, __n64x2 src, const int lane);
__n128x2 neon_ld2s_q8(const __int8 * ptr, __n128x2 src, const int lane);
__n64x2 neon_ld2s_16(const __int16 * ptr, __n64x2 src, const int lane);
__n128x2 neon_ld2s_q16(const __int16 * ptr, __n128x2 src, const int lane);
__n64x2 neon_ld2s_32(const __int32 * ptr, __n64x2 src, const int lane);
__n128x2 neon_ld2s_q32(const __int32 * ptr, __n128x2 src, const int lane);
__n64x2 neon_ld2s_64(const __int64 * ptr, __n64x2 src, const int lane);
__n128x2 neon_ld2s_q64(const __int64 * ptr, __n128x2 src, const int lane);
__n64 neon_ld1r_8(const __int8 * ptr);
__n128 neon_ld1r_q8(const __int8 * ptr);
__n64 neon_ld1r_16(const __int16 * ptr);
__n128 neon_ld1r_q16(const __int16 * ptr);
__n64 neon_ld1r_32(const __int32 * ptr);
__n128 neon_ld1r_q32(const __int32 * ptr);
__n64 neon_ld1r_64(const __int64 * ptr);
__n128 neon_ld1r_q64(const __int64 * ptr);
__n64 neon_ld1m_8(const __int8 * ptr);
__n128 neon_ld1m_q8(const __int8 * ptr);
__n64 neon_ld1m_16(const __int16 * ptr);
__n128 neon_ld1m_q16(const __int16 * ptr);
__n64 neon_ld1m_32(const __int32 * ptr);
__n128 neon_ld1m_q32(const __int32 * ptr);
__n64 neon_ld1m_64(const __int64 * ptr);
__n128 neon_ld1m_q64(const __int64 * ptr);
__n64x2 neon_ld1m2_8(const __int8 * ptr);
__n128x2 neon_ld1m2_q8(const __int8 * ptr);
__n64x2 neon_ld1m2_16(const __int16 * ptr);
__n128x2 neon_ld1m2_q16(const __int16 * ptr);
__n64x2 neon_ld1m2_32(const __int32 * ptr);
__n128x2 neon_ld1m2_q32(const __int32 * ptr);
__n64x2 neon_ld1m2_64(const __int64 * ptr);
__n128x2 neon_ld1m2_q64(const __int64 * ptr);
__n64x3 neon_ld1m3_8(const __int8 * ptr);
__n128x3 neon_ld1m3_q8(const __int8 * ptr);
__n64x3 neon_ld1m3_16(const __int16 * ptr);
__n128x3 neon_ld1m3_q16(const __int16 * ptr);
__n64x3 neon_ld1m3_32(const __int32 * ptr);
__n128x3 neon_ld1m3_q32(const __int32 * ptr);
__n64x3 neon_ld1m3_64(const __int64 * ptr);
__n128x3 neon_ld1m3_q64(const __int64 * ptr);
__n64x4 neon_ld1m4_8(const __int8 * ptr);
__n128x4 neon_ld1m4_q8(const __int8 * ptr);
__n64x4 neon_ld1m4_16(const __int16 * ptr);
__n128x4 neon_ld1m4_q16(const __int16 * ptr);
__n64x4 neon_ld1m4_32(const __int32 * ptr);
__n128x4 neon_ld1m4_q32(const __int32 * ptr);
__n64x4 neon_ld1m4_64(const __int64 * ptr);
__n128x4 neon_ld1m4_q64(const __int64 * ptr);
__n64 neon_ld1s_8(const __int8 * ptr, __n64 src, const int lane);
__n128 neon_ld1s_q8(const __int8 * ptr, __n128 src, const int lane);
__n64 neon_ld1s_16(const __int16 * ptr, __n64 src, const int lane);
__n128 neon_ld1s_q16(const __int16 * ptr, __n128 src, const int lane);
__n64 neon_ld1s_32(const __int32 * ptr, __n64 src, const int lane);
__n128 neon_ld1s_q32(const __int32 * ptr, __n128 src, const int lane);
__n64 neon_ld1s_64(const __int64 * ptr, __n64 src, const int lane);
__n128 neon_ld1s_q64(const __int64 * ptr, __n128 src, const int lane);
#define vld4_dup_f32(src) __n64x4_to_float32x2x4_t(neon_ld4r_32((__int32*)(src)))
#define vld4_dup_f64(src) __n64x4_to_float64x1x4_t(neon_ld4r_64((__int64*)(src)))
#define vld4_dup_p64(src) __n64x4_to_poly64x1x4_t(neon_ld4r_64((__int64*)(src)))
#define vld4_dup_p16(src) __n64x4_to_poly16x4x4_t(neon_ld4r_16((__int16*)(src)))
#define vld4_dup_p8(src) __n64x4_to_poly8x8x4_t(neon_ld4r_8((__int8*)(src)))
#define vld4_dup_s16(src) __n64x4_to_int16x4x4_t(neon_ld4r_16((__int16*)(src)))
#define vld4_dup_s32(src) __n64x4_to_int32x2x4_t(neon_ld4r_32((__int32*)(src)))
#define vld4_dup_s8(src) __n64x4_to_int8x8x4_t(neon_ld4r_8((__int8*)(src)))
#define vld4_dup_u16(src) __n64x4_to_uint16x4x4_t(neon_ld4r_16((__int16*)(src)))
#define vld4_dup_u32(src) __n64x4_to_uint32x2x4_t(neon_ld4r_32((__int32*)(src)))
#define vld4_dup_u8(src) __n64x4_to_uint8x8x4_t(neon_ld4r_8((__int8*)(src)))
#define vld4_dup_s64(src) __n64x4_to_int64x1x4_t(neon_ld4r_64((__int64*)(src)))
#define vld4_dup_u64(src) __n64x4_to_uint64x1x4_t(neon_ld4r_64((__int64*)(src)))
#define vld4_f32(src) __n64x4_to_float32x2x4_t(neon_ld4m_32((__int32*)(src)))
#define vld4_p16(src) __n64x4_to_poly16x4x4_t(neon_ld4m_16((__int16*)(src)))
#define vld4_p8(src) __n64x4_to_poly8x8x4_t(neon_ld4m_8((__int8*)(src)))
#define vld4_s16(src) __n64x4_to_int16x4x4_t(neon_ld4m_16((__int16*)(src)))
#define vld4_s32(src) __n64x4_to_int32x2x4_t(neon_ld4m_32((__int32*)(src)))
#define vld4_s8(src) __n64x4_to_int8x8x4_t(neon_ld4m_8((__int8*)(src)))
#define vld4_u16(src) __n64x4_to_uint16x4x4_t(neon_ld4m_16((__int16*)(src)))
#define vld4_u32(src) __n64x4_to_uint32x2x4_t(neon_ld4m_32((__int32*)(src)))
#define vld4_u8(src) __n64x4_to_uint8x8x4_t(neon_ld4m_8((__int8*)(src)))
#define vld4_s64(src) __n64x4_to_int64x1x4_t(neon_ld1m4_64((__int64*)(src)))
#define vld4_u64(src) __n64x4_to_uint64x1x4_t(neon_ld1m4_64((__int64*)(src)))
#define vld4_f64(src) __n64x4_to_float64x1x4_t(neon_ld1m4_64((__int64*)(src)))
#define vld4_p64(src) __n64x4_to_poly64x1x4_t(neon_ld1m4_64((__int64*)(src)))
#define vld4q_dup_f32(src) __n128x4_to_float32x4x4_t(neon_ld4r_q32((__int32*)(src)))
#define vld4q_dup_f64(src) __n128x4_to_float64x2x4_t(neon_ld4r_q64((__int64*)(src)))
#define vld4q_dup_p64(src) __n128x4_to_poly64x2x4_t(neon_ld4r_q64((__int64*)(src)))
#define vld4q_dup_p16(src) __n128x4_to_poly16x8x4_t(neon_ld4r_q16((__int16*)(src)))
#define vld4q_dup_p8(src) __n128x4_to_poly8x16x4_t(neon_ld4r_q8((__int8*)(src)))
#define vld4q_dup_s16(src) __n128x4_to_int16x8x4_t(neon_ld4r_q16((__int16*)(src)))
#define vld4q_dup_s32(src) __n128x4_to_int32x4x4_t(neon_ld4r_q32((__int32*)(src)))
#define vld4q_dup_s8(src) __n128x4_to_int8x16x4_t(neon_ld4r_q8((__int8*)(src)))
#define vld4q_dup_u16(src) __n128x4_to_uint16x8x4_t(neon_ld4r_q16((__int16*)(src)))
#define vld4q_dup_u32(src) __n128x4_to_uint32x4x4_t(neon_ld4r_q32((__int32*)(src)))
#define vld4q_dup_u8(src) __n128x4_to_uint8x16x4_t(neon_ld4r_q8((__int8*)(src)))
#define vld4q_dup_s64(src) __n128x4_to_int64x2x4_t(neon_ld4r_q64((__int64*)(src)))
#define vld4q_dup_u64(src) __n128x4_to_uint64x2x4_t(neon_ld4r_q64((__int64*)(src)))
#define vld4q_f32(src) __n128x4_to_float32x4x4_t(neon_ld4m_q32((__int32*)(src)))
#define vld4q_p16(src) __n128x4_to_poly16x8x4_t(neon_ld4m_q16((__int16*)(src)))
#define vld4q_p8(src) __n128x4_to_poly8x16x4_t(neon_ld4m_q8((__int8*)(src)))
#define vld4q_s16(src) __n128x4_to_int16x8x4_t(neon_ld4m_q16((__int16*)(src)))
#define vld4q_s32(src) __n128x4_to_int32x4x4_t(neon_ld4m_q32((__int32*)(src)))
#define vld4q_s8(src) __n128x4_to_int8x16x4_t(neon_ld4m_q8((__int8*)(src)))
#define vld4q_u16(src) __n128x4_to_uint16x8x4_t(neon_ld4m_q16((__int16*)(src)))
#define vld4q_u32(src) __n128x4_to_uint32x4x4_t(neon_ld4m_q32((__int32*)(src)))
#define vld4q_u8(src) __n128x4_to_uint8x16x4_t(neon_ld4m_q8((__int8*)(src)))
#define vld4q_s64(src) __n128x4_to_int64x2x4_t(neon_ld4m_q64((__int64*)(src)))
#define vld4q_u64(src) __n128x4_to_uint64x2x4_t(neon_ld4m_q64((__int64*)(src)))
#define vld4q_f64(src) __n128x4_to_float64x2x4_t(neon_ld4m_q64((__int64*)(src)))
#define vld4q_p64(src) __n128x4_to_poly64x2x4_t(neon_ld4m_q64((__int64*)(src)))
#define vld4_lane_f32(src1, src2, src3) __n64x4_to_float32x2x4_t(neon_ld4s_32((__int32*)(src1), __float32x2x4_t_to_n64x4(src2), (src3)))
#define vld4_lane_f64(src1, src2, src3) __n64x4_to_float64x1x4_t(neon_ld4s_64((__int64*)(src1), __float64x1x4_t_to_n64x4(src2), (src3)))
#define vld4_lane_p64(src1, src2, src3) __n64x4_to_poly64x1x4_t(neon_ld4s_64((__int64*)(src1), __poly64x1x4_t_to_n64x4(src2), (src3)))
#define vld4_lane_p16(src1, src2, src3) __n64x4_to_poly16x4x4_t(neon_ld4s_16((__int16*)(src1), __poly16x4x4_t_to_n64x4(src2), (src3)))
#define vld4_lane_p8(src1, src2, src3) __n64x4_to_poly8x8x4_t(neon_ld4s_8((__int8*)(src1), __poly8x8x4_t_to_n64x4(src2), (src3)))
#define vld4_lane_s16(src1, src2, src3) __n64x4_to_int16x4x4_t(neon_ld4s_16((__int16*)(src1), __int16x4x4_t_to_n64x4(src2), (src3)))
#define vld4_lane_s32(src1, src2, src3) __n64x4_to_int32x2x4_t(neon_ld4s_32((__int32*)(src1), __int32x2x4_t_to_n64x4(src2), (src3)))
#define vld4_lane_s64(src1, src2, src3) __n64x4_to_int64x1x4_t(neon_ld4s_64((__int64*)(src1), __int64x1x4_t_to_n64x4(src2), (src3)))
#define vld4_lane_s8(src1, src2, src3) __n64x4_to_int8x8x4_t(neon_ld4s_8((__int8*)(src1), __int8x8x4_t_to_n64x4(src2), (src3)))
#define vld4_lane_u16(src1, src2, src3) __n64x4_to_uint16x4x4_t(neon_ld4s_16((__int16*)(src1), __uint16x4x4_t_to_n64x4(src2), (src3)))
#define vld4_lane_u32(src1, src2, src3) __n64x4_to_uint32x2x4_t(neon_ld4s_32((__int32*)(src1), __uint32x2x4_t_to_n64x4(src2), (src3)))
#define vld4_lane_u64(src1, src2, src3) __n64x4_to_uint64x1x4_t(neon_ld4s_64((__int64*)(src1), __uint64x1x4_t_to_n64x4(src2), (src3)))
#define vld4_lane_u8(src1, src2, src3) __n64x4_to_uint8x8x4_t(neon_ld4s_8((__int8*)(src1), __uint8x8x4_t_to_n64x4(src2), (src3)))
#define vld4q_lane_f32(src1, src2, src3) __n128x4_to_float32x4x4_t(neon_ld4s_q32((__int32*)(src1), __float32x4x4_t_to_n128x4(src2), (src3)))
#define vld4q_lane_f64(src1, src2, src3) __n128x4_to_float64x2x4_t(neon_ld4s_q64((__int64*)(src1), __float64x2x4_t_to_n128x4(src2), (src3)))
#define vld4q_lane_p64(src1, src2, src3) __n128x4_to_poly64x2x4_t(neon_ld4s_q64((__int64*)(src1), __poly64x2x4_t_to_n128x4(src2), (src3)))
#define vld4q_lane_p8(src1, src2, src3) __n128x4_to_poly8x16x4_t(neon_ld4s_q8((__int8*)(src1), __poly8x16x4_t_to_n128x4(src2), (src3)))
#define vld4q_lane_p16(src1, src2, src3) __n128x4_to_poly16x8x4_t(neon_ld4s_q16((__int16*)(src1), __poly16x8x4_t_to_n128x4(src2), (src3)))
#define vld4q_lane_s8(src1, src2, src3) __n128x4_to_int8x16x4_t(neon_ld4s_q8((__int8*)(src1), __int8x16x4_t_to_n128x4(src2), (src3)))
#define vld4q_lane_s16(src1, src2, src3) __n128x4_to_int16x8x4_t(neon_ld4s_q16((__int16*)(src1), __int16x8x4_t_to_n128x4(src2), (src3)))
#define vld4q_lane_s32(src1, src2, src3) __n128x4_to_int32x4x4_t(neon_ld4s_q32((__int32*)(src1), __int32x4x4_t_to_n128x4(src2), (src3)))
#define vld4q_lane_s64(src1, src2, src3) __n128x4_to_int64x2x4_t(neon_ld4s_q64((__int64*)(src1), __int64x2x4_t_to_n128x4(src2), (src3)))
#define vld4q_lane_u8(src1, src2, src3) __n128x4_to_uint8x16x4_t(neon_ld4s_q8((__int8*)(src1), __uint8x16x4_t_to_n128x4(src2), (src3)))
#define vld4q_lane_u16(src1, src2, src3) __n128x4_to_uint16x8x4_t(neon_ld4s_q16((__int16*)(src1), __uint16x8x4_t_to_n128x4(src2), (src3)))
#define vld4q_lane_u32(src1, src2, src3) __n128x4_to_uint32x4x4_t(neon_ld4s_q32((__int32*)(src1), __uint32x4x4_t_to_n128x4(src2), (src3)))
#define vld4q_lane_u64(src1, src2, src3) __n128x4_to_uint64x2x4_t(neon_ld4s_q64((__int64*)(src1), __uint64x2x4_t_to_n128x4(src2), (src3)))
#define vld3_dup_f32(src) __n64x3_to_float32x2x3_t(neon_ld3r_32((__int32*)(src)))
#define vld3_dup_f64(src) __n64x3_to_float64x1x3_t(neon_ld3r_64((__int64*)(src)))
#define vld3_dup_p64(src) __n64x3_to_poly64x1x3_t(neon_ld3r_64((__int64*)(src)))
#define vld3_dup_p16(src) __n64x3_to_poly16x4x3_t(neon_ld3r_16((__int16*)(src)))
#define vld3_dup_p8(src) __n64x3_to_poly8x8x3_t(neon_ld3r_8((__int8*)(src)))
#define vld3_dup_s16(src) __n64x3_to_int16x4x3_t(neon_ld3r_16((__int16*)(src)))
#define vld3_dup_s32(src) __n64x3_to_int32x2x3_t(neon_ld3r_32((__int32*)(src)))
#define vld3_dup_s8(src) __n64x3_to_int8x8x3_t(neon_ld3r_8((__int8*)(src)))
#define vld3_dup_u16(src) __n64x3_to_uint16x4x3_t(neon_ld3r_16((__int16*)(src)))
#define vld3_dup_u32(src) __n64x3_to_uint32x2x3_t(neon_ld3r_32((__int32*)(src)))
#define vld3_dup_u8(src) __n64x3_to_uint8x8x3_t(neon_ld3r_8((__int8*)(src)))
#define vld3_dup_s64(src) __n64x3_to_int64x1x3_t(neon_ld3r_64((__int64*)(src)))
#define vld3_dup_u64(src) __n64x3_to_uint64x1x3_t(neon_ld3r_64((__int64*)(src)))
#define vld3_f32(src) __n64x3_to_float32x2x3_t(neon_ld3m_32((__int32*)(src)))
#define vld3_p16(src) __n64x3_to_poly16x4x3_t(neon_ld3m_16((__int16*)(src)))
#define vld3_p8(src) __n64x3_to_poly8x8x3_t(neon_ld3m_8((__int8*)(src)))
#define vld3_s16(src) __n64x3_to_int16x4x3_t(neon_ld3m_16((__int16*)(src)))
#define vld3_s32(src) __n64x3_to_int32x2x3_t(neon_ld3m_32((__int32*)(src)))
#define vld3_s8(src) __n64x3_to_int8x8x3_t(neon_ld3m_8((__int8*)(src)))
#define vld3_u16(src) __n64x3_to_uint16x4x3_t(neon_ld3m_16((__int16*)(src)))
#define vld3_u32(src) __n64x3_to_uint32x2x3_t(neon_ld3m_32((__int32*)(src)))
#define vld3_u8(src) __n64x3_to_uint8x8x3_t(neon_ld3m_8((__int8*)(src)))
#define vld3_s64(src) __n64x3_to_int64x1x3_t(neon_ld1m3_64((__int64*)(src)))
#define vld3_u64(src) __n64x3_to_uint64x1x3_t(neon_ld1m3_64((__int64*)(src)))
#define vld3_f64(src) __n64x3_to_float64x1x3_t(neon_ld1m3_64((__int64*)(src)))
#define vld3_p64(src) __n64x3_to_poly64x1x3_t(neon_ld1m3_64((__int64*)(src)))
#define vld3q_dup_f32(src) __n128x3_to_float32x4x3_t(neon_ld3r_q32((__int32*)(src)))
#define vld3q_dup_f64(src) __n128x3_to_float64x2x3_t(neon_ld3r_q64((__int64*)(src)))
#define vld3q_dup_p64(src) __n128x3_to_poly64x2x3_t(neon_ld3r_q64((__int64*)(src)))
#define vld3q_dup_p16(src) __n128x3_to_poly16x8x3_t(neon_ld3r_q16((__int16*)(src)))
#define vld3q_dup_p8(src) __n128x3_to_poly8x16x3_t(neon_ld3r_q8((__int8*)(src)))
#define vld3q_dup_s16(src) __n128x3_to_int16x8x3_t(neon_ld3r_q16((__int16*)(src)))
#define vld3q_dup_s32(src) __n128x3_to_int32x4x3_t(neon_ld3r_q32((__int32*)(src)))
#define vld3q_dup_s8(src) __n128x3_to_int8x16x3_t(neon_ld3r_q8((__int8*)(src)))
#define vld3q_dup_u16(src) __n128x3_to_uint16x8x3_t(neon_ld3r_q16((__int16*)(src)))
#define vld3q_dup_u32(src) __n128x3_to_uint32x4x3_t(neon_ld3r_q32((__int32*)(src)))
#define vld3q_dup_u8(src) __n128x3_to_uint8x16x3_t(neon_ld3r_q8((__int8*)(src)))
#define vld3q_dup_s64(src) __n128x3_to_int64x2x3_t(neon_ld3r_q64((__int64*)(src)))
#define vld3q_dup_u64(src) __n128x3_to_uint64x2x3_t(neon_ld3r_q64((__int64*)(src)))
#define vld3q_f32(src) __n128x3_to_float32x4x3_t(neon_ld3m_q32((__int32*)(src)))
#define vld3q_p16(src) __n128x3_to_poly16x8x3_t(neon_ld3m_q16((__int16*)(src)))
#define vld3q_p8(src) __n128x3_to_poly8x16x3_t(neon_ld3m_q8((__int8*)(src)))
#define vld3q_s16(src) __n128x3_to_int16x8x3_t(neon_ld3m_q16((__int16*)(src)))
#define vld3q_s32(src) __n128x3_to_int32x4x3_t(neon_ld3m_q32((__int32*)(src)))
#define vld3q_s8(src) __n128x3_to_int8x16x3_t(neon_ld3m_q8((__int8*)(src)))
#define vld3q_u16(src) __n128x3_to_uint16x8x3_t(neon_ld3m_q16((__int16*)(src)))
#define vld3q_u32(src) __n128x3_to_uint32x4x3_t(neon_ld3m_q32((__int32*)(src)))
#define vld3q_u8(src) __n128x3_to_uint8x16x3_t(neon_ld3m_q8((__int8*)(src)))
#define vld3q_s64(src) __n128x3_to_int64x2x3_t(neon_ld3m_q64((__int64*)(src)))
#define vld3q_u64(src) __n128x3_to_uint64x2x3_t(neon_ld3m_q64((__int64*)(src)))
#define vld3q_f64(src) __n128x3_to_float64x2x3_t(neon_ld3m_q64((__int64*)(src)))
#define vld3q_p64(src) __n128x3_to_poly64x2x3_t(neon_ld3m_q64((__int64*)(src)))
#define vld3_lane_f32(src1, src2, src3) __n64x3_to_float32x2x3_t(neon_ld3s_32((__int32*)(src1), __float32x2x3_t_to_n64x3(src2), (src3)))
#define vld3_lane_f64(src1, src2, src3) __n64x3_to_float64x1x3_t(neon_ld3s_64((__int64*)(src1), __float64x1x3_t_to_n64x3(src2), (src3)))
#define vld3_lane_p64(src1, src2, src3) __n64x3_to_poly64x1x3_t(neon_ld3s_64((__int64*)(src1), __poly64x1x3_t_to_n64x3(src2), (src3)))
#define vld3_lane_p16(src1, src2, src3) __n64x3_to_poly16x4x3_t(neon_ld3s_16((__int16*)(src1), __poly16x4x3_t_to_n64x3(src2), (src3)))
#define vld3_lane_p8(src1, src2, src3) __n64x3_to_poly8x8x3_t(neon_ld3s_8((__int8*)(src1), __poly8x8x3_t_to_n64x3(src2), (src3)))
#define vld3_lane_s16(src1, src2, src3) __n64x3_to_int16x4x3_t(neon_ld3s_16((__int16*)(src1), __int16x4x3_t_to_n64x3(src2), (src3)))
#define vld3_lane_s32(src1, src2, src3) __n64x3_to_int32x2x3_t(neon_ld3s_32((__int32*)(src1), __int32x2x3_t_to_n64x3(src2), (src3)))
#define vld3_lane_s64(src1, src2, src3) __n64x3_to_int64x1x3_t(neon_ld3s_64((__int64*)(src1), __int64x1x3_t_to_n64x3(src2), (src3)))
#define vld3_lane_s8(src1, src2, src3) __n64x3_to_int8x8x3_t(neon_ld3s_8((__int8*)(src1), __int8x8x3_t_to_n64x3(src2), (src3)))
#define vld3_lane_u16(src1, src2, src3) __n64x3_to_uint16x4x3_t(neon_ld3s_16((__int16*)(src1), __uint16x4x3_t_to_n64x3(src2), (src3)))
#define vld3_lane_u32(src1, src2, src3) __n64x3_to_uint32x2x3_t(neon_ld3s_32((__int32*)(src1), __uint32x2x3_t_to_n64x3(src2), (src3)))
#define vld3_lane_u64(src1, src2, src3) __n64x3_to_uint64x1x3_t(neon_ld3s_64((__int64*)(src1), __uint64x1x3_t_to_n64x3(src2), (src3)))
#define vld3_lane_u8(src1, src2, src3) __n64x3_to_uint8x8x3_t(neon_ld3s_8((__int8*)(src1), __uint8x8x3_t_to_n64x3(src2), (src3)))
#define vld3q_lane_f32(src1, src2, src3) __n128x3_to_float32x4x3_t(neon_ld3s_q32((__int32*)(src1), __float32x4x3_t_to_n128x3(src2), (src3)))
#define vld3q_lane_f64(src1, src2, src3) __n128x3_to_float64x2x3_t(neon_ld3s_q64((__int64*)(src1), __float64x2x3_t_to_n128x3(src2), (src3)))
#define vld3q_lane_p64(src1, src2, src3) __n128x3_to_poly64x2x3_t(neon_ld3s_q64((__int64*)(src1), __poly64x2x3_t_to_n128x3(src2), (src3)))
#define vld3q_lane_p8(src1, src2, src3) __n128x3_to_poly8x16x3_t(neon_ld3s_q8((__int8*)(src1), __poly8x16x3_t_to_n128x3(src2), (src3)))
#define vld3q_lane_p16(src1, src2, src3) __n128x3_to_poly16x8x3_t(neon_ld3s_q16((__int16*)(src1), __poly16x8x3_t_to_n128x3(src2), (src3)))
#define vld3q_lane_s8(src1, src2, src3) __n128x3_to_int8x16x3_t(neon_ld3s_q8((__int8*)(src1), __int8x16x3_t_to_n128x3(src2), (src3)))
#define vld3q_lane_s16(src1, src2, src3) __n128x3_to_int16x8x3_t(neon_ld3s_q16((__int16*)(src1), __int16x8x3_t_to_n128x3(src2), (src3)))
#define vld3q_lane_s32(src1, src2, src3) __n128x3_to_int32x4x3_t(neon_ld3s_q32((__int32*)(src1), __int32x4x3_t_to_n128x3(src2), (src3)))
#define vld3q_lane_s64(src1, src2, src3) __n128x3_to_int64x2x3_t(neon_ld3s_q64((__int64*)(src1), __int64x2x3_t_to_n128x3(src2), (src3)))
#define vld3q_lane_u8(src1, src2, src3) __n128x3_to_uint8x16x3_t(neon_ld3s_q8((__int8*)(src1), __uint8x16x3_t_to_n128x3(src2), (src3)))
#define vld3q_lane_u16(src1, src2, src3) __n128x3_to_uint16x8x3_t(neon_ld3s_q16((__int16*)(src1), __uint16x8x3_t_to_n128x3(src2), (src3)))
#define vld3q_lane_u32(src1, src2, src3) __n128x3_to_uint32x4x3_t(neon_ld3s_q32((__int32*)(src1), __uint32x4x3_t_to_n128x3(src2), (src3)))
#define vld3q_lane_u64(src1, src2, src3) __n128x3_to_uint64x2x3_t(neon_ld3s_q64((__int64*)(src1), __uint64x2x3_t_to_n128x3(src2), (src3)))
#define vld2_dup_f32(src) __n64x2_to_float32x2x2_t(neon_ld2r_32((__int32*)(src)))
#define vld2_dup_f64(src) __n64x2_to_float64x1x2_t(neon_ld2r_64((__int64*)(src)))
#define vld2_dup_p64(src) __n64x2_to_poly64x1x2_t(neon_ld2r_64((__int64*)(src)))
#define vld2_dup_p16(src) __n64x2_to_poly16x4x2_t(neon_ld2r_16((__int16*)(src)))
#define vld2_dup_p8(src) __n64x2_to_poly8x8x2_t(neon_ld2r_8((__int8*)(src)))
#define vld2_dup_s16(src) __n64x2_to_int16x4x2_t(neon_ld2r_16((__int16*)(src)))
#define vld2_dup_s32(src) __n64x2_to_int32x2x2_t(neon_ld2r_32((__int32*)(src)))
#define vld2_dup_s8(src) __n64x2_to_int8x8x2_t(neon_ld2r_8((__int8*)(src)))
#define vld2_dup_u16(src) __n64x2_to_uint16x4x2_t(neon_ld2r_16((__int16*)(src)))
#define vld2_dup_u32(src) __n64x2_to_uint32x2x2_t(neon_ld2r_32((__int32*)(src)))
#define vld2_dup_u8(src) __n64x2_to_uint8x8x2_t(neon_ld2r_8((__int8*)(src)))
#define vld2_dup_s64(src) __n64x2_to_int64x1x2_t(neon_ld2r_64((__int64*)(src)))
#define vld2_dup_u64(src) __n64x2_to_uint64x1x2_t(neon_ld2r_64((__int64*)(src)))
#define vld2_f32(src) __n64x2_to_float32x2x2_t(neon_ld2m_32((__int32*)(src)))
#define vld2_p16(src) __n64x2_to_poly16x4x2_t(neon_ld2m_16((__int16*)(src)))
#define vld2_p8(src) __n64x2_to_poly8x8x2_t(neon_ld2m_8((__int8*)(src)))
#define vld2_s16(src) __n64x2_to_int16x4x2_t(neon_ld2m_16((__int16*)(src)))
#define vld2_s32(src) __n64x2_to_int32x2x2_t(neon_ld2m_32((__int32*)(src)))
#define vld2_s8(src) __n64x2_to_int8x8x2_t(neon_ld2m_8((__int8*)(src)))
#define vld2_u16(src) __n64x2_to_uint16x4x2_t(neon_ld2m_16((__int16*)(src)))
#define vld2_u32(src) __n64x2_to_uint32x2x2_t(neon_ld2m_32((__int32*)(src)))
#define vld2_u8(src) __n64x2_to_uint8x8x2_t(neon_ld2m_8((__int8*)(src)))
#define vld2_s64(src) __n64x2_to_int64x1x2_t(neon_ld1m2_64((__int64*)(src)))
#define vld2_u64(src) __n64x2_to_uint64x1x2_t(neon_ld1m2_64((__int64*)(src)))
#define vld2_f64(src) __n64x2_to_float64x1x2_t(neon_ld1m2_64((__int64*)(src)))
#define vld2_p64(src) __n64x2_to_poly64x1x2_t(neon_ld1m2_64((__int64*)(src)))
#define vld2q_dup_f32(src) __n128x2_to_float32x4x2_t(neon_ld2r_q32((__int32*)(src)))
#define vld2q_dup_f64(src) __n128x2_to_float64x2x2_t(neon_ld2r_q64((__int64*)(src)))
#define vld2q_dup_p64(src) __n128x2_to_poly64x2x2_t(neon_ld2r_q64((__int64*)(src)))
#define vld2q_dup_p16(src) __n128x2_to_poly16x8x2_t(neon_ld2r_q16((__int16*)(src)))
#define vld2q_dup_p8(src) __n128x2_to_poly8x16x2_t(neon_ld2r_q8((__int8*)(src)))
#define vld2q_dup_s16(src) __n128x2_to_int16x8x2_t(neon_ld2r_q16((__int16*)(src)))
#define vld2q_dup_s32(src) __n128x2_to_int32x4x2_t(neon_ld2r_q32((__int32*)(src)))
#define vld2q_dup_s8(src) __n128x2_to_int8x16x2_t(neon_ld2r_q8((__int8*)(src)))
#define vld2q_dup_u16(src) __n128x2_to_uint16x8x2_t(neon_ld2r_q16((__int16*)(src)))
#define vld2q_dup_u32(src) __n128x2_to_uint32x4x2_t(neon_ld2r_q32((__int32*)(src)))
#define vld2q_dup_u8(src) __n128x2_to_uint8x16x2_t(neon_ld2r_q8((__int8*)(src)))
#define vld2q_dup_s64(src) __n128x2_to_int64x2x2_t(neon_ld2r_q64((__int64*)(src)))
#define vld2q_dup_u64(src) __n128x2_to_uint64x2x2_t(neon_ld2r_q64((__int64*)(src)))
#define vld2q_f32(src) __n128x2_to_float32x4x2_t(neon_ld2m_q32((__int32*)(src)))
#define vld2q_p16(src) __n128x2_to_poly16x8x2_t(neon_ld2m_q16((__int16*)(src)))
#define vld2q_p8(src) __n128x2_to_poly8x16x2_t(neon_ld2m_q8((__int8*)(src)))
#define vld2q_s16(src) __n128x2_to_int16x8x2_t(neon_ld2m_q16((__int16*)(src)))
#define vld2q_s32(src) __n128x2_to_int32x4x2_t(neon_ld2m_q32((__int32*)(src)))
#define vld2q_s8(src) __n128x2_to_int8x16x2_t(neon_ld2m_q8((__int8*)(src)))
#define vld2q_u16(src) __n128x2_to_uint16x8x2_t(neon_ld2m_q16((__int16*)(src)))
#define vld2q_u32(src) __n128x2_to_uint32x4x2_t(neon_ld2m_q32((__int32*)(src)))
#define vld2q_u8(src) __n128x2_to_uint8x16x2_t(neon_ld2m_q8((__int8*)(src)))
#define vld2q_s64(src) __n128x2_to_int64x2x2_t(neon_ld2m_q64((__int64*)(src)))
#define vld2q_u64(src) __n128x2_to_uint64x2x2_t(neon_ld2m_q64((__int64*)(src)))
#define vld2q_f64(src) __n128x2_to_float64x2x2_t(neon_ld2m_q64((__int64*)(src)))
#define vld2q_p64(src) __n128x2_to_poly64x2x2_t(neon_ld2m_q64((__int64*)(src)))
#define vld2_lane_f32(src1, src2, src3) __n64x2_to_float32x2x2_t(neon_ld2s_32((__int32*)(src1), __float32x2x2_t_to_n64x2(src2), (src3)))
#define vld2_lane_f64(src1, src2, src3) __n64x2_to_float64x1x2_t(neon_ld2s_64((__int64*)(src1), __float64x1x2_t_to_n64x2(src2), (src3)))
#define vld2_lane_p64(src1, src2, src3) __n64x2_to_poly64x1x2_t(neon_ld2s_64((__int64*)(src1), __poly64x1x2_t_to_n64x2(src2), (src3)))
#define vld2_lane_p16(src1, src2, src3) __n64x2_to_poly16x4x2_t(neon_ld2s_16((__int16*)(src1), __poly16x4x2_t_to_n64x2(src2), (src3)))
#define vld2_lane_p8(src1, src2, src3) __n64x2_to_poly8x8x2_t(neon_ld2s_8((__int8*)(src1), __poly8x8x2_t_to_n64x2(src2), (src3)))
#define vld2_lane_s16(src1, src2, src3) __n64x2_to_int16x4x2_t(neon_ld2s_16((__int16*)(src1), __int16x4x2_t_to_n64x2(src2), (src3)))
#define vld2_lane_s32(src1, src2, src3) __n64x2_to_int32x2x2_t(neon_ld2s_32((__int32*)(src1), __int32x2x2_t_to_n64x2(src2), (src3)))
#define vld2_lane_s64(src1, src2, src3) __n64x2_to_int64x1x2_t(neon_ld2s_64((__int64*)(src1), __int64x1x2_t_to_n64x2(src2), (src3)))
#define vld2_lane_s8(src1, src2, src3) __n64x2_to_int8x8x2_t(neon_ld2s_8((__int8*)(src1), __int8x8x2_t_to_n64x2(src2), (src3)))
#define vld2_lane_u16(src1, src2, src3) __n64x2_to_uint16x4x2_t(neon_ld2s_16((__int16*)(src1), __uint16x4x2_t_to_n64x2(src2), (src3)))
#define vld2_lane_u32(src1, src2, src3) __n64x2_to_uint32x2x2_t(neon_ld2s_32((__int32*)(src1), __uint32x2x2_t_to_n64x2(src2), (src3)))
#define vld2_lane_u64(src1, src2, src3) __n64x2_to_uint64x1x2_t(neon_ld2s_64((__int64*)(src1), __uint64x1x2_t_to_n64x2(src2), (src3)))
#define vld2_lane_u8(src1, src2, src3) __n64x2_to_uint8x8x2_t(neon_ld2s_8((__int8*)(src1), __uint8x8x2_t_to_n64x2(src2), (src3)))
#define vld2q_lane_f32(src1, src2, src3) __n128x2_to_float32x4x2_t(neon_ld2s_q32((__int32*)(src1), __float32x4x2_t_to_n128x2(src2), (src3)))
#define vld2q_lane_f64(src1, src2, src3) __n128x2_to_float64x2x2_t(neon_ld2s_q64((__int64*)(src1), __float64x2x2_t_to_n128x2(src2), (src3)))
#define vld2q_lane_p64(src1, src2, src3) __n128x2_to_poly64x2x2_t(neon_ld2s_q64((__int64*)(src1), __poly64x2x2_t_to_n128x2(src2), (src3)))
#define vld2q_lane_p8(src1, src2, src3) __n128x2_to_poly8x16x2_t(neon_ld2s_q8((__int8*)(src1), __poly8x16x2_t_to_n128x2(src2), (src3)))
#define vld2q_lane_p16(src1, src2, src3) __n128x2_to_poly16x8x2_t(neon_ld2s_q16((__int16*)(src1), __poly16x8x2_t_to_n128x2(src2), (src3)))
#define vld2q_lane_s8(src1, src2, src3) __n128x2_to_int8x16x2_t(neon_ld2s_q8((__int8*)(src1), __int8x16x2_t_to_n128x2(src2), (src3)))
#define vld2q_lane_s16(src1, src2, src3) __n128x2_to_int16x8x2_t(neon_ld2s_q16((__int16*)(src1), __int16x8x2_t_to_n128x2(src2), (src3)))
#define vld2q_lane_s32(src1, src2, src3) __n128x2_to_int32x4x2_t(neon_ld2s_q32((__int32*)(src1), __int32x4x2_t_to_n128x2(src2), (src3)))
#define vld2q_lane_s64(src1, src2, src3) __n128x2_to_int64x2x2_t(neon_ld2s_q64((__int64*)(src1), __int64x2x2_t_to_n128x2(src2), (src3)))
#define vld2q_lane_u8(src1, src2, src3) __n128x2_to_uint8x16x2_t(neon_ld2s_q8((__int8*)(src1), __uint8x16x2_t_to_n128x2(src2), (src3)))
#define vld2q_lane_u16(src1, src2, src3) __n128x2_to_uint16x8x2_t(neon_ld2s_q16((__int16*)(src1), __uint16x8x2_t_to_n128x2(src2), (src3)))
#define vld2q_lane_u32(src1, src2, src3) __n128x2_to_uint32x4x2_t(neon_ld2s_q32((__int32*)(src1), __uint32x4x2_t_to_n128x2(src2), (src3)))
#define vld2q_lane_u64(src1, src2, src3) __n128x2_to_uint64x2x2_t(neon_ld2s_q64((__int64*)(src1), __uint64x2x2_t_to_n128x2(src2), (src3)))
#define vld1_dup_f32(src) __n64_to_float32x2_t(neon_ld1r_32((__int32*)(src)))
#define vld1_dup_f64(src) __n64_to_float64x1_t(neon_ld1r_64((__int64*)(src)))
#define vld1_dup_p64(src) __n64_to_poly64x1_t(neon_ld1r_64((__int64*)(src)))
#define vld1_dup_p16(src) __n64_to_poly16x4_t(neon_ld1r_16((__int16*)(src)))
#define vld1_dup_p8(src) __n64_to_poly8x8_t(neon_ld1r_8((__int8*)(src)))
#define vld1_dup_s16(src) __n64_to_int16x4_t(neon_ld1r_16((__int16*)(src)))
#define vld1_dup_s32(src) __n64_to_int32x2_t(neon_ld1r_32((__int32*)(src)))
#define vld1_dup_s8(src) __n64_to_int8x8_t(neon_ld1r_8((__int8*)(src)))
#define vld1_dup_u16(src) __n64_to_uint16x4_t(neon_ld1r_16((__int16*)(src)))
#define vld1_dup_u32(src) __n64_to_uint32x2_t(neon_ld1r_32((__int32*)(src)))
#define vld1_dup_u8(src) __n64_to_uint8x8_t(neon_ld1r_8((__int8*)(src)))
#define vld1_dup_s64(src) __n64_to_int64x1_t(neon_ld1r_64((__int64*)(src)))
#define vld1_dup_u64(src) __n64_to_uint64x1_t(neon_ld1r_64((__int64*)(src)))
#define vld1_f32(src) __n64_to_float32x2_t(neon_ld1m_32((__int32*)(src)))
#define vld1_p16(src) __n64_to_poly16x4_t(neon_ld1m_16((__int16*)(src)))
#define vld1_p8(src) __n64_to_poly8x8_t(neon_ld1m_8((__int8*)(src)))
#define vld1_s16(src) __n64_to_int16x4_t(neon_ld1m_16((__int16*)(src)))
#define vld1_s32(src) __n64_to_int32x2_t(neon_ld1m_32((__int32*)(src)))
#define vld1_s8(src) __n64_to_int8x8_t(neon_ld1m_8((__int8*)(src)))
#define vld1_u16(src) __n64_to_uint16x4_t(neon_ld1m_16((__int16*)(src)))
#define vld1_u32(src) __n64_to_uint32x2_t(neon_ld1m_32((__int32*)(src)))
#define vld1_u8(src) __n64_to_uint8x8_t(neon_ld1m_8((__int8*)(src)))
#define vld1_s64(src) __n64_to_int64x1_t(neon_ld1m_64((__int64*)(src)))
#define vld1_u64(src) __n64_to_uint64x1_t(neon_ld1m_64((__int64*)(src)))
#define vld1_f64(src) __n64_to_float64x1_t(neon_ld1m_64((__int64*)(src)))
#define vld1_p64(src) __n64_to_poly64x1_t(neon_ld1m_64((__int64*)(src)))
#define vld1_f32_x2(src) __n64x2_to_float32x2x2_t(neon_ld1m2_32((__int32*)(src)))
#define vld1_p16_x2(src) __n64x2_to_poly16x4x2_t(neon_ld1m2_16((__int16*)(src)))
#define vld1_p8_x2(src) __n64x2_to_poly8x8x2_t(neon_ld1m2_8((__int8*)(src)))
#define vld1_s16_x2(src) __n64x2_to_int16x4x2_t(neon_ld1m2_16((__int16*)(src)))
#define vld1_s32_x2(src) __n64x2_to_int32x2x2_t(neon_ld1m2_32((__int32*)(src)))
#define vld1_s8_x2(src) __n64x2_to_int8x8x2_t(neon_ld1m2_8((__int8*)(src)))
#define vld1_u16_x2(src) __n64x2_to_uint16x4x2_t(neon_ld1m2_16((__int16*)(src)))
#define vld1_u32_x2(src) __n64x2_to_uint32x2x2_t(neon_ld1m2_32((__int32*)(src)))
#define vld1_u8_x2(src) __n64x2_to_uint8x8x2_t(neon_ld1m2_8((__int8*)(src)))
#define vld1_s64_x2(src) __n64x2_to_int64x1x2_t(neon_ld1m2_64((__int64*)(src)))
#define vld1_u64_x2(src) __n64x2_to_uint64x1x2_t(neon_ld1m2_64((__int64*)(src)))
#define vld1_f64_x2(src) __n64x2_to_float64x1x2_t(neon_ld1m2_64((__int64*)(src)))
#define vld1_p64_x2(src) __n64x2_to_poly64x1x2_t(neon_ld1m2_64((__int64*)(src)))
#define vld1_f32_x3(src) __n64x3_to_float32x2x3_t(neon_ld1m3_32((__int32*)(src)))
#define vld1_p16_x3(src) __n64x3_to_poly16x4x3_t(neon_ld1m3_16((__int16*)(src)))
#define vld1_p8_x3(src) __n64x3_to_poly8x8x3_t(neon_ld1m3_8((__int8*)(src)))
#define vld1_s16_x3(src) __n64x3_to_int16x4x3_t(neon_ld1m3_16((__int16*)(src)))
#define vld1_s32_x3(src) __n64x3_to_int32x2x3_t(neon_ld1m3_32((__int32*)(src)))
#define vld1_s8_x3(src) __n64x3_to_int8x8x3_t(neon_ld1m3_8((__int8*)(src)))
#define vld1_u16_x3(src) __n64x3_to_uint16x4x3_t(neon_ld1m3_16((__int16*)(src)))
#define vld1_u32_x3(src) __n64x3_to_uint32x2x3_t(neon_ld1m3_32((__int32*)(src)))
#define vld1_u8_x3(src) __n64x3_to_uint8x8x3_t(neon_ld1m3_8((__int8*)(src)))
#define vld1_s64_x3(src) __n64x3_to_int64x1x3_t(neon_ld1m3_64((__int64*)(src)))
#define vld1_u64_x3(src) __n64x3_to_uint64x1x3_t(neon_ld1m3_64((__int64*)(src)))
#define vld1_f64_x3(src) __n64x3_to_float64x1x3_t(neon_ld1m3_64((__int64*)(src)))
#define vld1_p64_x3(src) __n64x3_to_poly64x1x3_t(neon_ld1m3_64((__int64*)(src)))
#define vld1_f32_x4(src) __n64x4_to_float32x2x4_t(neon_ld1m4_32((__int32*)(src)))
#define vld1_p16_x4(src) __n64x4_to_poly16x4x4_t(neon_ld1m4_16((__int16*)(src)))
#define vld1_p8_x4(src) __n64x4_to_poly8x8x4_t(neon_ld1m4_8((__int8*)(src)))
#define vld1_s16_x4(src) __n64x4_to_int16x4x4_t(neon_ld1m4_16((__int16*)(src)))
#define vld1_s32_x4(src) __n64x4_to_int32x2x4_t(neon_ld1m4_32((__int32*)(src)))
#define vld1_s8_x4(src) __n64x4_to_int8x8x4_t(neon_ld1m4_8((__int8*)(src)))
#define vld1_u16_x4(src) __n64x4_to_uint16x4x4_t(neon_ld1m4_16((__int16*)(src)))
#define vld1_u32_x4(src) __n64x4_to_uint32x2x4_t(neon_ld1m4_32((__int32*)(src)))
#define vld1_u8_x4(src) __n64x4_to_uint8x8x4_t(neon_ld1m4_8((__int8*)(src)))
#define vld1_s64_x4(src) __n64x4_to_int64x1x4_t(neon_ld1m4_64((__int64*)(src)))
#define vld1_u64_x4(src) __n64x4_to_uint64x1x4_t(neon_ld1m4_64((__int64*)(src)))
#define vld1_p64_x4(src) __n64x4_to_poly64x1x4_t(neon_ld1m4_64((__int64*)(src)))
#define vld1_f64_x4(src) __n64x4_to_float64x1x4_t(neon_ld1m4_64((__int64*)(src)))
#define vld1q_dup_f32(src) __n128_to_float32x4_t(neon_ld1r_q32((__int32*)(src)))
#define vld1q_dup_f64(src) __n128_to_float64x2_t(neon_ld1r_q64((__int64*)(src)))
#define vld1q_dup_p64(src) __n128_to_poly64x2_t(neon_ld1r_q64((__int64*)(src)))
#define vld1q_dup_p16(src) __n128_to_poly16x8_t(neon_ld1r_q16((__int16*)(src)))
#define vld1q_dup_p8(src) __n128_to_poly8x16_t(neon_ld1r_q8((__int8*)(src)))
#define vld1q_dup_s16(src) __n128_to_int16x8_t(neon_ld1r_q16((__int16*)(src)))
#define vld1q_dup_s32(src) __n128_to_int32x4_t(neon_ld1r_q32((__int32*)(src)))
#define vld1q_dup_s8(src) __n128_to_int8x16_t(neon_ld1r_q8((__int8*)(src)))
#define vld1q_dup_u16(src) __n128_to_uint16x8_t(neon_ld1r_q16((__int16*)(src)))
#define vld1q_dup_u32(src) __n128_to_uint32x4_t(neon_ld1r_q32((__int32*)(src)))
#define vld1q_dup_u8(src) __n128_to_uint8x16_t(neon_ld1r_q8((__int8*)(src)))
#define vld1q_dup_s64(src) __n128_to_int64x2_t(neon_ld1r_q64((__int64*)(src)))
#define vld1q_dup_u64(src) __n128_to_uint64x2_t(neon_ld1r_q64((__int64*)(src)))
#define vld1q_f32(src) __n128_to_float32x4_t(neon_ld1m_q32((__int32*)(src)))
#define vld1q_p16(src) __n128_to_poly16x8_t(neon_ld1m_q16((__int16*)(src)))
#define vld1q_p8(src) __n128_to_poly8x16_t(neon_ld1m_q8((__int8*)(src)))
#define vld1q_s16(src) __n128_to_int16x8_t(neon_ld1m_q16((__int16*)(src)))
#define vld1q_s32(src) __n128_to_int32x4_t(neon_ld1m_q32((__int32*)(src)))
#define vld1q_s8(src) __n128_to_int8x16_t(neon_ld1m_q8((__int8*)(src)))
#define vld1q_u16(src) __n128_to_uint16x8_t(neon_ld1m_q16((__int16*)(src)))
#define vld1q_u32(src) __n128_to_uint32x4_t(neon_ld1m_q32((__int32*)(src)))
#define vld1q_u8(src) __n128_to_uint8x16_t(neon_ld1m_q8((__int8*)(src)))
#define vld1q_s64(src) __n128_to_int64x2_t(neon_ld1m_q64((__int64*)(src)))
#define vld1q_u64(src) __n128_to_uint64x2_t(neon_ld1m_q64((__int64*)(src)))
#define vld1q_f64(src) __n128_to_float64x2_t(neon_ld1m_q64((__int64*)(src)))
#define vld1q_p64(src) __n128_to_poly64x2_t(neon_ld1m_q64((__int64*)(src)))
#define vld1q_f32_x2(src) __n128x2_to_float32x4x2_t(neon_ld1m2_q32((__int32*)(src)))
#define vld1q_p16_x2(src) __n128x2_to_poly16x8x2_t(neon_ld1m2_q16((__int16*)(src)))
#define vld1q_p8_x2(src) __n128x2_to_poly8x16x2_t(neon_ld1m2_q8((__int8*)(src)))
#define vld1q_s16_x2(src) __n128x2_to_int16x8x2_t(neon_ld1m2_q16((__int16*)(src)))
#define vld1q_s32_x2(src) __n128x2_to_int32x4x2_t(neon_ld1m2_q32((__int32*)(src)))
#define vld1q_s8_x2(src) __n128x2_to_int8x16x2_t(neon_ld1m2_q8((__int8*)(src)))
#define vld1q_u16_x2(src) __n128x2_to_uint16x8x2_t(neon_ld1m2_q16((__int16*)(src)))
#define vld1q_u32_x2(src) __n128x2_to_uint32x4x2_t(neon_ld1m2_q32((__int32*)(src)))
#define vld1q_u8_x2(src) __n128x2_to_uint8x16x2_t(neon_ld1m2_q8((__int8*)(src)))
#define vld1q_s64_x2(src) __n128x2_to_int64x2x2_t(neon_ld1m2_q64((__int64*)(src)))
#define vld1q_u64_x2(src) __n128x2_to_uint64x2x2_t(neon_ld1m2_q64((__int64*)(src)))
#define vld1q_f64_x2(src) __n128x2_to_float64x2x2_t(neon_ld1m2_q64((__int64*)(src)))
#define vld1q_p64_x2(src) __n128x2_to_poly64x2x2_t(neon_ld1m2_q64((__int64*)(src)))
#define vld1q_f32_x3(src) __n128x3_to_float32x4x3_t(neon_ld1m3_q32((__int32*)(src)))
#define vld1q_p16_x3(src) __n128x3_to_poly16x8x3_t(neon_ld1m3_q16((__int16*)(src)))
#define vld1q_p8_x3(src) __n128x3_to_poly8x16x3_t(neon_ld1m3_q8((__int8*)(src)))
#define vld1q_s16_x3(src) __n128x3_to_int16x8x3_t(neon_ld1m3_q16((__int16*)(src)))
#define vld1q_s32_x3(src) __n128x3_to_int32x4x3_t(neon_ld1m3_q32((__int32*)(src)))
#define vld1q_s8_x3(src) __n128x3_to_int8x16x3_t(neon_ld1m3_q8((__int8*)(src)))
#define vld1q_u16_x3(src) __n128x3_to_uint16x8x3_t(neon_ld1m3_q16((__int16*)(src)))
#define vld1q_u32_x3(src) __n128x3_to_uint32x4x3_t(neon_ld1m3_q32((__int32*)(src)))
#define vld1q_u8_x3(src) __n128x3_to_uint8x16x3_t(neon_ld1m3_q8((__int8*)(src)))
#define vld1q_s64_x3(src) __n128x3_to_int64x2x3_t(neon_ld1m3_q64((__int64*)(src)))
#define vld1q_u64_x3(src) __n128x3_to_uint64x2x3_t(neon_ld1m3_q64((__int64*)(src)))
#define vld1q_f64_x3(src) __n128x3_to_float64x2x3_t(neon_ld1m3_q64((__int64*)(src)))
#define vld1q_p64_x3(src) __n128x3_to_poly64x2x3_t(neon_ld1m3_q64((__int64*)(src)))
#define vld1q_f32_x4(src) __n128x4_to_float32x4x4_t(neon_ld1m4_q32((__int32*)(src)))
#define vld1q_p16_x4(src) __n128x4_to_poly16x8x4_t(neon_ld1m4_q16((__int16*)(src)))
#define vld1q_p8_x4(src) __n128x4_to_poly8x16x4_t(neon_ld1m4_q8((__int8*)(src)))
#define vld1q_s16_x4(src) __n128x4_to_int16x8x4_t(neon_ld1m4_q16((__int16*)(src)))
#define vld1q_s32_x4(src) __n128x4_to_int32x4x4_t(neon_ld1m4_q32((__int32*)(src)))
#define vld1q_s8_x4(src) __n128x4_to_int8x16x4_t(neon_ld1m4_q8((__int8*)(src)))
#define vld1q_u16_x4(src) __n128x4_to_uint16x8x4_t(neon_ld1m4_q16((__int16*)(src)))
#define vld1q_u32_x4(src) __n128x4_to_uint32x4x4_t(neon_ld1m4_q32((__int32*)(src)))
#define vld1q_u8_x4(src) __n128x4_to_uint8x16x4_t(neon_ld1m4_q8((__int8*)(src)))
#define vld1q_s64_x4(src) __n128x4_to_int64x2x4_t(neon_ld1m4_q64((__int64*)(src)))
#define vld1q_u64_x4(src) __n128x4_to_uint64x2x4_t(neon_ld1m4_q64((__int64*)(src)))
#define vld1q_f64_x4(src) __n128x4_to_float64x2x4_t(neon_ld1m4_q64((__int64*)(src)))
#define vld1q_p64_x4(src) __n128x4_to_poly64x2x4_t(neon_ld1m4_q64((__int64*)(src)))
#define vld1_lane_f32(src1, src2, src3) __n64_to_float32x2_t(neon_ld1s_32((__int32*)(src1), __float32x2_t_to_n64(src2), (src3)))
#define vld1_lane_f64(src1, src2, src3) __n64_to_float64x1_t(neon_ld1s_64((__int64*)(src1), __float64x1_t_to_n64(src2), (src3)))
#define vld1_lane_p64(src1, src2, src3) __n64_to_poly64x1_t(neon_ld1s_64((__int64*)(src1), __poly64x1_t_to_n64(src2), (src3)))
#define vld1_lane_p16(src1, src2, src3) __n64_to_poly16x4_t(neon_ld1s_16((__int16*)(src1), __poly16x4_t_to_n64(src2), (src3)))
#define vld1_lane_p8(src1, src2, src3) __n64_to_poly8x8_t(neon_ld1s_8((__int8*)(src1), __poly8x8_t_to_n64(src2), (src3)))
#define vld1_lane_s16(src1, src2, src3) __n64_to_int16x4_t(neon_ld1s_16((__int16*)(src1), __int16x4_t_to_n64(src2), (src3)))
#define vld1_lane_s32(src1, src2, src3) __n64_to_int32x2_t(neon_ld1s_32((__int32*)(src1), __int32x2_t_to_n64(src2), (src3)))
#define vld1_lane_s64(src1, src2, src3) __n64_to_int64x1_t(neon_ld1s_64((__int64*)(src1), __int64x1_t_to_n64(src2), (src3)))
#define vld1_lane_s8(src1, src2, src3) __n64_to_int8x8_t(neon_ld1s_8((__int8*)(src1), __int8x8_t_to_n64(src2), (src3)))
#define vld1_lane_u16(src1, src2, src3) __n64_to_uint16x4_t(neon_ld1s_16((__int16*)(src1), __uint16x4_t_to_n64(src2), (src3)))
#define vld1_lane_u32(src1, src2, src3) __n64_to_uint32x2_t(neon_ld1s_32((__int32*)(src1), __uint32x2_t_to_n64(src2), (src3)))
#define vld1_lane_u64(src1, src2, src3) __n64_to_uint64x1_t(neon_ld1s_64((__int64*)(src1), __uint64x1_t_to_n64(src2), (src3)))
#define vld1_lane_u8(src1, src2, src3) __n64_to_uint8x8_t(neon_ld1s_8((__int8*)(src1), __uint8x8_t_to_n64(src2), (src3)))
#define vld1q_lane_f32(src1, src2, src3) __n128_to_float32x4_t(neon_ld1s_q32((__int32*)(src1), __float32x4_t_to_n128(src2), (src3)))
#define vld1q_lane_f64(src1, src2, src3) __n128_to_float64x2_t(neon_ld1s_q64((__int64*)(src1), __float64x2_t_to_n128(src2), (src3)))
#define vld1q_lane_p64(src1, src2, src3) __n128_to_poly64x2_t(neon_ld1s_q64((__int64*)(src1), __poly64x2_t_to_n128(src2), (src3)))
#define vld1q_lane_p8(src1, src2, src3) __n128_to_poly8x16_t(neon_ld1s_q8((__int8*)(src1), __poly8x16_t_to_n128(src2), (src3)))
#define vld1q_lane_p16(src1, src2, src3) __n128_to_poly16x8_t(neon_ld1s_q16((__int16*)(src1), __poly16x8_t_to_n128(src2), (src3)))
#define vld1q_lane_s8(src1, src2, src3) __n128_to_int8x16_t(neon_ld1s_q8((__int8*)(src1), __int8x16_t_to_n128(src2), (src3)))
#define vld1q_lane_s16(src1, src2, src3) __n128_to_int16x8_t(neon_ld1s_q16((__int16*)(src1), __int16x8_t_to_n128(src2), (src3)))
#define vld1q_lane_s32(src1, src2, src3) __n128_to_int32x4_t(neon_ld1s_q32((__int32*)(src1), __int32x4_t_to_n128(src2), (src3)))
#define vld1q_lane_s64(src1, src2, src3) __n128_to_int64x2_t(neon_ld1s_q64((__int64*)(src1), __int64x2_t_to_n128(src2), (src3)))
#define vld1q_lane_u8(src1, src2, src3) __n128_to_uint8x16_t(neon_ld1s_q8((__int8*)(src1), __uint8x16_t_to_n128(src2), (src3)))
#define vld1q_lane_u16(src1, src2, src3) __n128_to_uint16x8_t(neon_ld1s_q16((__int16*)(src1), __uint16x8_t_to_n128(src2), (src3)))
#define vld1q_lane_u32(src1, src2, src3) __n128_to_uint32x4_t(neon_ld1s_q32((__int32*)(src1), __uint32x4_t_to_n128(src2), (src3)))
#define vld1q_lane_u64(src1, src2, src3) __n128_to_uint64x2_t(neon_ld1s_q64((__int64*)(src1), __uint64x2_t_to_n128(src2), (src3)))

// ST1/ST2/ST3/ST4
void neon_st4m_8(__int8 * ptr, __n64x4 src);
void neon_st4m_q8(__int8 * ptr, __n128x4 src);
void neon_st4m_16(__int16 * ptr, __n64x4 src);
void neon_st4m_q16(__int16 * ptr, __n128x4 src);
void neon_st4m_32(__int32 * ptr, __n64x4 src);
void neon_st4m_q32(__int32 * ptr, __n128x4 src);
void neon_st4m_q64(__int64 * ptr, __n128x4 src);
void neon_st4s_8(__int8 * ptr, __n64x4 src, const int lane);
void neon_st4s_q8(__int8 * ptr, __n128x4 src, const int lane);
void neon_st4s_16(__int16 * ptr, __n64x4 src, const int lane);
void neon_st4s_q16(__int16 * ptr, __n128x4 src, const int lane);
void neon_st4s_32(__int32 * ptr, __n64x4 src, const int lane);
void neon_st4s_q32(__int32 * ptr, __n128x4 src, const int lane);
void neon_st4s_64(__int64 * ptr, __n64x4 src, const int lane);
void neon_st4s_q64(__int64 * ptr, __n128x4 src, const int lane);
void neon_st3m_8(__int8 * ptr, __n64x3 src);
void neon_st3m_q8(__int8 * ptr, __n128x3 src);
void neon_st3m_16(__int16 * ptr, __n64x3 src);
void neon_st3m_q16(__int16 * ptr, __n128x3 src);
void neon_st3m_32(__int32 * ptr, __n64x3 src);
void neon_st3m_q32(__int32 * ptr, __n128x3 src);
void neon_st3m_q64(__int64 * ptr, __n128x3 src);
void neon_st3s_8(__int8 * ptr, __n64x3 src, const int lane);
void neon_st3s_q8(__int8 * ptr, __n128x3 src, const int lane);
void neon_st3s_16(__int16 * ptr, __n64x3 src, const int lane);
void neon_st3s_q16(__int16 * ptr, __n128x3 src, const int lane);
void neon_st3s_32(__int32 * ptr, __n64x3 src, const int lane);
void neon_st3s_q32(__int32 * ptr, __n128x3 src, const int lane);
void neon_st3s_64(__int64 * ptr, __n64x3 src, const int lane);
void neon_st3s_q64(__int64 * ptr, __n128x3 src, const int lane);
void neon_st2m_8(__int8 * ptr, __n64x2 src);
void neon_st2m_q8(__int8 * ptr, __n128x2 src);
void neon_st2m_16(__int16 * ptr, __n64x2 src);
void neon_st2m_q16(__int16 * ptr, __n128x2 src);
void neon_st2m_32(__int32 * ptr, __n64x2 src);
void neon_st2m_q32(__int32 * ptr, __n128x2 src);
void neon_st2m_q64(__int64 * ptr, __n128x2 src);
void neon_st2s_8(__int8 * ptr, __n64x2 src, const int lane);
void neon_st2s_q8(__int8 * ptr, __n128x2 src, const int lane);
void neon_st2s_16(__int16 * ptr, __n64x2 src, const int lane);
void neon_st2s_q16(__int16 * ptr, __n128x2 src, const int lane);
void neon_st2s_32(__int32 * ptr, __n64x2 src, const int lane);
void neon_st2s_q32(__int32 * ptr, __n128x2 src, const int lane);
void neon_st2s_64(__int64 * ptr, __n64x2 src, const int lane);
void neon_st2s_q64(__int64 * ptr, __n128x2 src, const int lane);
void neon_st1m_8(__int8 * ptr, __n64 src);
void neon_st1m_q8(__int8 * ptr, __n128 src);
void neon_st1m_16(__int16 * ptr, __n64 src);
void neon_st1m_q16(__int16 * ptr, __n128 src);
void neon_st1m_32(__int32 * ptr, __n64 src);
void neon_st1m_q32(__int32 * ptr, __n128 src);
void neon_st1m_64(__int64 * ptr, __n64 src);
void neon_st1m_q64(__int64 * ptr, __n128 src);
void neon_st1m2_8(__int8 * ptr, __n64x2 src);
void neon_st1m2_q8(__int8 * ptr, __n128x2 src);
void neon_st1m2_16(__int16 * ptr, __n64x2 src);
void neon_st1m2_q16(__int16 * ptr, __n128x2 src);
void neon_st1m2_32(__int32 * ptr, __n64x2 src);
void neon_st1m2_q32(__int32 * ptr, __n128x2 src);
void neon_st1m2_64(__int64 * ptr, __n64x2 src);
void neon_st1m2_q64(__int64 * ptr, __n128x2 src);
void neon_st1m3_8(__int8 * ptr, __n64x3 src);
void neon_st1m3_q8(__int8 * ptr, __n128x3 src);
void neon_st1m3_16(__int16 * ptr, __n64x3 src);
void neon_st1m3_q16(__int16 * ptr, __n128x3 src);
void neon_st1m3_32(__int32 * ptr, __n64x3 src);
void neon_st1m3_q32(__int32 * ptr, __n128x3 src);
void neon_st1m3_64(__int64 * ptr, __n64x3 src);
void neon_st1m3_q64(__int64 * ptr, __n128x3 src);
void neon_st1m4_8(__int8 * ptr, __n64x4 src);
void neon_st1m4_q8(__int8 * ptr, __n128x4 src);
void neon_st1m4_16(__int16 * ptr, __n64x4 src);
void neon_st1m4_q16(__int16 * ptr, __n128x4 src);
void neon_st1m4_32(__int32 * ptr, __n64x4 src);
void neon_st1m4_q32(__int32 * ptr, __n128x4 src);
void neon_st1m4_64(__int64 * ptr, __n64x4 src);
void neon_st1m4_q64(__int64 * ptr, __n128x4 src);
void neon_st1s_8(__int8 * ptr, __n64 src, const int lane);
void neon_st1s_q8(__int8 * ptr, __n128 src, const int lane);
void neon_st1s_16(__int16 * ptr, __n64 src, const int lane);
void neon_st1s_q16(__int16 * ptr, __n128 src, const int lane);
void neon_st1s_32(__int32 * ptr, __n64 src, const int lane);
void neon_st1s_q32(__int32 * ptr, __n128 src, const int lane);
void neon_st1s_64(__int64 * ptr, __n64 src, const int lane);
void neon_st1s_q64(__int64 * ptr, __n128 src, const int lane);
#define vst4_f32(src1, src2) neon_st4m_32((__int32*)(src1), __float32x2x4_t_to_n64x4(src2))
#define vst4_p16(src1, src2) neon_st4m_16((__int16*)(src1), __poly16x4x4_t_to_n64x4(src2))
#define vst4_p8(src1, src2) neon_st4m_8((__int8*)(src1), __poly8x8x4_t_to_n64x4(src2))
#define vst4_s16(src1, src2) neon_st4m_16((__int16*)(src1), __int16x4x4_t_to_n64x4(src2))
#define vst4_s32(src1, src2) neon_st4m_32((__int32*)(src1), __int32x2x4_t_to_n64x4(src2))
#define vst4_s8(src1, src2) neon_st4m_8((__int8*)(src1), __int8x8x4_t_to_n64x4(src2))
#define vst4_u16(src1, src2) neon_st4m_16((__int16*)(src1), __uint16x4x4_t_to_n64x4(src2))
#define vst4_u32(src1, src2) neon_st4m_32((__int32*)(src1), __uint32x2x4_t_to_n64x4(src2))
#define vst4_u8(src1, src2) neon_st4m_8((__int8*)(src1), __uint8x8x4_t_to_n64x4(src2))
#define vst4_s64(src1, src2) neon_st1m4_64((__int64*)(src1), __int64x1x4_t_to_n64x4(src2))
#define vst4_u64(src1, src2) neon_st1m4_64((__int64*)(src1), __uint64x1x4_t_to_n64x4(src2))
#define vst4_f64(src1, src2) neon_st1m4_64((__int64*)(src1), __float64x1x4_t_to_n64x4(src2))
#define vst4_p64(src1, src2) neon_st1m4_64((__int64*)(src1), __poly64x1x4_t_to_n64x4(src2))
#define vst4q_f32(src1, src2) neon_st4m_q32((__int32*)(src1), __float32x4x4_t_to_n128x4(src2))
#define vst4q_p16(src1, src2) neon_st4m_q16((__int16*)(src1), __poly16x8x4_t_to_n128x4(src2))
#define vst4q_p8(src1, src2) neon_st4m_q8((__int8*)(src1), __poly8x16x4_t_to_n128x4(src2))
#define vst4q_s16(src1, src2) neon_st4m_q16((__int16*)(src1), __int16x8x4_t_to_n128x4(src2))
#define vst4q_s32(src1, src2) neon_st4m_q32((__int32*)(src1), __int32x4x4_t_to_n128x4(src2))
#define vst4q_s8(src1, src2) neon_st4m_q8((__int8*)(src1), __int8x16x4_t_to_n128x4(src2))
#define vst4q_u16(src1, src2) neon_st4m_q16((__int16*)(src1), __uint16x8x4_t_to_n128x4(src2))
#define vst4q_u32(src1, src2) neon_st4m_q32((__int32*)(src1), __uint32x4x4_t_to_n128x4(src2))
#define vst4q_u8(src1, src2) neon_st4m_q8((__int8*)(src1), __uint8x16x4_t_to_n128x4(src2))
#define vst4q_s64(src1, src2) neon_st4m_q64((__int64*)(src1), __int64x2x4_t_to_n128x4(src2))
#define vst4q_u64(src1, src2) neon_st4m_q64((__int64*)(src1), __uint64x2x4_t_to_n128x4(src2))
#define vst4q_f64(src1, src2) neon_st4m_q64((__int64*)(src1), __float64x2x4_t_to_n128x4(src2))
#define vst4q_p64(src1, src2) neon_st4m_q64((__int64*)(src1), __poly64x2x4_t_to_n128x4(src2))
#define vst4_lane_f32(src1, src2, src3) neon_st4s_32((__int32*)(src1), __float32x2x4_t_to_n64x4(src2), (src3))
#define vst4_lane_f64(src1, src2, src3) neon_st4s_64((__int64*)(src1), __float64x1x4_t_to_n64x4(src2), (src3))
#define vst4_lane_p64(src1, src2, src3) neon_st4s_64((__int64*)(src1), __poly64x1x4_t_to_n64x4(src2), (src3))
#define vst4_lane_p16(src1, src2, src3) neon_st4s_16((__int16*)(src1), __poly16x4x4_t_to_n64x4(src2), (src3))
#define vst4_lane_p8(src1, src2, src3) neon_st4s_8((__int8*)(src1), __poly8x8x4_t_to_n64x4(src2), (src3))
#define vst4_lane_s16(src1, src2, src3) neon_st4s_16((__int16*)(src1), __int16x4x4_t_to_n64x4(src2), (src3))
#define vst4_lane_s32(src1, src2, src3) neon_st4s_32((__int32*)(src1), __int32x2x4_t_to_n64x4(src2), (src3))
#define vst4_lane_s64(src1, src2, src3) neon_st4s_64((__int64*)(src1), __int64x1x4_t_to_n64x4(src2), (src3))
#define vst4_lane_s8(src1, src2, src3) neon_st4s_8((__int8*)(src1), __int8x8x4_t_to_n64x4(src2), (src3))
#define vst4_lane_u16(src1, src2, src3) neon_st4s_16((__int16*)(src1), __uint16x4x4_t_to_n64x4(src2), (src3))
#define vst4_lane_u32(src1, src2, src3) neon_st4s_32((__int32*)(src1), __uint32x2x4_t_to_n64x4(src2), (src3))
#define vst4_lane_u64(src1, src2, src3) neon_st4s_64((__int64*)(src1), __uint64x1x4_t_to_n64x4(src2), (src3))
#define vst4_lane_u8(src1, src2, src3) neon_st4s_8((__int8*)(src1), __uint8x8x4_t_to_n64x4(src2), (src3))
#define vst4q_lane_f32(src1, src2, src3) neon_st4s_q32((__int32*)(src1), __float32x4x4_t_to_n128x4(src2), (src3))
#define vst4q_lane_f64(src1, src2, src3) neon_st4s_q64((__int64*)(src1), __float64x2x4_t_to_n128x4(src2), (src3))
#define vst4q_lane_p64(src1, src2, src3) neon_st4s_q64((__int64*)(src1), __poly64x2x4_t_to_n128x4(src2), (src3))
#define vst4q_lane_p8(src1, src2, src3) neon_st4s_q8((__int8*)(src1), __poly8x16x4_t_to_n128x4(src2), (src3))
#define vst4q_lane_p16(src1, src2, src3) neon_st4s_q16((__int16*)(src1), __poly16x8x4_t_to_n128x4(src2), (src3))
#define vst4q_lane_s8(src1, src2, src3) neon_st4s_q8((__int8*)(src1), __int8x16x4_t_to_n128x4(src2), (src3))
#define vst4q_lane_s16(src1, src2, src3) neon_st4s_q16((__int16*)(src1), __int16x8x4_t_to_n128x4(src2), (src3))
#define vst4q_lane_s32(src1, src2, src3) neon_st4s_q32((__int32*)(src1), __int32x4x4_t_to_n128x4(src2), (src3))
#define vst4q_lane_s64(src1, src2, src3) neon_st4s_q64((__int64*)(src1), __int64x2x4_t_to_n128x4(src2), (src3))
#define vst4q_lane_u8(src1, src2, src3) neon_st4s_q8((__int8*)(src1), __uint8x16x4_t_to_n128x4(src2), (src3))
#define vst4q_lane_u16(src1, src2, src3) neon_st4s_q16((__int16*)(src1), __uint16x8x4_t_to_n128x4(src2), (src3))
#define vst4q_lane_u32(src1, src2, src3) neon_st4s_q32((__int32*)(src1), __uint32x4x4_t_to_n128x4(src2), (src3))
#define vst4q_lane_u64(src1, src2, src3) neon_st4s_q64((__int64*)(src1), __uint64x2x4_t_to_n128x4(src2), (src3))
#define vst3_f32(src1, src2) neon_st3m_32((__int32*)(src1), __float32x2x3_t_to_n64x3(src2))
#define vst3_p16(src1, src2) neon_st3m_16((__int16*)(src1), __poly16x4x3_t_to_n64x3(src2))
#define vst3_p8(src1, src2) neon_st3m_8((__int8*)(src1), __poly8x8x3_t_to_n64x3(src2))
#define vst3_s16(src1, src2) neon_st3m_16((__int16*)(src1), __int16x4x3_t_to_n64x3(src2))
#define vst3_s32(src1, src2) neon_st3m_32((__int32*)(src1), __int32x2x3_t_to_n64x3(src2))
#define vst3_s8(src1, src2) neon_st3m_8((__int8*)(src1), __int8x8x3_t_to_n64x3(src2))
#define vst3_u16(src1, src2) neon_st3m_16((__int16*)(src1), __uint16x4x3_t_to_n64x3(src2))
#define vst3_u32(src1, src2) neon_st3m_32((__int32*)(src1), __uint32x2x3_t_to_n64x3(src2))
#define vst3_u8(src1, src2) neon_st3m_8((__int8*)(src1), __uint8x8x3_t_to_n64x3(src2))
#define vst3_s64(src1, src2) neon_st1m3_64((__int64*)(src1), __int64x1x3_t_to_n64x3(src2))
#define vst3_u64(src1, src2) neon_st1m3_64((__int64*)(src1), __uint64x1x3_t_to_n64x3(src2))
#define vst3_f64(src1, src2) neon_st1m3_64((__int64*)(src1), __float64x1x3_t_to_n64x3(src2))
#define vst3_p64(src1, src2) neon_st1m3_64((__int64*)(src1), __poly64x1x3_t_to_n64x3(src2))
#define vst3q_f32(src1, src2) neon_st3m_q32((__int32*)(src1), __float32x4x3_t_to_n128x3(src2))
#define vst3q_p16(src1, src2) neon_st3m_q16((__int16*)(src1), __poly16x8x3_t_to_n128x3(src2))
#define vst3q_p8(src1, src2) neon_st3m_q8((__int8*)(src1), __poly8x16x3_t_to_n128x3(src2))
#define vst3q_s16(src1, src2) neon_st3m_q16((__int16*)(src1), __int16x8x3_t_to_n128x3(src2))
#define vst3q_s32(src1, src2) neon_st3m_q32((__int32*)(src1), __int32x4x3_t_to_n128x3(src2))
#define vst3q_s8(src1, src2) neon_st3m_q8((__int8*)(src1), __int8x16x3_t_to_n128x3(src2))
#define vst3q_u16(src1, src2) neon_st3m_q16((__int16*)(src1), __uint16x8x3_t_to_n128x3(src2))
#define vst3q_u32(src1, src2) neon_st3m_q32((__int32*)(src1), __uint32x4x3_t_to_n128x3(src2))
#define vst3q_u8(src1, src2) neon_st3m_q8((__int8*)(src1), __uint8x16x3_t_to_n128x3(src2))
#define vst3q_s64(src1, src2) neon_st3m_q64((__int64*)(src1), __int64x2x3_t_to_n128x3(src2))
#define vst3q_u64(src1, src2) neon_st3m_q64((__int64*)(src1), __uint64x2x3_t_to_n128x3(src2))
#define vst3q_f64(src1, src2) neon_st3m_q64((__int64*)(src1), __float64x2x3_t_to_n128x3(src2))
#define vst3q_p64(src1, src2) neon_st3m_q64((__int64*)(src1), __poly64x2x3_t_to_n128x3(src2))
#define vst3_lane_f32(src1, src2, src3) neon_st3s_32((__int32*)(src1), __float32x2x3_t_to_n64x3(src2), (src3))
#define vst3_lane_f64(src1, src2, src3) neon_st3s_64((__int64*)(src1), __float64x1x3_t_to_n64x3(src2), (src3))
#define vst3_lane_p64(src1, src2, src3) neon_st3s_64((__int64*)(src1), __poly64x1x3_t_to_n64x3(src2), (src3))
#define vst3_lane_p16(src1, src2, src3) neon_st3s_16((__int16*)(src1), __poly16x4x3_t_to_n64x3(src2), (src3))
#define vst3_lane_p8(src1, src2, src3) neon_st3s_8((__int8*)(src1), __poly8x8x3_t_to_n64x3(src2), (src3))
#define vst3_lane_s16(src1, src2, src3) neon_st3s_16((__int16*)(src1), __int16x4x3_t_to_n64x3(src2), (src3))
#define vst3_lane_s32(src1, src2, src3) neon_st3s_32((__int32*)(src1), __int32x2x3_t_to_n64x3(src2), (src3))
#define vst3_lane_s64(src1, src2, src3) neon_st3s_64((__int64*)(src1), __int64x1x3_t_to_n64x3(src2), (src3))
#define vst3_lane_s8(src1, src2, src3) neon_st3s_8((__int8*)(src1), __int8x8x3_t_to_n64x3(src2), (src3))
#define vst3_lane_u16(src1, src2, src3) neon_st3s_16((__int16*)(src1), __uint16x4x3_t_to_n64x3(src2), (src3))
#define vst3_lane_u32(src1, src2, src3) neon_st3s_32((__int32*)(src1), __uint32x2x3_t_to_n64x3(src2), (src3))
#define vst3_lane_u64(src1, src2, src3) neon_st3s_64((__int64*)(src1), __uint64x1x3_t_to_n64x3(src2), (src3))
#define vst3_lane_u8(src1, src2, src3) neon_st3s_8((__int8*)(src1), __uint8x8x3_t_to_n64x3(src2), (src3))
#define vst3q_lane_f32(src1, src2, src3) neon_st3s_q32((__int32*)(src1), __float32x4x3_t_to_n128x3(src2), (src3))
#define vst3q_lane_f64(src1, src2, src3) neon_st3s_q64((__int64*)(src1), __float64x2x3_t_to_n128x3(src2), (src3))
#define vst3q_lane_p64(src1, src2, src3) neon_st3s_q64((__int64*)(src1), __poly64x2x3_t_to_n128x3(src2), (src3))
#define vst3q_lane_p8(src1, src2, src3) neon_st3s_q8((__int8*)(src1), __poly8x16x3_t_to_n128x3(src2), (src3))
#define vst3q_lane_p16(src1, src2, src3) neon_st3s_q16((__int16*)(src1), __poly16x8x3_t_to_n128x3(src2), (src3))
#define vst3q_lane_s8(src1, src2, src3) neon_st3s_q8((__int8*)(src1), __int8x16x3_t_to_n128x3(src2), (src3))
#define vst3q_lane_s16(src1, src2, src3) neon_st3s_q16((__int16*)(src1), __int16x8x3_t_to_n128x3(src2), (src3))
#define vst3q_lane_s32(src1, src2, src3) neon_st3s_q32((__int32*)(src1), __int32x4x3_t_to_n128x3(src2), (src3))
#define vst3q_lane_s64(src1, src2, src3) neon_st3s_q64((__int64*)(src1), __int64x2x3_t_to_n128x3(src2), (src3))
#define vst3q_lane_u8(src1, src2, src3) neon_st3s_q8((__int8*)(src1), __uint8x16x3_t_to_n128x3(src2), (src3))
#define vst3q_lane_u16(src1, src2, src3) neon_st3s_q16((__int16*)(src1), __uint16x8x3_t_to_n128x3(src2), (src3))
#define vst3q_lane_u32(src1, src2, src3) neon_st3s_q32((__int32*)(src1), __uint32x4x3_t_to_n128x3(src2), (src3))
#define vst3q_lane_u64(src1, src2, src3) neon_st3s_q64((__int64*)(src1), __uint64x2x3_t_to_n128x3(src2), (src3))
#define vst2_f32(src1, src2) neon_st2m_32((__int32*)(src1), __float32x2x2_t_to_n64x2(src2))
#define vst2_p16(src1, src2) neon_st2m_16((__int16*)(src1), __poly16x4x2_t_to_n64x2(src2))
#define vst2_p8(src1, src2) neon_st2m_8((__int8*)(src1), __poly8x8x2_t_to_n64x2(src2))
#define vst2_s16(src1, src2) neon_st2m_16((__int16*)(src1), __int16x4x2_t_to_n64x2(src2))
#define vst2_s32(src1, src2) neon_st2m_32((__int32*)(src1), __int32x2x2_t_to_n64x2(src2))
#define vst2_s8(src1, src2) neon_st2m_8((__int8*)(src1), __int8x8x2_t_to_n64x2(src2))
#define vst2_u16(src1, src2) neon_st2m_16((__int16*)(src1), __uint16x4x2_t_to_n64x2(src2))
#define vst2_u32(src1, src2) neon_st2m_32((__int32*)(src1), __uint32x2x2_t_to_n64x2(src2))
#define vst2_u8(src1, src2) neon_st2m_8((__int8*)(src1), __uint8x8x2_t_to_n64x2(src2))
#define vst2_s64(src1, src2) neon_st1m2_64((__int64*)(src1), __int64x1x2_t_to_n64x2(src2))
#define vst2_u64(src1, src2) neon_st1m2_64((__int64*)(src1), __uint64x1x2_t_to_n64x2(src2))
#define vst2_f64(src1, src2) neon_st1m2_64((__int64*)(src1), __float64x1x2_t_to_n64x2(src2))
#define vst2_p64(src1, src2) neon_st1m2_64((__int64*)(src1), __poly64x1x2_t_to_n64x2(src2))
#define vst2q_f32(src1, src2) neon_st2m_q32((__int32*)(src1), __float32x4x2_t_to_n128x2(src2))
#define vst2q_p16(src1, src2) neon_st2m_q16((__int16*)(src1), __poly16x8x2_t_to_n128x2(src2))
#define vst2q_p8(src1, src2) neon_st2m_q8((__int8*)(src1), __poly8x16x2_t_to_n128x2(src2))
#define vst2q_s16(src1, src2) neon_st2m_q16((__int16*)(src1), __int16x8x2_t_to_n128x2(src2))
#define vst2q_s32(src1, src2) neon_st2m_q32((__int32*)(src1), __int32x4x2_t_to_n128x2(src2))
#define vst2q_s8(src1, src2) neon_st2m_q8((__int8*)(src1), __int8x16x2_t_to_n128x2(src2))
#define vst2q_u16(src1, src2) neon_st2m_q16((__int16*)(src1), __uint16x8x2_t_to_n128x2(src2))
#define vst2q_u32(src1, src2) neon_st2m_q32((__int32*)(src1), __uint32x4x2_t_to_n128x2(src2))
#define vst2q_u8(src1, src2) neon_st2m_q8((__int8*)(src1), __uint8x16x2_t_to_n128x2(src2))
#define vst2q_s64(src1, src2) neon_st2m_q64((__int64*)(src1), __int64x2x2_t_to_n128x2(src2))
#define vst2q_u64(src1, src2) neon_st2m_q64((__int64*)(src1), __uint64x2x2_t_to_n128x2(src2))
#define vst2q_f64(src1, src2) neon_st2m_q64((__int64*)(src1), __float64x2x2_t_to_n128x2(src2))
#define vst2q_p64(src1, src2) neon_st2m_q64((__int64*)(src1), __poly64x2x2_t_to_n128x2(src2))
#define vst2_lane_f32(src1, src2, src3) neon_st2s_32((__int32*)(src1), __float32x2x2_t_to_n64x2(src2), (src3))
#define vst2_lane_f64(src1, src2, src3) neon_st2s_64((__int64*)(src1), __float64x1x2_t_to_n64x2(src2), (src3))
#define vst2_lane_p64(src1, src2, src3) neon_st2s_64((__int64*)(src1), __poly64x1x2_t_to_n64x2(src2), (src3))
#define vst2_lane_p16(src1, src2, src3) neon_st2s_16((__int16*)(src1), __poly16x4x2_t_to_n64x2(src2), (src3))
#define vst2_lane_p8(src1, src2, src3) neon_st2s_8((__int8*)(src1), __poly8x8x2_t_to_n64x2(src2), (src3))
#define vst2_lane_s16(src1, src2, src3) neon_st2s_16((__int16*)(src1), __int16x4x2_t_to_n64x2(src2), (src3))
#define vst2_lane_s32(src1, src2, src3) neon_st2s_32((__int32*)(src1), __int32x2x2_t_to_n64x2(src2), (src3))
#define vst2_lane_s64(src1, src2, src3) neon_st2s_64((__int64*)(src1), __int64x1x2_t_to_n64x2(src2), (src3))
#define vst2_lane_s8(src1, src2, src3) neon_st2s_8((__int8*)(src1), __int8x8x2_t_to_n64x2(src2), (src3))
#define vst2_lane_u16(src1, src2, src3) neon_st2s_16((__int16*)(src1), __uint16x4x2_t_to_n64x2(src2), (src3))
#define vst2_lane_u32(src1, src2, src3) neon_st2s_32((__int32*)(src1), __uint32x2x2_t_to_n64x2(src2), (src3))
#define vst2_lane_u64(src1, src2, src3) neon_st2s_64((__int64*)(src1), __uint64x1x2_t_to_n64x2(src2), (src3))
#define vst2_lane_u8(src1, src2, src3) neon_st2s_8((__int8*)(src1), __uint8x8x2_t_to_n64x2(src2), (src3))
#define vst2q_lane_f32(src1, src2, src3) neon_st2s_q32((__int32*)(src1), __float32x4x2_t_to_n128x2(src2), (src3))
#define vst2q_lane_f64(src1, src2, src3) neon_st2s_q64((__int64*)(src1), __float64x2x2_t_to_n128x2(src2), (src3))
#define vst2q_lane_p64(src1, src2, src3) neon_st2s_q64((__int64*)(src1), __poly64x2x2_t_to_n128x2(src2), (src3))
#define vst2q_lane_p8(src1, src2, src3) neon_st2s_q8((__int8*)(src1), __poly8x16x2_t_to_n128x2(src2), (src3))
#define vst2q_lane_p16(src1, src2, src3) neon_st2s_q16((__int16*)(src1), __poly16x8x2_t_to_n128x2(src2), (src3))
#define vst2q_lane_s8(src1, src2, src3) neon_st2s_q8((__int8*)(src1), __int8x16x2_t_to_n128x2(src2), (src3))
#define vst2q_lane_s16(src1, src2, src3) neon_st2s_q16((__int16*)(src1), __int16x8x2_t_to_n128x2(src2), (src3))
#define vst2q_lane_s32(src1, src2, src3) neon_st2s_q32((__int32*)(src1), __int32x4x2_t_to_n128x2(src2), (src3))
#define vst2q_lane_s64(src1, src2, src3) neon_st2s_q64((__int64*)(src1), __int64x2x2_t_to_n128x2(src2), (src3))
#define vst2q_lane_u8(src1, src2, src3) neon_st2s_q8((__int8*)(src1), __uint8x16x2_t_to_n128x2(src2), (src3))
#define vst2q_lane_u16(src1, src2, src3) neon_st2s_q16((__int16*)(src1), __uint16x8x2_t_to_n128x2(src2), (src3))
#define vst2q_lane_u32(src1, src2, src3) neon_st2s_q32((__int32*)(src1), __uint32x4x2_t_to_n128x2(src2), (src3))
#define vst2q_lane_u64(src1, src2, src3) neon_st2s_q64((__int64*)(src1), __uint64x2x2_t_to_n128x2(src2), (src3))
#define vst1_f32(src1, src2) neon_st1m_32((__int32*)(src1), __float32x2_t_to_n64(src2))
#define vst1_p16(src1, src2) neon_st1m_16((__int16*)(src1), __poly16x4_t_to_n64(src2))
#define vst1_p8(src1, src2) neon_st1m_8((__int8*)(src1), __poly8x8_t_to_n64(src2))
#define vst1_s16(src1, src2) neon_st1m_16((__int16*)(src1), __int16x4_t_to_n64(src2))
#define vst1_s32(src1, src2) neon_st1m_32((__int32*)(src1), __int32x2_t_to_n64(src2))
#define vst1_s8(src1, src2) neon_st1m_8((__int8*)(src1), __int8x8_t_to_n64(src2))
#define vst1_u16(src1, src2) neon_st1m_16((__int16*)(src1), __uint16x4_t_to_n64(src2))
#define vst1_u32(src1, src2) neon_st1m_32((__int32*)(src1), __uint32x2_t_to_n64(src2))
#define vst1_u8(src1, src2) neon_st1m_8((__int8*)(src1), __uint8x8_t_to_n64(src2))
#define vst1_s64(src1, src2) neon_st1m_64((__int64*)(src1), __int64x1_t_to_n64(src2))
#define vst1_u64(src1, src2) neon_st1m_64((__int64*)(src1), __uint64x1_t_to_n64(src2))
#define vst1_f64(src1, src2) neon_st1m_64((__int64*)(src1), __float64x1_t_to_n64(src2))
#define vst1_p64(src1, src2) neon_st1m_64((__int64*)(src1), __poly64x1_t_to_n64(src2))
#define vst1q_f32(src1, src2) neon_st1m_q32((__int32*)(src1), __float32x4_t_to_n128(src2))
#define vst1q_p16(src1, src2) neon_st1m_q16((__int16*)(src1), __poly16x8_t_to_n128(src2))
#define vst1q_p8(src1, src2) neon_st1m_q8((__int8*)(src1), __poly8x16_t_to_n128(src2))
#define vst1q_s16(src1, src2) neon_st1m_q16((__int16*)(src1), __int16x8_t_to_n128(src2))
#define vst1q_s32(src1, src2) neon_st1m_q32((__int32*)(src1), __int32x4_t_to_n128(src2))
#define vst1q_s8(src1, src2) neon_st1m_q8((__int8*)(src1), __int8x16_t_to_n128(src2))
#define vst1q_u16(src1, src2) neon_st1m_q16((__int16*)(src1), __uint16x8_t_to_n128(src2))
#define vst1q_u32(src1, src2) neon_st1m_q32((__int32*)(src1), __uint32x4_t_to_n128(src2))
#define vst1q_u8(src1, src2) neon_st1m_q8((__int8*)(src1), __uint8x16_t_to_n128(src2))
#define vst1q_s64(src1, src2) neon_st1m_q64((__int64*)(src1), __int64x2_t_to_n128(src2))
#define vst1q_u64(src1, src2) neon_st1m_q64((__int64*)(src1), __uint64x2_t_to_n128(src2))
#define vst1q_f64(src1, src2) neon_st1m_q64((__int64*)(src1), __float64x2_t_to_n128(src2))
#define vst1q_p64(src1, src2) neon_st1m_q64((__int64*)(src1), __poly64x2_t_to_n128(src2))
#define vst1_f32_x2(src1, src2) neon_st1m2_32((__int32*)(src1), __float32x2x2_t_to_n64x2(src2))
#define vst1_p16_x2(src1, src2) neon_st1m2_16((__int16*)(src1), __poly16x4x2_t_to_n64x2(src2))
#define vst1_p8_x2(src1, src2) neon_st1m2_8((__int8*)(src1), __poly8x8x2_t_to_n64x2(src2))
#define vst1_s16_x2(src1, src2) neon_st1m2_16((__int16*)(src1), __int16x4x2_t_to_n64x2(src2))
#define vst1_s32_x2(src1, src2) neon_st1m2_32((__int32*)(src1), __int32x2x2_t_to_n64x2(src2))
#define vst1_s8_x2(src1, src2) neon_st1m2_8((__int8*)(src1), __int8x8x2_t_to_n64x2(src2))
#define vst1_u16_x2(src1, src2) neon_st1m2_16((__int16*)(src1), __uint16x4x2_t_to_n64x2(src2))
#define vst1_u32_x2(src1, src2) neon_st1m2_32((__int32*)(src1), __uint32x2x2_t_to_n64x2(src2))
#define vst1_u8_x2(src1, src2) neon_st1m2_8((__int8*)(src1), __uint8x8x2_t_to_n64x2(src2))
#define vst1_s64_x2(src1, src2) neon_st1m2_64((__int64*)(src1), __int64x1x2_t_to_n64x2(src2))
#define vst1_u64_x2(src1, src2) neon_st1m2_64((__int64*)(src1), __uint64x1x2_t_to_n64x2(src2))
#define vst1_f64_x2(src1, src2) neon_st1m2_64((__int64*)(src1), __float64x1x2_t_to_n64x2(src2))
#define vst1_p64_x2(src1, src2) neon_st1m2_64((__int64*)(src1), __poly64x1x2_t_to_n64x2(src2))
#define vst1q_f32_x2(src1, src2) neon_st1m2_q32((__int32*)(src1), __float32x4x2_t_to_n128x2(src2))
#define vst1q_p16_x2(src1, src2) neon_st1m2_q16((__int16*)(src1), __poly16x8x2_t_to_n128x2(src2))
#define vst1q_p8_x2(src1, src2) neon_st1m2_q8((__int8*)(src1), __poly8x16x2_t_to_n128x2(src2))
#define vst1q_s16_x2(src1, src2) neon_st1m2_q16((__int16*)(src1), __int16x8x2_t_to_n128x2(src2))
#define vst1q_s32_x2(src1, src2) neon_st1m2_q32((__int32*)(src1), __int32x4x2_t_to_n128x2(src2))
#define vst1q_s8_x2(src1, src2) neon_st1m2_q8((__int8*)(src1), __int8x16x2_t_to_n128x2(src2))
#define vst1q_u16_x2(src1, src2) neon_st1m2_q16((__int16*)(src1), __uint16x8x2_t_to_n128x2(src2))
#define vst1q_u32_x2(src1, src2) neon_st1m2_q32((__int32*)(src1), __uint32x4x2_t_to_n128x2(src2))
#define vst1q_u8_x2(src1, src2) neon_st1m2_q8((__int8*)(src1), __uint8x16x2_t_to_n128x2(src2))
#define vst1q_s64_x2(src1, src2) neon_st1m2_q64((__int64*)(src1), __int64x2x2_t_to_n128x2(src2))
#define vst1q_u64_x2(src1, src2) neon_st1m2_q64((__int64*)(src1), __uint64x2x2_t_to_n128x2(src2))
#define vst1q_f64_x2(src1, src2) neon_st1m2_q64((__int64*)(src1), __float64x2x2_t_to_n128x2(src2))
#define vst1q_p64_x2(src1, src2) neon_st1m2_q64((__int64*)(src1), __poly64x2x2_t_to_n128x2(src2))
#define vst1_f32_x3(src1, src2) neon_st1m3_32((__int32*)(src1), __float32x2x3_t_to_n64x3(src2))
#define vst1_p16_x3(src1, src2) neon_st1m3_16((__int16*)(src1), __poly16x4x3_t_to_n64x3(src2))
#define vst1_p8_x3(src1, src2) neon_st1m3_8((__int8*)(src1), __poly8x8x3_t_to_n64x3(src2))
#define vst1_s16_x3(src1, src2) neon_st1m3_16((__int16*)(src1), __int16x4x3_t_to_n64x3(src2))
#define vst1_s32_x3(src1, src2) neon_st1m3_32((__int32*)(src1), __int32x2x3_t_to_n64x3(src2))
#define vst1_s8_x3(src1, src2) neon_st1m3_8((__int8*)(src1), __int8x8x3_t_to_n64x3(src2))
#define vst1_u16_x3(src1, src2) neon_st1m3_16((__int16*)(src1), __uint16x4x3_t_to_n64x3(src2))
#define vst1_u32_x3(src1, src2) neon_st1m3_32((__int32*)(src1), __uint32x2x3_t_to_n64x3(src2))
#define vst1_u8_x3(src1, src2) neon_st1m3_8((__int8*)(src1), __uint8x8x3_t_to_n64x3(src2))
#define vst1_s64_x3(src1, src2) neon_st1m3_64((__int64*)(src1), __int64x1x3_t_to_n64x3(src2))
#define vst1_u64_x3(src1, src2) neon_st1m3_64((__int64*)(src1), __uint64x1x3_t_to_n64x3(src2))
#define vst1_p64_x3(src1, src2) neon_st1m3_64((__int64*)(src1), __poly64x1x3_t_to_n64x3(src2))
#define vst1_f64_x3(src1, src2) neon_st1m3_64((__int64*)(src1), __float64x1x3_t_to_n64x3(src2))
#define vst1q_f32_x3(src1, src2) neon_st1m3_q32((__int32*)(src1), __float32x4x3_t_to_n128x3(src2))
#define vst1q_p16_x3(src1, src2) neon_st1m3_q16((__int16*)(src1), __poly16x8x3_t_to_n128x3(src2))
#define vst1q_p8_x3(src1, src2) neon_st1m3_q8((__int8*)(src1), __poly8x16x3_t_to_n128x3(src2))
#define vst1q_s16_x3(src1, src2) neon_st1m3_q16((__int16*)(src1), __int16x8x3_t_to_n128x3(src2))
#define vst1q_s32_x3(src1, src2) neon_st1m3_q32((__int32*)(src1), __int32x4x3_t_to_n128x3(src2))
#define vst1q_s8_x3(src1, src2) neon_st1m3_q8((__int8*)(src1), __int8x16x3_t_to_n128x3(src2))
#define vst1q_u16_x3(src1, src2) neon_st1m3_q16((__int16*)(src1), __uint16x8x3_t_to_n128x3(src2))
#define vst1q_u32_x3(src1, src2) neon_st1m3_q32((__int32*)(src1), __uint32x4x3_t_to_n128x3(src2))
#define vst1q_u8_x3(src1, src2) neon_st1m3_q8((__int8*)(src1), __uint8x16x3_t_to_n128x3(src2))
#define vst1q_s64_x3(src1, src2) neon_st1m3_q64((__int64*)(src1), __int64x2x3_t_to_n128x3(src2))
#define vst1q_u64_x3(src1, src2) neon_st1m3_q64((__int64*)(src1), __uint64x2x3_t_to_n128x3(src2))
#define vst1q_p64_x3(src1, src2) neon_st1m3_q64((__int64*)(src1), __poly64x2x3_t_to_n128x3(src2))
#define vst1q_f64_x3(src1, src2) neon_st1m3_q64((__int64*)(src1), __float64x2x3_t_to_n128x3(src2))
#define vst1_f32_x4(src1, src2) neon_st1m4_32((__int32*)(src1), __float32x2x4_t_to_n64x4(src2))
#define vst1_p16_x4(src1, src2) neon_st1m4_16((__int16*)(src1), __poly16x4x4_t_to_n64x4(src2))
#define vst1_p8_x4(src1, src2) neon_st1m4_8((__int8*)(src1), __poly8x8x4_t_to_n64x4(src2))
#define vst1_s16_x4(src1, src2) neon_st1m4_16((__int16*)(src1), __int16x4x4_t_to_n64x4(src2))
#define vst1_s32_x4(src1, src2) neon_st1m4_32((__int32*)(src1), __int32x2x4_t_to_n64x4(src2))
#define vst1_s8_x4(src1, src2) neon_st1m4_8((__int8*)(src1), __int8x8x4_t_to_n64x4(src2))
#define vst1_u16_x4(src1, src2) neon_st1m4_16((__int16*)(src1), __uint16x4x4_t_to_n64x4(src2))
#define vst1_u32_x4(src1, src2) neon_st1m4_32((__int32*)(src1), __uint32x2x4_t_to_n64x4(src2))
#define vst1_u8_x4(src1, src2) neon_st1m4_8((__int8*)(src1), __uint8x8x4_t_to_n64x4(src2))
#define vst1_s64_x4(src1, src2) neon_st1m4_64((__int64*)(src1), __int64x1x4_t_to_n64x4(src2))
#define vst1_u64_x4(src1, src2) neon_st1m4_64((__int64*)(src1), __uint64x1x4_t_to_n64x4(src2))
#define vst1_p64_x4(src1, src2) neon_st1m4_64((__int64*)(src1), __poly64x1x4_t_to_n64x4(src2))
#define vst1_f64_x4(src1, src2) neon_st1m4_64((__int64*)(src1), __float64x1x4_t_to_n64x4(src2))
#define vst1q_f32_x4(src1, src2) neon_st1m4_q32((__int32*)(src1), __float32x4x4_t_to_n128x4(src2))
#define vst1q_p16_x4(src1, src2) neon_st1m4_q16((__int16*)(src1), __poly16x8x4_t_to_n128x4(src2))
#define vst1q_p8_x4(src1, src2) neon_st1m4_q8((__int8*)(src1), __poly8x16x4_t_to_n128x4(src2))
#define vst1q_s16_x4(src1, src2) neon_st1m4_q16((__int16*)(src1), __int16x8x4_t_to_n128x4(src2))
#define vst1q_s32_x4(src1, src2) neon_st1m4_q32((__int32*)(src1), __int32x4x4_t_to_n128x4(src2))
#define vst1q_s8_x4(src1, src2) neon_st1m4_q8((__int8*)(src1), __int8x16x4_t_to_n128x4(src2))
#define vst1q_u16_x4(src1, src2) neon_st1m4_q16((__int16*)(src1), __uint16x8x4_t_to_n128x4(src2))
#define vst1q_u32_x4(src1, src2) neon_st1m4_q32((__int32*)(src1), __uint32x4x4_t_to_n128x4(src2))
#define vst1q_u8_x4(src1, src2) neon_st1m4_q8((__int8*)(src1), __uint8x16x4_t_to_n128x4(src2))
#define vst1q_s64_x4(src1, src2) neon_st1m4_q64((__int64*)(src1), __int64x2x4_t_to_n128x4(src2))
#define vst1q_u64_x4(src1, src2) neon_st1m4_q64((__int64*)(src1), __uint64x2x4_t_to_n128x4(src2))
#define vst1q_p64_x4(src1, src2) neon_st1m4_q64((__int64*)(src1), __poly64x2x4_t_to_n128x4(src2))
#define vst1q_f64_x4(src1, src2) neon_st1m4_q64((__int64*)(src1), __float64x2x4_t_to_n128x4(src2))
#define vst1_lane_f32(src1, src2, src3) neon_st1s_32((__int32*)(src1), __float32x2_t_to_n64(src2), (src3))
#define vst1_lane_f64(src1, src2, src3) neon_st1s_64((__int64*)(src1), __float64x1_t_to_n64(src2), (src3))
#define vst1_lane_p64(src1, src2, src3) neon_st1s_64((__int64*)(src1), __poly64x1_t_to_n64(src2), (src3))
#define vst1_lane_p16(src1, src2, src3) neon_st1s_16((__int16*)(src1), __poly16x4_t_to_n64(src2), (src3))
#define vst1_lane_p8(src1, src2, src3) neon_st1s_8((__int8*)(src1), __poly8x8_t_to_n64(src2), (src3))
#define vst1q_lane_s8(src1, src2, src3) neon_st1s_q8((__int8*)(src1), __int8x16_t_to_n128(src2), (src3))
#define vst1_lane_s16(src1, src2, src3) neon_st1s_16((__int16*)(src1), __int16x4_t_to_n64(src2), (src3))
#define vst1_lane_s32(src1, src2, src3) neon_st1s_32((__int32*)(src1), __int32x2_t_to_n64(src2), (src3))
#define vst1_lane_s64(src1, src2, src3) neon_st1s_64((__int64*)(src1), __int64x1_t_to_n64(src2), (src3))
#define vst1_lane_s8(src1, src2, src3) neon_st1s_8((__int8*)(src1), __int8x8_t_to_n64(src2), (src3))
#define vst1_lane_u16(src1, src2, src3) neon_st1s_16((__int16*)(src1), __uint16x4_t_to_n64(src2), (src3))
#define vst1_lane_u32(src1, src2, src3) neon_st1s_32((__int32*)(src1), __uint32x2_t_to_n64(src2), (src3))
#define vst1_lane_u64(src1, src2, src3) neon_st1s_64((__int64*)(src1), __uint64x1_t_to_n64(src2), (src3))
#define vst1_lane_u8(src1, src2, src3) neon_st1s_8((__int8*)(src1), __uint8x8_t_to_n64(src2), (src3))
#define vst1q_lane_f32(src1, src2, src3) neon_st1s_q32((__int32*)(src1), __float32x4_t_to_n128(src2), (src3))
#define vst1q_lane_f64(src1, src2, src3) neon_st1s_q64((__int64*)(src1), __float64x2_t_to_n128(src2), (src3))
#define vst1q_lane_p64(src1, src2, src3) neon_st1s_q64((__int64*)(src1), __poly64x2_t_to_n128(src2), (src3))
#define vst1q_lane_p8(src1, src2, src3) neon_st1s_q8((__int8*)(src1), __poly8x16_t_to_n128(src2), (src3))
#define vst1q_lane_p16(src1, src2, src3) neon_st1s_q16((__int16*)(src1), __poly16x8_t_to_n128(src2), (src3))
#define vst1q_lane_s16(src1, src2, src3) neon_st1s_q16((__int16*)(src1), __int16x8_t_to_n128(src2), (src3))
#define vst1q_lane_s32(src1, src2, src3) neon_st1s_q32((__int32*)(src1), __int32x4_t_to_n128(src2), (src3))
#define vst1q_lane_s64(src1, src2, src3) neon_st1s_q64((__int64*)(src1), __int64x2_t_to_n128(src2), (src3))
#define vst1q_lane_u8(src1, src2, src3) neon_st1s_q8((__int8*)(src1), __uint8x16_t_to_n128(src2), (src3))
#define vst1q_lane_u16(src1, src2, src3) neon_st1s_q16((__int16*)(src1), __uint16x8_t_to_n128(src2), (src3))
#define vst1q_lane_u32(src1, src2, src3) neon_st1s_q32((__int32*)(src1), __uint32x4_t_to_n128(src2), (src3))
#define vst1q_lane_u64(src1, src2, src3) neon_st1s_q64((__int64*)(src1), __uint64x2_t_to_n128(src2), (src3))

// FCVTL/FCVTL2/FCVTN/FCVTN2/FCVTXN/FCVTXN2
__n128 neon_fcvtl_32(__n64);
__n128 neon_fcvtl2_32(__n128);
__n128 neon_fcvtl_64(__n64);
__n128 neon_fcvtl2_64(__n128);
__n64  neon_fcvtn_32(__n128);
__n128 neon_fcvtn2_32(__n64, __n128);
__n64  neon_fcvtn_64(__n128);
__n128 neon_fcvtn2_64(__n64, __n128);
__n64  neon_fcvtxn_64(__n128);
__n128 neon_fcvtxn2_64(__n64, __n128);
float  neon_fcvtxns_64(double);
#define vcvt_f64_f32(src) __n128_to_float64x2_t(neon_fcvtl_64(__float32x2_t_to_n64(src)))
#define vcvt_high_f64_f32(src) __n128_to_float64x2_t(neon_fcvtl2_64(__float32x4_t_to_n128(src)))
#define vcvt_f32_f64(src) __n64_to_float32x2_t(neon_fcvtn_64(__float64x2_t_to_n128(src)))
#define vcvt_high_f32_f64(src1, src2) __n128_to_float32x4_t(neon_fcvtn2_64(__float32x2_t_to_n64(src1), __float64x2_t_to_n128(src2)))
#define vcvtx_f32_f64(src) __n64_to_float32x2_t(neon_fcvtxn_64(__float64x2_t_to_n128(src)))
#define vcvtx_high_f32_f64(src1, src2) __n128_to_float32x4_t(neon_fcvtxn2_64(__float32x2_t_to_n64(src1), __float64x2_t_to_n128(src2)))
#define vcvtxd_f32_f64(src) neon_fcvtxns_64(src)

// SQXTN/SQXTUN/UQXTN/XTN
__n64 neon_sqxtn_16(__n128);
__n64 neon_sqxtn_32(__n128);
__n64 neon_sqxtn_64(__n128);
__n128 neon_sqxtn2_16(__n64, __n128);
__n128 neon_sqxtn2_32(__n64, __n128);
__n128 neon_sqxtn2_64(__n64, __n128);
__n8  neon_sqxtns_16(__n16);
__n16 neon_sqxtns_32(float);
float neon_sqxtns_64(__n64);
__n64 neon_sqxtun_16(__n128);
__n64 neon_sqxtun_32(__n128);
__n64 neon_sqxtun_64(__n128);
__n128 neon_sqxtun2_16(__n64, __n128);
__n128 neon_sqxtun2_32(__n64, __n128);
__n128 neon_sqxtun2_64(__n64, __n128);
__n8  neon_sqxtuns_16(__n16);
__n16 neon_sqxtuns_32(float);
float neon_sqxtuns_64(__n64);
__n64 neon_uqxtn_16(__n128);
__n64 neon_uqxtn_32(__n128);
__n64 neon_uqxtn_64(__n128);
__n128 neon_uqxtn2_16(__n64, __n128);
__n128 neon_uqxtn2_32(__n64, __n128);
__n128 neon_uqxtn2_64(__n64, __n128);
__n8  neon_uqxtns_16(__n16);
__n16 neon_uqxtns_32(float);
float neon_uqxtns_64(__n64);
__n64 neon_xtn_16(__n128);
__n64 neon_xtn_32(__n128);
__n64 neon_xtn_64(__n128);
__n128 neon_xtn2_16(__n64, __n128);
__n128 neon_xtn2_32(__n64, __n128);
__n128 neon_xtn2_64(__n64, __n128);
#define vqmovn_s16(src) __n64_to_int8x8_t(neon_sqxtn_16(__int16x8_t_to_n128(src)))
#define vqmovn_s32(src) __n64_to_int16x4_t(neon_sqxtn_32(__int32x4_t_to_n128(src)))
#define vqmovn_s64(src) __n64_to_int32x2_t(neon_sqxtn_64(__int64x2_t_to_n128(src)))
#define vqmovn_high_s16(src1, src2) __n128_to_int8x16_t(neon_sqxtn2_16(__int8x8_t_to_n64(src1), __int16x8_t_to_n128(src2)))
#define vqmovn_high_s32(src1, src2) __n128_to_int16x8_t(neon_sqxtn2_32(__int16x4_t_to_n64(src1), __int32x4_t_to_n128(src2)))
#define vqmovn_high_s64(src1, src2) __n128_to_int32x4_t(neon_sqxtn2_64(__int32x2_t_to_n64(src1), __int64x2_t_to_n128(src2)))
#define vqmovnh_s16(src1) neon_sqxtns_16(__int16ToN16_v(src1)).n8_i8[0]
#define vqmovns_s32(src1) neon_sqxtns_32(_CopyFloatFromInt32(src1)).n16_i16[0]
#define vqmovnd_s64(src1) _CopyInt32FromFloat(neon_sqxtns_64(__int64ToN64_v(src1)))
#define vqmovun_s16(src) __n64_to_uint8x8_t(neon_sqxtun_16(__int16x8_t_to_n128(src)))
#define vqmovun_s32(src) __n64_to_uint16x4_t(neon_sqxtun_32(__int32x4_t_to_n128(src)))
#define vqmovun_s64(src) __n64_to_uint32x2_t(neon_sqxtun_64(__int64x2_t_to_n128(src)))
#define vqmovun_high_s16(src1, src2) __n128_to_uint8x16_t(neon_sqxtun2_16(__uint8x8_t_to_n64(src1), __int16x8_t_to_n128(src2)))
#define vqmovun_high_s32(src1, src2) __n128_to_uint16x8_t(neon_sqxtun2_32(__uint16x4_t_to_n64(src1), __int32x4_t_to_n128(src2)))
#define vqmovun_high_s64(src1, src2) __n128_to_uint32x4_t(neon_sqxtun2_64(__uint32x2_t_to_n64(src1), __int64x2_t_to_n128(src2)))
#define vqmovunh_s16(src1) neon_sqxtuns_16(__int16ToN16_v(src1)).n8_u8[0]
#define vqmovuns_s32(src1) neon_sqxtuns_32(_CopyFloatFromInt32(src1)).n16_u16[0]
#define vqmovund_s64(src1) _CopyUInt32FromFloat(neon_sqxtuns_64(__int64ToN64_v(src1)))
#define vqmovn_u16(src) __n64_to_uint8x8_t(neon_uqxtn_16(__uint16x8_t_to_n128(src)))
#define vqmovn_u32(src) __n64_to_uint16x4_t(neon_uqxtn_32(__uint32x4_t_to_n128(src)))
#define vqmovn_u64(src) __n64_to_uint32x2_t(neon_uqxtn_64(__uint64x2_t_to_n128(src)))
#define vqmovn_high_u16(src1, src2) __n128_to_uint8x16_t(neon_uqxtn2_16(__uint8x8_t_to_n64(src1), __uint16x8_t_to_n128(src2)))
#define vqmovn_high_u32(src1, src2) __n128_to_uint16x8_t(neon_uqxtn2_32(__uint16x4_t_to_n64(src1), __uint32x4_t_to_n128(src2)))
#define vqmovn_high_u64(src1, src2) __n128_to_uint32x4_t(neon_uqxtn2_64(__uint32x2_t_to_n64(src1), __uint64x2_t_to_n128(src2)))
#define vqmovnh_u16(src1) neon_uqxtns_16(__uint16ToN16_v(src1)).n8_u8[0]
#define vqmovns_u32(src1) neon_uqxtns_32(_CopyFloatFromInt32(src1)).n16_u16[0]
#define vqmovnd_u64(src1) _CopyUInt32FromFloat(neon_uqxtns_64(__uint64ToN64_v(src1)))
#define vmovn_s16(src) __n64_to_int8x8_t(neon_xtn_16(__int16x8_t_to_n128(src)))
#define vmovn_s32(src) __n64_to_int16x4_t(neon_xtn_32(__int32x4_t_to_n128(src)))
#define vmovn_s64(src) __n64_to_int32x2_t(neon_xtn_64(__int64x2_t_to_n128(src)))
#define vmovn_u16(src) __n64_to_uint8x8_t(neon_xtn_16(__uint16x8_t_to_n128(src)))
#define vmovn_u32(src) __n64_to_uint16x4_t(neon_xtn_32(__uint32x4_t_to_n128(src)))
#define vmovn_u64(src) __n64_to_uint32x2_t(neon_xtn_64(__uint64x2_t_to_n128(src)))
#define vmovn_high_s16(src1, src2) __n128_to_int8x16_t(neon_xtn2_16(__int8x8_t_to_n64(src1), __int16x8_t_to_n128(src2)))
#define vmovn_high_s32(src1, src2) __n128_to_int16x8_t(neon_xtn2_32(__int16x4_t_to_n64(src1), __int32x4_t_to_n128(src2)))
#define vmovn_high_s64(src1, src2) __n128_to_int32x4_t(neon_xtn2_64(__int32x2_t_to_n64(src1), __int64x2_t_to_n128(src2)))
#define vmovn_high_u16(src1, src2) __n128_to_uint8x16_t(neon_xtn2_16(__uint8x8_t_to_n64(src1), __uint16x8_t_to_n128(src2)))
#define vmovn_high_u32(src1, src2) __n128_to_uint16x8_t(neon_xtn2_32(__uint16x4_t_to_n64(src1), __uint32x4_t_to_n128(src2)))
#define vmovn_high_u64(src1, src2) __n128_to_uint32x4_t(neon_xtn2_64(__uint32x2_t_to_n64(src1), __uint64x2_t_to_n128(src2)))

// SHLL/SSHLL/USHLL
__n128 neon_sshll_8  (__n64, const int);
__n128 neon_sshll2_8 (__n128, const int);
__n128 neon_sshll_16 (__n64, const int);
__n128 neon_sshll2_16(__n128, const int);
__n128 neon_sshll_32 (__n64, const int);
__n128 neon_sshll2_32(__n128, const int);
__n128 neon_ushll_8  (__n64, const int);
__n128 neon_ushll2_8 (__n128, const int);
__n128 neon_ushll_16 (__n64, const int);
__n128 neon_ushll2_16(__n128, const int);
__n128 neon_ushll_32 (__n64, const int);
__n128 neon_ushll2_32(__n128, const int);
#define vshll_n_s8(src1, src2) __n128_to_int16x8_t(neon_sshll_8(__int8x8_t_to_n64(src1), (src2)))
#define vshll_n_s16(src1, src2) __n128_to_int32x4_t(neon_sshll_16(__int16x4_t_to_n64(src1), (src2)))
#define vshll_n_s32(src1, src2) __n128_to_int64x2_t(neon_sshll_32(__int32x2_t_to_n64(src1), (src2)))
#define vshll_n_u8(src1, src2) __n128_to_uint16x8_t(neon_ushll_8(__uint8x8_t_to_n64(src1), (src2)))
#define vshll_n_u16(src1, src2) __n128_to_uint32x4_t(neon_ushll_16(__uint16x4_t_to_n64(src1), (src2)))
#define vshll_n_u32(src1, src2) __n128_to_uint64x2_t(neon_ushll_32(__uint32x2_t_to_n64(src1), (src2)))
#define vshll_high_n_s8(src1, src2) __n128_to_int16x8_t(neon_sshll2_8(__int8x16_t_to_n128(src1), (src2)))
#define vshll_high_n_s16(src1, src2) __n128_to_int32x4_t(neon_sshll2_16(__int16x8_t_to_n128(src1), (src2)))
#define vshll_high_n_s32(src1, src2) __n128_to_int64x2_t(neon_sshll2_32(__int32x4_t_to_n128(src1), (src2)))
#define vshll_high_n_u8(src1, src2) __n128_to_uint16x8_t(neon_ushll2_8(__uint8x16_t_to_n128(src1), (src2)))
#define vshll_high_n_u16(src1, src2) __n128_to_uint32x4_t(neon_ushll2_16(__uint16x8_t_to_n128(src1), (src2)))
#define vshll_high_n_u32(src1, src2) __n128_to_uint64x2_t(neon_ushll2_32(__uint32x4_t_to_n128(src1), (src2)))
#define vmovl_s8(src1) __n128_to_int16x8_t(neon_sshll_8(__int8x8_t_to_n64(src1), 0))
#define vmovl_s16(src1) __n128_to_int32x4_t(neon_sshll_16(__int16x4_t_to_n64(src1), 0))
#define vmovl_s32(src1) __n128_to_int64x2_t(neon_sshll_32(__int32x2_t_to_n64(src1), 0))
#define vmovl_u8(src1) __n128_to_uint16x8_t(neon_ushll_8(__uint8x8_t_to_n64(src1), 0))
#define vmovl_u16(src1) __n128_to_uint32x4_t(neon_ushll_16(__uint16x4_t_to_n64(src1), 0))
#define vmovl_u32(src1) __n128_to_uint64x2_t(neon_ushll_32(__uint32x2_t_to_n64(src1), 0))
#define vmovl_high_s8(src1) __n128_to_int16x8_t(neon_sshll2_8(__int8x16_t_to_n128(src1), 0))
#define vmovl_high_s16(src1) __n128_to_int32x4_t(neon_sshll2_16(__int16x8_t_to_n128(src1), 0))
#define vmovl_high_s32(src1) __n128_to_int64x2_t(neon_sshll2_32(__int32x4_t_to_n128(src1), 0))
#define vmovl_high_u8(src1) __n128_to_uint16x8_t(neon_ushll2_8(__uint8x16_t_to_n128(src1), 0))
#define vmovl_high_u16(src1) __n128_to_uint32x4_t(neon_ushll2_16(__uint16x8_t_to_n128(src1), 0))
#define vmovl_high_u32(src1) __n128_to_uint64x2_t(neon_ushll2_32(__uint32x4_t_to_n128(src1), 0))

// SHRN/RSHRN/SQSHRN/SQRSHRN/UQSHRN/UQRSHRN/SQSHRUN/SQRSHRUN
__n64  neon_shrn_16     (__n128, const int);
__n128 neon_shrn2_16    (__n64, __n128, const int);
__n64  neon_shrn_32     (__n128, const int);
__n128 neon_shrn2_32    (__n64, __n128, const int);
__n64  neon_shrn_64     (__n128, const int);
__n128 neon_shrn2_64    (__n64, __n128, const int);
__n64  neon_rshrn_16    (__n128, const int);
__n128 neon_rshrn2_16   (__n64, __n128, const int);
__n64  neon_rshrn_32    (__n128, const int);
__n128 neon_rshrn2_32   (__n64, __n128, const int);
__n64  neon_rshrn_64    (__n128, const int);
__n128 neon_rshrn2_64   (__n64, __n128, const int);
__n64  neon_sqshrn_16   (__n128, const int);
__n128 neon_sqshrn2_16  (__n64, __n128, const int);
__n64  neon_sqshrn_32   (__n128, const int);
__n128 neon_sqshrn2_32  (__n64, __n128, const int);
__n64  neon_sqshrn_64   (__n128, const int);
__n128 neon_sqshrn2_64  (__n64, __n128, const int);
__n64  neon_sqrshrn_16  (__n128, const int);
__n128 neon_sqrshrn2_16 (__n64, __n128, const int);
__n64  neon_sqrshrn_32  (__n128, const int);
__n128 neon_sqrshrn2_32 (__n64, __n128, const int);
__n64  neon_sqrshrn_64  (__n128, const int);
__n128 neon_sqrshrn2_64 (__n64, __n128, const int);
__n64  neon_uqshrn_16   (__n128, const int);
__n128 neon_uqshrn2_16  (__n64, __n128, const int);
__n64  neon_uqshrn_32   (__n128, const int);
__n128 neon_uqshrn2_32  (__n64, __n128, const int);
__n64  neon_uqshrn_64   (__n128, const int);
__n128 neon_uqshrn2_64  (__n64, __n128, const int);
__n64  neon_uqrshrn_16  (__n128, const int);
__n128 neon_uqrshrn2_16 (__n64, __n128, const int);
__n64  neon_uqrshrn_32  (__n128, const int);
__n128 neon_uqrshrn2_32 (__n64, __n128, const int);
__n64  neon_uqrshrn_64  (__n128, const int);
__n128 neon_uqrshrn2_64 (__n64, __n128, const int);
__n64  neon_sqshrun_16  (__n128, const int);
__n128 neon_sqshrun2_16 (__n64, __n128, const int);
__n64  neon_sqshrun_32  (__n128, const int);
__n128 neon_sqshrun2_32 (__n64, __n128, const int);
__n64  neon_sqshrun_64  (__n128, const int);
__n128 neon_sqshrun2_64 (__n64, __n128, const int);
__n64  neon_sqrshrun_16 (__n128, const int);
__n128 neon_sqrshrun2_16(__n64, __n128, const int);
__n64  neon_sqrshrun_32 (__n128, const int);
__n128 neon_sqrshrun2_32(__n64, __n128, const int);
__n64  neon_sqrshrun_64 (__n128, const int);
__n128 neon_sqrshrun2_64(__n64, __n128, const int);
__n8   neon_sqshrn_s16  (__n16, const int);
__n16  neon_sqshrn_s32  (float, const int);
float  neon_sqshrn_s64  (__n64, const int);
__n8   neon_sqrshrn_s16 (__n16, const int);
__n16  neon_sqrshrn_s32 (float, const int);
float  neon_sqrshrn_s64 (__n64, const int);
__n8   neon_uqshrn_s16  (__n16, const int);
__n16  neon_uqshrn_s32  (float, const int);
float  neon_uqshrn_s64  (__n64, const int);
__n8   neon_uqrshrn_s16 (__n16, const int);
__n16  neon_uqrshrn_s32 (float, const int);
float  neon_uqrshrn_s64 (__n64, const int);
__n8   neon_sqshrun_s16 (__n16, const int);
__n16  neon_sqshrun_s32 (float, const int);
float  neon_sqshrun_s64 (__n64, const int);
__n8   neon_sqrshrun_s16(__n16, const int);
__n16  neon_sqrshrun_s32(float, const int);
float  neon_sqrshrun_s64(__n64, const int);
#define vshrn_n_s16(src1, src2) __n64_to_int8x8_t(neon_shrn_16(__int16x8_t_to_n128(src1), (src2)))
#define vshrn_n_s32(src1, src2) __n64_to_int16x4_t(neon_shrn_32(__int32x4_t_to_n128(src1), (src2)))
#define vshrn_n_s64(src1, src2) __n64_to_int32x2_t(neon_shrn_64(__int64x2_t_to_n128(src1), (src2)))
#define vshrn_n_u16(src1, src2) __n64_to_uint8x8_t(neon_shrn_16(__uint16x8_t_to_n128(src1), (src2)))
#define vshrn_n_u32(src1, src2) __n64_to_uint16x4_t(neon_shrn_32(__uint32x4_t_to_n128(src1), (src2)))
#define vshrn_n_u64(src1, src2) __n64_to_uint32x2_t(neon_shrn_64(__uint64x2_t_to_n128(src1), (src2)))
#define vshrn_high_n_s16(src1, src2, src3) __n128_to_int8x16_t(neon_shrn2_16(__int8x8_t_to_n64(src1), __int16x8_t_to_n128(src2), (src3)))
#define vshrn_high_n_s32(src1, src2, src3) __n128_to_int16x8_t(neon_shrn2_32(__int16x4_t_to_n64(src1), __int32x4_t_to_n128(src2), (src3)))
#define vshrn_high_n_s64(src1, src2, src3) __n128_to_int32x4_t(neon_shrn2_64(__int32x2_t_to_n64(src1), __int64x2_t_to_n128(src2), (src3)))
#define vshrn_high_n_u16(src1, src2, src3) __n128_to_uint8x16_t(neon_shrn2_16(__uint8x8_t_to_n64(src1), __uint16x8_t_to_n128(src2), (src3)))
#define vshrn_high_n_u32(src1, src2, src3) __n128_to_uint16x8_t(neon_shrn2_32(__uint16x4_t_to_n64(src1), __uint32x4_t_to_n128(src2), (src3)))
#define vshrn_high_n_u64(src1, src2, src3) __n128_to_uint32x4_t(neon_shrn2_64(__uint32x2_t_to_n64(src1), __uint64x2_t_to_n128(src2), (src3)))
#define vrshrn_n_s16(src1, src2) __n64_to_int8x8_t(neon_rshrn_16(__int16x8_t_to_n128(src1), (src2)))
#define vrshrn_n_s32(src1, src2) __n64_to_int16x4_t(neon_rshrn_32(__int32x4_t_to_n128(src1), (src2)))
#define vrshrn_n_s64(src1, src2) __n64_to_int32x2_t(neon_rshrn_64(__int64x2_t_to_n128(src1), (src2)))
#define vrshrn_n_u16(src1, src2) __n64_to_uint8x8_t(neon_rshrn_16(__uint16x8_t_to_n128(src1), (src2)))
#define vrshrn_n_u32(src1, src2) __n64_to_uint16x4_t(neon_rshrn_32(__uint32x4_t_to_n128(src1), (src2)))
#define vrshrn_n_u64(src1, src2) __n64_to_uint32x2_t(neon_rshrn_64(__uint64x2_t_to_n128(src1), (src2)))
#define vrshrn_high_n_s16(src1, src2, src3) __n128_to_int8x16_t(neon_rshrn2_16(__int8x8_t_to_n64(src1), __int16x8_t_to_n128(src2), (src3)))
#define vrshrn_high_n_s32(src1, src2, src3) __n128_to_int16x8_t(neon_rshrn2_32(__int16x4_t_to_n64(src1), __int32x4_t_to_n128(src2), (src3)))
#define vrshrn_high_n_s64(src1, src2, src3) __n128_to_int32x4_t(neon_rshrn2_64(__int32x2_t_to_n64(src1), __int64x2_t_to_n128(src2), (src3)))
#define vrshrn_high_n_u16(src1, src2, src3) __n128_to_uint8x16_t(neon_rshrn2_16(__uint8x8_t_to_n64(src1), __uint16x8_t_to_n128(src2), (src3)))
#define vrshrn_high_n_u32(src1, src2, src3) __n128_to_uint16x8_t(neon_rshrn2_32(__uint16x4_t_to_n64(src1), __uint32x4_t_to_n128(src2), (src3)))
#define vrshrn_high_n_u64(src1, src2, src3) __n128_to_uint32x4_t(neon_rshrn2_64(__uint32x2_t_to_n64(src1), __uint64x2_t_to_n128(src2), (src3)))
#define vqshrn_n_s16(src1, src2) __n64_to_int8x8_t(neon_sqshrn_16(__int16x8_t_to_n128(src1), (src2)))
#define vqshrn_n_s32(src1, src2) __n64_to_int16x4_t(neon_sqshrn_32(__int32x4_t_to_n128(src1), (src2)))
#define vqshrn_n_s64(src1, src2) __n64_to_int32x2_t(neon_sqshrn_64(__int64x2_t_to_n128(src1), (src2)))
#define vqshrn_n_u16(src1, src2) __n64_to_uint8x8_t(neon_uqshrn_16(__uint16x8_t_to_n128(src1), (src2)))
#define vqshrn_n_u32(src1, src2) __n64_to_uint16x4_t(neon_uqshrn_32(__uint32x4_t_to_n128(src1), (src2)))
#define vqshrn_n_u64(src1, src2) __n64_to_uint32x2_t(neon_uqshrn_64(__uint64x2_t_to_n128(src1), (src2)))
#define vqshrn_high_n_s16(src1, src2, src3) __n128_to_int8x16_t(neon_sqshrn2_16(__int8x8_t_to_n64(src1), __int16x8_t_to_n128(src2), (src3)))
#define vqshrn_high_n_s32(src1, src2, src3) __n128_to_int16x8_t(neon_sqshrn2_32(__int16x4_t_to_n64(src1), __int32x4_t_to_n128(src2), (src3)))
#define vqshrn_high_n_s64(src1, src2, src3) __n128_to_int32x4_t(neon_sqshrn2_64(__int32x2_t_to_n64(src1), __int64x2_t_to_n128(src2), (src3)))
#define vqshrn_high_n_u16(src1, src2, src3) __n128_to_uint8x16_t(neon_uqshrn2_16(__uint8x8_t_to_n64(src1), __uint16x8_t_to_n128(src2), (src3)))
#define vqshrn_high_n_u32(src1, src2, src3) __n128_to_uint16x8_t(neon_uqshrn2_32(__uint16x4_t_to_n64(src1), __uint32x4_t_to_n128(src2), (src3)))
#define vqshrn_high_n_u64(src1, src2, src3) __n128_to_uint32x4_t(neon_uqshrn2_64(__uint32x2_t_to_n64(src1), __uint64x2_t_to_n128(src2), (src3)))
#define vqrshrn_n_s16(src1, src2) __n64_to_int8x8_t(neon_sqrshrn_16(__int16x8_t_to_n128(src1), (src2)))
#define vqrshrn_n_s32(src1, src2) __n64_to_int16x4_t(neon_sqrshrn_32(__int32x4_t_to_n128(src1), (src2)))
#define vqrshrn_n_s64(src1, src2) __n64_to_int32x2_t(neon_sqrshrn_64(__int64x2_t_to_n128(src1), (src2)))
#define vqrshrn_n_u16(src1, src2) __n64_to_uint8x8_t(neon_uqrshrn_16(__uint16x8_t_to_n128(src1), (src2)))
#define vqrshrn_n_u32(src1, src2) __n64_to_uint16x4_t(neon_uqrshrn_32(__uint32x4_t_to_n128(src1), (src2)))
#define vqrshrn_n_u64(src1, src2) __n64_to_uint32x2_t(neon_uqrshrn_64(__uint64x2_t_to_n128(src1), (src2)))
#define vqrshrn_high_n_s16(src1, src2, src3) __n128_to_int8x16_t(neon_sqrshrn2_16(__int8x8_t_to_n64(src1), __int16x8_t_to_n128(src2), (src3)))
#define vqrshrn_high_n_s32(src1, src2, src3) __n128_to_int16x8_t(neon_sqrshrn2_32(__int16x4_t_to_n64(src1), __int32x4_t_to_n128(src2), (src3)))
#define vqrshrn_high_n_s64(src1, src2, src3) __n128_to_int32x4_t(neon_sqrshrn2_64(__int32x2_t_to_n64(src1), __int64x2_t_to_n128(src2), (src3)))
#define vqrshrn_high_n_u16(src1, src2, src3) __n128_to_uint8x16_t(neon_uqrshrn2_16(__uint8x8_t_to_n64(src1), __uint16x8_t_to_n128(src2), (src3)))
#define vqrshrn_high_n_u32(src1, src2, src3) __n128_to_uint16x8_t(neon_uqrshrn2_32(__uint16x4_t_to_n64(src1), __uint32x4_t_to_n128(src2), (src3)))
#define vqrshrn_high_n_u64(src1, src2, src3) __n128_to_uint32x4_t(neon_uqrshrn2_64(__uint32x2_t_to_n64(src1), __uint64x2_t_to_n128(src2), (src3)))
#define vqshrun_n_s16(src1, src2) __n64_to_uint8x8_t(neon_sqshrun_16(__int16x8_t_to_n128(src1), (src2)))
#define vqshrun_n_s32(src1, src2) __n64_to_uint16x4_t(neon_sqshrun_32(__int32x4_t_to_n128(src1), (src2)))
#define vqshrun_n_s64(src1, src2) __n64_to_uint32x2_t(neon_sqshrun_64(__int64x2_t_to_n128(src1), (src2)))
#define vqshrun_high_n_s16(src1, src2, src3) __n128_to_uint8x16_t(neon_sqshrun2_16(__uint8x8_t_to_n64(src1), __int16x8_t_to_n128(src2), (src3)))
#define vqshrun_high_n_s32(src1, src2, src3) __n128_to_uint16x8_t(neon_sqshrun2_32(__uint16x4_t_to_n64(src1), __int32x4_t_to_n128(src2), (src3)))
#define vqshrun_high_n_s64(src1, src2, src3) __n128_to_uint32x4_t(neon_sqshrun2_64(__uint32x2_t_to_n64(src1), __int64x2_t_to_n128(src2), (src3)))
#define vqrshrun_n_s16(src1, src2) __n64_to_uint8x8_t(neon_sqrshrun_16(__int16x8_t_to_n128(src1), (src2)))
#define vqrshrun_n_s32(src1, src2) __n64_to_uint16x4_t(neon_sqrshrun_32(__int32x4_t_to_n128(src1), (src2)))
#define vqrshrun_n_s64(src1, src2) __n64_to_uint32x2_t(neon_sqrshrun_64(__int64x2_t_to_n128(src1), (src2)))
#define vqrshrun_high_n_s16(src1, src2, src3) __n128_to_uint8x16_t(neon_sqrshrun2_16(__uint8x8_t_to_n64(src1), __int16x8_t_to_n128(src2), (src3)))
#define vqrshrun_high_n_s32(src1, src2, src3) __n128_to_uint16x8_t(neon_sqrshrun2_32(__uint16x4_t_to_n64(src1), __int32x4_t_to_n128(src2), (src3)))
#define vqrshrun_high_n_s64(src1, src2, src3) __n128_to_uint32x4_t(neon_sqrshrun2_64(__uint32x2_t_to_n64(src1), __int64x2_t_to_n128(src2), (src3)))
#define vqshrnh_n_s16(src1, src2) neon_sqshrn_s16(__int16ToN16_v(src1), (src2)).n8_i8[0]
#define vqshrns_n_s32(src1, src2) neon_sqshrn_s32(_CopyFloatFromInt32(src1), (src2)).n16_i16[0]
#define vqshrnd_n_s64(src1, src2) _CopyInt32FromFloat(neon_sqshrn_s64(__int64ToN64_v(src1), (src2)))
#define vqshrnh_n_u16(src1, src2) neon_uqshrn_s16(__int16ToN16_v(src1), (src2)).n8_i8[0]
#define vqshrns_n_u32(src1, src2) neon_uqshrn_s32(_CopyFloatFromInt32(src1), (src2)).n16_i16[0]
#define vqshrnd_n_u64(src1, src2) _CopyInt32FromFloat(neon_uqshrn_s64(__int64ToN64_v(src1), (src2)))
#define vqshrunh_n_s16(src1, src2) neon_sqshrun_s16(__int16ToN16_v(src1), (src2)).n8_u8[0]
#define vqshruns_n_s32(src1, src2) neon_sqshrun_s32(_CopyFloatFromInt32(src1), (src2)).n16_u16[0]
#define vqshrund_n_s64(src1, src2) _CopyUInt32FromFloat(neon_sqshrun_s64(__int64ToN64_v(src1), (src2)))
#define vqrshrnh_n_s16(src1, src2) neon_sqrshrn_s16(__int16ToN16_v(src1), (src2)).n8_i8[0]
#define vqrshrns_n_s32(src1, src2) neon_sqrshrn_s32(_CopyFloatFromInt32(src1), (src2)).n16_i16[0]
#define vqrshrnd_n_s64(src1, src2) _CopyInt32FromFloat(neon_sqrshrn_s64(__int64ToN64_v(src1), (src2)))
#define vqrshrnh_n_u16(src1, src2) neon_uqrshrn_s16(__int16ToN16_v(src1), (src2)).n8_i8[0]
#define vqrshrns_n_u32(src1, src2) neon_uqrshrn_s32(_CopyFloatFromInt32(src1), (src2)).n16_i16[0]
#define vqrshrnd_n_u64(src1, src2) _CopyInt32FromFloat(neon_uqrshrn_s64(__int64ToN64_v(src1), (src2)))
#define vqrshrunh_n_s16(src1, src2) neon_sqrshrun_s16(__int16ToN16_v(src1), (src2)).n8_u8[0]
#define vqrshruns_n_s32(src1, src2) neon_sqrshrun_s32(_CopyFloatFromInt32(src1), (src2)).n16_u16[0]
#define vqrshrund_n_s64(src1, src2) _CopyUInt32FromFloat(neon_sqrshrun_s64(__int64ToN64_v(src1), (src2)))

// ADDHN/RADDHN/SADDW/UADDW/SADDL/UADDL
__n64  neon_addhn_16   (__n128, __n128);
__n128 neon_addhn2_16  (__n64, __n128, __n128);
__n64  neon_addhn_32   (__n128, __n128);
__n128 neon_addhn2_32  (__n64, __n128, __n128);
__n64  neon_addhn_64   (__n128, __n128);
__n128 neon_addhn2_64  (__n64, __n128, __n128);
__n64  neon_raddhn_16  (__n128, __n128);
__n128 neon_raddhn2_16 (__n64, __n128, __n128);
__n64  neon_raddhn_32  (__n128, __n128);
__n128 neon_raddhn2_32 (__n64, __n128, __n128);
__n64  neon_raddhn_64  (__n128, __n128);
__n128 neon_raddhn2_64 (__n64, __n128, __n128);
__n128 neon_saddw_8    (__n128, __n64);
__n128 neon_saddw2_8   (__n128, __n128);
__n128 neon_saddw_16   (__n128, __n64);
__n128 neon_saddw2_16  (__n128, __n128);
__n128 neon_saddw_32   (__n128, __n64);
__n128 neon_saddw2_32  (__n128, __n128);
__n128 neon_uaddw_8    (__n128, __n64);
__n128 neon_uaddw2_8   (__n128, __n128);
__n128 neon_uaddw_16   (__n128, __n64);
__n128 neon_uaddw2_16  (__n128, __n128);
__n128 neon_uaddw_32   (__n128, __n64);
__n128 neon_uaddw2_32  (__n128, __n128);
__n128 neon_saddl_8    (__n64, __n64);
__n128 neon_saddl2_8   (__n128, __n128);
__n128 neon_saddl_16   (__n64, __n64);
__n128 neon_saddl2_16  (__n128, __n128);
__n128 neon_saddl_32   (__n64, __n64);
__n128 neon_saddl2_32  (__n128, __n128);
__n128 neon_uaddl_8    (__n64, __n64);
__n128 neon_uaddl2_8   (__n128, __n128);
__n128 neon_uaddl_16   (__n64, __n64);
__n128 neon_uaddl2_16  (__n128, __n128);
__n128 neon_uaddl_32   (__n64, __n64);
__n128 neon_uaddl2_32  (__n128, __n128);
#define vaddhn_s16(src1, src2) __n64_to_int8x8_t(neon_addhn_16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vaddhn_s32(src1, src2) __n64_to_int16x4_t(neon_addhn_32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vaddhn_s64(src1, src2) __n64_to_int32x2_t(neon_addhn_64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vaddhn_u16(src1, src2) __n64_to_uint8x8_t(neon_addhn_16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vaddhn_u32(src1, src2) __n64_to_uint16x4_t(neon_addhn_32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vaddhn_u64(src1, src2) __n64_to_uint32x2_t(neon_addhn_64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vaddhn_high_s16(src1, src2, src3) __n128_to_int8x16_t(neon_addhn2_16(__int8x8_t_to_n64(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vaddhn_high_s32(src1, src2, src3) __n128_to_int16x8_t(neon_addhn2_32(__int16x4_t_to_n64(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vaddhn_high_s64(src1, src2, src3) __n128_to_int32x4_t(neon_addhn2_64(__int32x2_t_to_n64(src1), __int64x2_t_to_n128(src2), __int64x2_t_to_n128(src3)))
#define vaddhn_high_u16(src1, src2, src3) __n128_to_uint8x16_t(neon_addhn2_16(__uint8x8_t_to_n64(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3)))
#define vaddhn_high_u32(src1, src2, src3) __n128_to_uint16x8_t(neon_addhn2_32(__uint16x4_t_to_n64(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vaddhn_high_u64(src1, src2, src3) __n128_to_uint32x4_t(neon_addhn2_64(__uint32x2_t_to_n64(src1), __uint64x2_t_to_n128(src2), __uint64x2_t_to_n128(src3)))
#define vraddhn_s16(src1, src2) __n64_to_int8x8_t(neon_raddhn_16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vraddhn_s32(src1, src2) __n64_to_int16x4_t(neon_raddhn_32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vraddhn_s64(src1, src2) __n64_to_int32x2_t(neon_raddhn_64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vraddhn_u16(src1, src2) __n64_to_uint8x8_t(neon_raddhn_16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vraddhn_u32(src1, src2) __n64_to_uint16x4_t(neon_raddhn_32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vraddhn_u64(src1, src2) __n64_to_uint32x2_t(neon_raddhn_64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vraddhn_high_s16(src1, src2, src3) __n128_to_int8x16_t(neon_raddhn2_16(__int8x8_t_to_n64(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vraddhn_high_s32(src1, src2, src3) __n128_to_int16x8_t(neon_raddhn2_32(__int16x4_t_to_n64(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vraddhn_high_s64(src1, src2, src3) __n128_to_int32x4_t(neon_raddhn2_64(__int32x2_t_to_n64(src1), __int64x2_t_to_n128(src2), __int64x2_t_to_n128(src3)))
#define vraddhn_high_u16(src1, src2, src3) __n128_to_uint8x16_t(neon_raddhn2_16(__uint8x8_t_to_n64(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3)))
#define vraddhn_high_u32(src1, src2, src3) __n128_to_uint16x8_t(neon_raddhn2_32(__uint16x4_t_to_n64(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vraddhn_high_u64(src1, src2, src3) __n128_to_uint32x4_t(neon_raddhn2_64(__uint32x2_t_to_n64(src1), __uint64x2_t_to_n128(src2), __uint64x2_t_to_n128(src3)))
#define vaddw_s8(src1, src2) __n128_to_int16x8_t(neon_saddw_8(__int16x8_t_to_n128(src1), __int8x8_t_to_n64(src2)))
#define vaddw_s16(src1, src2) __n128_to_int32x4_t(neon_saddw_16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2)))
#define vaddw_s32(src1, src2) __n128_to_int64x2_t(neon_saddw_32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2)))
#define vaddw_u8(src1, src2) __n128_to_uint16x8_t(neon_uaddw_8(__uint16x8_t_to_n128(src1), __uint8x8_t_to_n64(src2)))
#define vaddw_u16(src1, src2) __n128_to_uint32x4_t(neon_uaddw_16(__uint32x4_t_to_n128(src1), __uint16x4_t_to_n64(src2)))
#define vaddw_u32(src1, src2) __n128_to_uint64x2_t(neon_uaddw_32(__uint64x2_t_to_n128(src1), __uint32x2_t_to_n64(src2)))
#define vaddl_s8(src1, src2) __n128_to_int16x8_t(neon_saddl_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vaddl_s16(src1, src2) __n128_to_int32x4_t(neon_saddl_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vaddl_s32(src1, src2) __n128_to_int64x2_t(neon_saddl_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vaddl_u8(src1, src2) __n128_to_uint16x8_t(neon_uaddl_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vaddl_u16(src1, src2) __n128_to_uint32x4_t(neon_uaddl_16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vaddl_u32(src1, src2) __n128_to_uint64x2_t(neon_uaddl_32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vaddw_high_s8(src1, src2) __n128_to_int16x8_t(neon_saddw2_8(__int16x8_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vaddw_high_s16(src1, src2) __n128_to_int32x4_t(neon_saddw2_16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vaddw_high_s32(src1, src2) __n128_to_int64x2_t(neon_saddw2_32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vaddw_high_u8(src1, src2) __n128_to_uint16x8_t(neon_uaddw2_8(__uint16x8_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vaddw_high_u16(src1, src2) __n128_to_uint32x4_t(neon_uaddw2_16(__uint32x4_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vaddw_high_u32(src1, src2) __n128_to_uint64x2_t(neon_uaddw2_32(__uint64x2_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vaddl_high_s8(src1, src2) __n128_to_int16x8_t(neon_saddl2_8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vaddl_high_s16(src1, src2) __n128_to_int32x4_t(neon_saddl2_16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vaddl_high_s32(src1, src2) __n128_to_int64x2_t(neon_saddl2_32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vaddl_high_u8(src1, src2) __n128_to_uint16x8_t(neon_uaddl2_8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vaddl_high_u16(src1, src2) __n128_to_uint32x4_t(neon_uaddl2_16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vaddl_high_u32(src1, src2) __n128_to_uint64x2_t(neon_uaddl2_32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))

// SUBHN/RSUBHN/SSUBW/USUBW/SSUBL/USUBL
__n64  neon_subhn_16(__n128, __n128);
__n128 neon_subhn2_16(__n64, __n128, __n128);
__n64  neon_subhn_32(__n128, __n128);
__n128 neon_subhn2_32(__n64, __n128, __n128);
__n64  neon_subhn_64(__n128, __n128);
__n128 neon_subhn2_64(__n64, __n128, __n128);
__n64  neon_rsubhn_16(__n128, __n128);
__n128 neon_rsubhn2_16(__n64, __n128, __n128);
__n64  neon_rsubhn_32(__n128, __n128);
__n128 neon_rsubhn2_32(__n64, __n128, __n128);
__n64  neon_rsubhn_64(__n128, __n128);
__n128 neon_rsubhn2_64(__n64, __n128, __n128);
__n128 neon_ssubw_8(__n128, __n64);
__n128 neon_ssubw2_8(__n128, __n128);
__n128 neon_ssubw_16(__n128, __n64);
__n128 neon_ssubw2_16(__n128, __n128);
__n128 neon_ssubw_32(__n128, __n64);
__n128 neon_ssubw2_32(__n128, __n128);
__n128 neon_usubw_8(__n128, __n64);
__n128 neon_usubw2_8(__n128, __n128);
__n128 neon_usubw_16(__n128, __n64);
__n128 neon_usubw2_16(__n128, __n128);
__n128 neon_usubw_32(__n128, __n64);
__n128 neon_usubw2_32(__n128, __n128);
__n128 neon_ssubl_8(__n64, __n64);
__n128 neon_ssubl2_8(__n128, __n128);
__n128 neon_ssubl_16(__n64, __n64);
__n128 neon_ssubl2_16(__n128, __n128);
__n128 neon_ssubl_32(__n64, __n64);
__n128 neon_ssubl2_32(__n128, __n128);
__n128 neon_usubl_8(__n64, __n64);
__n128 neon_usubl2_8(__n128, __n128);
__n128 neon_usubl_16(__n64, __n64);
__n128 neon_usubl2_16(__n128, __n128);
__n128 neon_usubl_32(__n64, __n64);
__n128 neon_usubl2_32(__n128, __n128);
#define vsubhn_s16(src1, src2) __n64_to_int8x8_t(neon_subhn_16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vsubhn_s32(src1, src2) __n64_to_int16x4_t(neon_subhn_32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vsubhn_s64(src1, src2) __n64_to_int32x2_t(neon_subhn_64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vsubhn_u16(src1, src2) __n64_to_uint8x8_t(neon_subhn_16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vsubhn_u32(src1, src2) __n64_to_uint16x4_t(neon_subhn_32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vsubhn_u64(src1, src2) __n64_to_uint32x2_t(neon_subhn_64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vsubhn_high_s16(src1, src2, src3) __n128_to_int8x16_t(neon_subhn2_16(__int8x8_t_to_n64(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vsubhn_high_s32(src1, src2, src3) __n128_to_int16x8_t(neon_subhn2_32(__int16x4_t_to_n64(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vsubhn_high_s64(src1, src2, src3) __n128_to_int32x4_t(neon_subhn2_64(__int32x2_t_to_n64(src1), __int64x2_t_to_n128(src2), __int64x2_t_to_n128(src3)))
#define vsubhn_high_u16(src1, src2, src3) __n128_to_uint8x16_t(neon_subhn2_16(__uint8x8_t_to_n64(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3)))
#define vsubhn_high_u32(src1, src2, src3) __n128_to_uint16x8_t(neon_subhn2_32(__uint16x4_t_to_n64(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vsubhn_high_u64(src1, src2, src3) __n128_to_uint32x4_t(neon_subhn2_64(__uint32x2_t_to_n64(src1), __uint64x2_t_to_n128(src2), __uint64x2_t_to_n128(src3)))
#define vrsubhn_s16(src1, src2) __n64_to_int8x8_t(neon_rsubhn_16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vrsubhn_s32(src1, src2) __n64_to_int16x4_t(neon_rsubhn_32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vrsubhn_s64(src1, src2) __n64_to_int32x2_t(neon_rsubhn_64(__int64x2_t_to_n128(src1), __int64x2_t_to_n128(src2)))
#define vrsubhn_u16(src1, src2) __n64_to_uint8x8_t(neon_rsubhn_16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vrsubhn_u32(src1, src2) __n64_to_uint16x4_t(neon_rsubhn_32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vrsubhn_u64(src1, src2) __n64_to_uint32x2_t(neon_rsubhn_64(__uint64x2_t_to_n128(src1), __uint64x2_t_to_n128(src2)))
#define vrsubhn_high_s16(src1, src2, src3) __n128_to_int8x16_t(neon_rsubhn2_16(__int8x8_t_to_n64(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vrsubhn_high_s32(src1, src2, src3) __n128_to_int16x8_t(neon_rsubhn2_32(__int16x4_t_to_n64(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vrsubhn_high_s64(src1, src2, src3) __n128_to_int32x4_t(neon_rsubhn2_64(__int32x2_t_to_n64(src1), __int64x2_t_to_n128(src2), __int64x2_t_to_n128(src3)))
#define vrsubhn_high_u16(src1, src2, src3) __n128_to_uint8x16_t(neon_rsubhn2_16(__uint8x8_t_to_n64(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3)))
#define vrsubhn_high_u32(src1, src2, src3) __n128_to_uint16x8_t(neon_rsubhn2_32(__uint16x4_t_to_n64(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vrsubhn_high_u64(src1, src2, src3) __n128_to_uint32x4_t(neon_rsubhn2_64(__uint32x2_t_to_n64(src1), __uint64x2_t_to_n128(src2), __uint64x2_t_to_n128(src3)))
#define vsubw_s8(src1, src2) __n128_to_int16x8_t(neon_ssubw_8(__int16x8_t_to_n128(src1), __int8x8_t_to_n64(src2)))
#define vsubw_s16(src1, src2) __n128_to_int32x4_t(neon_ssubw_16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2)))
#define vsubw_s32(src1, src2) __n128_to_int64x2_t(neon_ssubw_32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2)))
#define vsubw_u8(src1, src2) __n128_to_uint16x8_t(neon_usubw_8(__uint16x8_t_to_n128(src1), __uint8x8_t_to_n64(src2)))
#define vsubw_u16(src1, src2) __n128_to_uint32x4_t(neon_usubw_16(__uint32x4_t_to_n128(src1), __uint16x4_t_to_n64(src2)))
#define vsubw_u32(src1, src2) __n128_to_uint64x2_t(neon_usubw_32(__uint64x2_t_to_n128(src1), __uint32x2_t_to_n64(src2)))
#define vsubl_s8(src1, src2) __n128_to_int16x8_t(neon_ssubl_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vsubl_s16(src1, src2) __n128_to_int32x4_t(neon_ssubl_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vsubl_s32(src1, src2) __n128_to_int64x2_t(neon_ssubl_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vsubl_u8(src1, src2) __n128_to_uint16x8_t(neon_usubl_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vsubl_u16(src1, src2) __n128_to_uint32x4_t(neon_usubl_16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vsubl_u32(src1, src2) __n128_to_uint64x2_t(neon_usubl_32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vsubw_high_s8(src1, src2) __n128_to_int16x8_t(neon_ssubw2_8(__int16x8_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vsubw_high_s16(src1, src2) __n128_to_int32x4_t(neon_ssubw2_16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vsubw_high_s32(src1, src2) __n128_to_int64x2_t(neon_ssubw2_32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vsubw_high_u8(src1, src2) __n128_to_uint16x8_t(neon_usubw2_8(__uint16x8_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vsubw_high_u16(src1, src2) __n128_to_uint32x4_t(neon_usubw2_16(__uint32x4_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vsubw_high_u32(src1, src2) __n128_to_uint64x2_t(neon_usubw2_32(__uint64x2_t_to_n128(src1), __uint32x4_t_to_n128(src2)))
#define vsubl_high_s8(src1, src2) __n128_to_int16x8_t(neon_ssubl2_8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vsubl_high_s16(src1, src2) __n128_to_int32x4_t(neon_ssubl2_16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vsubl_high_s32(src1, src2) __n128_to_int64x2_t(neon_ssubl2_32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vsubl_high_u8(src1, src2) __n128_to_uint16x8_t(neon_usubl2_8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vsubl_high_u16(src1, src2) __n128_to_uint32x4_t(neon_usubl2_16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vsubl_high_u32(src1, src2) __n128_to_uint64x2_t(neon_usubl2_32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))

// SABAL/UABAL/SABDL/UABDL
__n128 neon_sabal_8  (__n128, __n64, __n64);
__n128 neon_sabal2_8 (__n128, __n128, __n128);
__n128 neon_sabal_16 (__n128, __n64, __n64);
__n128 neon_sabal2_16(__n128, __n128, __n128);
__n128 neon_sabal_32 (__n128, __n64, __n64);
__n128 neon_sabal2_32(__n128, __n128, __n128);
__n128 neon_uabal_8  (__n128, __n64, __n64);
__n128 neon_uabal2_8 (__n128, __n128, __n128);
__n128 neon_uabal_16 (__n128, __n64, __n64);
__n128 neon_uabal2_16(__n128, __n128, __n128);
__n128 neon_uabal_32 (__n128, __n64, __n64);
__n128 neon_uabal2_32(__n128, __n128, __n128);
__n128 neon_sabdl_8  (__n64, __n64);
__n128 neon_sabdl2_8 (__n128, __n128);
__n128 neon_sabdl_16 (__n64, __n64);
__n128 neon_sabdl2_16(__n128, __n128);
__n128 neon_sabdl_32 (__n64, __n64);
__n128 neon_sabdl2_32(__n128, __n128);
__n128 neon_uabdl_8  (__n64, __n64);
__n128 neon_uabdl2_8 (__n128, __n128);
__n128 neon_uabdl_16 (__n64, __n64);
__n128 neon_uabdl2_16(__n128, __n128);
__n128 neon_uabdl_32 (__n64, __n64);
__n128 neon_uabdl2_32(__n128, __n128);
#define vabal_s8(src1, src2, src3) __n128_to_int16x8_t(neon_sabal_8(__int16x8_t_to_n128(src1), __int8x8_t_to_n64(src2), __int8x8_t_to_n64(src3)))
#define vabal_s16(src1, src2, src3) __n128_to_int32x4_t(neon_sabal_16(__int32x4_t_to_n128(src1), __int16x4_t_to_n64(src2), __int16x4_t_to_n64(src3)))
#define vabal_s32(src1, src2, src3) __n128_to_int64x2_t(neon_sabal_32(__int64x2_t_to_n128(src1), __int32x2_t_to_n64(src2), __int32x2_t_to_n64(src3)))
#define vabal_u8(src1, src2, src3) __n128_to_uint16x8_t(neon_uabal_8(__uint16x8_t_to_n128(src1), __uint8x8_t_to_n64(src2), __uint8x8_t_to_n64(src3)))
#define vabal_u16(src1, src2, src3) __n128_to_uint32x4_t(neon_uabal_16(__uint32x4_t_to_n128(src1), __uint16x4_t_to_n64(src2), __uint16x4_t_to_n64(src3)))
#define vabal_u32(src1, src2, src3) __n128_to_uint64x2_t(neon_uabal_32(__uint64x2_t_to_n128(src1), __uint32x2_t_to_n64(src2), __uint32x2_t_to_n64(src3)))
#define vabal_high_s8(src1, src2, src3) __n128_to_int16x8_t(neon_sabal2_8(__int16x8_t_to_n128(src1), __int8x16_t_to_n128(src2), __int8x16_t_to_n128(src3)))
#define vabal_high_s16(src1, src2, src3) __n128_to_int32x4_t(neon_sabal2_16(__int32x4_t_to_n128(src1), __int16x8_t_to_n128(src2), __int16x8_t_to_n128(src3)))
#define vabal_high_s32(src1, src2, src3) __n128_to_int64x2_t(neon_sabal2_32(__int64x2_t_to_n128(src1), __int32x4_t_to_n128(src2), __int32x4_t_to_n128(src3)))
#define vabal_high_u8(src1, src2, src3) __n128_to_uint16x8_t(neon_uabal2_8(__uint16x8_t_to_n128(src1), __uint8x16_t_to_n128(src2), __uint8x16_t_to_n128(src3)))
#define vabal_high_u16(src1, src2, src3) __n128_to_uint32x4_t(neon_uabal2_16(__uint32x4_t_to_n128(src1), __uint16x8_t_to_n128(src2), __uint16x8_t_to_n128(src3)))
#define vabal_high_u32(src1, src2, src3) __n128_to_uint64x2_t(neon_uabal2_32(__uint64x2_t_to_n128(src1), __uint32x4_t_to_n128(src2), __uint32x4_t_to_n128(src3)))
#define vabdl_s8(src1, src2) __n128_to_int16x8_t(neon_sabdl_8(__int8x8_t_to_n64(src1), __int8x8_t_to_n64(src2)))
#define vabdl_s16(src1, src2) __n128_to_int32x4_t(neon_sabdl_16(__int16x4_t_to_n64(src1), __int16x4_t_to_n64(src2)))
#define vabdl_s32(src1, src2) __n128_to_int64x2_t(neon_sabdl_32(__int32x2_t_to_n64(src1), __int32x2_t_to_n64(src2)))
#define vabdl_u8(src1, src2) __n128_to_uint16x8_t(neon_uabdl_8(__uint8x8_t_to_n64(src1), __uint8x8_t_to_n64(src2)))
#define vabdl_u16(src1, src2) __n128_to_uint32x4_t(neon_uabdl_16(__uint16x4_t_to_n64(src1), __uint16x4_t_to_n64(src2)))
#define vabdl_u32(src1, src2) __n128_to_uint64x2_t(neon_uabdl_32(__uint32x2_t_to_n64(src1), __uint32x2_t_to_n64(src2)))
#define vabdl_high_s8(src1, src2) __n128_to_int16x8_t(neon_sabdl2_8(__int8x16_t_to_n128(src1), __int8x16_t_to_n128(src2)))
#define vabdl_high_s16(src1, src2) __n128_to_int32x4_t(neon_sabdl2_16(__int16x8_t_to_n128(src1), __int16x8_t_to_n128(src2)))
#define vabdl_high_s32(src1, src2) __n128_to_int64x2_t(neon_sabdl2_32(__int32x4_t_to_n128(src1), __int32x4_t_to_n128(src2)))
#define vabdl_high_u8(src1, src2) __n128_to_uint16x8_t(neon_uabdl2_8(__uint8x16_t_to_n128(src1), __uint8x16_t_to_n128(src2)))
#define vabdl_high_u16(src1, src2) __n128_to_uint32x4_t(neon_uabdl2_16(__uint16x8_t_to_n128(src1), __uint16x8_t_to_n128(src2)))
#define vabdl_high_u32(src1, src2) __n128_to_uint64x2_t(neon_uabdl2_32(__uint32x4_t_to_n128(src1), __uint32x4_t_to_n128(src2)))

// FCADD/FCMLA
__n64 neon_fcadd_f16(__n64, __n64, const int);
__n128 neon_fcaddq_f16(__n128, __n128, const int);
__n64 neon_fcadd_f32(__n64, __n64, const int);
__n128 neon_fcaddq_f32(__n128, __n128, const int);
__n128 neon_fcaddq_f64(__n128, __n128, const int);
__n64 neon_fcmla_f16(__n64, __n64, __n64, const int);
__n64 neon_fcmla_lane_f16(__n64, __n64, __n64, const int, const int);
__n64 neon_fcmla_laneq_f16(__n64, __n64, __n128, const int, const int);
__n64 neon_fcmla_f32(__n64, __n64, __n64, const int);
__n64 neon_fcmla_lane_f32(__n64, __n64, __n64, const int, const int);
__n64 neon_fcmla_laneq_f32(__n64, __n64, __n128, const int, const int);
__n128 neon_fcmlaq_f16(__n128, __n128, __n128, const int);
__n128 neon_fcmlaq_f32(__n128, __n128, __n128, const int);
__n128 neon_fcmlaq_f64(__n128, __n128, __n128, const int);
__n128 neon_fcmlaq_lane_f16(__n128, __n128, __n64, const int, const int);
__n128 neon_fcmlaq_lane_f32(__n128, __n128, __n64, const int, const int);
__n128 neon_fcmlaq_laneq_f16(__n128, __n128, __n128, const int, const int);
__n128 neon_fcmlaq_laneq_f32(__n128, __n128, __n128, const int, const int);
#define vcadd_rot90_f32(src1, src2) __n64_to_float32x2_t(neon_fcadd_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), 90))
#define vcaddq_rot90_f32(src1, src2) __n128_to_float32x4_t(neon_fcaddq_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), 90))
#define vcaddq_rot90_f64(src1, src2) __n128_to_float64x2_t(neon_fcaddq_f64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), 90))
#define vcadd_rot270_f32(src1, src2) __n64_to_float32x2_t(neon_fcadd_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), 270))
#define vcaddq_rot270_f32(src1, src2) __n128_to_float32x4_t(neon_fcaddq_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), 270))
#define vcaddq_rot270_f64(src1, src2) __n128_to_float64x2_t(neon_fcaddq_f64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), 270))
#define vcmla_f32(src1, src2, src3) __n64_to_float32x2_t(neon_fcmla_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3), 0))
#define vcmla_lane_f32(src1, src2, src3, src4) __n64_to_float32x2_t(neon_fcmla_lane_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3), (src4), 0))
#define vcmla_laneq_f32(src1, src2, src3, src4) __n64_to_float32x2_t(neon_fcmla_laneq_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x4_t_to_n128(src3), (src4), 0))
#define vcmlaq_f32(src1, src2, src3) __n128_to_float32x4_t(neon_fcmlaq_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3), 0))
#define vcmlaq_f64(src1, src2, src3) __n128_to_float64x2_t(neon_fcmlaq_f64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), __float64x2_t_to_n128(src3), 0))
#define vcmlaq_lane_f32(src1, src2, src3, src4) __n128_to_float32x4_t(neon_fcmlaq_lane_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x2_t_to_n64(src3), (src4), 0))
#define vcmlaq_laneq_f32(src1, src2, src3, src4) __n128_to_float32x4_t(neon_fcmlaq_laneq_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3), (src4), 0))
#define vcmla_rot90_f32(src1, src2, src3) __n64_to_float32x2_t(neon_fcmla_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3), 90))
#define vcmla_rot90_lane_f32(src1, src2, src3, src4) __n64_to_float32x2_t(neon_fcmla_lane_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3), (src4), 90))
#define vcmla_rot90_laneq_f32(src1, src2, src3, src4) __n64_to_float32x2_t(neon_fcmla_laneq_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x4_t_to_n128(src3), (src4), 90))
#define vcmlaq_rot90_f32(src1, src2, src3) __n128_to_float32x4_t(neon_fcmlaq_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3), 90))
#define vcmlaq_rot90_f64(src1, src2, src3) __n128_to_float64x2_t(neon_fcmlaq_f64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), __float64x2_t_to_n128(src3), 90))
#define vcmlaq_rot90_lane_f32(src1, src2, src3, src4) __n128_to_float32x4_t(neon_fcmlaq_lane_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x2_t_to_n64(src3), (src4), 90))
#define vcmlaq_rot90_laneq_f32(src1, src2, src3, src4) __n128_to_float32x4_t(neon_fcmlaq_laneq_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3), (src4), 90))
#define vcmla_rot180_f32(src1, src2, src3) __n64_to_float32x2_t(neon_fcmla_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3), 180))
#define vcmla_rot180_lane_f32(src1, src2, src3, src4) __n64_to_float32x2_t(neon_fcmla_lane_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3), (src4), 180))
#define vcmla_rot180_laneq_f32(src1, src2, src3, src4) __n64_to_float32x2_t(neon_fcmla_laneq_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x4_t_to_n128(src3), (src4), 180))
#define vcmlaq_rot180_f32(src1, src2, src3) __n128_to_float32x4_t(neon_fcmlaq_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3), 180))
#define vcmlaq_rot180_f64(src1, src2, src3) __n128_to_float64x2_t(neon_fcmlaq_f64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), __float64x2_t_to_n128(src3), 180))
#define vcmlaq_rot180_lane_f32(src1, src2, src3, src4) __n128_to_float32x4_t(neon_fcmlaq_lane_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x2_t_to_n64(src3), (src4), 180))
#define vcmlaq_rot180_laneq_f32(src1, src2, src3, src4) __n128_to_float32x4_t(neon_fcmlaq_laneq_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3), (src4), 180))
#define vcmla_rot270_f32(src1, src2, src3) __n64_to_float32x2_t(neon_fcmla_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3), 270))
#define vcmla_rot270_lane_f32(src1, src2, src3, src4) __n64_to_float32x2_t(neon_fcmla_lane_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x2_t_to_n64(src3), (src4), 270))
#define vcmla_rot270_laneq_f32(src1, src2, src3, src4) __n64_to_float32x2_t(neon_fcmla_laneq_f32(__float32x2_t_to_n64(src1), __float32x2_t_to_n64(src2), __float32x4_t_to_n128(src3), (src4), 270))
#define vcmlaq_rot270_f32(src1, src2, src3) __n128_to_float32x4_t(neon_fcmlaq_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3), 270))
#define vcmlaq_rot270_f64(src1, src2, src3) __n128_to_float64x2_t(neon_fcmlaq_f64(__float64x2_t_to_n128(src1), __float64x2_t_to_n128(src2), __float64x2_t_to_n128(src3), 270))
#define vcmlaq_rot270_lane_f32(src1, src2, src3, src4) __n128_to_float32x4_t(neon_fcmlaq_lane_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x2_t_to_n64(src3), (src4), 270))
#define vcmlaq_rot270_laneq_f32(src1, src2, src3, src4) __n128_to_float32x4_t(neon_fcmlaq_laneq_f32(__float32x4_t_to_n128(src1), __float32x4_t_to_n128(src2), __float32x4_t_to_n128(src3), (src4), 270))

// vget_low/vget_high/vcombine
__n128 neon_combine(__n64, __n64);
#define vget_high_u8(src) __n64_to_uint8x8_t(neon_dups64q(__uint8x16_t_to_n128(src), 1))
#define vget_high_s8(src) __n64_to_int8x8_t(neon_dups64q(__int8x16_t_to_n128(src), 1))
#define vget_low_u8(src) __n64_to_uint8x8_t(neon_dups64q(__uint8x16_t_to_n128(src), 0))
#define vget_low_s8(src) __n64_to_int8x8_t(neon_dups64q(__int8x16_t_to_n128(src), 0))
#define vget_high_u16(src) __n64_to_uint16x4_t(neon_dups64q(__uint16x8_t_to_n128(src), 1))
#define vget_high_s16(src) __n64_to_int16x4_t(neon_dups64q(__int16x8_t_to_n128(src), 1))
#define vget_low_u16(src) __n64_to_uint16x4_t(neon_dups64q(__uint16x8_t_to_n128(src), 0))
#define vget_low_s16(src) __n64_to_int16x4_t(neon_dups64q(__int16x8_t_to_n128(src), 0))
#define vget_high_u32(src) __n64_to_uint32x2_t(neon_dups64q(__uint32x4_t_to_n128(src), 1))
#define vget_high_s32(src) __n64_to_int32x2_t(neon_dups64q(__int32x4_t_to_n128(src), 1))
#define vget_low_u32(src) __n64_to_uint32x2_t(neon_dups64q(__uint32x4_t_to_n128(src), 0))
#define vget_low_s32(src) __n64_to_int32x2_t(neon_dups64q(__int32x4_t_to_n128(src), 0))
#define vget_high_u64(src) __n64_to_uint64x1_t(neon_dups64q(__uint64x2_t_to_n128(src), 1))
#define vget_high_s64(src) __n64_to_int64x1_t(neon_dups64q(__int64x2_t_to_n128(src), 1))
#define vget_low_u64(src) __n64_to_uint64x1_t(neon_dups64q(__uint64x2_t_to_n128(src), 0))
#define vget_low_s64(src) __n64_to_int64x1_t(neon_dups64q(__int64x2_t_to_n128(src), 0))
#define vget_high_p8(src) __n64_to_poly8x8_t(neon_dups64q(__poly8x16_t_to_n128(src), 1))
#define vget_high_p16(src) __n64_to_poly16x4_t(neon_dups64q(__poly16x8_t_to_n128(src), 1))
#define vget_high_p64(src) __n64_to_poly64x1_t(neon_dups64q(__poly64x2_t_to_n128(src), 1))
#define vget_low_p8(src) __n64_to_poly8x8_t(neon_dups64q(__poly8x16_t_to_n128(src), 0))
#define vget_low_p16(src) __n64_to_poly16x4_t(neon_dups64q(__poly16x8_t_to_n128(src), 0))
#define vget_low_p64(src) __n64_to_poly64x1_t(neon_dups64q(__poly64x2_t_to_n128(src), 0))
#define vget_high_f32(src) __n64_to_float32x2_t(neon_dups64q(__float32x4_t_to_n128(src), 1))
#define vget_high_f64(src) __n64_to_float64x1_t(neon_dups64q(__float64x2_t_to_n128(src), 1))
#define vget_low_f32(src) __n64_to_float32x2_t(neon_dups64q(__float32x4_t_to_n128(src), 0))
#define vget_low_f64(src) __n64_to_float64x1_t(neon_dups64q(__float64x2_t_to_n128(src), 0))
#define vcombine_u8(low, high) __n128_to_uint8x16_t(neon_combine(__uint8x8_t_to_n64(low), __uint8x8_t_to_n64(high)))
#define vcombine_s8(low, high) __n128_to_int8x16_t(neon_combine(__int8x8_t_to_n64(low), __int8x8_t_to_n64(high)))
#define vcombine_p8(low, high) __n128_to_poly8x16_t(neon_combine(__poly8x8_t_to_n64(low), __poly8x8_t_to_n64(high)))
#define vcombine_u16(low, high) __n128_to_uint16x8_t(neon_combine(__uint16x4_t_to_n64(low), __uint16x4_t_to_n64(high)))
#define vcombine_s16(low, high) __n128_to_int16x8_t(neon_combine(__int16x4_t_to_n64(low), __int16x4_t_to_n64(high)))
#define vcombine_p16(low, high) __n128_to_poly16x8_t(neon_combine(__poly16x4_t_to_n64(low), __poly16x4_t_to_n64(high)))
#define vcombine_u32(low, high) __n128_to_uint32x4_t(neon_combine(__uint32x2_t_to_n64(low), __uint32x2_t_to_n64(high)))
#define vcombine_s32(low, high) __n128_to_int32x4_t(neon_combine(__int32x2_t_to_n64(low), __int32x2_t_to_n64(high)))
#define vcombine_f32(low, high) __n128_to_float32x4_t(neon_combine(__float32x2_t_to_n64(low), __float32x2_t_to_n64(high)))
#define vcombine_u64(low, high) __n128_to_uint64x2_t(neon_combine(__uint64x1_t_to_n64(low), __uint64x1_t_to_n64(high)))
#define vcombine_s64(low, high) __n128_to_int64x2_t(neon_combine(__int64x1_t_to_n64(low), __int64x1_t_to_n64(high)))
#define vcombine_p64(low, high) __n128_to_poly64x2_t(neon_combine(__poly64x1_t_to_n64(low), __poly64x1_t_to_n64(high)))
#define vcombine_f64(low, high) __n128_to_float64x2_t(neon_combine(__float64x1_t_to_n64(low), __float64x1_t_to_n64(high)))

// VCREATE
__n64 vcreate(unsigned __int64 src);
#define vcreate_s8(src) __n64_to_int8x8_t(vcreate(src))
#define vcreate_s16(src) __n64_to_int16x4_t(vcreate(src))
#define vcreate_s32(src) __n64_to_int32x2_t(vcreate(src))
#define vcreate_s64(src) __n64_to_int64x1_t(vcreate(src))
#define vcreate_u8(src) __n64_to_uint8x8_t(vcreate(src))
#define vcreate_u16(src) __n64_to_uint16x4_t(vcreate(src))
#define vcreate_u32(src) __n64_to_uint32x2_t(vcreate(src))
#define vcreate_u64(src) __n64_to_uint64x1_t(vcreate(src))
#define vcreate_p64(src) __n64_to_poly64x1_t(vcreate(src))
#define vcreate_p16(src) __n64_to_poly16x4_t(vcreate(src))
#define vcreate_p8(src) __n64_to_poly8x8_t(vcreate(src))
#define vcreate_f32(src) __n64_to_float32x2_t(vcreate(src))
#define vcreate_f64(src) __n64_to_float64x1_t(vcreate(src))

#if !defined(_ARM64_DISTINCT_NEON_TYPES)
#define vget_lane_f16(Dm, lane)     neon_dups16((Dm), (lane))
#define vgetq_lane_f16(Dm, lane)    neon_dups16q((Dm), (lane))
#define vmull_p64(src1, src2) neon_pmull_64((src1), (src2))
#define vmull_high_p64(src1, src2) neon_pmull2_64((src1), (src2))
#define vld1_dup_f16(src) neon_ld1r_16((__int16*)(src))
#define vld1_f16(src) neon_ld1m_16((__int16*)(src))
#define vld1_lane_f16(src1, src2, src3) neon_ld1s_16((__int16*)(src1), (src2), (src3))
#define vst1_f16(src1, src2) neon_st1m_16((__int16*)(src1), (src2))
#define vst1_lane_f16(src1, src2, src3) neon_st1s_16((__int16*)(src1), (src2), (src3))

#define vcvt_f32_f16(src)               __n128_to_float32x4_t(neon_fcvtl_32(__float16x4_t_to_n64(src)))
#define vcvt_high_f32_f16(src)          __n128_to_float32x4_t(neon_fcvtl2_32(__float16x8_t_to_n128(src)))
#define vcvt_f16_f32(src)               __n64_to_float16x4_t(neon_fcvtn_32(__float32x4_t_to_n128(src)))
#define vcvt_high_f16_f32(src1, src2)   __n128_to_float16x8_t(neon_fcvtn2_32(__float16x4_t_to_n64(src1), __float32x4_t_to_n128(src2)))
#define vget_high_f16(src)              __n64_to_float16x4_t(neon_dups64q(__float16x8_t_to_n128(src), 1))
#define vget_low_f16(src)               __n64_to_float16x4_t(neon_dups64q(__float16x8_t_to_n128(src), 0))
#define vcombine_f16(low, high)         __n128_to_float16x8_t(neon_combine(__float16x4_t_to_n64(low), __float16x4_t_to_n64(high)))
#define vcreate_f16(src)                __n64_to_float16x4_t(vcreate(src))
#define vabs_f16(reg)                   __n64_to_float16x4_t(neon_fabs16(__float16x4_t_to_n64(reg)))
#define vabsq_f16(reg)                  __n128_to_float16x8_t(neon_fabsq16(__float16x8_t_to_n128(reg)))
#define vceqz_f16(src)                  __n64_to_uint16x4_t(neon_fcmeqz16(__float16x4_t_to_n64(src)))
#define vceqzq_f16(src)                 __n128_to_uint16x8_t(neon_fcmeqzq16(__float16x8_t_to_n128(src)))
#define vcgez_f16(src)                  __n64_to_uint16x4_t(neon_fcmgez16(__float16x4_t_to_n64(src)))
#define vcgezq_f16(src)                 __n128_to_uint16x8_t(neon_fcmgezq16(__float16x8_t_to_n128(src)))
#define vcgtz_f16(src)                  __n64_to_uint16x4_t(neon_fcmgtz16(__float16x4_t_to_n64(src)))
#define vcgtzq_f16(src)                 __n128_to_uint16x8_t(neon_fcmgtzq16(__float16x8_t_to_n128(src)))
#define vclez_f16(src)                  __n64_to_uint16x4_t(neon_fcmlez16(__float16x4_t_to_n64(src)))
#define vclezq_f16(src)                 __n128_to_uint16x8_t(neon_fcmlezq16(__float16x8_t_to_n128(src)))
#define vcltz_f16(src)                  __n64_to_uint16x4_t(neon_fcmltz16(__float16x4_t_to_n64(src)))
#define vcltzq_f16(src)                 __n128_to_uint16x8_t(neon_fcmltzq16(__float16x8_t_to_n128(src)))
#define vcvt_f16_s16(src)               __n64_to_float16x4_t(neon_scvtf16(__int16x4_t_to_n64(src)))
#define vcvt_f16_u16(src)               __n64_to_float16x4_t(neon_ucvtf16(__uint16x4_t_to_n64(src)))
#define vcvt_s16_f16(src)               __n64_to_int16x4_t(neon_fcvtzs16(__float16x4_t_to_n64(src)))
#define vcvt_u16_f16(src)               __n64_to_uint16x4_t(neon_fcvtzu16(__float16x4_t_to_n64(src)))
#define vcvtq_f16_s16(src)              __n128_to_float16x8_t(neon_scvtfq16(__int16x8_t_to_n128(src)))
#define vcvtq_f16_u16(src)              __n128_to_float16x8_t(neon_ucvtfq16(__uint16x8_t_to_n128(src)))
#define vcvtq_s16_f16(src)              __n128_to_int16x8_t(neon_fcvtzsq16(__float16x8_t_to_n128(src)))
#define vcvtq_u16_f16(src)              __n128_to_uint16x8_t(neon_fcvtzuq16(__float16x8_t_to_n128(src)))
#define vcvta_s16_f16(src)              __n64_to_int16x4_t(neon_fcvtas16(__float16x4_t_to_n64(src)))
#define vcvta_u16_f16(src)              __n64_to_uint16x4_t(neon_fcvtau16(__float16x4_t_to_n64(src)))
#define vcvtm_s16_f16(src)              __n64_to_int16x4_t(neon_fcvtms16(__float16x4_t_to_n64(src)))
#define vcvtm_u16_f16(src)              __n64_to_uint16x4_t(neon_fcvtmu16(__float16x4_t_to_n64(src)))
#define vcvtn_s16_f16(src)              __n64_to_int16x4_t(neon_fcvtns16(__float16x4_t_to_n64(src)))
#define vcvtn_u16_f16(src)              __n64_to_uint16x4_t(neon_fcvtnu16(__float16x4_t_to_n64(src)))
#define vcvtp_s16_f16(src)              __n64_to_int16x4_t(neon_fcvtps16(__float16x4_t_to_n64(src)))
#define vcvtp_u16_f16(src)              __n64_to_uint16x4_t(neon_fcvtpu16(__float16x4_t_to_n64(src)))
#define vcvtaq_s16_f16(src)             __n128_to_int16x8_t(neon_fcvtasq16(__float16x8_t_to_n128(src)))
#define vcvtaq_u16_f16(src)             __n128_to_uint16x8_t(neon_fcvtauq16(__float16x8_t_to_n128(src)))
#define vcvtmq_s16_f16(src)             __n128_to_int16x8_t(neon_fcvtmsq16(__float16x8_t_to_n128(src)))
#define vcvtmq_u16_f16(src)             __n128_to_uint16x8_t(neon_fcvtmuq16(__float16x8_t_to_n128(src)))
#define vcvtnq_s16_f16(src)             __n128_to_int16x8_t(neon_fcvtnsq16(__float16x8_t_to_n128(src)))
#define vcvtnq_u16_f16(src)             __n128_to_uint16x8_t(neon_fcvtnuq16(__float16x8_t_to_n128(src)))
#define vcvtpq_s16_f16(src)             __n128_to_int16x8_t(neon_fcvtpsq16(__float16x8_t_to_n128(src)))
#define vcvtpq_u16_f16(src)             __n128_to_uint16x8_t(neon_fcvtpuq16(__float16x8_t_to_n128(src)))
#define vneg_f16(src)                   __n64_to_float16x4_t(neon_fneg16(__float16x4_t_to_n64(src)))
#define vnegq_f16(src)                  __n128_to_float16x8_t(neon_fnegq16(__float16x8_t_to_n128(src)))
#define vrecpe_f16(src)                 __n64_to_float16x4_t(neon_frecpe16(__float16x4_t_to_n64(src)))
#define vrecpeq_f16(src)                __n128_to_float16x8_t(neon_frecpeq16(__float16x8_t_to_n128(src)))
#define vrnd_f16(src)                   __n64_to_float16x4_t(neon_frintz_16(__float16x4_t_to_n64(src)))
#define vrndq_f16(src)                  __n128_to_float16x8_t(neon_frintz_q16(__float16x8_t_to_n128(src)))
#define vrnda_f16(src)                  __n64_to_float16x4_t(neon_frinta_16(__float16x4_t_to_n64(src)))
#define vrndaq_f16(src)                 __n128_to_float16x8_t(neon_frinta_q16(__float16x8_t_to_n128(src)))
#define vrndi_f16(src)                  __n64_to_float16x4_t(neon_frinti_16(__float16x4_t_to_n64(src)))
#define vrndiq_f16(src)                 __n128_to_float16x8_t(neon_frinti_q16(__float16x8_t_to_n128(src)))
#define vrndm_f16(src)                  __n64_to_float16x4_t(neon_frintm_16(__float16x4_t_to_n64(src)))
#define vrndmq_f16(src)                 __n128_to_float16x8_t(neon_frintm_q16(__float16x8_t_to_n128(src)))
#define vrndn_f16(src)                  __n64_to_float16x4_t(neon_frintn_16(__float16x4_t_to_n64(src)))
#define vrndnq_f16(src)                 __n128_to_float16x8_t(neon_frintn_q16(__float16x8_t_to_n128(src)))
#define vrndp_f16(src)                  __n64_to_float16x4_t(neon_frintp_16(__float16x4_t_to_n64(src)))
#define vrndpq_f16(src)                 __n128_to_float16x8_t(neon_frintp_q16(__float16x8_t_to_n128(src)))
#define vrndx_f16(src)                  __n64_to_float16x4_t(neon_frintx_16(__float16x4_t_to_n64(src)))
#define vrndxq_f16(src)                 __n128_to_float16x8_t(neon_frintx_q16(__float16x8_t_to_n128(src)))
#define vsqrt_f16(src)                  __n64_to_float16x4_t(neon_fsqrt16(__float16x4_t_to_n64(src)))
#define vsqrtq_f16(src)                 __n128_to_float16x8_t(neon_fsqrtq16(__float16x8_t_to_n128(src)))
#define vrsqrte_f16(src)                __n64_to_float16x4_t(neon_frsqrte16(__float16x4_t_to_n64(src)))
#define vrsqrteq_f16(src)               __n128_to_float16x8_t(neon_frsqrteq16(__float16x8_t_to_n128(src)))
#define vadd_f16(src1, src2)            __n64_to_float16x4_t(neon_fadd16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vaddq_f16(src1, src2)           __n128_to_float16x8_t(neon_faddq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vabd_f16(src1, src2)            __n64_to_float16x4_t(neon_fabd16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vabdq_f16(src1, src2)           __n128_to_float16x8_t(neon_fabdq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vcage_f16(src1, src2)           __n64_to_uint16x4_t(neon_facge16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vcageq_f16(src1, src2)          __n128_to_uint16x8_t(neon_facgeq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vcagt_f16(src1, src2)           __n64_to_uint16x4_t(neon_facgt16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vcagtq_f16(src1, src2)          __n128_to_uint16x8_t(neon_facgtq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vcale_f16(src1, src2)           __n64_to_uint16x4_t(neon_facge16(__float16x4_t_to_n64(src2), __float16x4_t_to_n64(src1)))
#define vcaleq_f16(src1, src2)          __n128_to_uint16x8_t(neon_facgeq16(__float16x8_t_to_n128(src2), __float16x8_t_to_n128(src1)))
#define vcalt_f16(src1, src2)           __n64_to_uint16x4_t(neon_facgt16(__float16x4_t_to_n64(src2), __float16x4_t_to_n64(src1)))
#define vcaltq_f16(src1, src2)          __n128_to_uint16x8_t(neon_facgtq16(__float16x8_t_to_n128(src2), __float16x8_t_to_n128(src1)))
#define vceq_f16(src1, src2)            __n64_to_uint16x4_t(neon_fcmeq16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vceqq_f16(src1, src2)           __n128_to_uint16x8_t(neon_fcmeqq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vcge_f16(src1, src2)            __n64_to_uint16x4_t(neon_fcmge16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vcgeq_f16(src1, src2)           __n128_to_uint16x8_t(neon_fcmgeq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vcgt_f16(src1, src2)            __n64_to_uint16x4_t(neon_fcmgt16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vcgtq_f16(src1, src2)           __n128_to_uint16x8_t(neon_fcmgtq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vcle_f16(src1, src2)            __n64_to_uint16x4_t(neon_fcmge16(__float16x4_t_to_n64(src2), __float16x4_t_to_n64(src1)))
#define vcleq_f16(src1, src2)           __n128_to_uint16x8_t(neon_fcmgeq16(__float16x8_t_to_n128(src2), __float16x8_t_to_n128(src1)))
#define vclt_f16(src1, src2)            __n64_to_uint16x4_t(neon_fcmgt16(__float16x4_t_to_n64(src2), __float16x4_t_to_n64(src1)))
#define vcltq_f16(src1, src2)           __n128_to_uint16x8_t(neon_fcmgtq16(__float16x8_t_to_n128(src2), __float16x8_t_to_n128(src1)))
#define vcvt_n_f16_s16(src1, src2)      __n64_to_float16x4_t(neon_scvtffp16(__int16x4_t_to_n64(src1), (src2)))
#define vcvt_n_f16_u16(src1, src2)      __n64_to_float16x4_t(neon_ucvtffp16(__uint16x4_t_to_n64(src1), (src2)))
#define vcvtq_n_f16_s16(src1, src2)     __n128_to_float16x8_t(neon_scvtffpq16(__int16x8_t_to_n128(src1), (src2)))
#define vcvtq_n_f16_u16(src1, src2)     __n128_to_float16x8_t(neon_ucvtffpq16(__uint16x8_t_to_n128(src1), (src2)))
#define vcvt_n_s16_f16(src1, src2)      __n64_to_int16x4_t(neon_fcvtzsfp16(__float16x4_t_to_n64(src1), (src2)))
#define vcvt_n_u16_f16(src1, src2)      __n64_to_uint16x4_t(neon_fcvtzufp16(__float16x4_t_to_n64(src1), (src2)))
#define vcvtq_n_s16_f16(src1, src2)     __n128_to_int16x8_t(neon_fcvtzsfpq16(__float16x8_t_to_n128(src1), (src2)))
#define vcvtq_n_u16_f16(src1, src2)     __n128_to_uint16x8_t(neon_fcvtzufpq16(__float16x8_t_to_n128(src1), (src2)))
#define vdiv_f16(src1, src2)            __n64_to_float16x4_t(neon_fdiv16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vdivq_f16(src1, src2)           __n128_to_float16x8_t(neon_fdivq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vmax_f16(src1, src2)            __n64_to_float16x4_t(neon_fmax16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vmaxnm_f16(src1, src2)          __n64_to_float16x4_t(neon_fmaxnm16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vmaxq_f16(src1, src2)           __n128_to_float16x8_t(neon_fmaxq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vmaxnmq_f16(src1, src2)         __n128_to_float16x8_t(neon_fmaxnmq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vmin_f16(src1, src2)            __n64_to_float16x4_t(neon_fmin16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vminnm_f16(src1, src2)          __n64_to_float16x4_t(neon_fminnm16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vminq_f16(src1, src2)           __n128_to_float16x8_t(neon_fminq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vminnmq_f16(src1, src2)         __n128_to_float16x8_t(neon_fminnmq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vmul_f16(src1, src2)            __n64_to_float16x4_t(neon_fmul16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vmulq_f16(src1, src2)           __n128_to_float16x8_t(neon_fmulq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vmulx_f16(src1, src2)           __n64_to_float16x4_t(neon_fmulx16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vmulxq_f16(src1, src2)          __n128_to_float16x8_t(neon_fmulxq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vpadd_f16(src1, src2)           __n64_to_float16x4_t(neon_faddp16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vpaddq_f16(src1, src2)          __n128_to_float16x8_t(neon_faddpq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vpmax_f16(src1, src2)           __n64_to_float16x4_t(neon_fmaxp16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vpmaxq_f16(src1, src2)          __n128_to_float16x8_t(neon_fmaxpq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vpmaxnm_f16(src1, src2)         __n64_to_float16x4_t(neon_fmaxnmp16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vpmaxnmq_f16(src1, src2)        __n128_to_float16x8_t(neon_fmaxnmpq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vpmin_f16(src1, src2)           __n64_to_float16x4_t(neon_fminp16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vpminq_f16(src1, src2)          __n128_to_float16x8_t(neon_fminpq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vpminnm_f16(src1, src2)         __n64_to_float16x4_t(neon_fminnmp16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vpminnmq_f16(src1, src2)        __n128_to_float16x8_t(neon_fminnmpq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vrecps_f16(src1, src2)          __n64_to_float16x4_t(neon_frecps16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vrecpsq_f16(src1, src2)         __n128_to_float16x8_t(neon_frecpsq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vrsqrts_f16(src1, src2)         __n64_to_float16x4_t(neon_frsqrts16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vrsqrtsq_f16(src1, src2)        __n128_to_float16x8_t(neon_frsqrtsq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vsub_f16(src1, src2)            __n64_to_float16x4_t(neon_fsub16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vsubq_f16(src1, src2)           __n128_to_float16x8_t(neon_fsubq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vfma_f16(src1, src2, src3)      __n64_to_float16x4_t(neon_fmla16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3)))
#define vfmaq_f16(src1, src2, src3)     __n128_to_float16x8_t(neon_fmlaq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3)))
#define vfms_f16(src1, src2, src3)      __n64_to_float16x4_t(neon_fmls16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3)))
#define vfmsq_f16(src1, src2, src3)     __n128_to_float16x8_t(neon_fmlsq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3)))
#define vfma_lane_f16(src1, src2, src3, lane)   __n64_to_float16x4_t(neon_fmlavind16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), (lane)))
#define vfmaq_lane_f16(src1, src2, src3, lane)  __n128_to_float16x8_t(neon_fmlaqvind16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x4_t_to_n64(src3), (lane)))
#define vfma_laneq_f16(src1, src2, src3, lane)  __n64_to_float16x4_t(neon_fmlavind16q(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x8_t_to_n128(src3), (lane)))
#define vfmaq_laneq_f16(src1, src2, src3, lane) __n128_to_float16x8_t(neon_fmlaqvind16q(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), (lane)))
#define vfms_lane_f16(src1, src2, src3, lane)   __n64_to_float16x4_t(neon_fmlsvind16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), (lane)))
#define vfms_laneq_f16(src1, src2, src3, lane)  __n64_to_float16x4_t(neon_fmlsvind16q(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x8_t_to_n128(src3), (lane)))
#define vfmsq_lane_f16(src1, src2, src3, lane)  __n128_to_float16x8_t(neon_fmlsqvind16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x4_t_to_n64(src3), (lane)))
#define vfmsq_laneq_f16(src1, src2, src3, lane) __n128_to_float16x8_t(neon_fmlsqvind16q(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), (lane)))
#define vmul_lane_f16(src1, src2, lane)         __n64_to_float16x4_t(neon_fmulvind16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), (lane)))
#define vmulq_lane_f16(src1, src2, lane)        __n128_to_float16x8_t(neon_fmulqvind16(__float16x8_t_to_n128(src1), __float16x4_t_to_n64(src2), (lane)))
#define vmul_laneq_f16(src1, src2, lane)        __n64_to_float16x4_t(neon_fmulvind16q(__float16x4_t_to_n64(src1), __float16x8_t_to_n128(src2), (lane)))
#define vmulq_laneq_f16(src1, src2, lane)       __n128_to_float16x8_t(neon_fmulqvind16q(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), (lane)))
#define vmulx_lane_f16(src1, src2, lane)        __n64_to_float16x4_t(neon_fmulxvind16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), (lane)))
#define vmulxq_lane_f16(src1, src2, lane)       __n128_to_float16x8_t(neon_fmulxqvind16(__float16x8_t_to_n128(src1), __float16x4_t_to_n64(src2), (lane)))
#define vmulx_laneq_f16(src1, src2, lane)       __n64_to_float16x4_t(neon_fmulxvind16q(__float16x4_t_to_n64(src1), __float16x8_t_to_n128(src2), (lane)))
#define vmulxq_laneq_f16(src1, src2, lane)      __n128_to_float16x8_t(neon_fmulxqvind16q(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), (lane)))
#define vbsl_f16(src1, src2, src3)      __n64_to_float16x4_t(neon_bsl(__uint16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3)))
#define vbslq_f16(src1, src2, src3)     __n128_to_float16x8_t(neon_bslq(__uint16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3)))
#define vzip_f16(src1, src2)            __n64x2_to_float16x4x2_t(neon_zip_16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vzipq_f16(src1, src2)           __n128x2_to_float16x8x2_t(neon_zip_q16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vuzp_f16(src1, src2)            __n64x2_to_float16x4x2_t(neon_uzp_16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vuzpq_f16(src1, src2)           __n128x2_to_float16x8x2_t(neon_uzp_q16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vtrn_f16(src1, src2)            __n64x2_to_float16x4x2_t(neon_trn_16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vtrnq_f16(src1, src2)           __n128x2_to_float16x8x2_t(neon_trn_q16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vdup_lane_f16(reg, lane)        __n64_to_float16x4_t(neon_dupe16(__float16x4_t_to_n64(reg), (lane)))
#define vdupq_lane_f16(reg, lane)       __n128_to_float16x8_t(neon_dupqe16(__float16x4_t_to_n64(reg), (lane)))
#define vext_f16(src1, src2, pos)       __n64_to_float16x4_t(neon_ext16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), (pos)))
#define vextq_f16(src1, src2, pos)      __n128_to_float16x8_t(neon_extq16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), (pos)))
#define vrev64_f16(src)                 __n64_to_float16x4_t(neon_rev64_16(__float16x4_t_to_n64(src)))
#define vrev64q_f16(src)                __n128_to_float16x8_t(neon_rev64q_16(__float16x8_t_to_n128(src)))
#define vzip1_f16(src1, src2)           __n64_to_float16x4_t(neon_zip1_16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vzip1q_f16(src1, src2)          __n128_to_float16x8_t(neon_zip1_q16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vzip2_f16(src1, src2)           __n64_to_float16x4_t(neon_zip2_16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vzip2q_f16(src1, src2)          __n128_to_float16x8_t(neon_zip2_q16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vuzp1_f16(src1, src2)           __n64_to_float16x4_t(neon_uzp1_16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vuzp1q_f16(src1, src2)          __n128_to_float16x8_t(neon_uzp1_q16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vuzp2_f16(src1, src2)           __n64_to_float16x4_t(neon_uzp2_16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vuzp2q_f16(src1, src2)          __n128_to_float16x8_t(neon_uzp2_q16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vtrn1_f16(src1, src2)           __n64_to_float16x4_t(neon_trn1_16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vtrn1q_f16(src1, src2)          __n128_to_float16x8_t(neon_trn1_q16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vtrn2_f16(src1, src2)           __n64_to_float16x4_t(neon_trn2_16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2)))
#define vtrn2q_f16(src1, src2)          __n128_to_float16x8_t(neon_trn2_q16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2)))
#define vdup_laneq_f16(reg, lane)      __n64_to_float16x4_t(neon_dupe16q(__float16x8_t_to_n128(reg), (lane)))
#define vdupq_laneq_f16(reg, lane)     __n128_to_float16x8_t(neon_dupqe16q(__float16x8_t_to_n128(reg), (lane)))
#define vfmlal_low_f16(src1, src2, src3)    __n64_to_float32x2_t(neon_fmlal_16(__float32x2_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3)))
#define vfmlsl_low_f16(src1, src2, src3)    __n64_to_float32x2_t(neon_fmlsl_16(__float32x2_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3)))
#define vfmlalq_low_f16(src1, src2, src3)   __n128_to_float32x4_t(neon_fmlal_16q(__float32x4_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3)))
#define vfmlslq_low_f16(src1, src2, src3)   __n128_to_float32x4_t(neon_fmlsl_16q(__float32x4_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3)))
#define vfmlal_high_f16(src1, src2, src3)   __n64_to_float32x2_t(neon_fmlal2_16(__float32x2_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3)))
#define vfmlsl_high_f16(src1, src2, src3)   __n64_to_float32x2_t(neon_fmlsl2_16(__float32x2_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3)))
#define vfmlalq_high_f16(src1, src2, src3)  __n128_to_float32x4_t(neon_fmlal2_16q(__float32x4_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3)))
#define vfmlslq_high_f16(src1, src2, src3)  __n128_to_float32x4_t(neon_fmlsl2_16q(__float32x4_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3)))
#define vfmlal_lane_low_f16(src1, src2, src3, lane)     __n64_to_float32x2_t(neon_fmlalvind_16(__float32x2_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), (lane)))
#define vfmlal_laneq_low_f16(src1, src2, src3, lane)    __n64_to_float32x2_t(neon_fmlalvind_16q(__float32x2_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x8_t_to_n128(src3), (lane)))
#define vfmlalq_lane_low_f16(src1, src2, src3, lane)    __n128_to_float32x4_t(neon_fmlalqvind_16(__float32x4_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x4_t_to_n64(src3), (lane)))
#define vfmlalq_laneq_low_f16(src1, src2, src3, lane)   __n128_to_float32x4_t(neon_fmlalqvind_16q(__float32x4_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), (lane)))
#define vfmlsl_lane_low_f16(src1, src2, src3, lane)     __n64_to_float32x2_t(neon_fmlslvind_16(__float32x2_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), (lane)))
#define vfmlsl_laneq_low_f16(src1, src2, src3, lane)    __n64_to_float32x2_t(neon_fmlslvind_16q(__float32x2_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x8_t_to_n128(src3), (lane)))
#define vfmlslq_lane_low_f16(src1, src2, src3, lane)    __n128_to_float32x4_t(neon_fmlslqvind_16(__float32x4_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x4_t_to_n64(src3), (lane)))
#define vfmlslq_laneq_low_f16(src1, src2, src3, lane)   __n128_to_float32x4_t(neon_fmlslqvind_16q(__float32x4_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), (lane)))
#define vfmlal_lane_high_f16(src1, src2, src3, lane)    __n64_to_float32x2_t(neon_fmlal2vind_16(__float32x2_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), (lane)))
#define vfmlal_laneq_high_f16(src1, src2, src3, lane)   __n64_to_float32x2_t(neon_fmlal2vind_16q(__float32x2_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x8_t_to_n128(src3), (lane)))
#define vfmlalq_lane_high_f16(src1, src2, src3, lane)   __n128_to_float32x4_t(neon_fmlal2qvind_16(__float32x4_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x4_t_to_n64(src3), (lane)))
#define vfmlalq_laneq_high_f16(src1, src2, src3, lane)  __n128_to_float32x4_t(neon_fmlal2qvind_16q(__float32x4_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), (lane)))
#define vfmlsl_lane_high_f16(src1, src2, src3, lane)    __n64_to_float32x2_t(neon_fmlsl2vind_16(__float32x2_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), (lane)))
#define vfmlsl_laneq_high_f16(src1, src2, src3, lane)   __n64_to_float32x2_t(neon_fmlsl2vind_16q(__float32x2_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x8_t_to_n128(src3), (lane)))
#define vfmlslq_lane_high_f16(src1, src2, src3, lane)   __n128_to_float32x4_t(neon_fmlsl2qvind_16(__float32x4_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x4_t_to_n64(src3), (lane)))
#define vfmlslq_laneq_high_f16(src1, src2, src3, lane)  __n128_to_float32x4_t(neon_fmlsl2qvind_16q(__float32x4_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), (lane)))

#define vcadd_rot90_f16(src1, src2)                     __n64_to_float16x4_t(neon_fcadd_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), 90))
#define vcaddq_rot90_f16(src1, src2)                    __n128_to_float16x8_t(neon_fcaddq_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), 90))
#define vcadd_rot270_f16(src1, src2)                    __n64_to_float16x4_t(neon_fcadd_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), 270))
#define vcaddq_rot270_f16(src1, src2)                   __n128_to_float16x8_t(neon_fcaddq_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), 270))
#define vcmla_f16(src1, src2, src3)                     __n64_to_float16x4_t(neon_fcmla_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), 0))
#define vcmla_lane_f16(src1, src2, src3, src4)          __n64_to_float16x4_t(neon_fcmla_lane_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), (src4), 0))
#define vcmla_laneq_f16(src1, src2, src3, src4)         __n64_to_float16x4_t(neon_fcmla_laneq_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x8_t_to_n128(src3), (src4), 0))
#define vcmlaq_f16(src1, src2, src3)                    __n128_to_float16x8_t(neon_fcmlaq_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), 0))
#define vcmlaq_lane_f16(src1, src2, src3, src4)         __n128_to_float16x8_t(neon_fcmlaq_lane_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x4_t_to_n64(src3), (src4), 0))
#define vcmlaq_laneq_f16(src1, src2, src3, src4)        __n128_to_float16x8_t(neon_fcmlaq_laneq_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), (src4), 0))
#define vcmla_rot90_f16(src1, src2, src3)               __n64_to_float16x4_t(neon_fcmla_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), 90))
#define vcmla_rot90_lane_f16(src1, src2, src3, src4)    __n64_to_float16x4_t(neon_fcmla_lane_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), (src4), 90))
#define vcmla_rot90_laneq_f16(src1, src2, src3, src4)   __n64_to_float16x4_t(neon_fcmla_laneq_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x8_t_to_n128(src3), (src4), 90))
#define vcmlaq_rot90_f16(src1, src2, src3)              __n128_to_float16x8_t(neon_fcmlaq_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), 90))
#define vcmlaq_rot90_lane_f16(src1, src2, src3, src4)   __n128_to_float16x8_t(neon_fcmlaq_lane_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x4_t_to_n64(src3), (src4), 90))
#define vcmlaq_rot90_laneq_f16(src1, src2, src3, src4)  __n128_to_float16x8_t(neon_fcmlaq_laneq_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), (src4), 90))
#define vcmla_rot180_f16(src1, src2, src3)              __n64_to_float16x4_t(neon_fcmla_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), 180))
#define vcmla_rot180_lane_f16(src1, src2, src3, src4)   __n64_to_float16x4_t(neon_fcmla_lane_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), (src4), 180))
#define vcmla_rot180_laneq_f16(src1, src2, src3, src4)  __n64_to_float16x4_t(neon_fcmla_laneq_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x8_t_to_n128(src3), (src4), 180))
#define vcmlaq_rot180_f16(src1, src2, src3)             __n128_to_float16x8_t(neon_fcmlaq_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), 180))
#define vcmlaq_rot180_lane_f16(src1, src2, src3, src4)  __n128_to_float16x8_t(neon_fcmlaq_lane_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x4_t_to_n64(src3), (src4), 180))
#define vcmlaq_rot180_laneq_f16(src1, src2, src3, src4) __n128_to_float16x8_t(neon_fcmlaq_laneq_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), (src4), 180))
#define vcmla_rot270_f16(src1, src2, src3)              __n64_to_float16x4_t(neon_fcmla_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), 270))
#define vcmla_rot270_lane_f16(src1, src2, src3, src4)   __n64_to_float16x4_t(neon_fcmla_lane_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x4_t_to_n64(src3), (src4), 270))
#define vcmla_rot270_laneq_f16(src1, src2, src3, src4)  __n64_to_float16x4_t(neon_fcmla_laneq_f16(__float16x4_t_to_n64(src1), __float16x4_t_to_n64(src2), __float16x8_t_to_n128(src3), (src4), 270))
#define vcmlaq_rot270_f16(src1, src2, src3)             __n128_to_float16x8_t(neon_fcmlaq_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), 270))
#define vcmlaq_rot270_lane_f16(src1, src2, src3, src4)  __n128_to_float16x8_t(neon_fcmlaq_lane_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x4_t_to_n64(src3), (src4), 270))
#define vcmlaq_rot270_laneq_f16(src1, src2, src3, src4) __n128_to_float16x8_t(neon_fcmlaq_laneq_f16(__float16x8_t_to_n128(src1), __float16x8_t_to_n128(src2), __float16x8_t_to_n128(src3), (src4), 270))

// needs convert:
// 32x2 32x4

#endif  /* !_ARM64_DISTINCT_NEON_TYPES */

#if defined(_ARM64_EXTENDED_INTRINSICS)
#define movs8(reg, lane)   neon_dups8((reg), (lane))
#define movs16(reg, lane)  neon_dups16((reg), (lane))
#define movs32(reg, lane)  neon_dups32((reg), (lane))
#define movs64(reg, lane)  neon_dups64((reg), (lane))
#define movs8q(reg, lane)  neon_dups8q((reg), (lane))
#define movs16q(reg, lane) neon_dups16q((reg), (lane))
#define movs32q(reg, lane) neon_dups32q((reg), (lane))
#define movs64q(reg, lane) neon_dups64q((reg), (lane))
#define movr8(opeqneonreg, lane, corereg)    neon_insr8((opeqneonreg), (lane), (corereg))
#define movr16(opeqneonreg, lane, corereg)   neon_insr16((opeqneonreg), (lane), (corereg))
#define movr32(opeqneonreg, lane, corereg)   neon_insr32((opeqneonreg), (lane), (corereg))
#define movr64(opeqneonreg, lane, corereg)   neon_insr64((opeqneonreg), (lane), (corereg))
#define movrf32(opeqneonreg, lane, corereg)  neon_insrf32((opeqneonreg), (lane), (corereg))
#define movrf64(opeqneonreg, lane, corereg)  neon_insrf64((opeqneonreg), (lane), (corereg))
#define movqr8(opeqneonreg, lane, corereg)   neon_insqr8((opeqneonreg), (lane), (corereg))
#define movqr16(opeqneonreg, lane, corereg)  neon_insqr16((opeqneonreg), (lane), (corereg))
#define movqr32(opeqneonreg, lane, corereg)  neon_insqr32((opeqneonreg), (lane), (corereg))
#define movqr64(opeqneonreg, lane, corereg)  neon_insqr64((opeqneonreg), (lane), (corereg))
#define movqrf32(opeqneonreg, lane, corereg) neon_insqrf32((opeqneonreg), (lane), (corereg))
#define movqrf64(opeqneonreg, lane, corereg) neon_insqrf64((opeqneonreg), (lane), (corereg))
#define move8(opeqneonreg, laneDst, neonSrc, laneSrc)    neon_inse8(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define movqe8(opeqneonreg, laneDst, neonSrc, laneSrc)   neon_insqe8(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define move8q(opeqneonreg, laneDst, neonSrc, laneSrc)   neon_inse8q(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define movqe8q(opeqneonreg, laneDst, neonSrc, laneSrc)  neon_insqe8q(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define move16(opeqneonreg, laneDst, neonSrc, laneSrc)   neon_inse16(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define movqe16(opeqneonreg, laneDst, neonSrc, laneSrc)  neon_insqe16(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define move16q(opeqneonreg, laneDst, neonSrc, laneSrc)  neon_inse16q(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define movqe16q(opeqneonreg, laneDst, neonSrc, laneSrc) neon_insqe16q(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define move32(opeqneonreg, laneDst, neonSrc, laneSrc)   neon_inse32(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define movqe32(opeqneonreg, laneDst, neonSrc, laneSrc)  neon_insqe32(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define move32q(opeqneonreg, laneDst, neonSrc, laneSrc)  neon_inse32q(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define movqe32q(opeqneonreg, laneDst, neonSrc, laneSrc) neon_insqe32q(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define move64(opeqneonreg, laneDst, neonSrc, laneSrc)   neon_inse64(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define movqe64(opeqneonreg, laneDst, neonSrc, laneSrc)  neon_insqe64(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define move64q(opeqneonreg, laneDst, neonSrc, laneSrc)  neon_inse64q(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define movqe64q(opeqneonreg, laneDst, neonSrc, laneSrc) neon_insqe64q(opeqneoneg, (laneDst), (neonSrc), (laneSrc))
#define mvn(src) neon_not(src)
#define mvnq(src) neon_notq(src)
#define vmvn_p16(reg)  neon_not(reg)
#define vmvnq_p16(reg) neon_notq(reg)
#define aese_p8(src1, src2)   neon_aese((src1), (src2))
#define aese_s8(src1, src2)   neon_aese((src1), (src2))
#define aese_u8(src1, src2)   neon_aese((src1), (src2))
#define aesd_p8(src1, src2)   neon_aesd((src1), (src2))
#define aesd_s8(src1, src2)   neon_aesd((src1), (src2))
#define aesd_u8(src1, src2)   neon_aesd((src1), (src2))
#define aesmc_p8(src)  neon_aesmc(src)
#define aesmc_s8(src)  neon_aesmc(src)
#define aesmc_u8(src)  neon_aesmc(src)
#define aesimc_p8(src) neon_aesimc(src)
#define aesimc_s8(src) neon_aesimc(src)
#define aesimc_u8(src) neon_aesimc(src)
#define vbif_s8(src1, src2, src3)   neon_bif((src1), (src2), (src3))
#define vbif_u8(src1, src2, src3)   neon_bif((src1), (src2), (src3))
#define vbif_s16(src1, src2, src3)  neon_bif((src1), (src2), (src3))
#define vbif_u16(src1, src2, src3)  neon_bif((src1), (src2), (src3))
#define vbif_s32(src1, src2, src3)  neon_bif((src1), (src2), (src3))
#define vbif_u32(src1, src2, src3)  neon_bif((src1), (src2), (src3))
#define vbif_s64(src1, src2, src3)  neon_bif((src1), (src2), (src3))
#define vbif_u64(src1, src2, src3)  neon_bif((src1), (src2), (src3))
#define vbifq_s8(src1, src2, src3)  neon_bifq((src1), (src2), (src3))
#define vbifq_u8(src1, src2, src3)  neon_bifq((src1), (src2), (src3))
#define vbifq_s16(src1, src2, src3) neon_bifq((src1), (src2), (src3))
#define vbifq_u16(src1, src2, src3) neon_bifq((src1), (src2), (src3))
#define vbifq_s32(src1, src2, src3) neon_bifq((src1), (src2), (src3))
#define vbifq_u32(src1, src2, src3) neon_bifq((src1), (src2), (src3))
#define vbifq_s64(src1, src2, src3) neon_bifq((src1), (src2), (src3))
#define vbifq_u64(src1, src2, src3) neon_bifq((src1), (src2), (src3))
#define vbit_s8(src1, src2, src3)   neon_bit((src1), (src2), (src3))
#define vbit_u8(src1, src2, src3)   neon_bit((src1), (src2), (src3))
#define vbit_s16(src1, src2, src3)  neon_bit((src1), (src2), (src3))
#define vbit_u16(src1, src2, src3)  neon_bit((src1), (src2), (src3))
#define vbit_s32(src1, src2, src3)  neon_bit((src1), (src2), (src3))
#define vbit_u32(src1, src2, src3)  neon_bit((src1), (src2), (src3))
#define vbit_s64(src1, src2, src3)  neon_bit((src1), (src2), (src3))
#define vbit_u64(src1, src2, src3)  neon_bit((src1), (src2), (src3))
#define vbitq_s8(src1, src2, src3)  neon_bitq((src1), (src2), (src3))
#define vbitq_u8(src1, src2, src3)  neon_bitq((src1), (src2), (src3))
#define vbitq_s16(src1, src2, src3) neon_bitq((src1), (src2), (src3))
#define vbitq_u16(src1, src2, src3) neon_bitq((src1), (src2), (src3))
#define vbitq_s32(src1, src2, src3) neon_bitq((src1), (src2), (src3))
#define vbitq_u32(src1, src2, src3) neon_bitq((src1), (src2), (src3))
#define vbitq_s64(src1, src2, src3) neon_bitq((src1), (src2), (src3))
#define vbitq_u64(src1, src2, src3) neon_bitq((src1), (src2), (src3))
#define vmullq_p8(src1, src2) neon_pmull_q8((src1), (src2))
#define vmullq_p64(src1, src2) neon_pmull_q64((src1), (src2))
#define vmlaq_laneq_f64(src1, src2, src3, lane) neon_fmlaqvind64q((src1), (src2), (src3), (lane))
#define vfma_lane_s16(src1, src2, src3, lane) neon_mlavind16((src1), (src2), (src3), (lane))
#define vfma_lane_s32(src1, src2, src3, lane) neon_mlavind32((src1), (src2), (src3), (lane))
#define vfma_lane_u16(src1, src2, src3, lane) neon_mlavind16((src1), (src2), (src3), (lane))
#define vfma_lane_u32(src1, src2, src3, lane) neon_mlavind32((src1), (src2), (src3), (lane))
#define vfms_lane_s16(src1, src2, src3, lane) neon_mlsvind16((src1), (src2), (src3), (lane))
#define vfms_lane_s32(src1, src2, src3, lane) neon_mlsvind32((src1), (src2), (src3), (lane))
#define vfms_lane_u16(src1, src2, src3, lane) neon_mlsvind16((src1), (src2), (src3), (lane))
#define vfms_lane_u32(src1, src2, src3, lane) neon_mlsvind32((src1), (src2), (src3), (lane))
#define vfmaq_lane_s16(src1, src2, src3, lane) neon_mlaqvind16((src1), (src2), (src3), (lane))
#define vfmaq_lane_s32(src1, src2, src3, lane) neon_mlaqvind32((src1), (src2), (src3), (lane))
#define vfmaq_lane_u16(src1, src2, src3, lane) neon_mlaqvind16((src1), (src2), (src3), (lane))
#define vfmaq_lane_u32(src1, src2, src3, lane) neon_mlaqvind32((src1), (src2), (src3), (lane))
#define vfmsq_lane_s16(src1, src2, src3, lane) neon_mlsqvind16((src1), (src2), (src3), (lane))
#define vfmsq_lane_s32(src1, src2, src3, lane) neon_mlsqvind32((src1), (src2), (src3), (lane))
#define vfmsq_lane_u16(src1, src2, src3, lane) neon_mlsqvind16((src1), (src2), (src3), (lane))
#define vfmsq_lane_u32(src1, src2, src3, lane) neon_mlsqvind32((src1), (src2), (src3), (lane))
#define vfma_s16(src1, src2, src3) neon_mla16((src1), (src2), (src3))
#define vfma_s32(src1, src2, src3) neon_mla32((src1), (src2), (src3))
#define vfma_s8(src1, src2, src3) neon_mla8((src1), (src2), (src3))
#define vfma_u16(src1, src2, src3) neon_mla16((src1), (src2), (src3))
#define vfma_u32(src1, src2, src3) neon_mla32((src1), (src2), (src3))
#define vfma_u8(src1, src2, src3) neon_mla8((src1), (src2), (src3))
#define vfms_s16(src1, src2, src3) neon_mls16((src1), (src2), (src3))
#define vfms_s32(src1, src2, src3) neon_mls32((src1), (src2), (src3))
#define vfms_s8(src1, src2, src3) neon_mls8((src1), (src2), (src3))
#define vfms_u16(src1, src2, src3) neon_mls16((src1), (src2), (src3))
#define vfms_u32(src1, src2, src3) neon_mls32((src1), (src2), (src3))
#define vfms_u8(src1, src2, src3) neon_mls8((src1), (src2), (src3))
#define vfmaq_s16(src1, src2, src3) neon_mlaq16((src1), (src2), (src3))
#define vfmaq_s32(src1, src2, src3) neon_mlaq32((src1), (src2), (src3))
#define vfmaq_s8(src1, src2, src3) neon_mlaq8((src1), (src2), (src3))
#define vfmaq_u16(src1, src2, src3) neon_mlaq16((src1), (src2), (src3))
#define vfmaq_u32(src1, src2, src3) neon_mlaq32((src1), (src2), (src3))
#define vfmaq_u8(src1, src2, src3) neon_mlaq8((src1), (src2), (src3))
#define vfmsq_s16(src1, src2, src3) neon_mlsq16((src1), (src2), (src3))
#define vfmsq_s32(src1, src2, src3) neon_mlsq32((src1), (src2), (src3))
#define vfmsq_s8(src1, src2, src3) neon_mlsq8((src1), (src2), (src3))
#define vfmsq_u16(src1, src2, src3) neon_mlsq16((src1), (src2), (src3))
#define vfmsq_u32(src1, src2, src3) neon_mlsq32((src1), (src2), (src3))
#define vfmsq_u8(src1, src2, src3) neon_mlsq8((src1), (src2), (src3))
#define vceq_z_f32_ex(src) neon_fcmeqz32(src)
#define vceq_z_s16_ex(src) neon_cmeqz16(src)
#define vceq_z_s32_ex(src) neon_cmeqz32(src)
#define vceq_z_s8_ex(src) neon_cmeqz8(src)
#define vceq_z_u16_ex(src) neon_cmeqz16(src)
#define vceq_z_u32_ex(src) neon_cmeqz32(src)
#define vceq_z_u8_ex(src) neon_cmeqz8(src)
#define vceqq_z_f32_ex(src) neon_fcmeqzq32(src)
#define vceqq_z_s16_ex(src) neon_cmeqzq16(src)
#define vceqq_z_s32_ex(src) neon_cmeqzq32(src)
#define vceqq_z_s8_ex(src) neon_cmeqzq8(src)
#define vceqq_z_u16_ex(src) neon_cmeqzq16(src)
#define vceqq_z_u32_ex(src) neon_cmeqzq32(src)
#define vceqq_z_u8_ex(src) neon_cmeqzq8(src)
#define vcge_z_f32_ex(src) neon_fcmgez32(src)
#define vcge_z_s16_ex(src) neon_cmgez16(src)
#define vcge_z_s32_ex(src) neon_cmgez32(src)
#define vcge_z_s8_ex(src) neon_cmgez8(src)
#define vcge_z_u16_ex(src) neon_cmgez16(src)
#define vcge_z_u32_ex(src) neon_cmgez32(src)
#define vcge_z_u8_ex(src) neon_cmgez8(src)
#define vcgeq_z_f32_ex(src) neon_fcmgezq32(src)
#define vcgeq_z_s16_ex(src) neon_cmgezq16(src)
#define vcgeq_z_s32_ex(src) neon_cmgezq32(src)
#define vcgeq_z_s8_ex(src) neon_cmgezq8(src)
#define vcgeq_z_u16_ex(src) neon_cmgezq16(src)
#define vcgeq_z_u32_ex(src) neon_cmgezq32(src)
#define vcgeq_z_u8_ex(src) neon_cmgezq8(src)
#define vcge_p8(src1, src2) neon_cmge8((src1), (src2))
#define vcgeq_p8(src1, src2) neon_cmgeq8((src1), (src2))
#define vcle_z_f32_ex(src) neon_fcmlez32(src)
#define vcle_z_s16_ex(src) neon_cmlez16(src)
#define vcle_z_s32_ex(src) neon_cmlez32(src)
#define vcle_z_s8_ex(src) neon_cmlez8(src)
#define vcleq_z_f32_ex(src) neon_fcmlezq32(src)
#define vcleq_z_s16_ex(src) neon_cmlezq16(src)
#define vcleq_z_s32_ex(src) neon_cmlezq32(src)
#define vcleq_z_s8_ex(src) neon_cmlezq8(src)
#define vcle_p8(src1, src2) neon_cmge8((src2), (src1))
#define vcleq_p8(src1, src2) neon_cmgeq8((src2), (src1))
#define vcgt_z_f32_ex(src) neon_fcmgtz32(src)
#define vcgt_z_s16_ex(src) neon_cmgtz16(src)
#define vcgt_z_s32_ex(src) neon_cmgtz32(src)
#define vcgt_z_s8_ex(src) neon_cmgtz8(src)
#define vcgt_z_u16_ex(src) neon_cmgtz16(src)
#define vcgt_z_u32_ex(src) neon_cmgtz32(src)
#define vcgt_z_u8_ex(src) neon_cmgtz8(src)
#define vcgtq_z_f32_ex(src) neon_fcmgtzq32(src)
#define vcgtq_z_s16_ex(src) neon_cmgtzq16(src)
#define vcgtq_z_s32_ex(src) neon_cmgtzq32(src)
#define vcgtq_z_s8_ex(src) neon_cmgtzq8(src)
#define vcgtq_z_u16_ex(src) neon_cmgtzq16(src)
#define vcgtq_z_u32_ex(src) neon_cmgtzq32(src)
#define vcgtq_z_u8_ex(src) neon_cmgtzq8(src)
#define vcgt_p8(src1, src2) neon_cmgt8((src1), (src2))
#define vcgtq_p8(src1, src2) neon_cmgtq8((src1), (src2))
#define vclt_z_f32_ex(src) neon_fcmltz32(src)
#define vclt_z_s16_ex(src) neon_cmltz16(src)
#define vclt_z_s32_ex(src) neon_cmltz32(src)
#define vclt_z_s8_ex(src) neon_cmltz8(src)
#define vcltq_z_f32_ex(src) neon_fcmltzq32(src)
#define vcltq_z_s16_ex(src) neon_cmltzq16(src)
#define vcltq_z_s32_ex(src) neon_cmltzq32(src)
#define vcltq_z_s8_ex(src) neon_cmltzq8(src)
#define vclt_p8(src1, src2) neon_cmgt8((src2), (src1))
#define vcltq_p8(src1, src2) neon_cmgtq8((src2), (src1))
#define vtst_p16(src1, src2) neon_cmtst16((src1), (src2))
#define vtstq_p16(src1, src2) neon_cmtstq16((src1), (src2))
#define sha1c_f32(src1, src2, src3) neon_sha1c((src1), (src2), (src3))
#define sha1c_s32(src1, src2, src3) neon_sha1c((src1), (src2), (src3))
#define sha1c_u32(src1, src2, src3) neon_sha1c((src1), (src2), (src3))
#define sha1p_f32(src1, src2, src3) neon_sha1p((src1), (src2), (src3))
#define sha1p_s32(src1, src2, src3) neon_sha1p((src1), (src2), (src3))
#define sha1p_u32(src1, src2, src3) neon_sha1p((src1), (src2), (src3))
#define sha1m_f32(src1, src2, src3) neon_sha1m((src1), (src2), (src3))
#define sha1m_s32(src1, src2, src3) neon_sha1m((src1), (src2), (src3))
#define sha1m_u32(src1, src2, src3) neon_sha1m((src1), (src2), (src3))
#define sha1su1_f32(src1, src2)   neon_sha1su1((src1), (src2))
#define sha1su1_s32(src1, src2)   neon_sha1su1((src1), (src2))
#define sha1su1_u32(src1, src2)   neon_sha1su1((src1), (src2))
#define sha256su0_f32(src1, src2) neon_sha256su0((src1), (src2))
#define sha256su0_s32(src1, src2) neon_sha256su0((src1), (src2))
#define sha256su0_u32(src1, src2) neon_sha256su0((src1), (src2))
#define sha1su0_f32(src1, src2, src3)   neon_sha1su0((src1), (src2), (src3))
#define sha1su0_s32(src1, src2, src3)   neon_sha1su0((src1), (src2), (src3))
#define sha1su0_u32(src1, src2, src3)   neon_sha1su0((src1), (src2), (src3))
#define sha256h_f32(src1, src2, src3)   neon_sha256h((src1), (src2), (src3))
#define sha256h_s32(src1, src2, src3)   neon_sha256h((src1), (src2), (src3))
#define sha256h_u32(src1, src2, src3)   neon_sha256h((src1), (src2), (src3))
#define sha256h2_f32(src1, src2, src3)  neon_sha256h2((src1), (src2), (src3))
#define sha256h2_s32(src1, src2, src3)  neon_sha256h2((src1), (src2), (src3))
#define sha256h2_u32(src1, src2, src3)  neon_sha256h2((src1), (src2), (src3))
#define sha256su1_f32(src1, src2, src3) neon_sha256su1((src1), (src2), (src3))
#define sha256su1_s32(src1, src2, src3) neon_sha256su1((src1), (src2), (src3))
#define sha256su1_u32(src1, src2, src3) neon_sha256su1((src1), (src2), (src3))
#define sha1h_f32(src) neon_sha1h(src)
#define sha1h_s32(src) neon_sha1h(src)
#define sha1h_u32(src) neon_sha1h(src)
#define vld4_dup_f32_ex(src, align) neon_ld4r_32((__int32*)(src))
#define vld4_dup_p16_ex(src, align) neon_ld4r_16((__int16*)(src))
#define vld4_dup_p8_ex(src, align) neon_ld4r_8((__int8*)(src))
#define vld4_dup_s16_ex(src, align) neon_ld4r_16((__int16*)(src))
#define vld4_dup_s32_ex(src, align) neon_ld4r_32((__int32*)(src))
#define vld4_dup_s8_ex(src, align) neon_ld4r_8((__int8*)(src))
#define vld4_dup_u16_ex(src, align) neon_ld4r_16((__int16*)(src))
#define vld4_dup_u32_ex(src, align) neon_ld4r_32((__int32*)(src))
#define vld4_dup_u8_ex(src, align) neon_ld4r_8((__int8*)(src))
#define vld4_dup_s64_ex(src, align) neon_ld4r_64((__int64*)(src))
#define vld4_dup_u64_ex(src, align) neon_ld4r_64((__int64*)(src))
#define vld4_f32_ex(src, align) neon_ld4m_32((__int32*)(src))
#define vld4_p16_ex(src, align) neon_ld4m_16((__int16*)(src))
#define vld4_p8_ex(src, align) neon_ld4m_8((__int8*)(src))
#define vld4_s16_ex(src, align) neon_ld4m_16((__int16*)(src))
#define vld4_s32_ex(src, align) neon_ld4m_32((__int32*)(src))
#define vld4_s8_ex(src, align) neon_ld4m_8((__int8*)(src))
#define vld4_u16_ex(src, align) neon_ld4m_16((__int16*)(src))
#define vld4_u32_ex(src, align) neon_ld4m_32((__int32*)(src))
#define vld4_u8_ex(src, align) neon_ld4m_8((__int8*)(src))
#define vld4_s64_ex(src, align) neon_ld1m4_64((__int64*)(src))
#define vld4_u64_ex(src, align) neon_ld1m4_64((__int64*)(src))
#define vld4q_dup_f32_ex(src, align) neon_ld4r_q32((__int32*)(src))
#define vld4q_dup_p16_ex(src, align) neon_ld4r_q16((__int16*)(src))
#define vld4q_dup_p8_ex(src, align) neon_ld4r_q8((__int8*)(src))
#define vld4q_dup_s16_ex(src, align) neon_ld4r_q16((__int16*)(src))
#define vld4q_dup_s32_ex(src, align) neon_ld4r_q32((__int32*)(src))
#define vld4q_dup_s8_ex(src, align) neon_ld4r_q8((__int8*)(src))
#define vld4q_dup_u16_ex(src, align) neon_ld4r_q16((__int16*)(src))
#define vld4q_dup_u32_ex(src, align) neon_ld4r_q32((__int32*)(src))
#define vld4q_dup_u8_ex(src, align) neon_ld4r_q8((__int8*)(src))
#define vld4q_dup_s64_ex(src, align) neon_ld4r_q64((__int64*)(src))
#define vld4q_dup_u64_ex(src, align) neon_ld4r_q64((__int64*)(src))
#define vld4q_f32_ex(src, align) neon_ld4m_q32((__int32*)(src))
#define vld4q_p16_ex(src, align) neon_ld4m_q16((__int16*)(src))
#define vld4q_p8_ex(src, align) neon_ld4m_q8((__int8*)(src))
#define vld4q_s16_ex(src, align) neon_ld4m_q16((__int16*)(src))
#define vld4q_s32_ex(src, align) neon_ld4m_q32((__int32*)(src))
#define vld4q_s8_ex(src, align) neon_ld4m_q8((__int8*)(src))
#define vld4q_u16_ex(src, align) neon_ld4m_q16((__int16*)(src))
#define vld4q_u32_ex(src, align) neon_ld4m_q32((__int32*)(src))
#define vld4q_u8_ex(src, align) neon_ld4m_q8((__int8*)(src))
#define vld4q_s64_ex(src, align) neon_ld4m_q64((__int64*)(src))
#define vld4q_u64_ex(src, align) neon_ld4m_q64((__int64*)(src))
#define vld4_lane_f32_ex(src1, src2, src3) neon_ld4s_32((__int32*)(src1), (src2), (src3))
#define vld4_lane_p16_ex(src1, src2, src3) neon_ld4s_16((__int16*)(src1), (src2), (src3))
#define vld4_lane_p8_ex(src1, src2, src3) neon_ld4s_8((__int8*)(src1), (src2), (src3))
#define vld4_lane_s16_ex(src1, src2, src3) neon_ld4s_16((__int16*)(src1), (src2), (src3))
#define vld4_lane_s32_ex(src1, src2, src3) neon_ld4s_32((__int32*)(src1), (src2), (src3))
#define vld4_lane_s64_ex(src1, src2, src3) neon_ld4s_64((__int64*)(src1), (src2), (src3))
#define vld4_lane_s8_ex(src1, src2, src3) neon_ld4s_8((__int8*)(src1), (src2), (src3))
#define vld4_lane_u16_ex(src1, src2, src3) neon_ld4s_16((__int16*)(src1), (src2), (src3))
#define vld4_lane_u32_ex(src1, src2, src3) neon_ld4s_32((__int32*)(src1), (src2), (src3))
#define vld4_lane_u8_ex(src1, src2, src3) neon_ld4s_8((__int8*)(src1), (src2), (src3))
#define vld4q_lane_f32_ex(src1, src2, src3) neon_ld4s_q32((__int32*)(src1), (src2), (src3))
#define vld4q_lane_p8_ex(src1, src2, src3) neon_ld4s_q8((__int8*)(src1), (src2), (src3))
#define vld4q_lane_p16_ex(src1, src2, src3) neon_ld4s_q16((__int16*)(src1), (src2), (src3))
#define vld4q_lane_s16_ex(src1, src2, src3) neon_ld4s_q16((__int16*)(src1), (src2), (src3))
#define vld4q_lane_s32_ex(src1, src2, src3) neon_ld4s_q32((__int32*)(src1), (src2), (src3))
#define vld4q_lane_s64_ex(src1, src2, src3) neon_ld4s_q64((__int64*)(src1), (src2), (src3))
#define vld4q_lane_u16_ex(src1, src2, src3) neon_ld4s_q16((__int16*)(src1), (src2), (src3))
#define vld4q_lane_u32_ex(src1, src2, src3) neon_ld4s_q32((__int32*)(src1), (src2), (src3))
#define vld3_dup_f32_ex(src, align) neon_ld3r_32((__int32*)(src))
#define vld3_dup_p16_ex(src, align) neon_ld3r_16((__int16*)(src))
#define vld3_dup_p8_ex(src, align) neon_ld3r_8((__int8*)(src))
#define vld3_dup_s16_ex(src, align) neon_ld3r_16((__int16*)(src))
#define vld3_dup_s32_ex(src, align) neon_ld3r_32((__int32*)(src))
#define vld3_dup_s8_ex(src, align) neon_ld3r_8((__int8*)(src))
#define vld3_dup_u16_ex(src, align) neon_ld3r_16((__int16*)(src))
#define vld3_dup_u32_ex(src, align) neon_ld3r_32((__int32*)(src))
#define vld3_dup_u8_ex(src, align) neon_ld3r_8((__int8*)(src))
#define vld3_dup_s64_ex(src, align) neon_ld3r_64((__int64*)(src))
#define vld3_dup_u64_ex(src, align) neon_ld3r_64((__int64*)(src))
#define vld3_f32_ex(src, align) neon_ld3m_32((__int32*)(src))
#define vld3_p16_ex(src, align) neon_ld3m_16((__int16*)(src))
#define vld3_p8_ex(src, align) neon_ld3m_8((__int8*)(src))
#define vld3_s16_ex(src, align) neon_ld3m_16((__int16*)(src))
#define vld3_s32_ex(src, align) neon_ld3m_32((__int32*)(src))
#define vld3_s8_ex(src, align) neon_ld3m_8((__int8*)(src))
#define vld3_u16_ex(src, align) neon_ld3m_16((__int16*)(src))
#define vld3_u32_ex(src, align) neon_ld3m_32((__int32*)(src))
#define vld3_u8_ex(src, align) neon_ld3m_8((__int8*)(src))
#define vld3_s64_ex(src, align) neon_ld1m3_64((__int64*)(src))
#define vld3_u64_ex(src, align) neon_ld1m3_64((__int64*)(src))
#define vld3q_dup_f32_ex(src, align) neon_ld3r_q32((__int32*)(src))
#define vld3q_dup_p16_ex(src, align) neon_ld3r_q16((__int16*)(src))
#define vld3q_dup_p8_ex(src, align) neon_ld3r_q8((__int8*)(src))
#define vld3q_dup_s16_ex(src, align) neon_ld3r_q16((__int16*)(src))
#define vld3q_dup_s32_ex(src, align) neon_ld3r_q32((__int32*)(src))
#define vld3q_dup_s8_ex(src, align) neon_ld3r_q8((__int8*)(src))
#define vld3q_dup_u16_ex(src, align) neon_ld3r_q16((__int16*)(src))
#define vld3q_dup_u32_ex(src, align) neon_ld3r_q32((__int32*)(src))
#define vld3q_dup_u8_ex(src, align) neon_ld3r_q8((__int8*)(src))
#define vld3q_dup_s64_ex(src, align) neon_ld3r_q64((__int64*)(src))
#define vld3q_dup_u64_ex(src, align) neon_ld3r_q64((__int64*)(src))
#define vld3q_f32_ex(src, align) neon_ld3m_q32((__int32*)(src))
#define vld3q_p16_ex(src, align) neon_ld3m_q16((__int16*)(src))
#define vld3q_p8_ex(src, align) neon_ld3m_q8((__int8*)(src))
#define vld3q_s16_ex(src, align) neon_ld3m_q16((__int16*)(src))
#define vld3q_s32_ex(src, align) neon_ld3m_q32((__int32*)(src))
#define vld3q_s8_ex(src, align) neon_ld3m_q8((__int8*)(src))
#define vld3q_u16_ex(src, align) neon_ld3m_q16((__int16*)(src))
#define vld3q_u32_ex(src, align) neon_ld3m_q32((__int32*)(src))
#define vld3q_u8_ex(src, align) neon_ld3m_q8((__int8*)(src))
#define vld3q_s64_ex(src, align) neon_ld3m_q64((__int64*)(src))
#define vld3q_u64_ex(src, align) neon_ld3m_q64((__int64*)(src))
#define vld3_lane_f32_ex(src1, src2, src3) neon_ld3s_32((__int32*)(src1), (src2), (src3))
#define vld3_lane_p16_ex(src1, src2, src3) neon_ld3s_16((__int16*)(src1), (src2), (src3))
#define vld3_lane_p8_ex(src1, src2, src3) neon_ld3s_8((__int8*)(src1), (src2), (src3))
#define vld3_lane_s16_ex(src1, src2, src3) neon_ld3s_16((__int16*)(src1), (src2), (src3))
#define vld3_lane_s32_ex(src1, src2, src3) neon_ld3s_32((__int32*)(src1), (src2), (src3))
#define vld3_lane_s64_ex(src1, src2, src3) neon_ld3s_64((__int64*)(src1), (src2), (src3))
#define vld3_lane_s8_ex(src1, src2, src3) neon_ld3s_8((__int8*)(src1), (src2), (src3))
#define vld3_lane_u16_ex(src1, src2, src3) neon_ld3s_16((__int16*)(src1), (src2), (src3))
#define vld3_lane_u32_ex(src1, src2, src3) neon_ld3s_32((__int32*)(src1), (src2), (src3))
#define vld3_lane_u8_ex(src1, src2, src3) neon_ld3s_8((__int8*)(src1), (src2), (src3))
#define vld3q_lane_f32_ex(src1, src2, src3) neon_ld3s_q32((__int32*)(src1), (src2), (src3))
#define vld3q_lane_p8_ex(src1, src2, src3) neon_ld3s_q8((__int8*)(src1), (src2), (src3))
#define vld3q_lane_p16_ex(src1, src2, src3) neon_ld3s_q16((__int16*)(src1), (src2), (src3))
#define vld3q_lane_s16_ex(src1, src2, src3) neon_ld3s_q16((__int16*)(src1), (src2), (src3))
#define vld3q_lane_s32_ex(src1, src2, src3) neon_ld3s_q32((__int32*)(src1), (src2), (src3))
#define vld3q_lane_s64_ex(src1, src2, src3) neon_ld3s_q64((__int64*)(src1), (src2), (src3))
#define vld3q_lane_u16_ex(src1, src2, src3) neon_ld3s_q16((__int16*)(src1), (src2), (src3))
#define vld3q_lane_u32_ex(src1, src2, src3) neon_ld3s_q32((__int32*)(src1), (src2), (src3))
#define vld2_dup_f32_ex(src, align) neon_ld2r_32((__int32*)(src))
#define vld2_dup_p16_ex(src, align) neon_ld2r_16((__int16*)(src))
#define vld2_dup_p8_ex(src, align) neon_ld2r_8((__int8*)(src))
#define vld2_dup_s16_ex(src, align) neon_ld2r_16((__int16*)(src))
#define vld2_dup_s32_ex(src, align) neon_ld2r_32((__int32*)(src))
#define vld2_dup_s8_ex(src, align) neon_ld2r_8((__int8*)(src))
#define vld2_dup_u16_ex(src, align) neon_ld2r_16((__int16*)(src))
#define vld2_dup_u32_ex(src, align) neon_ld2r_32((__int32*)(src))
#define vld2_dup_u8_ex(src, align) neon_ld2r_8((__int8*)(src))
#define vld2_dup_s64_ex(src, align) neon_ld2r_64((__int64*)(src))
#define vld2_dup_u64_ex(src, align) neon_ld2r_64((__int64*)(src))
#define vld2_f32_ex(src, align) neon_ld2m_32((__int32*)(src))
#define vld2_p16_ex(src, align) neon_ld2m_16((__int16*)(src))
#define vld2_p8_ex(src, align) neon_ld2m_8((__int8*)(src))
#define vld2_s16_ex(src, align) neon_ld2m_16((__int16*)(src))
#define vld2_s32_ex(src, align) neon_ld2m_32((__int32*)(src))
#define vld2_s8_ex(src, align) neon_ld2m_8((__int8*)(src))
#define vld2_u16_ex(src, align) neon_ld2m_16((__int16*)(src))
#define vld2_u32_ex(src, align) neon_ld2m_32((__int32*)(src))
#define vld2_u8_ex(src, align) neon_ld2m_8((__int8*)(src))
#define vld2_s64_ex(src, align) neon_ld1m2_64((__int64*)(src))
#define vld2_u64_ex(src, align) neon_ld1m2_64((__int64*)(src))
#define vld2q_dup_f32_ex(src, align) neon_ld2r_q32((__int32*)(src))
#define vld2q_dup_p16_ex(src, align) neon_ld2r_q16((__int16*)(src))
#define vld2q_dup_p8_ex(src, align) neon_ld2r_q8((__int8*)(src))
#define vld2q_dup_s16_ex(src, align) neon_ld2r_q16((__int16*)(src))
#define vld2q_dup_s32_ex(src, align) neon_ld2r_q32((__int32*)(src))
#define vld2q_dup_s8_ex(src, align) neon_ld2r_q8((__int8*)(src))
#define vld2q_dup_u16_ex(src, align) neon_ld2r_q16((__int16*)(src))
#define vld2q_dup_u32_ex(src, align) neon_ld2r_q32((__int32*)(src))
#define vld2q_dup_u8_ex(src, align) neon_ld2r_q8((__int8*)(src))
#define vld2q_dup_s64_ex(src, align) neon_ld2r_q64((__int64*)(src))
#define vld2q_dup_u64_ex(src, align) neon_ld2r_q64((__int64*)(src))
#define vld2q_f32_ex(src, align) neon_ld2m_q32((__int32*)(src))
#define vld2q_p16_ex(src, align) neon_ld2m_q16((__int16*)(src))
#define vld2q_p8_ex(src, align) neon_ld2m_q8((__int8*)(src))
#define vld2q_s16_ex(src, align) neon_ld2m_q16((__int16*)(src))
#define vld2q_s32_ex(src, align) neon_ld2m_q32((__int32*)(src))
#define vld2q_s8_ex(src, align) neon_ld2m_q8((__int8*)(src))
#define vld2q_u16_ex(src, align) neon_ld2m_q16((__int16*)(src))
#define vld2q_u32_ex(src, align) neon_ld2m_q32((__int32*)(src))
#define vld2q_u8_ex(src, align) neon_ld2m_q8((__int8*)(src))
#define vld2q_s64_ex(src, align) neon_ld2m_q64((__int64*)(src))
#define vld2q_u64_ex(src, align) neon_ld2m_q64((__int64*)(src))
#define vld2_lane_f32_ex(src1, src2, src3) neon_ld2s_32((__int32*)(src1), (src2), (src3))
#define vld2_lane_p16_ex(src1, src2, src3) neon_ld2s_16((__int16*)(src1), (src2), (src3))
#define vld2_lane_p8_ex(src1, src2, src3) neon_ld2s_8((__int8*)(src1), (src2), (src3))
#define vld2_lane_s16_ex(src1, src2, src3) neon_ld2s_16((__int16*)(src1), (src2), (src3))
#define vld2_lane_s32_ex(src1, src2, src3) neon_ld2s_32((__int32*)(src1), (src2), (src3))
#define vld2_lane_s64_ex(src1, src2, src3) neon_ld2s_64((__int64*)(src1), (src2), (src3))
#define vld2_lane_s8_ex(src1, src2, src3) neon_ld2s_8((__int8*)(src1), (src2), (src3))
#define vld2_lane_u16_ex(src1, src2, src3) neon_ld2s_16((__int16*)(src1), (src2), (src3))
#define vld2_lane_u32_ex(src1, src2, src3) neon_ld2s_32((__int32*)(src1), (src2), (src3))
#define vld2_lane_u8_ex(src1, src2, src3) neon_ld2s_8((__int8*)(src1), (src2), (src3))
#define vld2q_lane_f32_ex(src1, src2, src3) neon_ld2s_q32((__int32*)(src1), (src2), (src3))
#define vld2q_lane_p8_ex(src1, src2, src3) neon_ld2s_q8((__int8*)(src1), (src2), (src3))
#define vld2q_lane_p16_ex(src1, src2, src3) neon_ld2s_q16((__int16*)(src1), (src2), (src3))
#define vld2q_lane_s16_ex(src1, src2, src3) neon_ld2s_q16((__int16*)(src1), (src2), (src3))
#define vld2q_lane_s32_ex(src1, src2, src3) neon_ld2s_q32((__int32*)(src1), (src2), (src3))
#define vld2q_lane_s64_ex(src1, src2, src3) neon_ld2s_q64((__int64*)(src1), (src2), (src3))
#define vld2q_lane_u16_ex(src1, src2, src3) neon_ld2s_q16((__int16*)(src1), (src2), (src3))
#define vld2q_lane_u32_ex(src1, src2, src3) neon_ld2s_q32((__int32*)(src1), (src2), (src3))
#define vld1_dup_f16_ex(src, align) neon_ld1r_16((__int16*)(src))
#define vld1_dup_f32_ex(src, align) neon_ld1r_32((__int32*)(src))
#define vld1_dup_p16_ex(src, align) neon_ld1r_16((__int16*)(src))
#define vld1_dup_p8_ex(src, align) neon_ld1r_8((__int8*)(src))
#define vld1_dup_s16_ex(src, align) neon_ld1r_16((__int16*)(src))
#define vld1_dup_s32_ex(src, align) neon_ld1r_32((__int32*)(src))
#define vld1_dup_s8_ex(src, align) neon_ld1r_8((__int8*)(src))
#define vld1_dup_u16_ex(src, align) neon_ld1r_16((__int16*)(src))
#define vld1_dup_u32_ex(src, align) neon_ld1r_32((__int32*)(src))
#define vld1_dup_u8_ex(src, align) neon_ld1r_8((__int8*)(src))
#define vld1_dup_s64_ex(src, align) neon_ld1r_64((__int64*)(src))
#define vld1_dup_u64_ex(src, align) neon_ld1r_64((__int64*)(src))
#define vld1_f16_ex(src, align) neon_ld1m_16((__int16*)(src))
#define vld1_f32_ex(src, align) neon_ld1m_32((__int32*)(src))
#define vld1_p16_ex(src, align) neon_ld1m_16((__int16*)(src))
#define vld1_p8_ex(src, align) neon_ld1m_8((__int8*)(src))
#define vld1_s16_ex(src, align) neon_ld1m_16((__int16*)(src))
#define vld1_s32_ex(src, align) neon_ld1m_32((__int32*)(src))
#define vld1_s8_ex(src, align) neon_ld1m_8((__int8*)(src))
#define vld1_u16_ex(src, align) neon_ld1m_16((__int16*)(src))
#define vld1_u32_ex(src, align) neon_ld1m_32((__int32*)(src))
#define vld1_u8_ex(src, align) neon_ld1m_8((__int8*)(src))
#define vld1_s64_ex(src, align) neon_ld1m_64((__int64*)(src))
#define vld1_u64_ex(src, align) neon_ld1m_64((__int64*)(src))
#define vld1_f64_ex(src, align) neon_ld1m_64((__int64*)(src))
#define vld1_f32_x2_ex(src, align) neon_ld1m2_32((__int32*)(src))
#define vld1_p16_x2_ex(src, align) neon_ld1m2_16((__int16*)(src))
#define vld1_p8_x2_ex(src, align) neon_ld1m2_8((__int8*)(src))
#define vld1_s16_x2_ex(src, align) neon_ld1m2_16((__int16*)(src))
#define vld1_s32_x2_ex(src, align) neon_ld1m2_32((__int32*)(src))
#define vld1_s8_x2_ex(src, align) neon_ld1m2_8((__int8*)(src))
#define vld1_u16_x2_ex(src, align) neon_ld1m2_16((__int16*)(src))
#define vld1_u32_x2_ex(src, align) neon_ld1m2_32((__int32*)(src))
#define vld1_u8_x2_ex(src, align) neon_ld1m2_8((__int8*)(src))
#define vld1_s64_x2_ex(src, align) neon_ld1m2_64((__int64*)(src))
#define vld1_u64_x2_ex(src, align) neon_ld1m2_64((__int64*)(src))
#define vld1_f64_x2_ex(src, align) neon_ld1m2_64((__int64*)(src))
#define vld1_f32_x3_ex(src, align) neon_ld1m3_32((__int32*)(src))
#define vld1_p16_x3_ex(src, align) neon_ld1m3_16((__int16*)(src))
#define vld1_p8_x3_ex(src, align) neon_ld1m3_8((__int8*)(src))
#define vld1_s16_x3_ex(src, align) neon_ld1m3_16((__int16*)(src))
#define vld1_s32_x3_ex(src, align) neon_ld1m3_32((__int32*)(src))
#define vld1_s8_x3_ex(src, align) neon_ld1m3_8((__int8*)(src))
#define vld1_u16_x3_ex(src, align) neon_ld1m3_16((__int16*)(src))
#define vld1_u32_x3_ex(src, align) neon_ld1m3_32((__int32*)(src))
#define vld1_u8_x3_ex(src, align) neon_ld1m3_8((__int8*)(src))
#define vld1_s64_x3_ex(src, align) neon_ld1m3_64((__int64*)(src))
#define vld1_u64_x3_ex(src, align) neon_ld1m3_64((__int64*)(src))
#define vld1_f64_x3_ex(src, align) neon_ld1m3_64((__int64*)(src))
#define vld1_f32_x4_ex(src, align) neon_ld1m4_32((__int32*)(src))
#define vld1_p16_x4_ex(src, align) neon_ld1m4_16((__int16*)(src))
#define vld1_p8_x4_ex(src, align) neon_ld1m4_8((__int8*)(src))
#define vld1_s16_x4_ex(src, align) neon_ld1m4_16((__int16*)(src))
#define vld1_s32_x4_ex(src, align) neon_ld1m4_32((__int32*)(src))
#define vld1_s8_x4_ex(src, align) neon_ld1m4_8((__int8*)(src))
#define vld1_u16_x4_ex(src, align) neon_ld1m4_16((__int16*)(src))
#define vld1_u32_x4_ex(src, align) neon_ld1m4_32((__int32*)(src))
#define vld1_u8_x4_ex(src, align) neon_ld1m4_8((__int8*)(src))
#define vld1_s64_x4_ex(src, align) neon_ld1m4_64((__int64*)(src))
#define vld1_u64_x4_ex(src, align) neon_ld1m4_64((__int64*)(src))
#define vld1_f64_x4_ex(src, align) neon_ld1m4_64((__int64*)(src))
#define vld1q_dup_f32_ex(src, align) neon_ld1r_q32((__int32*)(src))
#define vld1q_dup_p16_ex(src, align) neon_ld1r_q16((__int16*)(src))
#define vld1q_dup_p8_ex(src, align) neon_ld1r_q8((__int8*)(src))
#define vld1q_dup_s16_ex(src, align) neon_ld1r_q16((__int16*)(src))
#define vld1q_dup_s32_ex(src, align) neon_ld1r_q32((__int32*)(src))
#define vld1q_dup_s8_ex(src, align) neon_ld1r_q8((__int8*)(src))
#define vld1q_dup_u16_ex(src, align) neon_ld1r_q16((__int16*)(src))
#define vld1q_dup_u32_ex(src, align) neon_ld1r_q32((__int32*)(src))
#define vld1q_dup_u8_ex(src, align) neon_ld1r_q8((__int8*)(src))
#define vld1q_dup_s64_ex(src, align) neon_ld1r_q64((__int64*)(src))
#define vld1q_dup_u64_ex(src, align) neon_ld1r_q64((__int64*)(src))
#define vld1q_f32_ex(src, align) neon_ld1m_q32((__int32*)(src))
#define vld1q_p16_ex(src, align) neon_ld1m_q16((__int16*)(src))
#define vld1q_p8_ex(src, align) neon_ld1m_q8((__int8*)(src))
#define vld1q_s16_ex(src, align) neon_ld1m_q16((__int16*)(src))
#define vld1q_s32_ex(src, align) neon_ld1m_q32((__int32*)(src))
#define vld1q_s8_ex(src, align) neon_ld1m_q8((__int8*)(src))
#define vld1q_u16_ex(src, align) neon_ld1m_q16((__int16*)(src))
#define vld1q_u32_ex(src, align) neon_ld1m_q32((__int32*)(src))
#define vld1q_u8_ex(src, align) neon_ld1m_q8((__int8*)(src))
#define vld1q_s64_ex(src, align) neon_ld1m_q64((__int64*)(src))
#define vld1q_u64_ex(src, align) neon_ld1m_q64((__int64*)(src))
#define vld1q_f32_x2_ex(src, align) neon_ld1m2_q32((__int32*)(src))
#define vld1q_p16_x2_ex(src, align) neon_ld1m2_q16((__int16*)(src))
#define vld1q_p8_x2_ex(src, align) neon_ld1m2_q8((__int8*)(src))
#define vld1q_s16_x2_ex(src, align) neon_ld1m2_q16((__int16*)(src))
#define vld1q_s32_x2_ex(src, align) neon_ld1m2_q32((__int32*)(src))
#define vld1q_s8_x2_ex(src, align) neon_ld1m2_q8((__int8*)(src))
#define vld1q_u16_x2_ex(src, align) neon_ld1m2_q16((__int16*)(src))
#define vld1q_u32_x2_ex(src, align) neon_ld1m2_q32((__int32*)(src))
#define vld1q_u8_x2_ex(src, align) neon_ld1m2_q8((__int8*)(src))
#define vld1q_s64_x2_ex(src, align) neon_ld1m2_q64((__int64*)(src))
#define vld1q_u64_x2_ex(src, align) neon_ld1m2_q64((__int64*)(src))
#define vld1q_f32_x3_ex(src, align) neon_ld1m3_q32((__int32*)(src))
#define vld1q_p16_x3_ex(src, align) neon_ld1m3_q16((__int16*)(src))
#define vld1q_p8_x3_ex(src, align) neon_ld1m3_q8((__int8*)(src))
#define vld1q_s16_x3_ex(src, align) neon_ld1m3_q16((__int16*)(src))
#define vld1q_s32_x3_ex(src, align) neon_ld1m3_q32((__int32*)(src))
#define vld1q_s8_x3_ex(src, align) neon_ld1m3_q8((__int8*)(src))
#define vld1q_u16_x3_ex(src, align) neon_ld1m3_q16((__int16*)(src))
#define vld1q_u32_x3_ex(src, align) neon_ld1m3_q32((__int32*)(src))
#define vld1q_u8_x3_ex(src, align) neon_ld1m3_q8((__int8*)(src))
#define vld1q_s64_x3_ex(src, align) neon_ld1m3_q64((__int64*)(src))
#define vld1q_u64_x3_ex(src, align) neon_ld1m3_q64((__int64*)(src))
#define vld1q_f32_x4_ex(src, align) neon_ld1m4_q32((__int32*)(src))
#define vld1q_p16_x4_ex(src, align) neon_ld1m4_q16((__int16*)(src))
#define vld1q_p8_x4_ex(src, align) neon_ld1m4_q8((__int8*)(src))
#define vld1q_s16_x4_ex(src, align) neon_ld1m4_q16((__int16*)(src))
#define vld1q_s32_x4_ex(src, align) neon_ld1m4_q32((__int32*)(src))
#define vld1q_s8_x4_ex(src, align) neon_ld1m4_q8((__int8*)(src))
#define vld1q_u16_x4_ex(src, align) neon_ld1m4_q16((__int16*)(src))
#define vld1q_u32_x4_ex(src, align) neon_ld1m4_q32((__int32*)(src))
#define vld1q_u8_x4_ex(src, align) neon_ld1m4_q8((__int8*)(src))
#define vld1q_s64_x4_ex(src, align) neon_ld1m4_q64((__int64*)(src))
#define vld1q_u64_x4_ex(src, align) neon_ld1m4_q64((__int64*)(src))
#define vld1_lane_f16_ex(src1, src2, src3) neon_ld1s_16((__int16*)(src1), (src2), (src3))
#define vld1_lane_f32_ex(src1, src2, src3) neon_ld1s_32((__int32*)(src1), (src2), (src3))
#define vld1_lane_p16_ex(src1, src2, src3) neon_ld1s_16((__int16*)(src1), (src2), (src3))
#define vld1_lane_p8_ex(src1, src2, src3) neon_ld1s_8((__int8*)(src1), (src2), (src3))
#define vld1_lane_s16_ex(src1, src2, src3) neon_ld1s_16((__int16*)(src1), (src2), (src3))
#define vld1_lane_s32_ex(src1, src2, src3) neon_ld1s_32((__int32*)(src1), (src2), (src3))
#define vld1_lane_s64_ex(src1, src2, src3) neon_ld1s_64((__int64*)(src1), (src2), (src3))
#define vld1_lane_s8_ex(src1, src2, src3) neon_ld1s_8((__int8*)(src1), (src2), (src3))
#define vld1_lane_u16_ex(src1, src2, src3) neon_ld1s_16((__int16*)(src1), (src2), (src3))
#define vld1_lane_u32_ex(src1, src2, src3) neon_ld1s_32((__int32*)(src1), (src2), (src3))
#define vld1_lane_u8_ex(src1, src2, src3) neon_ld1s_8((__int8*)(src1), (src2), (src3))
#define vld1q_lane_f32_ex(src1, src2, src3) neon_ld1s_q32((__int32*)(src1), (src2), (src3))
#define vld1q_lane_p8_ex(src1, src2, src3) neon_ld1s_q8((__int8*)(src1), (src2), (src3))
#define vld1q_lane_p16_ex(src1, src2, src3) neon_ld1s_q16((__int16*)(src1), (src2), (src3))
#define vld1q_lane_s16_ex(src1, src2, src3) neon_ld1s_q16((__int16*)(src1), (src2), (src3))
#define vld1q_lane_s32_ex(src1, src2, src3) neon_ld1s_q32((__int32*)(src1), (src2), (src3))
#define vld1q_lane_s64_ex(src1, src2, src3) neon_ld1s_q64((__int64*)(src1), (src2), (src3))
#define vld1q_lane_u16_ex(src1, src2, src3) neon_ld1s_q16((__int16*)(src1), (src2), (src3))
#define vld1q_lane_u32_ex(src1, src2, src3) neon_ld1s_q32((__int32*)(src1), (src2), (src3))
#define vst4_f32_ex(src1, src2, align) neon_st4m_32((__int32*)(src1), (src2))
#define vst4_p16_ex(src1, src2, align) neon_st4m_16((__int16*)(src1), (src2))
#define vst4_p8_ex(src1, src2, align) neon_st4m_8((__int8*)(src1), (src2))
#define vst4_s16_ex(src1, src2, align) neon_st4m_16((__int16*)(src1), (src2))
#define vst4_s32_ex(src1, src2, align) neon_st4m_32((__int32*)(src1), (src2))
#define vst4_s8_ex(src1, src2, align) neon_st4m_8((__int8*)(src1), (src2))
#define vst4_u16_ex(src1, src2, align) neon_st4m_16((__int16*)(src1), (src2))
#define vst4_u32_ex(src1, src2, align) neon_st4m_32((__int32*)(src1), (src2))
#define vst4_u8_ex(src1, src2, align) neon_st4m_8((__int8*)(src1), (src2))
#define vst4_s64_ex(src1, src2, align) neon_st1m4_64((__int64*)(src1), (src2))
#define vst4_u64_ex(src1, src2, align) neon_st1m4_64((__int64*)(src1), (src2))
#define vst4q_f32_ex(src1, src2, align) neon_st4m_q32((__int32*)(src1), (src2))
#define vst4q_p16_ex(src1, src2, align) neon_st4m_q16((__int16*)(src1), (src2))
#define vst4q_p8_ex(src1, src2, align) neon_st4m_q8((__int8*)(src1), (src2))
#define vst4q_s16_ex(src1, src2, align) neon_st4m_q16((__int16*)(src1), (src2))
#define vst4q_s32_ex(src1, src2, align) neon_st4m_q32((__int32*)(src1), (src2))
#define vst4q_s8_ex(src1, src2, align) neon_st4m_q8((__int8*)(src1), (src2))
#define vst4q_u16_ex(src1, src2, align) neon_st4m_q16((__int16*)(src1), (src2))
#define vst4q_u32_ex(src1, src2, align) neon_st4m_q32((__int32*)(src1), (src2))
#define vst4q_u8_ex(src1, src2, align) neon_st4m_q8((__int8*)(src1), (src2))
#define vst4q_s64_ex(src1, src2, align) neon_st4m_q64((__int64*)(src1), (src2))
#define vst4q_u64_ex(src1, src2, align) neon_st4m_q64((__int64*)(src1), (src2))
#define vst4_lane_f32_ex(src1, src2, src3, align) neon_st4s_32((__int32*)(src1), (src2), (src3))
#define vst4_lane_p16_ex(src1, src2, src3, align) neon_st4s_16((__int16*)(src1), (src2), (src3))
#define vst4_lane_p8_ex(src1, src2, src3, align) neon_st4s_8((__int8*)(src1), (src2), (src3))
#define vst4_lane_s16_ex(src1, src2, src3, align) neon_st4s_16((__int16*)(src1), (src2), (src3))
#define vst4_lane_s32_ex(src1, src2, src3, align) neon_st4s_32((__int32*)(src1), (src2), (src3))
#define vst4_lane_s64_ex(src1, src2, src3, align) neon_st4s_64((__int64*)(src1), (src2), (src3))
#define vst4_lane_s8_ex(src1, src2, src3, align) neon_st4s_8((__int8*)(src1), (src2), (src3))
#define vst4_lane_u16_ex(src1, src2, src3, align) neon_st4s_16((__int16*)(src1), (src2), (src3))
#define vst4_lane_u32_ex(src1, src2, src3, align) neon_st4s_32((__int32*)(src1), (src2), (src3))
#define vst4_lane_u8_ex(src1, src2, src3, align) neon_st4s_8((__int8*)(src1), (src2), (src3))
#define vst4q_lane_f32_ex(src1, src2, src3, align) neon_st4s_q32((__int32*)(src1), (src2), (src3))
#define vst4q_lane_p8_ex(src1, src2, src3, align) neon_st4s_q8((__int8*)(src1), (src2), (src3))
#define vst4q_lane_p16_ex(src1, src2, src3, align) neon_st4s_q16((__int16*)(src1), (src2), (src3))
#define vst4q_lane_s16_ex(src1, src2, src3, align) neon_st4s_q16((__int16*)(src1), (src2), (src3))
#define vst4q_lane_s32_ex(src1, src2, src3, align) neon_st4s_q32((__int32*)(src1), (src2), (src3))
#define vst4q_lane_s64_ex(src1, src2, src3, align) neon_st4s_q64((__int64*)(src1), (src2), (src3))
#define vst4q_lane_u16_ex(src1, src2, src3, align) neon_st4s_q16((__int16*)(src1), (src2), (src3))
#define vst4q_lane_u32_ex(src1, src2, src3, align) neon_st4s_q32((__int32*)(src1), (src2), (src3))
#define vst3_f32_ex(src1, src2, align) neon_st3m_32((__int32*)(src1), (src2))
#define vst3_p16_ex(src1, src2, align) neon_st3m_16((__int16*)(src1), (src2))
#define vst3_p8_ex(src1, src2, align) neon_st3m_8((__int8*)(src1), (src2))
#define vst3_s16_ex(src1, src2, align) neon_st3m_16((__int16*)(src1), (src2))
#define vst3_s32_ex(src1, src2, align) neon_st3m_32((__int32*)(src1), (src2))
#define vst3_s8_ex(src1, src2, align) neon_st3m_8((__int8*)(src1), (src2))
#define vst3_u16_ex(src1, src2, align) neon_st3m_16((__int16*)(src1), (src2))
#define vst3_u32_ex(src1, src2, align) neon_st3m_32((__int32*)(src1), (src2))
#define vst3_u8_ex(src1, src2, align) neon_st3m_8((__int8*)(src1), (src2))
#define vst3_s64_ex(src1, src2, align) neon_st1m3_64((__int64*)(src1), (src2))
#define vst3_u64_ex(src1, src2, align) neon_st1m3_64((__int64*)(src1), (src2))
#define vst3q_f32_ex(src1, src2, align) neon_st3m_q32((__int32*)(src1), (src2))
#define vst3q_p16_ex(src1, src2, align) neon_st3m_q16((__int16*)(src1), (src2))
#define vst3q_p8_ex(src1, src2, align) neon_st3m_q8((__int8*)(src1), (src2))
#define vst3q_s16_ex(src1, src2, align) neon_st3m_q16((__int16*)(src1), (src2))
#define vst3q_s32_ex(src1, src2, align) neon_st3m_q32((__int32*)(src1), (src2))
#define vst3q_s8_ex(src1, src2, align) neon_st3m_q8((__int8*)(src1), (src2))
#define vst3q_u16_ex(src1, src2, align) neon_st3m_q16((__int16*)(src1), (src2))
#define vst3q_u32_ex(src1, src2, align) neon_st3m_q32((__int32*)(src1), (src2))
#define vst3q_u8_ex(src1, src2, align) neon_st3m_q8((__int8*)(src1), (src2))
#define vst3q_s64_ex(src1, src2, align) neon_st3m_q64((__int64*)(src1), (src2))
#define vst3q_u64_ex(src1, src2, align) neon_st3m_q64((__int64*)(src1), (src2))
#define vst3_lane_f32_ex(src1, src2, src3, align) neon_st3s_32((__int32*)(src1), (src2), (src3))
#define vst3_lane_p16_ex(src1, src2, src3, align) neon_st3s_16((__int16*)(src1), (src2), (src3))
#define vst3_lane_p8_ex(src1, src2, src3, align) neon_st3s_8((__int8*)(src1), (src2), (src3))
#define vst3_lane_s16_ex(src1, src2, src3, align) neon_st3s_16((__int16*)(src1), (src2), (src3))
#define vst3_lane_s32_ex(src1, src2, src3, align) neon_st3s_32((__int32*)(src1), (src2), (src3))
#define vst3_lane_s64_ex(src1, src2, src3, align) neon_st3s_64((__int64*)(src1), (src2), (src3))
#define vst3_lane_s8_ex(src1, src2, src3, align) neon_st3s_8((__int8*)(src1), (src2), (src3))
#define vst3_lane_u16_ex(src1, src2, src3, align) neon_st3s_16((__int16*)(src1), (src2), (src3))
#define vst3_lane_u32_ex(src1, src2, src3, align) neon_st3s_32((__int32*)(src1), (src2), (src3))
#define vst3_lane_u8_ex(src1, src2, src3, align) neon_st3s_8((__int8*)(src1), (src2), (src3))
#define vst3q_lane_f32_ex(src1, src2, src3, align) neon_st3s_q32((__int32*)(src1), (src2), (src3))
#define vst3q_lane_p8_ex(src1, src2, src3, align) neon_st3s_q8((__int8*)(src1), (src2), (src3))
#define vst3q_lane_p16_ex(src1, src2, src3, align) neon_st3s_q16((__int16*)(src1), (src2), (src3))
#define vst3q_lane_s16_ex(src1, src2, src3, align) neon_st3s_q16((__int16*)(src1), (src2), (src3))
#define vst3q_lane_s32_ex(src1, src2, src3, align) neon_st3s_q32((__int32*)(src1), (src2), (src3))
#define vst3q_lane_s64_ex(src1, src2, src3, align) neon_st3s_q64((__int64*)(src1), (src2), (src3))
#define vst3q_lane_u16_ex(src1, src2, src3, align) neon_st3s_q16((__int16*)(src1), (src2), (src3))
#define vst3q_lane_u32_ex(src1, src2, src3, align) neon_st3s_q32((__int32*)(src1), (src2), (src3))
#define vst2_f32_ex(src1, src2, align) neon_st2m_32((__int32*)(src1), (src2))
#define vst2_p16_ex(src1, src2, align) neon_st2m_16((__int16*)(src1), (src2))
#define vst2_p8_ex(src1, src2, align) neon_st2m_8((__int8*)(src1), (src2))
#define vst2_s16_ex(src1, src2, align) neon_st2m_16((__int16*)(src1), (src2))
#define vst2_s32_ex(src1, src2, align) neon_st2m_32((__int32*)(src1), (src2))
#define vst2_s8_ex(src1, src2, align) neon_st2m_8((__int8*)(src1), (src2))
#define vst2_u16_ex(src1, src2, align) neon_st2m_16((__int16*)(src1), (src2))
#define vst2_u32_ex(src1, src2, align) neon_st2m_32((__int32*)(src1), (src2))
#define vst2_u8_ex(src1, src2, align) neon_st2m_8((__int8*)(src1), (src2))
#define vst2_s64_ex(src1, src2, align) neon_st1m2_64((__int64*)(src1), (src2))
#define vst2_u64_ex(src1, src2, align) neon_st1m2_64((__int64*)(src1), (src2))
#define vst2q_f32_ex(src1, src2, align) neon_st2m_q32((__int32*)(src1), (src2))
#define vst2q_p16_ex(src1, src2, align) neon_st2m_q16((__int16*)(src1), (src2))
#define vst2q_p8_ex(src1, src2, align) neon_st2m_q8((__int8*)(src1), (src2))
#define vst2q_s16_ex(src1, src2, align) neon_st2m_q16((__int16*)(src1), (src2))
#define vst2q_s32_ex(src1, src2, align) neon_st2m_q32((__int32*)(src1), (src2))
#define vst2q_s8_ex(src1, src2, align) neon_st2m_q8((__int8*)(src1), (src2))
#define vst2q_u16_ex(src1, src2, align) neon_st2m_q16((__int16*)(src1), (src2))
#define vst2q_u32_ex(src1, src2, align) neon_st2m_q32((__int32*)(src1), (src2))
#define vst2q_u8_ex(src1, src2, align) neon_st2m_q8((__int8*)(src1), (src2))
#define vst2q_s64_ex(src1, src2, align) neon_st2m_q64((__int64*)(src1), (src2))
#define vst2q_u64_ex(src1, src2, align) neon_st2m_q64((__int64*)(src1), (src2))
#define vst2_lane_f32_ex(src1, src2, src3, align) neon_st2s_32((__int32*)(src1), (src2), (src3))
#define vst2_lane_p16_ex(src1, src2, src3, align) neon_st2s_16((__int16*)(src1), (src2), (src3))
#define vst2_lane_p8_ex(src1, src2, src3, align) neon_st2s_8((__int8*)(src1), (src2), (src3))
#define vst2_lane_s16_ex(src1, src2, src3, align) neon_st2s_16((__int16*)(src1), (src2), (src3))
#define vst2_lane_s32_ex(src1, src2, src3, align) neon_st2s_32((__int32*)(src1), (src2), (src3))
#define vst2_lane_s64_ex(src1, src2, src3, align) neon_st2s_64((__int64*)(src1), (src2), (src3))
#define vst2_lane_s8_ex(src1, src2, src3, align) neon_st2s_8((__int8*)(src1), (src2), (src3))
#define vst2_lane_u16_ex(src1, src2, src3, align) neon_st2s_16((__int16*)(src1), (src2), (src3))
#define vst2_lane_u32_ex(src1, src2, src3, align) neon_st2s_32((__int32*)(src1), (src2), (src3))
#define vst2_lane_u8_ex(src1, src2, src3, align) neon_st2s_8((__int8*)(src1), (src2), (src3))
#define vst2q_lane_f32_ex(src1, src2, src3, align) neon_st2s_q32((__int32*)(src1), (src2), (src3))
#define vst2q_lane_p8_ex(src1, src2, src3, align) neon_st2s_q8((__int8*)(src1), (src2), (src3))
#define vst2q_lane_p16_ex(src1, src2, src3, align) neon_st2s_q16((__int16*)(src1), (src2), (src3))
#define vst2q_lane_s16_ex(src1, src2, src3, align) neon_st2s_q16((__int16*)(src1), (src2), (src3))
#define vst2q_lane_s32_ex(src1, src2, src3, align) neon_st2s_q32((__int32*)(src1), (src2), (src3))
#define vst2q_lane_s64_ex(src1, src2, src3, align) neon_st2s_q64((__int64*)(src1), (src2), (src3))
#define vst2q_lane_u16_ex(src1, src2, src3, align) neon_st2s_q16((__int16*)(src1), (src2), (src3))
#define vst2q_lane_u32_ex(src1, src2, src3, align) neon_st2s_q32((__int32*)(src1), (src2), (src3))
#define vst1_f16_ex(src1, src2, align) neon_st1m_16((__int16*)(src1), (src2))
#define vst1_f32_ex(src1, src2, align) neon_st1m_32((__int32*)(src1), (src2))
#define vst1_p16_ex(src1, src2, align) neon_st1m_16((__int16*)(src1), (src2))
#define vst1_p8_ex(src1, src2, align) neon_st1m_8((__int8*)(src1), (src2))
#define vst1_s16_ex(src1, src2, align) neon_st1m_16((__int16*)(src1), (src2))
#define vst1_s32_ex(src1, src2, align) neon_st1m_32((__int32*)(src1), (src2))
#define vst1_s8_ex(src1, src2, align) neon_st1m_8((__int8*)(src1), (src2))
#define vst1_u16_ex(src1, src2, align) neon_st1m_16((__int16*)(src1), (src2))
#define vst1_u32_ex(src1, src2, align) neon_st1m_32((__int32*)(src1), (src2))
#define vst1_u8_ex(src1, src2, align) neon_st1m_8((__int8*)(src1), (src2))
#define vst1_s64_ex(src1, src2, align) neon_st1m_64((__int64*)(src1), (src2))
#define vst1_u64_ex(src1, src2, align) neon_st1m_64((__int64*)(src1), (src2))
#define vst1q_f32_ex(src1, src2, align) neon_st1m_q32((__int32*)(src1), (src2))
#define vst1q_p16_ex(src1, src2, align) neon_st1m_q16((__int16*)(src1), (src2))
#define vst1q_p8_ex(src1, src2, align) neon_st1m_q8((__int8*)(src1), (src2))
#define vst1q_s16_ex(src1, src2, align) neon_st1m_q16((__int16*)(src1), (src2))
#define vst1q_s32_ex(src1, src2, align) neon_st1m_q32((__int32*)(src1), (src2))
#define vst1q_s8_ex(src1, src2, align) neon_st1m_q8((__int8*)(src1), (src2))
#define vst1q_u16_ex(src1, src2, align) neon_st1m_q16((__int16*)(src1), (src2))
#define vst1q_u32_ex(src1, src2, align) neon_st1m_q32((__int32*)(src1), (src2))
#define vst1q_u8_ex(src1, src2, align) neon_st1m_q8((__int8*)(src1), (src2))
#define vst1q_s64_ex(src1, src2, align) neon_st1m_q64((__int64*)(src1), (src2))
#define vst1q_u64_ex(src1, src2, align) neon_st1m_q64((__int64*)(src1), (src2))
#define vst1_f32_x2_ex(src1, src2, align) neon_st1m2_32((__int32*)(src1), (src2))
#define vst1_p16_x2_ex(src1, src2, align) neon_st1m2_16((__int16*)(src1), (src2))
#define vst1_p8_x2_ex(src1, src2, align) neon_st1m2_8((__int8*)(src1), (src2))
#define vst1_s16_x2_ex(src1, src2, align) neon_st1m2_16((__int16*)(src1), (src2))
#define vst1_s32_x2_ex(src1, src2, align) neon_st1m2_32((__int32*)(src1), (src2))
#define vst1_s8_x2_ex(src1, src2, align) neon_st1m2_8((__int8*)(src1), (src2))
#define vst1_u16_x2_ex(src1, src2, align) neon_st1m2_16((__int16*)(src1), (src2))
#define vst1_u32_x2_ex(src1, src2, align) neon_st1m2_32((__int32*)(src1), (src2))
#define vst1_u8_x2_ex(src1, src2, align) neon_st1m2_8((__int8*)(src1), (src2))
#define vst1_s64_x2_ex(src1, src2, align) neon_st1m2_64((__int64*)(src1), (src2))
#define vst1_u64_x2_ex(src1, src2, align) neon_st1m2_64((__int64*)(src1), (src2))
#define vst1q_f32_x2_ex(src1, src2, align) neon_st1m2_q32((__int32*)(src1), (src2))
#define vst1q_p16_x2_ex(src1, src2, align) neon_st1m2_q16((__int16*)(src1), (src2))
#define vst1q_p8_x2_ex(src1, src2, align) neon_st1m2_q8((__int8*)(src1), (src2))
#define vst1q_s16_x2_ex(src1, src2, align) neon_st1m2_q16((__int16*)(src1), (src2))
#define vst1q_s32_x2_ex(src1, src2, align) neon_st1m2_q32((__int32*)(src1), (src2))
#define vst1q_s8_x2_ex(src1, src2, align) neon_st1m2_q8((__int8*)(src1), (src2))
#define vst1q_u16_x2_ex(src1, src2, align) neon_st1m2_q16((__int16*)(src1), (src2))
#define vst1q_u32_x2_ex(src1, src2, align) neon_st1m2_q32((__int32*)(src1), (src2))
#define vst1q_u8_x2_ex(src1, src2, align) neon_st1m2_q8((__int8*)(src1), (src2))
#define vst1q_s64_x2_ex(src1, src2, align) neon_st1m2_q64((__int64*)(src1), (src2))
#define vst1q_u64_x2_ex(src1, src2, align) neon_st1m2_q64((__int64*)(src1), (src2))
#define vst1_f32_x3_ex(src1, src2, align) neon_st1m3_32((__int32*)(src1), (src2))
#define vst1_p16_x3_ex(src1, src2, align) neon_st1m3_16((__int16*)(src1), (src2))
#define vst1_p8_x3_ex(src1, src2, align) neon_st1m3_8((__int8*)(src1), (src2))
#define vst1_s16_x3_ex(src1, src2, align) neon_st1m3_16((__int16*)(src1), (src2))
#define vst1_s32_x3_ex(src1, src2, align) neon_st1m3_32((__int32*)(src1), (src2))
#define vst1_s8_x3_ex(src1, src2, align) neon_st1m3_8((__int8*)(src1), (src2))
#define vst1_u16_x3_ex(src1, src2, align) neon_st1m3_16((__int16*)(src1), (src2))
#define vst1_u32_x3_ex(src1, src2, align) neon_st1m3_32((__int32*)(src1), (src2))
#define vst1_u8_x3_ex(src1, src2, align) neon_st1m3_8((__int8*)(src1), (src2))
#define vst1_s64_x3_ex(src1, src2, align) neon_st1m3_64((__int64*)(src1), (src2))
#define vst1_u64_x3_ex(src1, src2, align) neon_st1m3_64((__int64*)(src1), (src2))
#define vst1q_f32_x3_ex(src1, src2, align) neon_st1m3_q32((__int32*)(src1), (src2))
#define vst1q_p16_x3_ex(src1, src2, align) neon_st1m3_q16((__int16*)(src1), (src2))
#define vst1q_p8_x3_ex(src1, src2, align) neon_st1m3_q8((__int8*)(src1), (src2))
#define vst1q_s16_x3_ex(src1, src2, align) neon_st1m3_q16((__int16*)(src1), (src2))
#define vst1q_s32_x3_ex(src1, src2, align) neon_st1m3_q32((__int32*)(src1), (src2))
#define vst1q_s8_x3_ex(src1, src2, align) neon_st1m3_q8((__int8*)(src1), (src2))
#define vst1q_u16_x3_ex(src1, src2, align) neon_st1m3_q16((__int16*)(src1), (src2))
#define vst1q_u32_x3_ex(src1, src2, align) neon_st1m3_q32((__int32*)(src1), (src2))
#define vst1q_u8_x3_ex(src1, src2, align) neon_st1m3_q8((__int8*)(src1), (src2))
#define vst1q_s64_x3_ex(src1, src2, align) neon_st1m3_q64((__int64*)(src1), (src2))
#define vst1q_u64_x3_ex(src1, src2, align) neon_st1m3_q64((__int64*)(src1), (src2))
#define vst1_f32_x4_ex(src1, src2, align) neon_st1m4_32((__int32*)(src1), (src2))
#define vst1_p16_x4_ex(src1, src2, align) neon_st1m4_16((__int16*)(src1), (src2))
#define vst1_p8_x4_ex(src1, src2, align) neon_st1m4_8((__int8*)(src1), (src2))
#define vst1_s16_x4_ex(src1, src2, align) neon_st1m4_16((__int16*)(src1), (src2))
#define vst1_s32_x4_ex(src1, src2, align) neon_st1m4_32((__int32*)(src1), (src2))
#define vst1_s8_x4_ex(src1, src2, align) neon_st1m4_8((__int8*)(src1), (src2))
#define vst1_u16_x4_ex(src1, src2, align) neon_st1m4_16((__int16*)(src1), (src2))
#define vst1_u32_x4_ex(src1, src2, align) neon_st1m4_32((__int32*)(src1), (src2))
#define vst1_u8_x4_ex(src1, src2, align) neon_st1m4_8((__int8*)(src1), (src2))
#define vst1_s64_x4_ex(src1, src2, align) neon_st1m4_64((__int64*)(src1), (src2))
#define vst1_u64_x4_ex(src1, src2, align) neon_st1m4_64((__int64*)(src1), (src2))
#define vst1q_f32_x4_ex(src1, src2, align) neon_st1m4_q32((__int32*)(src1), (src2))
#define vst1q_p16_x4_ex(src1, src2, align) neon_st1m4_q16((__int16*)(src1), (src2))
#define vst1q_p8_x4_ex(src1, src2, align) neon_st1m4_q8((__int8*)(src1), (src2))
#define vst1q_s16_x4_ex(src1, src2, align) neon_st1m4_q16((__int16*)(src1), (src2))
#define vst1q_s32_x4_ex(src1, src2, align) neon_st1m4_q32((__int32*)(src1), (src2))
#define vst1q_s8_x4_ex(src1, src2, align) neon_st1m4_q8((__int8*)(src1), (src2))
#define vst1q_u16_x4_ex(src1, src2, align) neon_st1m4_q16((__int16*)(src1), (src2))
#define vst1q_u32_x4_ex(src1, src2, align) neon_st1m4_q32((__int32*)(src1), (src2))
#define vst1q_u8_x4_ex(src1, src2, align) neon_st1m4_q8((__int8*)(src1), (src2))
#define vst1q_s64_x4_ex(src1, src2, align) neon_st1m4_q64((__int64*)(src1), (src2))
#define vst1q_u64_x4_ex(src1, src2, align) neon_st1m4_q64((__int64*)(src1), (src2))
#define vst1_lane_f16_ex(src1, src2, src3, align) neon_st1s_16((__int16*)(src1), (src2), (src3))
#define vst1_lane_f32_ex(src1, src2, src3, align) neon_st1s_32((__int32*)(src1), (src2), (src3))
#define vst1_lane_p16_ex(src1, src2, src3, align) neon_st1s_16((__int16*)(src1), (src2), (src3))
#define vst1_lane_p8_ex(src1, src2, src3, align) neon_st1s_8((__int8*)(src1), (src2), (src3))
#define vst1_lane_s16_ex(src1, src2, src3, align) neon_st1s_16((__int16*)(src1), (src2), (src3))
#define vst1_lane_s32_ex(src1, src2, src3, align) neon_st1s_32((__int32*)(src1), (src2), (src3))
#define vst1_lane_s64_ex(src1, src2, src3, align) neon_st1s_64((__int64*)(src1), (src2), (src3))
#define vst1_lane_s8_ex(src1, src2, src3, align) neon_st1s_8((__int8*)(src1), (src2), (src3))
#define vst1_lane_u16_ex(src1, src2, src3, align) neon_st1s_16((__int16*)(src1), (src2), (src3))
#define vst1_lane_u32_ex(src1, src2, src3, align) neon_st1s_32((__int32*)(src1), (src2), (src3))
#define vst1_lane_u8_ex(src1, src2, src3, align) neon_st1s_8((__int8*)(src1), (src2), (src3))
#define vst1q_lane_f32_ex(src1, src2, src3, align) neon_st1s_q32((__int32*)(src1), (src2), (src3))
#define vst1q_lane_p8_ex(src1, src2, src3, align) neon_st1s_q8((__int8*)(src1), (src2), (src3))
#define vst1q_lane_p16_ex(src1, src2, src3, align) neon_st1s_q16((__int16*)(src1), (src2), (src3))
#define vst1q_lane_s8_ex(src1, src2, src3, align) neon_st1s_q8((__int8*)(src1), (src2), (src3))
#define vst1q_lane_s16_ex(src1, src2, src3, align) neon_st1s_q16((__int16*)(src1), (src2), (src3))
#define vst1q_lane_s32_ex(src1, src2, src3, align) neon_st1s_q32((__int32*)(src1), (src2), (src3))
#define vst1q_lane_s64_ex(src1, src2, src3, align) neon_st1s_q64((__int64*)(src1), (src2), (src3))
#define vst1q_lane_u8_ex(src1, src2, src3, align) neon_st1s_q8((__int8*)(src1), (src2), (src3))
#define vst1q_lane_u16_ex(src1, src2, src3, align) neon_st1s_q16((__int16*)(src1), (src2), (src3))
#define vst1q_lane_u32_ex(src1, src2, src3, align) neon_st1s_q32((__int32*)(src1), (src2), (src3))
#define vqshrnh_n_s32(src1, src2) neon_sqshrn_s32(_CopyFloatFromInt32(src1), (src2))
#define vqshrnh_n_s64(src1, src2) neon_sqshrn_s64((src1), (src2))
#define vqrshrnh_n_s32(src1, src2) neon_sqrshrn_s32(_CopyFloatFromInt32(src1), (src2)).n16_i16[0]
#define vqrshrnh_n_s64(src1, src2) neon_sqrshrn_s64((src1), (src2))
#define vqshrunh_n_s32(src1, src2) neon_sqshrun_s32(_CopyFloatFromInt32(src1), (src2)).n16_u16[0]
#define vqshrunh_n_s64(src1, src2) neon_sqshrun_s64((src1), (src2))
#define vqrshrunh_n_s32(src1, src2) neon_sqrshrun_s32(_CopyFloatFromInt32(src1), (src2)).n16_u16[0]
#define vqrshrunh_n_s64(src1, src2) neon_sqrshrun_s64((src1), (src2))
#endif  /* _ARM64_EXTENDED_INTRINSICS */

//vreinterpret
#if !defined(_ARM64_DISTINCT_NEON_TYPES)
#define vreinterpret_f32_s8(a)         (a)
#define vreinterpret_f32_s16(a)        (a)
#define vreinterpret_f32_s32(a)        (a)
#define vreinterpret_f32_s64(a)        (a)
#define vreinterpret_f32_p8(a)         (a)
#define vreinterpret_f32_p16(a)        (a)
#define vreinterpret_f32_u8(a)         (a)
#define vreinterpret_f32_u16(a)        (a)
#define vreinterpret_f32_u32(a)        (a)
#define vreinterpret_f32_u64(a)        (a)
#define vreinterpret_s8_f32(a)         (a)
#define vreinterpret_s8_s16(a)         (a)
#define vreinterpret_s8_s32(a)         (a)
#define vreinterpret_s8_s64(a)         (a)
#define vreinterpret_s8_p8(a)          (a)
#define vreinterpret_s8_p16(a)         (a)
#define vreinterpret_s8_u8(a)          (a)
#define vreinterpret_s8_u16(a)         (a)
#define vreinterpret_s8_u32(a)         (a)
#define vreinterpret_s8_u64(a)         (a)
#define vreinterpret_s16_f32(a)        (a)
#define vreinterpret_s16_s8(a)         (a)
#define vreinterpret_s16_s32(a)        (a)
#define vreinterpret_s16_s64(a)        (a)
#define vreinterpret_s16_p8(a)         (a)
#define vreinterpret_s16_p16(a)        (a)
#define vreinterpret_s16_u8(a)         (a)
#define vreinterpret_s16_u16(a)        (a)
#define vreinterpret_s16_u32(a)        (a)
#define vreinterpret_s16_u64(a)        (a)
#define vreinterpret_s32_f32(a)        (a)
#define vreinterpret_s32_s8(a)         (a)
#define vreinterpret_s32_s16(a)        (a)
#define vreinterpret_s32_s64(a)        (a)
#define vreinterpret_s32_p8(a)         (a)
#define vreinterpret_s32_p16(a)        (a)
#define vreinterpret_s32_u8(a)         (a)
#define vreinterpret_s32_u16(a)        (a)
#define vreinterpret_s32_u32(a)        (a)
#define vreinterpret_s32_u64(a)        (a)
#define vreinterpret_s64_f32(a)        (a)
#define vreinterpret_s64_s8(a)         (a)
#define vreinterpret_s64_s16(a)        (a)
#define vreinterpret_s64_s32(a)        (a)
#define vreinterpret_s64_p8(a)         (a)
#define vreinterpret_s64_p16(a)        (a)
#define vreinterpret_s64_u8(a)         (a)
#define vreinterpret_s64_u16(a)        (a)
#define vreinterpret_s64_u32(a)        (a)
#define vreinterpret_s64_u64(a)        (a)
#define vreinterpret_p8_f32(a)         (a)
#define vreinterpret_p8_s8(a)          (a)
#define vreinterpret_p8_s16(a)         (a)
#define vreinterpret_p8_s32(a)         (a)
#define vreinterpret_p8_s64(a)         (a)
#define vreinterpret_p8_p16(a)         (a)
#define vreinterpret_p8_u8(a)          (a)
#define vreinterpret_p8_u16(a)         (a)
#define vreinterpret_p8_u32(a)         (a)
#define vreinterpret_p8_u64(a)         (a)
#define vreinterpret_p16_f32(a)        (a)
#define vreinterpret_p16_s8(a)         (a)
#define vreinterpret_p16_s16(a)        (a)
#define vreinterpret_p16_s32(a)        (a)
#define vreinterpret_p16_s64(a)        (a)
#define vreinterpret_p16_p8(a)         (a)
#define vreinterpret_p16_u8(a)         (a)
#define vreinterpret_p16_u16(a)        (a)
#define vreinterpret_p16_u32(a)        (a)
#define vreinterpret_p16_u64(a)        (a)
#define vreinterpret_u8_f32(a)         (a)
#define vreinterpret_u8_s8(a)          (a)
#define vreinterpret_u8_s16(a)         (a)
#define vreinterpret_u8_s32(a)         (a)
#define vreinterpret_u8_s64(a)         (a)
#define vreinterpret_u8_p8(a)          (a)
#define vreinterpret_u8_p16(a)         (a)
#define vreinterpret_u8_u16(a)         (a)
#define vreinterpret_u8_u32(a)         (a)
#define vreinterpret_u8_u64(a)         (a)
#define vreinterpret_u16_f32(a)        (a)
#define vreinterpret_u16_s8(a)         (a)
#define vreinterpret_u16_s16(a)        (a)
#define vreinterpret_u16_s32(a)        (a)
#define vreinterpret_u16_s64(a)        (a)
#define vreinterpret_u16_p8(a)         (a)
#define vreinterpret_u16_p16(a)        (a)
#define vreinterpret_u16_u8(a)         (a)
#define vreinterpret_u16_u32(a)        (a)
#define vreinterpret_u16_u64(a)        (a)
#define vreinterpret_u32_f32(a)        (a)
#define vreinterpret_u32_s8(a)         (a)
#define vreinterpret_u32_s16(a)        (a)
#define vreinterpret_u32_s32(a)        (a)
#define vreinterpret_u32_s64(a)        (a)
#define vreinterpret_u32_p8(a)         (a)
#define vreinterpret_u32_p16(a)        (a)
#define vreinterpret_u32_u8(a)         (a)
#define vreinterpret_u32_u16(a)        (a)
#define vreinterpret_u32_u64(a)        (a)
#define vreinterpret_u64_f32(a)        (a)
#define vreinterpret_u64_s8(a)         (a)
#define vreinterpret_u64_s16(a)        (a)
#define vreinterpret_u64_s32(a)        (a)
#define vreinterpret_u64_s64(a)        (a)
#define vreinterpret_u64_p8(a)         (a)
#define vreinterpret_u64_p16(a)        (a)
#define vreinterpret_u64_u8(a)         (a)
#define vreinterpret_u64_u16(a)        (a)
#define vreinterpret_u64_u32(a)        (a)
#define vreinterpretq_f32_s8(a)        (a)
#define vreinterpretq_f32_s16(a)       (a)
#define vreinterpretq_f32_s32(a)       (a)
#define vreinterpretq_f32_s64(a)       (a)
#define vreinterpretq_f32_p8(a)        (a)
#define vreinterpretq_f32_p16(a)       (a)
#define vreinterpretq_f32_u8(a)        (a)
#define vreinterpretq_f32_u16(a)       (a)
#define vreinterpretq_f32_u32(a)       (a)
#define vreinterpretq_f32_u64(a)       (a)
#define vreinterpretq_s8_f32(a)        (a)
#define vreinterpretq_s8_s16(a)        (a)
#define vreinterpretq_s8_s32(a)        (a)
#define vreinterpretq_s8_s64(a)        (a)
#define vreinterpretq_s8_p8(a)         (a)
#define vreinterpretq_s8_p16(a)        (a)
#define vreinterpretq_s8_u8(a)         (a)
#define vreinterpretq_s8_u16(a)        (a)
#define vreinterpretq_s8_u32(a)        (a)
#define vreinterpretq_s8_u64(a)        (a)
#define vreinterpretq_s16_f32(a)       (a)
#define vreinterpretq_s16_s8(a)        (a)
#define vreinterpretq_s16_s32(a)       (a)
#define vreinterpretq_s16_s64(a)       (a)
#define vreinterpretq_s16_p8(a)        (a)
#define vreinterpretq_s16_p16(a)       (a)
#define vreinterpretq_s16_u8(a)        (a)
#define vreinterpretq_s16_u16(a)       (a)
#define vreinterpretq_s16_u32(a)       (a)
#define vreinterpretq_s16_u64(a)       (a)
#define vreinterpretq_s32_f32(a)       (a)
#define vreinterpretq_s32_s8(a)        (a)
#define vreinterpretq_s32_s16(a)       (a)
#define vreinterpretq_s32_s64(a)       (a)
#define vreinterpretq_s32_p8(a)        (a)
#define vreinterpretq_s32_p16(a)       (a)
#define vreinterpretq_s32_u8(a)        (a)
#define vreinterpretq_s32_u16(a)       (a)
#define vreinterpretq_s32_u32(a)       (a)
#define vreinterpretq_s32_u64(a)       (a)
#define vreinterpretq_s64_f32(a)       (a)
#define vreinterpretq_s64_s8(a)        (a)
#define vreinterpretq_s64_s16(a)       (a)
#define vreinterpretq_s64_s32(a)       (a)
#define vreinterpretq_s64_p8(a)        (a)
#define vreinterpretq_s64_p16(a)       (a)
#define vreinterpretq_s64_u8(a)        (a)
#define vreinterpretq_s64_u16(a)       (a)
#define vreinterpretq_s64_u32(a)       (a)
#define vreinterpretq_s64_u64(a)       (a)
#define vreinterpretq_p8_f32(a)        (a)
#define vreinterpretq_p8_s8(a)         (a)
#define vreinterpretq_p8_s16(a)        (a)
#define vreinterpretq_p8_s32(a)        (a)
#define vreinterpretq_p8_s64(a)        (a)
#define vreinterpretq_p8_p16(a)        (a)
#define vreinterpretq_p8_u8(a)         (a)
#define vreinterpretq_p8_u16(a)        (a)
#define vreinterpretq_p8_u32(a)        (a)
#define vreinterpretq_p8_u64(a)        (a)
#define vreinterpretq_p16_f32(a)       (a)
#define vreinterpretq_p16_s8(a)        (a)
#define vreinterpretq_p16_s16(a)       (a)
#define vreinterpretq_p16_s32(a)       (a)
#define vreinterpretq_p16_s64(a)       (a)
#define vreinterpretq_p16_p8(a)        (a)
#define vreinterpretq_p16_u8(a)        (a)
#define vreinterpretq_p16_u16(a)       (a)
#define vreinterpretq_p16_u32(a)       (a)
#define vreinterpretq_p16_u64(a)       (a)
#define vreinterpretq_u8_f32(a)        (a)
#define vreinterpretq_u8_s8(a)         (a)
#define vreinterpretq_u8_s16(a)        (a)
#define vreinterpretq_u8_s32(a)        (a)
#define vreinterpretq_u8_s64(a)        (a)
#define vreinterpretq_u8_p8(a)         (a)
#define vreinterpretq_u8_p16(a)        (a)
#define vreinterpretq_u8_u16(a)        (a)
#define vreinterpretq_u8_u32(a)        (a)
#define vreinterpretq_u8_u64(a)        (a)
#define vreinterpretq_u16_f32(a)       (a)
#define vreinterpretq_u16_s8(a)        (a)
#define vreinterpretq_u16_s16(a)       (a)
#define vreinterpretq_u16_s32(a)       (a)
#define vreinterpretq_u16_s64(a)       (a)
#define vreinterpretq_u16_p8(a)        (a)
#define vreinterpretq_u16_p16(a)       (a)
#define vreinterpretq_u16_u8(a)        (a)
#define vreinterpretq_u16_u32(a)       (a)
#define vreinterpretq_u16_u64(a)       (a)
#define vreinterpretq_u32_f32(a)       (a)
#define vreinterpretq_u32_s8(a)        (a)
#define vreinterpretq_u32_s16(a)       (a)
#define vreinterpretq_u32_s32(a)       (a)
#define vreinterpretq_u32_s64(a)       (a)
#define vreinterpretq_u32_p8(a)        (a)
#define vreinterpretq_u32_p16(a)       (a)
#define vreinterpretq_u32_u8(a)        (a)
#define vreinterpretq_u32_u16(a)       (a)
#define vreinterpretq_u32_u64(a)       (a)
#define vreinterpretq_u64_f32(a)       (a)
#define vreinterpretq_u64_s8(a)        (a)
#define vreinterpretq_u64_s16(a)       (a)
#define vreinterpretq_u64_s32(a)       (a)
#define vreinterpretq_u64_s64(a)       (a)
#define vreinterpretq_u64_p8(a)        (a)
#define vreinterpretq_u64_p16(a)       (a)
#define vreinterpretq_u64_u8(a)        (a)
#define vreinterpretq_u64_u16(a)       (a)
#define vreinterpretq_u64_u32(a)       (a)
#define vreinterpret_f64_s8(a)        (a)
#define vreinterpret_p64_s8(a)        (a)
#define vreinterpret_f16_s8(a)        (a)
#define vreinterpret_f64_s16(a)        (a)
#define vreinterpret_p64_s16(a)        (a)
#define vreinterpret_f16_s16(a)        (a)
#define vreinterpret_f64_s32(a)        (a)
#define vreinterpret_p64_s32(a)        (a)
#define vreinterpret_f16_s32(a)        (a)
#define vreinterpret_f64_f32(a)        (a)
#define vreinterpret_p64_f32(a)        (a)
#define vreinterpret_p64_f64(a)        (a)
#define vreinterpret_f16_f32(a)        (a)
#define vreinterpret_f64_u8(a)        (a)
#define vreinterpret_p64_u8(a)        (a)
#define vreinterpret_f16_u8(a)        (a)
#define vreinterpret_f64_u16(a)        (a)
#define vreinterpret_p64_u16(a)        (a)
#define vreinterpret_f16_u16(a)        (a)
#define vreinterpret_f64_u32(a)        (a)
#define vreinterpret_p64_u32(a)        (a)
#define vreinterpret_f16_u32(a)        (a)
#define vreinterpret_f64_p8(a)        (a)
#define vreinterpret_p64_p8(a)        (a)
#define vreinterpret_f16_p8(a)        (a)
#define vreinterpret_f64_p16(a)        (a)
#define vreinterpret_p64_p16(a)        (a)
#define vreinterpret_f16_p16(a)        (a)
#define vreinterpret_f64_u64(a)        (a)
#define vreinterpret_p64_u64(a)        (a)
#define vreinterpret_f16_u64(a)        (a)
#define vreinterpret_f64_s64(a)        (a)
#define vreinterpret_u64_p64(a)        (a)
#define vreinterpret_f16_s64(a)        (a)
#define vreinterpret_s8_f16(a)        (a)
#define vreinterpret_s16_f16(a)        (a)
#define vreinterpret_s32_f16(a)        (a)
#define vreinterpret_f32_f16(a)        (a)
#define vreinterpret_u8_f16(a)        (a)
#define vreinterpret_u16_f16(a)        (a)
#define vreinterpret_u32_f16(a)        (a)
#define vreinterpret_p8_f16(a)        (a)
#define vreinterpret_p16_f16(a)        (a)
#define vreinterpret_u64_f16(a)        (a)
#define vreinterpret_s64_f16(a)        (a)
#define vreinterpret_f64_f16(a)        (a)
#define vreinterpret_p64_f16(a)        (a)
#define vreinterpretq_f64_s8(a)        (a)
#define vreinterpretq_p64_s8(a)        (a)
#define vreinterpretq_p128_s8(a)        (a)
#define vreinterpretq_f16_s8(a)        (a)
#define vreinterpretq_f64_s16(a)        (a)
#define vreinterpretq_p64_s16(a)        (a)
#define vreinterpretq_p128_s16(a)        (a)
#define vreinterpretq_f16_s16(a)        (a)
#define vreinterpretq_f64_s32(a)        (a)
#define vreinterpretq_p64_s32(a)        (a)
#define vreinterpretq_p128_s32(a)        (a)
#define vreinterpretq_f16_s32(a)        (a)
#define vreinterpretq_f64_f32(a)        (a)
#define vreinterpretq_p64_f32(a)        (a)
#define vreinterpretq_p128_f32(a)        (a)
#define vreinterpretq_p64_f64(a)        (a)
#define vreinterpretq_p128_f64(a)        (a)
#define vreinterpretq_f16_f32(a)        (a)
#define vreinterpretq_f64_u8(a)        (a)
#define vreinterpretq_p64_u8(a)        (a)
#define vreinterpretq_p128_u8(a)        (a)
#define vreinterpretq_f16_u8(a)        (a)
#define vreinterpretq_f64_u16(a)        (a)
#define vreinterpretq_p64_u16(a)        (a)
#define vreinterpretq_p128_u16(a)        (a)
#define vreinterpretq_f16_u16(a)        (a)
#define vreinterpretq_f64_u32(a)        (a)
#define vreinterpretq_p64_u32(a)        (a)
#define vreinterpretq_p128_u32(a)        (a)
#define vreinterpretq_f16_u32(a)        (a)
#define vreinterpretq_f64_p8(a)        (a)
#define vreinterpretq_p64_p8(a)        (a)
#define vreinterpretq_p128_p8(a)        (a)
#define vreinterpretq_f16_p8(a)        (a)
#define vreinterpretq_f64_p16(a)        (a)
#define vreinterpretq_p64_p16(a)        (a)
#define vreinterpretq_p128_p16(a)        (a)
#define vreinterpretq_f16_p16(a)        (a)
#define vreinterpretq_f64_u64(a)        (a)
#define vreinterpretq_f64_s64(a)        (a)
#define vreinterpretq_p64_s64(a)        (a)
#define vreinterpretq_p128_s64(a)        (a)
#define vreinterpretq_p64_u64(a)        (a)
#define vreinterpretq_p128_u64(a)        (a)
#define vreinterpretq_f16_u64(a)        (a)
#define vreinterpretq_u64_p64(a)        (a)
#define vreinterpretq_f16_s64(a)        (a)
#define vreinterpretq_s8_f16(a)        (a)
#define vreinterpretq_s16_f16(a)        (a)
#define vreinterpretq_s32_f16(a)        (a)
#define vreinterpretq_f32_f16(a)        (a)
#define vreinterpretq_u8_f16(a)        (a)
#define vreinterpretq_u16_f16(a)        (a)
#define vreinterpretq_u32_f16(a)        (a)
#define vreinterpretq_p8_f16(a)        (a)
#define vreinterpretq_p16_f16(a)        (a)
#define vreinterpretq_u64_f16(a)        (a)
#define vreinterpretq_s64_f16(a)        (a)
#define vreinterpretq_f64_f16(a)        (a)
#define vreinterpretq_p64_f16(a)        (a)
#define vreinterpretq_p128_f16(a)        (a)
#define vreinterpret_s8_f64(a)        (a)
#define vreinterpret_s16_f64(a)        (a)
#define vreinterpret_s32_f64(a)        (a)
#define vreinterpret_u8_f64(a)        (a)
#define vreinterpret_u16_f64(a)        (a)
#define vreinterpret_u32_f64(a)        (a)
#define vreinterpret_p8_f64(a)        (a)
#define vreinterpret_p16_f64(a)        (a)
#define vreinterpret_u64_f64(a)        (a)
#define vreinterpret_s64_f64(a)        (a)
#define vreinterpret_f16_f64(a)        (a)
#define vreinterpretq_s8_f64(a)        (a)
#define vreinterpretq_s16_f64(a)        (a)
#define vreinterpretq_s32_f64(a)        (a)
#define vreinterpretq_u8_f64(a)        (a)
#define vreinterpretq_u16_f64(a)        (a)
#define vreinterpretq_u32_f64(a)        (a)
#define vreinterpretq_p8_f64(a)        (a)
#define vreinterpretq_p16_f64(a)        (a)
#define vreinterpretq_u64_f64(a)        (a)
#define vreinterpretq_s64_f64(a)        (a)
#define vreinterpretq_f16_f64(a)        (a)
#define vreinterpret_s8_p64(a)        (a)
#define vreinterpret_s16_p64(a)        (a)
#define vreinterpret_s32_p64(a)        (a)
#define vreinterpret_u8_p64(a)        (a)
#define vreinterpret_u16_p64(a)        (a)
#define vreinterpret_u32_p64(a)        (a)
#define vreinterpret_p8_p64(a)        (a)
#define vreinterpret_p16_p64(a)        (a)
#define vreinterpret_s64_p64(a)        (a)
#define vreinterpret_f64_p64(a)        (a)
#define vreinterpret_f16_p64(a)        (a)
#define vreinterpretq_s8_p64(a)        (a)
#define vreinterpretq_s16_p64(a)        (a)
#define vreinterpretq_s32_p64(a)        (a)
#define vreinterpretq_u8_p64(a)        (a)
#define vreinterpretq_u16_p64(a)        (a)
#define vreinterpretq_u32_p64(a)        (a)
#define vreinterpretq_p8_p64(a)        (a)
#define vreinterpretq_p16_p64(a)        (a)
#define vreinterpretq_s64_p64(a)        (a)
#define vreinterpretq_f64_p64(a)        (a)
#define vreinterpretq_f16_p64(a)        (a)
#define vreinterpretq_s8_p128(a)        (a)
#define vreinterpretq_s16_p128(a)        (a)
#define vreinterpretq_s32_p128(a)        (a)
#define vreinterpretq_u8_p128(a)        (a)
#define vreinterpretq_u16_p128(a)        (a)
#define vreinterpretq_u32_p128(a)        (a)
#define vreinterpretq_p8_p128(a)        (a)
#define vreinterpretq_p16_p128(a)        (a)
#define vreinterpretq_u64_p128(a)        (a)
#define vreinterpretq_s64_p128(a)        (a)
#define vreinterpretq_f64_p128(a)        (a)
#define vreinterpretq_f16_p128(a)        (a)
#define vreinterpret_f32_f64(a)          (a)
#define vreinterpretq_f32_f64(a)         (a)
#else
__forceinline float32x2_t vreinterpret_f32_s8(int8x8_t a) { return *(float32x2_t *)(&a); }
__forceinline float32x2_t vreinterpret_f32_s16(int16x4_t a) { return *(float32x2_t *)(&a); }
__forceinline float32x2_t vreinterpret_f32_s32(int32x2_t a) { return *(float32x2_t *)(&a); }
__forceinline float32x2_t vreinterpret_f32_s64(int64x1_t a) { return *(float32x2_t *)(&a); }
__forceinline float32x2_t vreinterpret_f32_p8(poly8x8_t a) { return *(float32x2_t *)(&a); }
__forceinline float32x2_t vreinterpret_f32_p16(poly16x4_t a) { return *(float32x2_t *)(&a); }
__forceinline float32x2_t vreinterpret_f32_u8(uint8x8_t a) { return *(float32x2_t *)(&a); }
__forceinline float32x2_t vreinterpret_f32_u16(uint16x4_t a) { return *(float32x2_t *)(&a); }
__forceinline float32x2_t vreinterpret_f32_u32(uint32x2_t a) { return *(float32x2_t *)(&a); }
__forceinline float32x2_t vreinterpret_f32_u64(uint64x1_t a) { return *(float32x2_t *)(&a); }
__forceinline int8x8_t vreinterpret_s8_f32(float32x2_t a) { return *(int8x8_t *)(&a); }
__forceinline int8x8_t vreinterpret_s8_s16(int16x4_t a) { return *(int8x8_t *)(&a); }
__forceinline int8x8_t vreinterpret_s8_s32(int32x2_t a) { return *(int8x8_t *)(&a); }
__forceinline int8x8_t vreinterpret_s8_s64(int64x1_t a) { return *(int8x8_t *)(&a); }
__forceinline int8x8_t vreinterpret_s8_p8(poly8x8_t a) { return *(int8x8_t *)(&a); }
__forceinline int8x8_t vreinterpret_s8_p16(poly16x4_t a) { return *(int8x8_t *)(&a); }
__forceinline int8x8_t vreinterpret_s8_u8(uint8x8_t a) { return *(int8x8_t *)(&a); }
__forceinline int8x8_t vreinterpret_s8_u16(uint16x4_t a) { return *(int8x8_t *)(&a); }
__forceinline int8x8_t vreinterpret_s8_u32(uint32x2_t a) { return *(int8x8_t *)(&a); }
__forceinline int8x8_t vreinterpret_s8_u64(uint64x1_t a) { return *(int8x8_t *)(&a); }
__forceinline int16x4_t vreinterpret_s16_f32(float32x2_t a) { return *(int16x4_t *)(&a); }
__forceinline int16x4_t vreinterpret_s16_s8(int8x8_t a) { return *(int16x4_t *)(&a); }
__forceinline int16x4_t vreinterpret_s16_s32(int32x2_t a) { return *(int16x4_t *)(&a); }
__forceinline int16x4_t vreinterpret_s16_s64(int64x1_t a) { return *(int16x4_t *)(&a); }
__forceinline int16x4_t vreinterpret_s16_p8(poly8x8_t a) { return *(int16x4_t *)(&a); }
__forceinline int16x4_t vreinterpret_s16_p16(poly16x4_t a) { return *(int16x4_t *)(&a); }
__forceinline int16x4_t vreinterpret_s16_u8(uint8x8_t a) { return *(int16x4_t *)(&a); }
__forceinline int16x4_t vreinterpret_s16_u16(uint16x4_t a) { return *(int16x4_t *)(&a); }
__forceinline int16x4_t vreinterpret_s16_u32(uint32x2_t a) { return *(int16x4_t *)(&a); }
__forceinline int16x4_t vreinterpret_s16_u64(uint64x1_t a) { return *(int16x4_t *)(&a); }
__forceinline int32x2_t vreinterpret_s32_f32(float32x2_t a) { return *(int32x2_t *)(&a); }
__forceinline int32x2_t vreinterpret_s32_s8(int8x8_t a) { return *(int32x2_t *)(&a); }
__forceinline int32x2_t vreinterpret_s32_s16(int16x4_t a) { return *(int32x2_t *)(&a); }
__forceinline int32x2_t vreinterpret_s32_s64(int64x1_t a) { return *(int32x2_t *)(&a); }
__forceinline int32x2_t vreinterpret_s32_p8(poly8x8_t a) { return *(int32x2_t *)(&a); }
__forceinline int32x2_t vreinterpret_s32_p16(poly16x4_t a) { return *(int32x2_t *)(&a); }
__forceinline int32x2_t vreinterpret_s32_u8(uint8x8_t a) { return *(int32x2_t *)(&a); }
__forceinline int32x2_t vreinterpret_s32_u16(uint16x4_t a) { return *(int32x2_t *)(&a); }
__forceinline int32x2_t vreinterpret_s32_u32(uint32x2_t a) { return *(int32x2_t *)(&a); }
__forceinline int32x2_t vreinterpret_s32_u64(uint64x1_t a) { return *(int32x2_t *)(&a); }
__forceinline int64x1_t vreinterpret_s64_f32(float32x2_t a) { return *(int64x1_t *)(&a); }
__forceinline int64x1_t vreinterpret_s64_s8(int8x8_t a) { return *(int64x1_t *)(&a); }
__forceinline int64x1_t vreinterpret_s64_s16(int16x4_t a) { return *(int64x1_t *)(&a); }
__forceinline int64x1_t vreinterpret_s64_s32(int32x2_t a) { return *(int64x1_t *)(&a); }
__forceinline int64x1_t vreinterpret_s64_p8(poly8x8_t a) { return *(int64x1_t *)(&a); }
__forceinline int64x1_t vreinterpret_s64_p16(poly16x4_t a) { return *(int64x1_t *)(&a); }
__forceinline int64x1_t vreinterpret_s64_u8(uint8x8_t a) { return *(int64x1_t *)(&a); }
__forceinline int64x1_t vreinterpret_s64_u16(uint16x4_t a) { return *(int64x1_t *)(&a); }
__forceinline int64x1_t vreinterpret_s64_u32(uint32x2_t a) { return *(int64x1_t *)(&a); }
__forceinline int64x1_t vreinterpret_s64_u64(uint64x1_t a) { return *(int64x1_t *)(&a); }
__forceinline poly8x8_t vreinterpret_p8_f32(float32x2_t a) { return *(poly8x8_t *)(&a); }
__forceinline poly8x8_t vreinterpret_p8_s8(int8x8_t a) { return *(poly8x8_t *)(&a); }
__forceinline poly8x8_t vreinterpret_p8_s16(int16x4_t a) { return *(poly8x8_t *)(&a); }
__forceinline poly8x8_t vreinterpret_p8_s32(int32x2_t a) { return *(poly8x8_t *)(&a); }
__forceinline poly8x8_t vreinterpret_p8_s64(int64x1_t a) { return *(poly8x8_t *)(&a); }
__forceinline poly8x8_t vreinterpret_p8_p16(poly16x4_t a) { return *(poly8x8_t *)(&a); }
__forceinline poly8x8_t vreinterpret_p8_u8(uint8x8_t a) { return *(poly8x8_t *)(&a); }
__forceinline poly8x8_t vreinterpret_p8_u16(uint16x4_t a) { return *(poly8x8_t *)(&a); }
__forceinline poly8x8_t vreinterpret_p8_u32(uint32x2_t a) { return *(poly8x8_t *)(&a); }
__forceinline poly8x8_t vreinterpret_p8_u64(uint64x1_t a) { return *(poly8x8_t *)(&a); }
__forceinline poly16x4_t vreinterpret_p16_f32(float32x2_t a) { return *(poly16x4_t *)(&a); }
__forceinline poly16x4_t vreinterpret_p16_s8(int8x8_t a) { return *(poly16x4_t *)(&a); }
__forceinline poly16x4_t vreinterpret_p16_s16(int16x4_t a) { return *(poly16x4_t *)(&a); }
__forceinline poly16x4_t vreinterpret_p16_s32(int32x2_t a) { return *(poly16x4_t *)(&a); }
__forceinline poly16x4_t vreinterpret_p16_s64(int64x1_t a) { return *(poly16x4_t *)(&a); }
__forceinline poly16x4_t vreinterpret_p16_p8(poly8x8_t a) { return *(poly16x4_t *)(&a); }
__forceinline poly16x4_t vreinterpret_p16_u8(uint8x8_t a) { return *(poly16x4_t *)(&a); }
__forceinline poly16x4_t vreinterpret_p16_u16(uint16x4_t a) { return *(poly16x4_t *)(&a); }
__forceinline poly16x4_t vreinterpret_p16_u32(uint32x2_t a) { return *(poly16x4_t *)(&a); }
__forceinline poly16x4_t vreinterpret_p16_u64(uint64x1_t a) { return *(poly16x4_t *)(&a); }
__forceinline uint8x8_t vreinterpret_u8_f32(float32x2_t a) { return *(uint8x8_t *)(&a); }
__forceinline uint8x8_t vreinterpret_u8_s8(int8x8_t a) { return *(uint8x8_t *)(&a); }
__forceinline uint8x8_t vreinterpret_u8_s16(int16x4_t a) { return *(uint8x8_t *)(&a); }
__forceinline uint8x8_t vreinterpret_u8_s32(int32x2_t a) { return *(uint8x8_t *)(&a); }
__forceinline uint8x8_t vreinterpret_u8_s64(int64x1_t a) { return *(uint8x8_t *)(&a); }
__forceinline uint8x8_t vreinterpret_u8_p8(poly8x8_t a) { return *(uint8x8_t *)(&a); }
__forceinline uint8x8_t vreinterpret_u8_p16(poly16x4_t a) { return *(uint8x8_t *)(&a); }
__forceinline uint8x8_t vreinterpret_u8_u16(uint16x4_t a) { return *(uint8x8_t *)(&a); }
__forceinline uint8x8_t vreinterpret_u8_u32(uint32x2_t a) { return *(uint8x8_t *)(&a); }
__forceinline uint8x8_t vreinterpret_u8_u64(uint64x1_t a) { return *(uint8x8_t *)(&a); }
__forceinline uint16x4_t vreinterpret_u16_f32(float32x2_t a) { return *(uint16x4_t *)(&a); }
__forceinline uint16x4_t vreinterpret_u16_s8(int8x8_t a) { return *(uint16x4_t *)(&a); }
__forceinline uint16x4_t vreinterpret_u16_s16(int16x4_t a) { return *(uint16x4_t *)(&a); }
__forceinline uint16x4_t vreinterpret_u16_s32(int32x2_t a) { return *(uint16x4_t *)(&a); }
__forceinline uint16x4_t vreinterpret_u16_s64(int64x1_t a) { return *(uint16x4_t *)(&a); }
__forceinline uint16x4_t vreinterpret_u16_p8(poly8x8_t a) { return *(uint16x4_t *)(&a); }
__forceinline uint16x4_t vreinterpret_u16_p16(poly16x4_t a) { return *(uint16x4_t *)(&a); }
__forceinline uint16x4_t vreinterpret_u16_u8(uint8x8_t a) { return *(uint16x4_t *)(&a); }
__forceinline uint16x4_t vreinterpret_u16_u32(uint32x2_t a) { return *(uint16x4_t *)(&a); }
__forceinline uint16x4_t vreinterpret_u16_u64(uint64x1_t a) { return *(uint16x4_t *)(&a); }
__forceinline uint32x2_t vreinterpret_u32_f32(float32x2_t a) { return *(uint32x2_t *)(&a); }
__forceinline uint32x2_t vreinterpret_u32_s8(int8x8_t a) { return *(uint32x2_t *)(&a); }
__forceinline uint32x2_t vreinterpret_u32_s16(int16x4_t a) { return *(uint32x2_t *)(&a); }
__forceinline uint32x2_t vreinterpret_u32_s32(int32x2_t a) { return *(uint32x2_t *)(&a); }
__forceinline uint32x2_t vreinterpret_u32_s64(int64x1_t a) { return *(uint32x2_t *)(&a); }
__forceinline uint32x2_t vreinterpret_u32_p8(poly8x8_t a) { return *(uint32x2_t *)(&a); }
__forceinline uint32x2_t vreinterpret_u32_p16(poly16x4_t a) { return *(uint32x2_t *)(&a); }
__forceinline uint32x2_t vreinterpret_u32_u8(uint8x8_t a) { return *(uint32x2_t *)(&a); }
__forceinline uint32x2_t vreinterpret_u32_u16(uint16x4_t a) { return *(uint32x2_t *)(&a); }
__forceinline uint32x2_t vreinterpret_u32_u64(uint64x1_t a) { return *(uint32x2_t *)(&a); }
__forceinline uint64x1_t vreinterpret_u64_f32(float32x2_t a) { return *(uint64x1_t *)(&a); }
__forceinline uint64x1_t vreinterpret_u64_s8(int8x8_t a) { return *(uint64x1_t *)(&a); }
__forceinline uint64x1_t vreinterpret_u64_s16(int16x4_t a) { return *(uint64x1_t *)(&a); }
__forceinline uint64x1_t vreinterpret_u64_s32(int32x2_t a) { return *(uint64x1_t *)(&a); }
__forceinline uint64x1_t vreinterpret_u64_s64(int64x1_t a) { return *(uint64x1_t *)(&a); }
__forceinline uint64x1_t vreinterpret_u64_p8(poly8x8_t a) { return *(uint64x1_t *)(&a); }
__forceinline uint64x1_t vreinterpret_u64_p16(poly16x4_t a) { return *(uint64x1_t *)(&a); }
__forceinline uint64x1_t vreinterpret_u64_u8(uint8x8_t a) { return *(uint64x1_t *)(&a); }
__forceinline uint64x1_t vreinterpret_u64_u16(uint16x4_t a) { return *(uint64x1_t *)(&a); }
__forceinline uint64x1_t vreinterpret_u64_u32(uint32x2_t a) { return *(uint64x1_t *)(&a); }
__forceinline float32x4_t vreinterpretq_f32_s8(int8x16_t a) { return *(float32x4_t *)(&a); }
__forceinline float32x4_t vreinterpretq_f32_s16(int16x8_t a) { return *(float32x4_t *)(&a); }
__forceinline float32x4_t vreinterpretq_f32_s32(int32x4_t a) { return *(float32x4_t *)(&a); }
__forceinline float32x4_t vreinterpretq_f32_s64(int64x2_t a) { return *(float32x4_t *)(&a); }
__forceinline float32x4_t vreinterpretq_f32_p8(poly8x16_t a) { return *(float32x4_t *)(&a); }
__forceinline float32x4_t vreinterpretq_f32_p16(poly16x8_t a) { return *(float32x4_t *)(&a); }
__forceinline float32x4_t vreinterpretq_f32_u8(uint8x16_t a) { return *(float32x4_t *)(&a); }
__forceinline float32x4_t vreinterpretq_f32_u16(uint16x8_t a) { return *(float32x4_t *)(&a); }
__forceinline float32x4_t vreinterpretq_f32_u32(uint32x4_t a) { return *(float32x4_t *)(&a); }
__forceinline float32x4_t vreinterpretq_f32_u64(uint64x2_t a) { return *(float32x4_t *)(&a); }
__forceinline int8x16_t vreinterpretq_s8_f32(float32x4_t a) { return *(int8x16_t *)(&a); }
__forceinline int8x16_t vreinterpretq_s8_s16(int16x8_t a) { return *(int8x16_t *)(&a); }
__forceinline int8x16_t vreinterpretq_s8_s32(int32x4_t a) { return *(int8x16_t *)(&a); }
__forceinline int8x16_t vreinterpretq_s8_s64(int64x2_t a) { return *(int8x16_t *)(&a); }
__forceinline int8x16_t vreinterpretq_s8_p8(poly8x16_t a) { return *(int8x16_t *)(&a); }
__forceinline int8x16_t vreinterpretq_s8_p16(poly16x8_t a) { return *(int8x16_t *)(&a); }
__forceinline int8x16_t vreinterpretq_s8_u8(uint8x16_t a) { return *(int8x16_t *)(&a); }
__forceinline int8x16_t vreinterpretq_s8_u16(uint16x8_t a) { return *(int8x16_t *)(&a); }
__forceinline int8x16_t vreinterpretq_s8_u32(uint32x4_t a) { return *(int8x16_t *)(&a); }
__forceinline int8x16_t vreinterpretq_s8_u64(uint64x2_t a) { return *(int8x16_t *)(&a); }
__forceinline int16x8_t vreinterpretq_s16_f32(float32x4_t a) { return *(int16x8_t *)(&a); }
__forceinline int16x8_t vreinterpretq_s16_s8(int8x16_t a) { return *(int16x8_t *)(&a); }
__forceinline int16x8_t vreinterpretq_s16_s32(int32x4_t a) { return *(int16x8_t *)(&a); }
__forceinline int16x8_t vreinterpretq_s16_s64(int64x2_t a) { return *(int16x8_t *)(&a); }
__forceinline int16x8_t vreinterpretq_s16_p8(poly8x16_t a) { return *(int16x8_t *)(&a); }
__forceinline int16x8_t vreinterpretq_s16_p16(poly16x8_t a) { return *(int16x8_t *)(&a); }
__forceinline int16x8_t vreinterpretq_s16_u8(uint8x16_t a) { return *(int16x8_t *)(&a); }
__forceinline int16x8_t vreinterpretq_s16_u16(uint16x8_t a) { return *(int16x8_t *)(&a); }
__forceinline int16x8_t vreinterpretq_s16_u32(uint32x4_t a) { return *(int16x8_t *)(&a); }
__forceinline int16x8_t vreinterpretq_s16_u64(uint64x2_t a) { return *(int16x8_t *)(&a); }
__forceinline int32x4_t vreinterpretq_s32_f32(float32x4_t a) { return *(int32x4_t *)(&a); }
__forceinline int32x4_t vreinterpretq_s32_s8(int8x16_t a) { return *(int32x4_t *)(&a); }
__forceinline int32x4_t vreinterpretq_s32_s16(int16x8_t a) { return *(int32x4_t *)(&a); }
__forceinline int32x4_t vreinterpretq_s32_s64(int64x2_t a) { return *(int32x4_t *)(&a); }
__forceinline int32x4_t vreinterpretq_s32_p8(poly8x16_t a) { return *(int32x4_t *)(&a); }
__forceinline int32x4_t vreinterpretq_s32_p16(poly16x8_t a) { return *(int32x4_t *)(&a); }
__forceinline int32x4_t vreinterpretq_s32_u8(uint8x16_t a) { return *(int32x4_t *)(&a); }
__forceinline int32x4_t vreinterpretq_s32_u16(uint16x8_t a) { return *(int32x4_t *)(&a); }
__forceinline int32x4_t vreinterpretq_s32_u32(uint32x4_t a) { return *(int32x4_t *)(&a); }
__forceinline int32x4_t vreinterpretq_s32_u64(uint64x2_t a) { return *(int32x4_t *)(&a); }
__forceinline int64x2_t vreinterpretq_s64_f32(float32x4_t a) { return *(int64x2_t *)(&a); }
__forceinline int64x2_t vreinterpretq_s64_s8(int8x16_t a) { return *(int64x2_t *)(&a); }
__forceinline int64x2_t vreinterpretq_s64_s16(int16x8_t a) { return *(int64x2_t *)(&a); }
__forceinline int64x2_t vreinterpretq_s64_s32(int32x4_t a) { return *(int64x2_t *)(&a); }
__forceinline int64x2_t vreinterpretq_s64_p8(poly8x16_t a) { return *(int64x2_t *)(&a); }
__forceinline int64x2_t vreinterpretq_s64_p16(poly16x8_t a) { return *(int64x2_t *)(&a); }
__forceinline int64x2_t vreinterpretq_s64_u8(uint8x16_t a) { return *(int64x2_t *)(&a); }
__forceinline int64x2_t vreinterpretq_s64_u16(uint16x8_t a) { return *(int64x2_t *)(&a); }
__forceinline int64x2_t vreinterpretq_s64_u32(uint32x4_t a) { return *(int64x2_t *)(&a); }
__forceinline int64x2_t vreinterpretq_s64_u64(uint64x2_t a) { return *(int64x2_t *)(&a); }
__forceinline poly8x16_t vreinterpretq_p8_f32(float32x4_t a) { return *(poly8x16_t *)(&a); }
__forceinline poly8x16_t vreinterpretq_p8_s8(int8x16_t a) { return *(poly8x16_t *)(&a); }
__forceinline poly8x16_t vreinterpretq_p8_s16(int16x8_t a) { return *(poly8x16_t *)(&a); }
__forceinline poly8x16_t vreinterpretq_p8_s32(int32x4_t a) { return *(poly8x16_t *)(&a); }
__forceinline poly8x16_t vreinterpretq_p8_s64(int64x2_t a) { return *(poly8x16_t *)(&a); }
__forceinline poly8x16_t vreinterpretq_p8_p16(poly16x8_t a) { return *(poly8x16_t *)(&a); }
__forceinline poly8x16_t vreinterpretq_p8_u8(uint8x16_t a) { return *(poly8x16_t *)(&a); }
__forceinline poly8x16_t vreinterpretq_p8_u16(uint16x8_t a) { return *(poly8x16_t *)(&a); }
__forceinline poly8x16_t vreinterpretq_p8_u32(uint32x4_t a) { return *(poly8x16_t *)(&a); }
__forceinline poly8x16_t vreinterpretq_p8_u64(uint64x2_t a) { return *(poly8x16_t *)(&a); }
__forceinline poly16x8_t vreinterpretq_p16_f32(float32x4_t a) { return *(poly16x8_t *)(&a); }
__forceinline poly16x8_t vreinterpretq_p16_s8(int8x16_t a) { return *(poly16x8_t *)(&a); }
__forceinline poly16x8_t vreinterpretq_p16_s16(int16x8_t a) { return *(poly16x8_t *)(&a); }
__forceinline poly16x8_t vreinterpretq_p16_s32(int32x4_t a) { return *(poly16x8_t *)(&a); }
__forceinline poly16x8_t vreinterpretq_p16_s64(int64x2_t a) { return *(poly16x8_t *)(&a); }
__forceinline poly16x8_t vreinterpretq_p16_p8(poly8x16_t a) { return *(poly16x8_t *)(&a); }
__forceinline poly16x8_t vreinterpretq_p16_u8(uint8x16_t a) { return *(poly16x8_t *)(&a); }
__forceinline poly16x8_t vreinterpretq_p16_u16(uint16x8_t a) { return *(poly16x8_t *)(&a); }
__forceinline poly16x8_t vreinterpretq_p16_u32(uint32x4_t a) { return *(poly16x8_t *)(&a); }
__forceinline poly16x8_t vreinterpretq_p16_u64(uint64x2_t a) { return *(poly16x8_t *)(&a); }
__forceinline uint8x16_t vreinterpretq_u8_f32(float32x4_t a) { return *(uint8x16_t *)(&a); }
__forceinline uint8x16_t vreinterpretq_u8_s8(int8x16_t a) { return *(uint8x16_t *)(&a); }
__forceinline uint8x16_t vreinterpretq_u8_s16(int16x8_t a) { return *(uint8x16_t *)(&a); }
__forceinline uint8x16_t vreinterpretq_u8_s32(int32x4_t a) { return *(uint8x16_t *)(&a); }
__forceinline uint8x16_t vreinterpretq_u8_s64(int64x2_t a) { return *(uint8x16_t *)(&a); }
__forceinline uint8x16_t vreinterpretq_u8_p8(poly8x16_t a) { return *(uint8x16_t *)(&a); }
__forceinline uint8x16_t vreinterpretq_u8_p16(poly16x8_t a) { return *(uint8x16_t *)(&a); }
__forceinline uint8x16_t vreinterpretq_u8_u16(uint16x8_t a) { return *(uint8x16_t *)(&a); }
__forceinline uint8x16_t vreinterpretq_u8_u32(uint32x4_t a) { return *(uint8x16_t *)(&a); }
__forceinline uint8x16_t vreinterpretq_u8_u64(uint64x2_t a) { return *(uint8x16_t *)(&a); }
__forceinline uint16x8_t vreinterpretq_u16_f32(float32x4_t a) { return *(uint16x8_t *)(&a); }
__forceinline uint16x8_t vreinterpretq_u16_s8(int8x16_t a) { return *(uint16x8_t *)(&a); }
__forceinline uint16x8_t vreinterpretq_u16_s16(int16x8_t a) { return *(uint16x8_t *)(&a); }
__forceinline uint16x8_t vreinterpretq_u16_s32(int32x4_t a) { return *(uint16x8_t *)(&a); }
__forceinline uint16x8_t vreinterpretq_u16_s64(int64x2_t a) { return *(uint16x8_t *)(&a); }
__forceinline uint16x8_t vreinterpretq_u16_p8(poly8x16_t a) { return *(uint16x8_t *)(&a); }
__forceinline uint16x8_t vreinterpretq_u16_p16(poly16x8_t a) { return *(uint16x8_t *)(&a); }
__forceinline uint16x8_t vreinterpretq_u16_u8(uint8x16_t a) { return *(uint16x8_t *)(&a); }
__forceinline uint16x8_t vreinterpretq_u16_u32(uint32x4_t a) { return *(uint16x8_t *)(&a); }
__forceinline uint16x8_t vreinterpretq_u16_u64(uint64x2_t a) { return *(uint16x8_t *)(&a); }
__forceinline uint32x4_t vreinterpretq_u32_f32(float32x4_t a) { return *(uint32x4_t *)(&a); }
__forceinline uint32x4_t vreinterpretq_u32_s8(int8x16_t a) { return *(uint32x4_t *)(&a); }
__forceinline uint32x4_t vreinterpretq_u32_s16(int16x8_t a) { return *(uint32x4_t *)(&a); }
__forceinline uint32x4_t vreinterpretq_u32_s32(int32x4_t a) { return *(uint32x4_t *)(&a); }
__forceinline uint32x4_t vreinterpretq_u32_s64(int64x2_t a) { return *(uint32x4_t *)(&a); }
__forceinline uint32x4_t vreinterpretq_u32_p8(poly8x16_t a) { return *(uint32x4_t *)(&a); }
__forceinline uint32x4_t vreinterpretq_u32_p16(poly16x8_t a) { return *(uint32x4_t *)(&a); }
__forceinline uint32x4_t vreinterpretq_u32_u8(uint8x16_t a) { return *(uint32x4_t *)(&a); }
__forceinline uint32x4_t vreinterpretq_u32_u16(uint16x8_t a) { return *(uint32x4_t *)(&a); }
__forceinline uint32x4_t vreinterpretq_u32_u64(uint64x2_t a) { return *(uint32x4_t *)(&a); }
__forceinline uint64x2_t vreinterpretq_u64_f32(float32x4_t a) { return *(uint64x2_t *)(&a); }
__forceinline uint64x2_t vreinterpretq_u64_s8(int8x16_t a) { return *(uint64x2_t *)(&a); }
__forceinline uint64x2_t vreinterpretq_u64_s16(int16x8_t a) { return *(uint64x2_t *)(&a); }
__forceinline uint64x2_t vreinterpretq_u64_s32(int32x4_t a) { return *(uint64x2_t *)(&a); }
__forceinline uint64x2_t vreinterpretq_u64_s64(int64x2_t a) { return *(uint64x2_t *)(&a); }
__forceinline uint64x2_t vreinterpretq_u64_p8(poly8x16_t a) { return *(uint64x2_t *)(&a); }
__forceinline uint64x2_t vreinterpretq_u64_p16(poly16x8_t a) { return *(uint64x2_t *)(&a); }
__forceinline uint64x2_t vreinterpretq_u64_u8(uint8x16_t a) { return *(uint64x2_t *)(&a); }
__forceinline uint64x2_t vreinterpretq_u64_u16(uint16x8_t a) { return *(uint64x2_t *)(&a); }
__forceinline uint64x2_t vreinterpretq_u64_u32(uint32x4_t a) { return *(uint64x2_t *)(&a); }
__forceinline float64x1_t vreinterpret_f64_s8(int8x8_t a) { return *(float64x1_t *)(&a); }
__forceinline poly64x1_t vreinterpret_p64_s8(int8x8_t a) { return *(poly64x1_t *)(&a); }
__forceinline float64x1_t vreinterpret_f64_s16(int16x4_t a) { return *(float64x1_t *)(&a); }
__forceinline poly64x1_t vreinterpret_p64_s16(int16x4_t a) { return *(poly64x1_t *)(&a); }
__forceinline float64x1_t vreinterpret_f64_s32(int32x2_t a) { return *(float64x1_t *)(&a); }
__forceinline poly64x1_t vreinterpret_p64_s32(int32x2_t a) { return *(poly64x1_t *)(&a); }
__forceinline float64x1_t vreinterpret_f64_f32(float32x2_t a) { return *(float64x1_t *)(&a); }
__forceinline poly64x1_t vreinterpret_p64_f32(float32x2_t a) { return *(poly64x1_t *)(&a); }
__forceinline poly64x1_t vreinterpret_p64_f64(float64x1_t a) { return *(poly64x1_t *)(&a); }
__forceinline float64x1_t vreinterpret_f64_u8(uint8x8_t a) { return *(float64x1_t *)(&a); }
__forceinline poly64x1_t vreinterpret_p64_u8(uint8x8_t a) { return *(poly64x1_t *)(&a); }
__forceinline float64x1_t vreinterpret_f64_u16(uint16x4_t a) { return *(float64x1_t *)(&a); }
__forceinline poly64x1_t vreinterpret_p64_u16(uint16x4_t a) { return *(poly64x1_t *)(&a); }
__forceinline float64x1_t vreinterpret_f64_u32(uint32x2_t a) { return *(float64x1_t *)(&a); }
__forceinline poly64x1_t vreinterpret_p64_u32(uint32x2_t a) { return *(poly64x1_t *)(&a); }
__forceinline float64x1_t vreinterpret_f64_p8(poly8x8_t a) { return *(float64x1_t *)(&a); }
__forceinline poly64x1_t vreinterpret_p64_p8(poly8x8_t a) { return *(poly64x1_t *)(&a); }
__forceinline float64x1_t vreinterpret_f64_p16(poly16x4_t a) { return *(float64x1_t *)(&a); }
__forceinline poly64x1_t vreinterpret_p64_p16(poly16x4_t a) { return *(poly64x1_t *)(&a); }
__forceinline float64x1_t vreinterpret_f64_u64(uint64x1_t a) { return *(float64x1_t *)(&a); }
__forceinline poly64x1_t vreinterpret_p64_u64(uint64x1_t a) { return *(poly64x1_t *)(&a); }
__forceinline float64x1_t vreinterpret_f64_s64(int64x1_t a) { return *(float64x1_t *)(&a); }
__forceinline uint64x1_t vreinterpret_u64_p64(poly64x1_t a) { return *(uint64x1_t *)(&a); }
__forceinline float64x2_t vreinterpretq_f64_s8(int8x16_t a) { return *(float64x2_t *)(&a); }
__forceinline poly64x2_t vreinterpretq_p64_s8(int8x16_t a) { return *(poly64x2_t *)(&a); }
__forceinline float64x2_t vreinterpretq_f64_s16(int16x8_t a) { return *(float64x2_t *)(&a); }
__forceinline poly64x2_t vreinterpretq_p64_s16(int16x8_t a) { return *(poly64x2_t *)(&a); }
__forceinline float64x2_t vreinterpretq_f64_s32(int32x4_t a) { return *(float64x2_t *)(&a); }
__forceinline poly64x2_t vreinterpretq_p64_s32(int32x4_t a) { return *(poly64x2_t *)(&a); }
__forceinline float64x2_t vreinterpretq_f64_f32(float32x4_t a) { return *(float64x2_t *)(&a); }
__forceinline poly64x2_t vreinterpretq_p64_f32(float32x4_t a) { return *(poly64x2_t *)(&a); }
__forceinline poly64x2_t vreinterpretq_p64_f64(float64x2_t a) { return *(poly64x2_t *)(&a); }
__forceinline float64x2_t vreinterpretq_f64_u8(uint8x16_t a) { return *(float64x2_t *)(&a); }
__forceinline poly64x2_t vreinterpretq_p64_u8(uint8x16_t a) { return *(poly64x2_t *)(&a); }
__forceinline float64x2_t vreinterpretq_f64_u16(uint16x8_t a) { return *(float64x2_t *)(&a); }
__forceinline poly64x2_t vreinterpretq_p64_u16(uint16x8_t a) { return *(poly64x2_t *)(&a); }
__forceinline float64x2_t vreinterpretq_f64_u32(uint32x4_t a) { return *(float64x2_t *)(&a); }
__forceinline poly64x2_t vreinterpretq_p64_u32(uint32x4_t a) { return *(poly64x2_t *)(&a); }
__forceinline float64x2_t vreinterpretq_f64_p8(poly8x16_t a) { return *(float64x2_t *)(&a); }
__forceinline poly64x2_t vreinterpretq_p64_p8(poly8x16_t a) { return *(poly64x2_t *)(&a); }
__forceinline float64x2_t vreinterpretq_f64_p16(poly16x8_t a) { return *(float64x2_t *)(&a); }
__forceinline poly64x2_t vreinterpretq_p64_p16(poly16x8_t a) { return *(poly64x2_t *)(&a); }
__forceinline float64x2_t vreinterpretq_f64_u64(uint64x2_t a) { return *(float64x2_t *)(&a); }
__forceinline float64x2_t vreinterpretq_f64_s64(int64x2_t a) { return *(float64x2_t *)(&a); }
__forceinline poly64x2_t vreinterpretq_p64_s64(int64x2_t a) { return *(poly64x2_t *)(&a); }
__forceinline poly64x2_t vreinterpretq_p64_u64(uint64x2_t a) { return *(poly64x2_t *)(&a); }
__forceinline uint64x2_t vreinterpretq_u64_p64(poly64x2_t a) { return *(uint64x2_t *)(&a); }
__forceinline int8x8_t vreinterpret_s8_f64(float64x1_t a) { return *(int8x8_t *)(&a); }
__forceinline int16x4_t vreinterpret_s16_f64(float64x1_t a) { return *(int16x4_t *)(&a); }
__forceinline int32x2_t vreinterpret_s32_f64(float64x1_t a) { return *(int32x2_t *)(&a); }
__forceinline uint8x8_t vreinterpret_u8_f64(float64x1_t a) { return *(uint8x8_t *)(&a); }
__forceinline uint16x4_t vreinterpret_u16_f64(float64x1_t a) { return *(uint16x4_t *)(&a); }
__forceinline uint32x2_t vreinterpret_u32_f64(float64x1_t a) { return *(uint32x2_t *)(&a); }
__forceinline poly8x8_t vreinterpret_p8_f64(float64x1_t a) { return *(poly8x8_t *)(&a); }
__forceinline poly16x4_t vreinterpret_p16_f64(float64x1_t a) { return *(poly16x4_t *)(&a); }
__forceinline uint64x1_t vreinterpret_u64_f64(float64x1_t a) { return *(uint64x1_t *)(&a); }
__forceinline int64x1_t vreinterpret_s64_f64(float64x1_t a) { return *(int64x1_t *)(&a); }
__forceinline int8x16_t vreinterpretq_s8_f64(float64x2_t a) { return *(int8x16_t *)(&a); }
__forceinline int16x8_t vreinterpretq_s16_f64(float64x2_t a) { return *(int16x8_t *)(&a); }
__forceinline int32x4_t vreinterpretq_s32_f64(float64x2_t a) { return *(int32x4_t *)(&a); }
__forceinline uint8x16_t vreinterpretq_u8_f64(float64x2_t a) { return *(uint8x16_t *)(&a); }
__forceinline uint16x8_t vreinterpretq_u16_f64(float64x2_t a) { return *(uint16x8_t *)(&a); }
__forceinline uint32x4_t vreinterpretq_u32_f64(float64x2_t a) { return *(uint32x4_t *)(&a); }
__forceinline poly8x16_t vreinterpretq_p8_f64(float64x2_t a) { return *(poly8x16_t *)(&a); }
__forceinline poly16x8_t vreinterpretq_p16_f64(float64x2_t a) { return *(poly16x8_t *)(&a); }
__forceinline uint64x2_t vreinterpretq_u64_f64(float64x2_t a) { return *(uint64x2_t *)(&a); }
__forceinline int64x2_t vreinterpretq_s64_f64(float64x2_t a) { return *(int64x2_t *)(&a); }
__forceinline int8x8_t vreinterpret_s8_p64(poly64x1_t a) { return *(int8x8_t *)(&a); }
__forceinline int16x4_t vreinterpret_s16_p64(poly64x1_t a) { return *(int16x4_t *)(&a); }
__forceinline int32x2_t vreinterpret_s32_p64(poly64x1_t a) { return *(int32x2_t *)(&a); }
__forceinline uint8x8_t vreinterpret_u8_p64(poly64x1_t a) { return *(uint8x8_t *)(&a); }
__forceinline uint16x4_t vreinterpret_u16_p64(poly64x1_t a) { return *(uint16x4_t *)(&a); }
__forceinline uint32x2_t vreinterpret_u32_p64(poly64x1_t a) { return *(uint32x2_t *)(&a); }
__forceinline poly8x8_t vreinterpret_p8_p64(poly64x1_t a) { return *(poly8x8_t *)(&a); }
__forceinline poly16x4_t vreinterpret_p16_p64(poly64x1_t a) { return *(poly16x4_t *)(&a); }
__forceinline int64x1_t vreinterpret_s64_p64(poly64x1_t a) { return *(int64x1_t *)(&a); }
__forceinline float64x1_t vreinterpret_f64_p64(poly64x1_t a) { return *(float64x1_t *)(&a); }
__forceinline int8x16_t vreinterpretq_s8_p64(poly64x2_t a) { return *(int8x16_t *)(&a); }
__forceinline int16x8_t vreinterpretq_s16_p64(poly64x2_t a) { return *(int16x8_t *)(&a); }
__forceinline int32x4_t vreinterpretq_s32_p64(poly64x2_t a) { return *(int32x4_t *)(&a); }
__forceinline uint8x16_t vreinterpretq_u8_p64(poly64x2_t a) { return *(uint8x16_t *)(&a); }
__forceinline uint16x8_t vreinterpretq_u16_p64(poly64x2_t a) { return *(uint16x8_t *)(&a); }
__forceinline uint32x4_t vreinterpretq_u32_p64(poly64x2_t a) { return *(uint32x4_t *)(&a); }
__forceinline poly8x16_t vreinterpretq_p8_p64(poly64x2_t a) { return *(poly8x16_t *)(&a); }
__forceinline poly16x8_t vreinterpretq_p16_p64(poly64x2_t a) { return *(poly16x8_t *)(&a); }
__forceinline int64x2_t vreinterpretq_s64_p64(poly64x2_t a) { return *(int64x2_t *)(&a); }
__forceinline float64x2_t vreinterpretq_f64_p64(poly64x2_t a) { return *(float64x2_t *)(&a); }
__forceinline float32x2_t vreinterpret_f32_f64(float64x1_t a) { return *(float32x2_t *)(&a); }
__forceinline float32x4_t vreinterpretq_f32_f64(float64x2_t a) { return *(float32x4_t *)(&a); }
#endif  /* !_ARM64_DISTINCT_NEON_TYPES */

uint8x16_t __iso_volatile_neon_load128(const volatile uint8x16_t *);
uint8x16x2_t __iso_volatile_neon_load128_p(const volatile uint8x16x2_t *);
float32x1x2_t __iso_volatile_neon_load32_np(const volatile float32x1x2_t *);
uint8x8x2_t __iso_volatile_neon_load64_np(const volatile uint8x8x2_t *);
uint8x16x2_t __iso_volatile_neon_load128_np(const volatile uint8x16x2_t *);
void __iso_volatile_neon_store128(volatile uint8x16_t *, uint8x16_t);
void __iso_volatile_neon_store128_p(volatile uint8x16x2_t *, uint8x16x2_t);
void __iso_volatile_neon_store32_np(volatile float32x1x2_t *, float32x1x2_t);
void __iso_volatile_neon_store64_np(volatile uint8x8x2_t *, uint8x8x2_t);
void __iso_volatile_neon_store128_np(volatile uint8x16x2_t *, uint8x16x2_t);

#if defined (__cplusplus)
}
#endif  /* defined (__cplusplus) */


///////////////////////////////////////////////////////////////////////////////
//
// VLDx/VSTx alignment specifications
//


#define _NEON_ALIGN16(a)         \
    (                            \
    ((a) == 8) ? 0 :             \
    ((a) == 16) ? 1 :            \
    -1)

#define _NEON_ALIGN32(a)         \
    (                            \
    ((a) == 8) ? 0 :             \
    ((a) == 32) ? 1 :            \
    -1)

#define _NEON_ALIGN64(a)         \
    (                            \
    ((a) == 8) ? 0 :             \
    ((a) == 64) ? 1 :            \
    -1)

#define _NEON_ALIGN64_128(a)     \
    (                            \
    ((a) == 8) ? 0 :             \
    ((a) == 64) ? 1 :            \
    ((a) == 128) ? 2 :           \
    -1)


#define _NEON_ALIGN64_128_256(a) \
    (                            \
    ((a) == 8) ? 0 :             \
    ((a) == 64) ? 1 :            \
    ((a) == 128) ? 2 :           \
    ((a) == 256) ? 3 :           \
    -1)

#pragma warning(pop) // _VCRUNTIME_DISABLED_WARNINGS