XenonRecomp/PowerUtils/ppc_context.h

667 lines
20 KiB
C
Raw Permalink Normal View History

2024-09-30 13:39:47 +06:00
#ifndef PPC_CONTEXT_H_INCLUDED
#define PPC_CONTEXT_H_INCLUDED
#ifndef PPC_CONFIG_H_INCLUDED
#error "ppc_config.h must be included before ppc_context.h"
#endif
#include <cmath>
#include <csetjmp>
2024-09-08 08:32:31 +06:00
#include <cstdint>
#include <cstdlib>
2024-09-15 20:30:56 +03:00
#include <cstring>
#include <intrin.h>
2024-09-12 21:50:36 +03:00
#include <x86intrin.h>
2024-09-30 18:45:07 +06:00
#define PPC_JOIN(x, y) x##y
#define PPC_XSTRINGIFY(x) #x
#define PPC_STRINGIFY(x) PPC_XSTRINGIFY(x)
#define PPC_FUNC(x) void x(PPCContext& __restrict ctx, uint8_t* base)
2024-09-30 18:45:07 +06:00
#define PPC_FUNC_IMPL(x) extern "C" PPC_FUNC(x)
#define PPC_EXTERN_FUNC(x) extern PPC_FUNC(x)
#define PPC_WEAK_FUNC(x) __attribute__((weak,noinline)) PPC_FUNC(x)
2024-09-20 18:05:13 +03:00
#define PPC_FUNC_PROLOGUE() __builtin_assume(((size_t)base & 0xFFFFFFFF) == 0)
#ifndef PPC_LOAD_U8
#define PPC_LOAD_U8(x) *(volatile uint8_t*)(base + (x))
#endif
#ifndef PPC_LOAD_U16
#define PPC_LOAD_U16(x) __builtin_bswap16(*(volatile uint16_t*)(base + (x)))
#endif
#ifndef PPC_LOAD_U32
#define PPC_LOAD_U32(x) __builtin_bswap32(*(volatile uint32_t*)(base + (x)))
#endif
#ifndef PPC_LOAD_U64
#define PPC_LOAD_U64(x) __builtin_bswap64(*(volatile uint64_t*)(base + (x)))
#endif
2024-11-24 15:36:50 +06:00
// TODO: Implement.
// These are currently unused. However, MMIO loads could possibly be handled statically with some profiling and a fallback.
// The fallback would be a runtime exception handler which will intercept reads from MMIO regions
// and log the PC for compiling to static code later.
#ifndef PPC_MM_LOAD_U8
2024-11-24 15:36:50 +06:00
#define PPC_MM_LOAD_U8(x) PPC_LOAD_U8 (x)
#endif
#ifndef PPC_MM_LOAD_U16
2024-11-24 15:36:50 +06:00
#define PPC_MM_LOAD_U16(x) PPC_LOAD_U16(x)
#endif
#ifndef PPC_MM_LOAD_U32
2024-11-24 15:36:50 +06:00
#define PPC_MM_LOAD_U32(x) PPC_LOAD_U32(x)
#endif
#ifndef PPC_MM_LOAD_U64
2024-11-24 15:36:50 +06:00
#define PPC_MM_LOAD_U64(x) PPC_LOAD_U64(x)
#endif
2024-11-24 15:36:50 +06:00
#ifndef PPC_STORE_U8
#define PPC_STORE_U8(x, y) *(volatile uint8_t*)(base + (x)) = (y)
#endif
#ifndef PPC_STORE_U16
#define PPC_STORE_U16(x, y) *(volatile uint16_t*)(base + (x)) = __builtin_bswap16(y)
#endif
#ifndef PPC_STORE_U32
#define PPC_STORE_U32(x, y) *(volatile uint32_t*)(base + (x)) = __builtin_bswap32(y)
#endif
#ifndef PPC_STORE_U64
#define PPC_STORE_U64(x, y) *(volatile uint64_t*)(base + (x)) = __builtin_bswap64(y)
#endif
2024-11-24 15:36:50 +06:00
// MMIO Store handling is completely reliant on being preeceded by eieio.
// TODO: Verify if that's always the case.
#ifndef PPC_MM_STORE_U8
2024-11-24 15:36:50 +06:00
#define PPC_MM_STORE_U8(x, y) PPC_STORE_U8 (x, y)
#endif
#ifndef PPC_MM_STORE_U16
2024-11-24 15:36:50 +06:00
#define PPC_MM_STORE_U16(x, y) PPC_STORE_U16(x, y)
#endif
#ifndef PPC_MM_STORE_U32
2024-11-24 15:36:50 +06:00
#define PPC_MM_STORE_U32(x, y) PPC_STORE_U32(x, y)
#endif
#ifndef PPC_MM_STORE_U64
2024-11-24 15:36:50 +06:00
#define PPC_MM_STORE_U64(x, y) PPC_STORE_U64(x, y)
#endif
2024-11-24 15:36:50 +06:00
#ifndef PPC_CALL_FUNC
#define PPC_CALL_FUNC(x) x(ctx, base)
#endif
#ifndef PPC_CALL_INDIRECT_FUNC
#define PPC_CALL_INDIRECT_FUNC(x) (*(PPCFunc**)(ctx.fn + uint64_t(x) * 2))(ctx, base)
#endif
2024-09-22 19:58:22 +03:00
typedef void PPCFunc(struct PPCContext& __restrict__ ctx, uint8_t* base);
2024-09-18 13:31:37 +03:00
struct PPCFuncMapping
{
size_t guest;
PPCFunc* host;
};
extern PPCFuncMapping PPCFuncMappings[];
struct PPCRegister
{
union
{
int8_t s8;
uint8_t u8;
int16_t s16;
uint16_t u16;
int32_t s32;
uint32_t u32;
int64_t s64;
uint64_t u64;
float f32;
double f64;
};
};
2024-09-08 08:32:31 +06:00
2024-09-12 17:41:39 +03:00
struct PPCXERRegister
{
uint8_t so;
uint8_t ov;
uint8_t ca;
};
2024-09-10 23:16:21 +03:00
struct PPCCRRegister
{
uint8_t lt;
uint8_t gt;
uint8_t eq;
union
{
uint8_t so;
uint8_t un;
};
template<typename T>
inline void compare(T left, T right, const PPCXERRegister& xer) noexcept
{
lt = left < right;
gt = left > right;
eq = left == right;
2024-09-12 17:41:39 +03:00
so = xer.so;
}
inline void compare(double left, double right) noexcept
{
2024-09-22 19:58:22 +03:00
un = __builtin_isnan(left) || __builtin_isnan(right);
lt = !un && (left < right);
gt = !un && (left > right);
eq = !un && (left == right);
}
2024-09-17 21:32:13 +03:00
inline void setFromMask(__m128 mask, int imm) noexcept
2024-09-17 21:32:13 +03:00
{
int m = _mm_movemask_ps(mask);
lt = m == imm; // all equal
gt = 0;
eq = m == 0; // none equal
so = 0;
}
inline void setFromMask(__m128i mask, int imm) noexcept
2024-09-17 21:32:13 +03:00
{
int m = _mm_movemask_epi8(mask);
lt = m == imm; // all equal
gt = 0;
eq = m == 0; // none equal
so = 0;
}
2024-09-10 23:16:21 +03:00
};
2024-09-14 23:17:51 +03:00
struct alignas(0x10) PPCVRegister
{
union
{
int8_t s8[16];
uint8_t u8[16];
int16_t s16[8];
uint16_t u16[8];
int32_t s32[4];
uint32_t u32[4];
int64_t s64[2];
uint64_t u64[2];
float f32[4];
double f64[2];
};
};
2024-09-08 08:32:31 +06:00
#define PPC_ROUND_NEAREST 0x00
#define PPC_ROUND_TOWARD_ZERO 0x01
#define PPC_ROUND_UP 0x02
#define PPC_ROUND_DOWN 0x03
#define PPC_ROUND_MASK 0x03
2024-09-22 00:07:15 +03:00
struct PPCFPSCRRegister
2024-09-20 18:05:13 +03:00
{
2024-09-22 00:07:15 +03:00
uint32_t csr;
2024-09-20 18:05:13 +03:00
static constexpr size_t GuestToHost[] = { _MM_ROUND_NEAREST, _MM_ROUND_TOWARD_ZERO, _MM_ROUND_UP, _MM_ROUND_DOWN };
static constexpr size_t HostToGuest[] = { PPC_ROUND_NEAREST, PPC_ROUND_DOWN, PPC_ROUND_UP, PPC_ROUND_TOWARD_ZERO };
inline uint32_t loadFromHost() noexcept
2024-09-20 18:05:13 +03:00
{
2024-09-22 00:07:15 +03:00
csr = _mm_getcsr();
return HostToGuest[(csr & _MM_ROUND_MASK) >> 13];
2024-09-22 00:07:15 +03:00
}
inline void storeFromGuest(uint32_t value) noexcept
2024-09-22 00:07:15 +03:00
{
csr &= ~_MM_ROUND_MASK;
csr |= GuestToHost[value & PPC_ROUND_MASK];
2024-09-22 00:07:15 +03:00
_mm_setcsr(csr);
2024-09-20 18:05:13 +03:00
}
static constexpr size_t FlushMask = _MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;
2024-09-26 18:31:56 +03:00
inline void enableFlushModeUnconditional() noexcept
2024-09-20 18:05:13 +03:00
{
csr |= FlushMask;
_mm_setcsr(csr);
}
2024-09-26 18:31:56 +03:00
inline void disableFlushModeUnconditional() noexcept
{
csr &= ~FlushMask;
_mm_setcsr(csr);
}
2024-09-26 18:31:56 +03:00
inline void enableFlushMode() noexcept
{
if ((csr & FlushMask) != FlushMask) [[unlikely]]
{
csr |= FlushMask;
_mm_setcsr(csr);
}
}
2024-09-26 18:31:56 +03:00
inline void disableFlushMode() noexcept
{
if ((csr & FlushMask) != 0) [[unlikely]]
2024-09-20 18:05:13 +03:00
{
csr &= ~FlushMask;
_mm_setcsr(csr);
2024-09-20 18:05:13 +03:00
}
}
};
2024-09-08 08:32:31 +06:00
struct PPCContext
{
uint8_t* fn;
#ifndef PPC_CONFIG_SKIP_LR
2024-09-08 08:32:31 +06:00
uint64_t lr;
#endif
#ifndef PPC_CONFIG_CTR_AS_LOCAL
2024-09-19 13:54:13 +03:00
PPCRegister ctr;
#endif
#ifndef PPC_CONFIG_XER_AS_LOCAL
2024-09-12 17:41:39 +03:00
PPCXERRegister xer;
#endif
#ifndef PPC_CONFIG_RESERVED_AS_LOCAL
2024-09-15 15:32:29 +03:00
PPCRegister reserved;
#endif
#ifndef PPC_CONFIG_SKIP_MSR
2024-09-22 19:58:22 +03:00
uint32_t msr = 0x200A000;
#endif
2024-09-08 08:32:31 +06:00
#ifndef PPC_CONFIG_CR_AS_LOCAL
2024-09-22 19:58:22 +03:00
PPCCRRegister cr0;
PPCCRRegister cr1;
PPCCRRegister cr2;
PPCCRRegister cr3;
PPCCRRegister cr4;
PPCCRRegister cr5;
PPCCRRegister cr6;
PPCCRRegister cr7;
#endif
#ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL
2024-09-22 19:58:22 +03:00
PPCRegister r0;
#endif
2024-09-22 19:58:22 +03:00
PPCRegister r1;
#ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL
2024-09-22 19:58:22 +03:00
PPCRegister r2;
#endif
2024-09-22 19:58:22 +03:00
PPCRegister r3;
PPCRegister r4;
PPCRegister r5;
PPCRegister r6;
PPCRegister r7;
PPCRegister r8;
PPCRegister r9;
PPCRegister r10;
#ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL
2024-09-22 19:58:22 +03:00
PPCRegister r11;
PPCRegister r12;
#endif
2024-09-22 19:58:22 +03:00
PPCRegister r13;
#ifndef PPC_CONFIG_NON_VOLATILE_AS_LOCAL
2024-09-22 19:58:22 +03:00
PPCRegister r14;
PPCRegister r15;
PPCRegister r16;
PPCRegister r17;
PPCRegister r18;
PPCRegister r19;
PPCRegister r20;
PPCRegister r21;
PPCRegister r22;
PPCRegister r23;
PPCRegister r24;
PPCRegister r25;
PPCRegister r26;
PPCRegister r27;
PPCRegister r28;
PPCRegister r29;
PPCRegister r30;
PPCRegister r31;
#endif
2024-09-22 19:58:22 +03:00
PPCFPSCRRegister fpscr;
#ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL
2024-09-22 19:58:22 +03:00
PPCRegister f0;
#endif
2024-09-22 19:58:22 +03:00
PPCRegister f1;
PPCRegister f2;
PPCRegister f3;
PPCRegister f4;
PPCRegister f5;
PPCRegister f6;
PPCRegister f7;
PPCRegister f8;
PPCRegister f9;
PPCRegister f10;
PPCRegister f11;
PPCRegister f12;
PPCRegister f13;
#ifndef PPC_CONFIG_NON_VOLATILE_AS_LOCAL
2024-09-22 19:58:22 +03:00
PPCRegister f14;
PPCRegister f15;
PPCRegister f16;
PPCRegister f17;
PPCRegister f18;
PPCRegister f19;
PPCRegister f20;
PPCRegister f21;
PPCRegister f22;
PPCRegister f23;
PPCRegister f24;
PPCRegister f25;
PPCRegister f26;
PPCRegister f27;
PPCRegister f28;
PPCRegister f29;
PPCRegister f30;
PPCRegister f31;
#endif
2024-09-22 19:58:22 +03:00
PPCVRegister v0;
PPCVRegister v1;
PPCVRegister v2;
PPCVRegister v3;
PPCVRegister v4;
PPCVRegister v5;
PPCVRegister v6;
PPCVRegister v7;
PPCVRegister v8;
PPCVRegister v9;
PPCVRegister v10;
PPCVRegister v11;
PPCVRegister v12;
PPCVRegister v13;
#ifndef PPC_CONFIG_NON_VOLATILE_AS_LOCAL
2024-09-22 19:58:22 +03:00
PPCVRegister v14;
PPCVRegister v15;
PPCVRegister v16;
PPCVRegister v17;
PPCVRegister v18;
PPCVRegister v19;
PPCVRegister v20;
PPCVRegister v21;
PPCVRegister v22;
PPCVRegister v23;
PPCVRegister v24;
PPCVRegister v25;
PPCVRegister v26;
PPCVRegister v27;
PPCVRegister v28;
PPCVRegister v29;
PPCVRegister v30;
PPCVRegister v31;
#endif
#ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL
2024-09-22 19:58:22 +03:00
PPCVRegister v32;
PPCVRegister v33;
PPCVRegister v34;
PPCVRegister v35;
PPCVRegister v36;
PPCVRegister v37;
PPCVRegister v38;
PPCVRegister v39;
PPCVRegister v40;
PPCVRegister v41;
PPCVRegister v42;
PPCVRegister v43;
PPCVRegister v44;
PPCVRegister v45;
PPCVRegister v46;
PPCVRegister v47;
PPCVRegister v48;
PPCVRegister v49;
PPCVRegister v50;
PPCVRegister v51;
PPCVRegister v52;
PPCVRegister v53;
PPCVRegister v54;
PPCVRegister v55;
PPCVRegister v56;
PPCVRegister v57;
PPCVRegister v58;
PPCVRegister v59;
PPCVRegister v60;
PPCVRegister v61;
PPCVRegister v62;
PPCVRegister v63;
#endif
#ifndef PPC_CONFIG_NON_VOLATILE_AS_LOCAL
2024-09-22 19:58:22 +03:00
PPCVRegister v64;
PPCVRegister v65;
PPCVRegister v66;
PPCVRegister v67;
PPCVRegister v68;
PPCVRegister v69;
PPCVRegister v70;
PPCVRegister v71;
PPCVRegister v72;
PPCVRegister v73;
PPCVRegister v74;
PPCVRegister v75;
PPCVRegister v76;
PPCVRegister v77;
PPCVRegister v78;
PPCVRegister v79;
PPCVRegister v80;
PPCVRegister v81;
PPCVRegister v82;
PPCVRegister v83;
PPCVRegister v84;
PPCVRegister v85;
PPCVRegister v86;
PPCVRegister v87;
PPCVRegister v88;
PPCVRegister v89;
PPCVRegister v90;
PPCVRegister v91;
PPCVRegister v92;
PPCVRegister v93;
PPCVRegister v94;
PPCVRegister v95;
PPCVRegister v96;
PPCVRegister v97;
PPCVRegister v98;
PPCVRegister v99;
PPCVRegister v100;
PPCVRegister v101;
PPCVRegister v102;
PPCVRegister v103;
PPCVRegister v104;
PPCVRegister v105;
PPCVRegister v106;
PPCVRegister v107;
PPCVRegister v108;
PPCVRegister v109;
PPCVRegister v110;
PPCVRegister v111;
PPCVRegister v112;
PPCVRegister v113;
PPCVRegister v114;
PPCVRegister v115;
PPCVRegister v116;
PPCVRegister v117;
PPCVRegister v118;
PPCVRegister v119;
PPCVRegister v120;
PPCVRegister v121;
PPCVRegister v122;
PPCVRegister v123;
PPCVRegister v124;
PPCVRegister v125;
PPCVRegister v126;
PPCVRegister v127;
#endif
2024-09-08 08:32:31 +06:00
};
2024-09-16 20:33:46 +03:00
inline uint8_t VectorMaskL[] =
{
0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02,
0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03,
0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F,
};
inline uint8_t VectorMaskR[] =
{
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF,
0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF,
0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF,
0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF,
};
inline uint8_t VectorShiftTableL[] =
{
0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02,
0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03,
0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05,
0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06,
0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07,
0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08,
0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09,
0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A,
0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B,
0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C,
0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D,
0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E,
0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F,
};
inline uint8_t VectorShiftTableR[] =
{
0x1F, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F,
0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E,
0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D,
0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C,
0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B,
0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A,
0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09,
0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08,
0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07,
0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06,
0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05,
0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03,
0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02,
0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
};
inline __m128i _mm_adds_epu32(__m128i a, __m128i b)
{
return _mm_add_epi32(a, _mm_min_epu32(_mm_xor_si128(a, _mm_cmpeq_epi32(a, a)), b));
}
inline __m128i _mm_avg_epi8(__m128i a, __m128i b)
{
__m128i c = _mm_set1_epi8(char(128));
2024-09-20 17:01:57 +03:00
return _mm_xor_si128(c, _mm_avg_epu8(_mm_xor_si128(c, a), _mm_xor_si128(c, b)));
}
inline __m128i _mm_avg_epi16(__m128i a, __m128i b)
{
__m128i c = _mm_set1_epi16(short(32768));
2024-09-20 17:01:57 +03:00
return _mm_xor_si128(c, _mm_avg_epu16(_mm_xor_si128(c, a), _mm_xor_si128(c, b)));
}
inline __m128 _mm_cvtepu32_ps_(__m128i src1)
{
__m128i xmm1 = _mm_add_epi32(src1, _mm_set1_epi32(127));
__m128i xmm0 = _mm_slli_epi32(src1, 31 - 8);
xmm0 = _mm_srli_epi32(xmm0, 31);
xmm0 = _mm_add_epi32(xmm0, xmm1);
xmm0 = _mm_srai_epi32(xmm0, 8);
xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(0x4F800000));
__m128 xmm2 = _mm_cvtepi32_ps(src1);
return _mm_blendv_ps(xmm2, _mm_castsi128_ps(xmm0), _mm_castsi128_ps(src1));
}
inline __m128i _mm_perm_epi8_(__m128i a, __m128i b, __m128i c)
{
__m128i d = _mm_set1_epi8(0xF);
__m128i e = _mm_sub_epi8(d, _mm_and_si128(c, d));
return _mm_blendv_epi8(_mm_shuffle_epi8(a, e), _mm_shuffle_epi8(b, e), _mm_slli_epi32(c, 3));
}
2024-09-17 21:32:13 +03:00
inline __m128i _mm_cmpgt_epu8(__m128i a, __m128i b)
{
2024-09-20 17:01:57 +03:00
__m128i c = _mm_set1_epi8(char(128));
2024-09-17 21:32:13 +03:00
return _mm_cmpgt_epi8(_mm_xor_si128(a, c), _mm_xor_si128(b, c));
}
inline __m128i _mm_cmpgt_epu16(__m128i a, __m128i b)
{
2024-09-20 17:01:57 +03:00
__m128i c = _mm_set1_epi16(short(32768));
2024-09-17 21:32:13 +03:00
return _mm_cmpgt_epi16(_mm_xor_si128(a, c), _mm_xor_si128(b, c));
}
inline __m128i _mm_vctsxs(__m128 src1)
2024-09-20 17:01:57 +03:00
{
__m128 xmm2 = _mm_cmpunord_ps(src1, src1);
__m128i xmm0 = _mm_cvttps_epi32(src1);
__m128i xmm1 = _mm_cmpeq_epi32(xmm0, _mm_set1_epi32(INT_MIN));
xmm1 = _mm_andnot_si128(_mm_castps_si128(src1), xmm1);
__m128 dest = _mm_blendv_ps(_mm_castsi128_ps(xmm0), _mm_castsi128_ps(_mm_set1_epi32(INT_MAX)), _mm_castsi128_ps(xmm1));
return _mm_andnot_si128(_mm_castps_si128(xmm2), _mm_castps_si128(dest));
2024-09-20 17:01:57 +03:00
}
2024-09-21 21:47:34 +03:00
2024-11-03 21:27:33 +01:00
inline __m128i _mm_vctuxs(__m128 src1)
{
__m128 xmm0 = _mm_max_ps(src1, _mm_set1_epi32(0));
__m128 xmm1 = _mm_cmpge_ps(xmm0, _mm_set1_ps((float)0x80000000));
__m128 xmm2 = _mm_sub_ps(xmm0, _mm_set1_ps((float)0x80000000));
xmm0 = _mm_blendv_ps(xmm0, xmm2, xmm1);
__m128i dest = _mm_cvttps_epi32(xmm0);
xmm0 = _mm_cmpeq_epi32(dest, _mm_set1_epi32(INT_MIN));
xmm1 = _mm_and_si128(xmm1, _mm_set1_epi32(INT_MIN));
dest = _mm_add_epi32(dest, xmm1);
return _mm_or_si128(dest, xmm0);
}
2024-09-21 21:47:34 +03:00
inline __m128i _mm_vsr(__m128i a, __m128i b)
{
b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61);
return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10));
}
2024-09-30 13:39:47 +06:00
#endif