#ifndef PPC_CONTEXT_H_INCLUDED #define PPC_CONTEXT_H_INCLUDED #ifndef PPC_CONFIG_H_INCLUDED #error "ppc_config.h must be included before ppc_context.h" #endif #include #include #include #include #include #include #include #define PPC_JOIN(x, y) x##y #define PPC_XSTRINGIFY(x) #x #define PPC_STRINGIFY(x) PPC_XSTRINGIFY(x) #define PPC_FUNC(x) void x(PPCContext& __restrict ctx, uint8_t* base) #define PPC_FUNC_IMPL(x) extern "C" PPC_FUNC(x) #define PPC_EXTERN_FUNC(x) extern PPC_FUNC(x) #define PPC_WEAK_FUNC(x) __attribute__((weak,noinline)) PPC_FUNC(x) #define PPC_FUNC_PROLOGUE() __builtin_assume(((size_t)base & 0xFFFFFFFF) == 0) #ifndef PPC_LOAD_U8 #define PPC_LOAD_U8(x) *(volatile uint8_t*)(base + (x)) #endif #ifndef PPC_LOAD_U16 #define PPC_LOAD_U16(x) __builtin_bswap16(*(volatile uint16_t*)(base + (x))) #endif #ifndef PPC_LOAD_U32 #define PPC_LOAD_U32(x) __builtin_bswap32(*(volatile uint32_t*)(base + (x))) #endif #ifndef PPC_LOAD_U64 #define PPC_LOAD_U64(x) __builtin_bswap64(*(volatile uint64_t*)(base + (x))) #endif // TODO: Implement. // These are currently unused. However, MMIO loads could possibly be handled statically with some profiling and a fallback. // The fallback would be a runtime exception handler which will intercept reads from MMIO regions // and log the PC for compiling to static code later. #ifndef PPC_MM_LOAD_U8 #define PPC_MM_LOAD_U8(x) PPC_LOAD_U8 (x) #endif #ifndef PPC_MM_LOAD_U16 #define PPC_MM_LOAD_U16(x) PPC_LOAD_U16(x) #endif #ifndef PPC_MM_LOAD_U32 #define PPC_MM_LOAD_U32(x) PPC_LOAD_U32(x) #endif #ifndef PPC_MM_LOAD_U64 #define PPC_MM_LOAD_U64(x) PPC_LOAD_U64(x) #endif #ifndef PPC_STORE_U8 #define PPC_STORE_U8(x, y) *(volatile uint8_t*)(base + (x)) = (y) #endif #ifndef PPC_STORE_U16 #define PPC_STORE_U16(x, y) *(volatile uint16_t*)(base + (x)) = __builtin_bswap16(y) #endif #ifndef PPC_STORE_U32 #define PPC_STORE_U32(x, y) *(volatile uint32_t*)(base + (x)) = __builtin_bswap32(y) #endif #ifndef PPC_STORE_U64 #define PPC_STORE_U64(x, y) *(volatile uint64_t*)(base + (x)) = __builtin_bswap64(y) #endif // MMIO Store handling is completely reliant on being preeceded by eieio. // TODO: Verify if that's always the case. #ifndef PPC_MM_STORE_U8 #define PPC_MM_STORE_U8(x, y) PPC_STORE_U8 (x, y) #endif #ifndef PPC_MM_STORE_U16 #define PPC_MM_STORE_U16(x, y) PPC_STORE_U16(x, y) #endif #ifndef PPC_MM_STORE_U32 #define PPC_MM_STORE_U32(x, y) PPC_STORE_U32(x, y) #endif #ifndef PPC_MM_STORE_U64 #define PPC_MM_STORE_U64(x, y) PPC_STORE_U64(x, y) #endif #ifndef PPC_CALL_FUNC #define PPC_CALL_FUNC(x) x(ctx, base) #endif #ifndef PPC_CALL_INDIRECT_FUNC #define PPC_CALL_INDIRECT_FUNC(x) (*(PPCFunc**)(ctx.fn + uint64_t(x) * 2))(ctx, base) #endif typedef void PPCFunc(struct PPCContext& __restrict__ ctx, uint8_t* base); struct PPCFuncMapping { size_t guest; PPCFunc* host; }; extern PPCFuncMapping PPCFuncMappings[]; struct PPCRegister { union { int8_t s8; uint8_t u8; int16_t s16; uint16_t u16; int32_t s32; uint32_t u32; int64_t s64; uint64_t u64; float f32; double f64; }; }; struct PPCXERRegister { uint8_t so; uint8_t ov; uint8_t ca; }; struct PPCCRRegister { uint8_t lt; uint8_t gt; uint8_t eq; union { uint8_t so; uint8_t un; }; template inline void compare(T left, T right, const PPCXERRegister& xer) noexcept { lt = left < right; gt = left > right; eq = left == right; so = xer.so; } inline void compare(double left, double right) noexcept { un = __builtin_isnan(left) || __builtin_isnan(right); lt = !un && (left < right); gt = !un && (left > right); eq = !un && (left == right); } inline void setFromMask(__m128 mask, int imm) noexcept { int m = _mm_movemask_ps(mask); lt = m == imm; // all equal gt = 0; eq = m == 0; // none equal so = 0; } inline void setFromMask(__m128i mask, int imm) noexcept { int m = _mm_movemask_epi8(mask); lt = m == imm; // all equal gt = 0; eq = m == 0; // none equal so = 0; } }; struct alignas(0x10) PPCVRegister { union { int8_t s8[16]; uint8_t u8[16]; int16_t s16[8]; uint16_t u16[8]; int32_t s32[4]; uint32_t u32[4]; int64_t s64[2]; uint64_t u64[2]; float f32[4]; double f64[2]; }; }; #define PPC_ROUND_NEAREST 0x00 #define PPC_ROUND_TOWARD_ZERO 0x01 #define PPC_ROUND_UP 0x02 #define PPC_ROUND_DOWN 0x03 #define PPC_ROUND_MASK 0x03 struct PPCFPSCRRegister { uint32_t csr; static constexpr size_t GuestToHost[] = { _MM_ROUND_NEAREST, _MM_ROUND_TOWARD_ZERO, _MM_ROUND_UP, _MM_ROUND_DOWN }; static constexpr size_t HostToGuest[] = { PPC_ROUND_NEAREST, PPC_ROUND_DOWN, PPC_ROUND_UP, PPC_ROUND_TOWARD_ZERO }; inline uint32_t loadFromHost() noexcept { csr = _mm_getcsr(); return HostToGuest[(csr & _MM_ROUND_MASK) >> 13]; } inline void storeFromGuest(uint32_t value) noexcept { csr &= ~_MM_ROUND_MASK; csr |= GuestToHost[value & PPC_ROUND_MASK]; _mm_setcsr(csr); } static constexpr size_t FlushMask = _MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK; inline void enableFlushModeUnconditional() noexcept { csr |= FlushMask; _mm_setcsr(csr); } inline void disableFlushModeUnconditional() noexcept { csr &= ~FlushMask; _mm_setcsr(csr); } inline void enableFlushMode() noexcept { if ((csr & FlushMask) != FlushMask) [[unlikely]] { csr |= FlushMask; _mm_setcsr(csr); } } inline void disableFlushMode() noexcept { if ((csr & FlushMask) != 0) [[unlikely]] { csr &= ~FlushMask; _mm_setcsr(csr); } } }; struct PPCContext { uint8_t* fn; #ifndef PPC_CONFIG_SKIP_LR uint64_t lr; #endif #ifndef PPC_CONFIG_CTR_AS_LOCAL PPCRegister ctr; #endif #ifndef PPC_CONFIG_XER_AS_LOCAL PPCXERRegister xer; #endif #ifndef PPC_CONFIG_RESERVED_AS_LOCAL PPCRegister reserved; #endif #ifndef PPC_CONFIG_SKIP_MSR uint32_t msr = 0x200A000; #endif #ifndef PPC_CONFIG_CR_AS_LOCAL PPCCRRegister cr0; PPCCRRegister cr1; PPCCRRegister cr2; PPCCRRegister cr3; PPCCRRegister cr4; PPCCRRegister cr5; PPCCRRegister cr6; PPCCRRegister cr7; #endif #ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL PPCRegister r0; #endif PPCRegister r1; #ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL PPCRegister r2; #endif PPCRegister r3; PPCRegister r4; PPCRegister r5; PPCRegister r6; PPCRegister r7; PPCRegister r8; PPCRegister r9; PPCRegister r10; #ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL PPCRegister r11; PPCRegister r12; #endif PPCRegister r13; #ifndef PPC_CONFIG_NON_VOLATILE_AS_LOCAL PPCRegister r14; PPCRegister r15; PPCRegister r16; PPCRegister r17; PPCRegister r18; PPCRegister r19; PPCRegister r20; PPCRegister r21; PPCRegister r22; PPCRegister r23; PPCRegister r24; PPCRegister r25; PPCRegister r26; PPCRegister r27; PPCRegister r28; PPCRegister r29; PPCRegister r30; PPCRegister r31; #endif PPCFPSCRRegister fpscr; #ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL PPCRegister f0; #endif PPCRegister f1; PPCRegister f2; PPCRegister f3; PPCRegister f4; PPCRegister f5; PPCRegister f6; PPCRegister f7; PPCRegister f8; PPCRegister f9; PPCRegister f10; PPCRegister f11; PPCRegister f12; PPCRegister f13; #ifndef PPC_CONFIG_NON_VOLATILE_AS_LOCAL PPCRegister f14; PPCRegister f15; PPCRegister f16; PPCRegister f17; PPCRegister f18; PPCRegister f19; PPCRegister f20; PPCRegister f21; PPCRegister f22; PPCRegister f23; PPCRegister f24; PPCRegister f25; PPCRegister f26; PPCRegister f27; PPCRegister f28; PPCRegister f29; PPCRegister f30; PPCRegister f31; #endif PPCVRegister v0; PPCVRegister v1; PPCVRegister v2; PPCVRegister v3; PPCVRegister v4; PPCVRegister v5; PPCVRegister v6; PPCVRegister v7; PPCVRegister v8; PPCVRegister v9; PPCVRegister v10; PPCVRegister v11; PPCVRegister v12; PPCVRegister v13; #ifndef PPC_CONFIG_NON_VOLATILE_AS_LOCAL PPCVRegister v14; PPCVRegister v15; PPCVRegister v16; PPCVRegister v17; PPCVRegister v18; PPCVRegister v19; PPCVRegister v20; PPCVRegister v21; PPCVRegister v22; PPCVRegister v23; PPCVRegister v24; PPCVRegister v25; PPCVRegister v26; PPCVRegister v27; PPCVRegister v28; PPCVRegister v29; PPCVRegister v30; PPCVRegister v31; #endif #ifndef PPC_CONFIG_NON_ARGUMENT_AS_LOCAL PPCVRegister v32; PPCVRegister v33; PPCVRegister v34; PPCVRegister v35; PPCVRegister v36; PPCVRegister v37; PPCVRegister v38; PPCVRegister v39; PPCVRegister v40; PPCVRegister v41; PPCVRegister v42; PPCVRegister v43; PPCVRegister v44; PPCVRegister v45; PPCVRegister v46; PPCVRegister v47; PPCVRegister v48; PPCVRegister v49; PPCVRegister v50; PPCVRegister v51; PPCVRegister v52; PPCVRegister v53; PPCVRegister v54; PPCVRegister v55; PPCVRegister v56; PPCVRegister v57; PPCVRegister v58; PPCVRegister v59; PPCVRegister v60; PPCVRegister v61; PPCVRegister v62; PPCVRegister v63; #endif #ifndef PPC_CONFIG_NON_VOLATILE_AS_LOCAL PPCVRegister v64; PPCVRegister v65; PPCVRegister v66; PPCVRegister v67; PPCVRegister v68; PPCVRegister v69; PPCVRegister v70; PPCVRegister v71; PPCVRegister v72; PPCVRegister v73; PPCVRegister v74; PPCVRegister v75; PPCVRegister v76; PPCVRegister v77; PPCVRegister v78; PPCVRegister v79; PPCVRegister v80; PPCVRegister v81; PPCVRegister v82; PPCVRegister v83; PPCVRegister v84; PPCVRegister v85; PPCVRegister v86; PPCVRegister v87; PPCVRegister v88; PPCVRegister v89; PPCVRegister v90; PPCVRegister v91; PPCVRegister v92; PPCVRegister v93; PPCVRegister v94; PPCVRegister v95; PPCVRegister v96; PPCVRegister v97; PPCVRegister v98; PPCVRegister v99; PPCVRegister v100; PPCVRegister v101; PPCVRegister v102; PPCVRegister v103; PPCVRegister v104; PPCVRegister v105; PPCVRegister v106; PPCVRegister v107; PPCVRegister v108; PPCVRegister v109; PPCVRegister v110; PPCVRegister v111; PPCVRegister v112; PPCVRegister v113; PPCVRegister v114; PPCVRegister v115; PPCVRegister v116; PPCVRegister v117; PPCVRegister v118; PPCVRegister v119; PPCVRegister v120; PPCVRegister v121; PPCVRegister v122; PPCVRegister v123; PPCVRegister v124; PPCVRegister v125; PPCVRegister v126; PPCVRegister v127; #endif }; inline uint8_t VectorMaskL[] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, }; inline uint8_t VectorMaskR[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, }; inline uint8_t VectorShiftTableL[] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, }; inline uint8_t VectorShiftTableR[] = { 0x1F, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, }; inline __m128i _mm_adds_epu32(__m128i a, __m128i b) { return _mm_add_epi32(a, _mm_min_epu32(_mm_xor_si128(a, _mm_cmpeq_epi32(a, a)), b)); } inline __m128i _mm_avg_epi8(__m128i a, __m128i b) { __m128i c = _mm_set1_epi8(char(128)); return _mm_xor_si128(c, _mm_avg_epu8(_mm_xor_si128(c, a), _mm_xor_si128(c, b))); } inline __m128i _mm_avg_epi16(__m128i a, __m128i b) { __m128i c = _mm_set1_epi16(short(32768)); return _mm_xor_si128(c, _mm_avg_epu16(_mm_xor_si128(c, a), _mm_xor_si128(c, b))); } inline __m128 _mm_cvtepu32_ps_(__m128i src1) { __m128i xmm1 = _mm_add_epi32(src1, _mm_set1_epi32(127)); __m128i xmm0 = _mm_slli_epi32(src1, 31 - 8); xmm0 = _mm_srli_epi32(xmm0, 31); xmm0 = _mm_add_epi32(xmm0, xmm1); xmm0 = _mm_srai_epi32(xmm0, 8); xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(0x4F800000)); __m128 xmm2 = _mm_cvtepi32_ps(src1); return _mm_blendv_ps(xmm2, _mm_castsi128_ps(xmm0), _mm_castsi128_ps(src1)); } inline __m128i _mm_perm_epi8_(__m128i a, __m128i b, __m128i c) { __m128i d = _mm_set1_epi8(0xF); __m128i e = _mm_sub_epi8(d, _mm_and_si128(c, d)); return _mm_blendv_epi8(_mm_shuffle_epi8(a, e), _mm_shuffle_epi8(b, e), _mm_slli_epi32(c, 3)); } inline __m128i _mm_cmpgt_epu8(__m128i a, __m128i b) { __m128i c = _mm_set1_epi8(char(128)); return _mm_cmpgt_epi8(_mm_xor_si128(a, c), _mm_xor_si128(b, c)); } inline __m128i _mm_cmpgt_epu16(__m128i a, __m128i b) { __m128i c = _mm_set1_epi16(short(32768)); return _mm_cmpgt_epi16(_mm_xor_si128(a, c), _mm_xor_si128(b, c)); } inline __m128i _mm_vctsxs(__m128 src1) { __m128 xmm2 = _mm_cmpunord_ps(src1, src1); __m128i xmm0 = _mm_cvttps_epi32(src1); __m128i xmm1 = _mm_cmpeq_epi32(xmm0, _mm_set1_epi32(INT_MIN)); xmm1 = _mm_andnot_si128(_mm_castps_si128(src1), xmm1); __m128 dest = _mm_blendv_ps(_mm_castsi128_ps(xmm0), _mm_castsi128_ps(_mm_set1_epi32(INT_MAX)), _mm_castsi128_ps(xmm1)); return _mm_andnot_si128(_mm_castps_si128(xmm2), _mm_castps_si128(dest)); } inline __m128i _mm_vctuxs(__m128 src1) { __m128 xmm0 = _mm_max_ps(src1, _mm_set1_epi32(0)); __m128 xmm1 = _mm_cmpge_ps(xmm0, _mm_set1_ps((float)0x80000000)); __m128 xmm2 = _mm_sub_ps(xmm0, _mm_set1_ps((float)0x80000000)); xmm0 = _mm_blendv_ps(xmm0, xmm2, xmm1); __m128i dest = _mm_cvttps_epi32(xmm0); xmm0 = _mm_cmpeq_epi32(dest, _mm_set1_epi32(INT_MIN)); xmm1 = _mm_and_si128(xmm1, _mm_set1_epi32(INT_MIN)); dest = _mm_add_epi32(dest, xmm1); return _mm_or_si128(dest, xmm0); } inline __m128i _mm_vsr(__m128i a, __m128i b) { b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61); return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10)); } #endif