#pragma once #include #include #include #include #ifdef __clang__ #include #define __restrict __restrict__ #define _byteswap_ushort __builtin_bswap16 #define _byteswap_ulong __builtin_bswap32 #define _byteswap_uint64 __builtin_bswap64 #define isnan __builtin_isnan #define __assume __builtin_assume #define __unreachable() __builtin_unreachable() #else #include #define __unreachable() __assume(0) #endif #define PPC_FUNC(x) extern "C" void x(PPCContext& __restrict ctx, uint8_t* base) noexcept #define PPC_FUNC_PROLOGUE() \ __assume((reinterpret_cast(base) & 0xFFFFFFFF) == 0); \ PPCRegister temp; \ PPCVRegister vtemp; \ uint32_t ea #define PPC_LOAD_U8(x) *(uint8_t*)(base + (x)) #define PPC_LOAD_U16(x) _byteswap_ushort(*(uint16_t*)(base + (x))) #define PPC_LOAD_U32(x) _byteswap_ulong(*(uint32_t*)(base + (x))) #define PPC_LOAD_U64(x) _byteswap_uint64(*(uint64_t*)(base + (x))) #define PPC_STORE_U8(x, y) *(uint8_t*)(base + (x)) = (y) #define PPC_STORE_U16(x, y) *(uint16_t*)(base + (x)) = _byteswap_ushort(y) #define PPC_STORE_U32(x, y) *(uint32_t*)(base + (x)) = _byteswap_ulong(y) #define PPC_STORE_U64(x, y) *(uint64_t*)(base + (x)) = _byteswap_uint64(y) typedef void PPCFunc(struct PPCContext& __restrict ctx, uint8_t* base); struct PPCFuncMapping { size_t guest; PPCFunc* host; }; extern "C" PPCFuncMapping PPCFuncMappings[]; struct PPCRegister { union { int8_t s8; uint8_t u8; int16_t s16; uint16_t u16; int32_t s32; uint32_t u32; int64_t s64; uint64_t u64; float f32; double f64; }; }; struct PPCXERRegister { uint8_t so; uint8_t ov; uint8_t ca; }; struct PPCCRRegister { uint8_t lt; uint8_t gt; uint8_t eq; union { uint8_t so; uint8_t un; }; template void compare(T left, T right, const PPCXERRegister& xer) { lt = left < right; gt = left > right; eq = left == right; so = xer.so; } void compare(double left, double right) { lt = left < right; gt = left > right; eq = left == right; un = isnan(left) || isnan(right); } void setFromMask(__m128 mask, int imm) { int m = _mm_movemask_ps(mask); lt = m == imm; // all equal gt = 0; eq = m == 0; // none equal so = 0; } void setFromMask(__m128i mask, int imm) { int m = _mm_movemask_epi8(mask); lt = m == imm; // all equal gt = 0; eq = m == 0; // none equal so = 0; } }; struct alignas(0x10) PPCVRegister { union { int8_t s8[16]; uint8_t u8[16]; int16_t s16[8]; uint16_t u16[8]; int32_t s32[4]; uint32_t u32[4]; int64_t s64[2]; uint64_t u64[2]; float f32[4]; double f64[2]; }; }; struct PPCFPSCRRegister { uint32_t csr; uint32_t loadFromHost() { csr = _mm_getcsr(); return (0x6C >> ((csr & _MM_ROUND_MASK) >> 12)) & 3; } void storeFromGuest(uint32_t value) { csr &= ~_MM_ROUND_MASK; csr |= ((0x6C >> (2 * (value & 3))) & 3) << 13; _mm_setcsr(csr); } void setFlushMode(bool enable) { constexpr uint32_t mask = _MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK; uint32_t value = enable ? (csr | mask) : (csr & ~mask); if (csr != value) { _mm_setcsr(value); csr = value; } } }; struct PPCContext { PPCFunc** fn; uint64_t lr; PPCRegister ctr; PPCXERRegister xer; PPCRegister reserved; uint32_t msr; PPCFPSCRRegister fpscr; union { struct { PPCCRRegister cr0; PPCCRRegister cr1; PPCCRRegister cr2; PPCCRRegister cr3; PPCCRRegister cr4; PPCCRRegister cr5; PPCCRRegister cr6; PPCCRRegister cr7; }; PPCCRRegister cr[8]; }; union { struct { PPCRegister r0; PPCRegister r1; PPCRegister r2; PPCRegister r3; PPCRegister r4; PPCRegister r5; PPCRegister r6; PPCRegister r7; PPCRegister r8; PPCRegister r9; PPCRegister r10; PPCRegister r11; PPCRegister r12; PPCRegister r13; PPCRegister r14; PPCRegister r15; PPCRegister r16; PPCRegister r17; PPCRegister r18; PPCRegister r19; PPCRegister r20; PPCRegister r21; PPCRegister r22; PPCRegister r23; PPCRegister r24; PPCRegister r25; PPCRegister r26; PPCRegister r27; PPCRegister r28; PPCRegister r29; PPCRegister r30; PPCRegister r31; }; PPCRegister r[32]; }; union { struct { PPCRegister f0; PPCRegister f1; PPCRegister f2; PPCRegister f3; PPCRegister f4; PPCRegister f5; PPCRegister f6; PPCRegister f7; PPCRegister f8; PPCRegister f9; PPCRegister f10; PPCRegister f11; PPCRegister f12; PPCRegister f13; PPCRegister f14; PPCRegister f15; PPCRegister f16; PPCRegister f17; PPCRegister f18; PPCRegister f19; PPCRegister f20; PPCRegister f21; PPCRegister f22; PPCRegister f23; PPCRegister f24; PPCRegister f25; PPCRegister f26; PPCRegister f27; PPCRegister f28; PPCRegister f29; PPCRegister f30; PPCRegister f31; }; PPCRegister f[32]; }; union { struct { PPCVRegister v0; PPCVRegister v1; PPCVRegister v2; PPCVRegister v3; PPCVRegister v4; PPCVRegister v5; PPCVRegister v6; PPCVRegister v7; PPCVRegister v8; PPCVRegister v9; PPCVRegister v10; PPCVRegister v11; PPCVRegister v12; PPCVRegister v13; PPCVRegister v14; PPCVRegister v15; PPCVRegister v16; PPCVRegister v17; PPCVRegister v18; PPCVRegister v19; PPCVRegister v20; PPCVRegister v21; PPCVRegister v22; PPCVRegister v23; PPCVRegister v24; PPCVRegister v25; PPCVRegister v26; PPCVRegister v27; PPCVRegister v28; PPCVRegister v29; PPCVRegister v30; PPCVRegister v31; PPCVRegister v32; PPCVRegister v33; PPCVRegister v34; PPCVRegister v35; PPCVRegister v36; PPCVRegister v37; PPCVRegister v38; PPCVRegister v39; PPCVRegister v40; PPCVRegister v41; PPCVRegister v42; PPCVRegister v43; PPCVRegister v44; PPCVRegister v45; PPCVRegister v46; PPCVRegister v47; PPCVRegister v48; PPCVRegister v49; PPCVRegister v50; PPCVRegister v51; PPCVRegister v52; PPCVRegister v53; PPCVRegister v54; PPCVRegister v55; PPCVRegister v56; PPCVRegister v57; PPCVRegister v58; PPCVRegister v59; PPCVRegister v60; PPCVRegister v61; PPCVRegister v62; PPCVRegister v63; PPCVRegister v64; PPCVRegister v65; PPCVRegister v66; PPCVRegister v67; PPCVRegister v68; PPCVRegister v69; PPCVRegister v70; PPCVRegister v71; PPCVRegister v72; PPCVRegister v73; PPCVRegister v74; PPCVRegister v75; PPCVRegister v76; PPCVRegister v77; PPCVRegister v78; PPCVRegister v79; PPCVRegister v80; PPCVRegister v81; PPCVRegister v82; PPCVRegister v83; PPCVRegister v84; PPCVRegister v85; PPCVRegister v86; PPCVRegister v87; PPCVRegister v88; PPCVRegister v89; PPCVRegister v90; PPCVRegister v91; PPCVRegister v92; PPCVRegister v93; PPCVRegister v94; PPCVRegister v95; PPCVRegister v96; PPCVRegister v97; PPCVRegister v98; PPCVRegister v99; PPCVRegister v100; PPCVRegister v101; PPCVRegister v102; PPCVRegister v103; PPCVRegister v104; PPCVRegister v105; PPCVRegister v106; PPCVRegister v107; PPCVRegister v108; PPCVRegister v109; PPCVRegister v110; PPCVRegister v111; PPCVRegister v112; PPCVRegister v113; PPCVRegister v114; PPCVRegister v115; PPCVRegister v116; PPCVRegister v117; PPCVRegister v118; PPCVRegister v119; PPCVRegister v120; PPCVRegister v121; PPCVRegister v122; PPCVRegister v123; PPCVRegister v124; PPCVRegister v125; PPCVRegister v126; PPCVRegister v127; }; PPCVRegister v[128]; }; }; inline uint8_t VectorMaskL[] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0x0C, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0x0D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, 0x0E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0F, }; inline uint8_t VectorMaskR[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0xFF, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, 0xFF, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0xFF, }; inline uint8_t VectorShiftTableL[] = { 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, }; inline uint8_t VectorShiftTableR[] = { 0x1F, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x13, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x12, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x11, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x10, 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, }; inline __m128i _mm_adds_epu32(__m128i a, __m128i b) { return _mm_add_epi32(_mm_min_epu32(a, _mm_xor_si128(b, _mm_cmpeq_epi32(b, b))), b); } inline __m128i _mm_avg_epi8(__m128i a, __m128i b) { __m128i c = _mm_set1_epi8(char(128)); return _mm_xor_si128(c, _mm_avg_epu8(_mm_xor_si128(c, a), _mm_xor_si128(c, b))); } inline __m128i _mm_avg_epi16(__m128i a, __m128i b) { __m128i c = _mm_set1_epi16(short(32768)); return _mm_xor_si128(c, _mm_avg_epu16(_mm_xor_si128(c, a), _mm_xor_si128(c, b))); } inline __m128 _mm_cvtepu32_ps_(__m128i v) { __m128i v2 = _mm_srli_epi32(v, 1); __m128i v1 = _mm_sub_epi32(v, v2); __m128 v2f = _mm_cvtepi32_ps(v2); __m128 v1f = _mm_cvtepi32_ps(v1); return _mm_add_ps(v2f, v1f); } inline __m128i _mm_perm_epi8_(__m128i a, __m128i b, __m128i c) { __m128i d = _mm_set1_epi8(0xF); __m128i e = _mm_sub_epi8(d, _mm_and_si128(c, d)); return _mm_blendv_epi8(_mm_shuffle_epi8(a, e), _mm_shuffle_epi8(b, e), _mm_slli_epi32(c, 3)); } inline __m128i _mm_cmpgt_epu8(__m128i a, __m128i b) { __m128i c = _mm_set1_epi8(char(128)); return _mm_cmpgt_epi8(_mm_xor_si128(a, c), _mm_xor_si128(b, c)); } inline __m128i _mm_cmpgt_epu16(__m128i a, __m128i b) { __m128i c = _mm_set1_epi16(short(32768)); return _mm_cmpgt_epi16(_mm_xor_si128(a, c), _mm_xor_si128(b, c)); } inline __m128i _mm_vctsxs(__m128 a) { __m128i result = _mm_cvttps_epi32(a); __m128 max_val = _mm_set1_ps(2147483648.0f); __m128 cmp_mask = _mm_cmpgt_ps(a, max_val); result = _mm_xor_si128(result, _mm_castps_si128(cmp_mask)); __m128 ord_mask = _mm_cmpord_ps(a, a); result = _mm_and_si128(result, _mm_castps_si128(ord_mask)); return result; } inline __m128i _mm_vsr(__m128i a, __m128i b) { b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61); return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10)); }