From fc13c684bad4cfddaf1b4ba8f87da4debb1c10d5 Mon Sep 17 00:00:00 2001 From: Skyth <19259897+blueskythlikesclouds@users.noreply.github.com> Date: Fri, 20 Sep 2024 18:05:13 +0300 Subject: [PATCH] Handle zero flushing. --- PowerRecomp/main.cpp | 57 ++++++++++++++++++++++++++++++++++++-- PowerSample/CMakeLists.txt | 2 +- PowerUtils/ppc_context.h | 27 ++++++++++++++++-- 3 files changed, 80 insertions(+), 6 deletions(-) diff --git a/PowerRecomp/main.cpp b/PowerRecomp/main.cpp index 8b0442e..f4d59c9 100644 --- a/PowerRecomp/main.cpp +++ b/PowerRecomp/main.cpp @@ -297,7 +297,7 @@ int main(int argc, char* argv[]) println("#include \n"); for (auto& symbol : image.symbols) - println("PPC_FUNC void {}(PPCContext& __restrict ctx, uint8_t* base);", symbol.name); + println("PPC_FUNC({});", symbol.name); saveFile(std::format("{}/ppc_recomp_shared.h", argv[3])); } @@ -336,11 +336,11 @@ int main(int argc, char* argv[]) auto symbol = image.symbols.find(fn.base); if (symbol != image.symbols.end()) { - println("PPC_FUNC void {}(PPCContext& __restrict ctx, uint8_t* base) {{", symbol->name); + println("PPC_FUNC({}) {{", symbol->name); } else { - println("PPC_FUNC void sub_{:X}(PPCContext& __restrict ctx, uint8_t* base) {{", fn.base); + println("PPC_FUNC(sub_{}) {{", fn.base); } println("\t__assume((reinterpret_cast(base) & 0xFFFFFFFF) == 0);"); @@ -739,104 +739,129 @@ int main(int argc, char* argv[]) // TODO: fpu operations require denormal flushing checks case PPC_INST_FABS: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = fabs(ctx.f{}.f64);", insn.operands[0], insn.operands[1]); break; case PPC_INST_FADD: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = ctx.f{}.f64 + ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2]); break; case PPC_INST_FADDS: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = float(ctx.f{}.f64 + ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]); break; case PPC_INST_FCFID: // TODO: rounding mode? + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = ctx.f{}.s64;", insn.operands[0], insn.operands[1]); break; case PPC_INST_FCMPU: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.cr{}.compare(ctx.f{}.f64, ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]); break; case PPC_INST_FCTID: // TODO: rounding mode? + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.s64 = ctx.f{}.f64;", insn.operands[0], insn.operands[1]); break; case PPC_INST_FCTIDZ: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.s64 = trunc(ctx.f{}.f64);", insn.operands[0], insn.operands[1]); break; case PPC_INST_FCTIWZ: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.s32 = trunc(ctx.f{}.f64);", insn.operands[0], insn.operands[1]); break; case PPC_INST_FDIV: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = ctx.f{}.f64 / ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2]); break; case PPC_INST_FDIVS: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = float(ctx.f{}.f64 / ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]); break; case PPC_INST_FMADD: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = ctx.f{}.f64 * ctx.f{}.f64 + ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); break; case PPC_INST_FMADDS: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = float(ctx.f{}.f64 * ctx.f{}.f64 + ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); break; case PPC_INST_FMR: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = ctx.f{}.f64;", insn.operands[0], insn.operands[1]); break; case PPC_INST_FMSUB: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = ctx.f{}.f64 * ctx.f{}.f64 - ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); break; case PPC_INST_FMSUBS: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = float(ctx.f{}.f64 * ctx.f{}.f64 - ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); break; case PPC_INST_FMUL: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = ctx.f{}.f64 * ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2]); break; case PPC_INST_FMULS: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = float(ctx.f{}.f64 * ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]); break; case PPC_INST_FNABS: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = -fabs(ctx.f{}.f64);", insn.operands[0], insn.operands[1]); break; case PPC_INST_FNEG: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = -ctx.f{}.f64;", insn.operands[0], insn.operands[1]); break; case PPC_INST_FNMADDS: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = -float(ctx.f{}.f64 * ctx.f{}.f64 + ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); break; case PPC_INST_FNMSUB: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = -(ctx.f{}.f64 * ctx.f{}.f64 - ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); break; case PPC_INST_FNMSUBS: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = -float(ctx.f{}.f64 * ctx.f{}.f64 - ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); break; case PPC_INST_FRES: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = 1.0f / float(ctx.f{}.f64);", insn.operands[0], insn.operands[1]); break; case PPC_INST_FRSP: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = float(ctx.f{}.f64);", insn.operands[0], insn.operands[1]); break; case PPC_INST_FSEL: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = ctx.f{}.f64 >= 0.0 ? ctx.f{}.f64 : ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); break; @@ -845,14 +870,17 @@ int main(int argc, char* argv[]) break; case PPC_INST_FSQRTS: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = float(sqrt(ctx.f{}.f64));", insn.operands[0], insn.operands[1]); break; case PPC_INST_FSUB: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = ctx.f{}.f64 - ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2]); break; case PPC_INST_FSUBS: + println("\tctx.csr.setFlushMode(false);"); println("\tctx.f{}.f64 = float(ctx.f{}.f64 - ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]); break; @@ -1546,6 +1574,7 @@ int main(int argc, char* argv[]) // TODO: vector instructions require denormal flushing checks case PPC_INST_VADDFP: case PPC_INST_VADDFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_add_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); break; @@ -1596,16 +1625,19 @@ int main(int argc, char* argv[]) case PPC_INST_VCTSXS: case PPC_INST_VCFPSXWS128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_si128((__m128i*)ctx.v{}.s32, _mm_vctsxs(_mm_mul_ps(_mm_load_ps(ctx.v{}.f32), _mm_set1_ps(exp2f({})))));", insn.operands[0], insn.operands[1], insn.operands[2]); break; case PPC_INST_VCFSX: case PPC_INST_VCSXWFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_mul_ps(_mm_cvtepi32_ps(_mm_load_si128((__m128i*)ctx.v{}.u32)), _mm_set1_ps(ldexpf(1.0f, {}))));", insn.operands[0], insn.operands[1], -int32_t(insn.operands[2])); break; case PPC_INST_VCFUX: case PPC_INST_VCUXWFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_mul_ps(_mm_cvtepu32_ps_(_mm_load_si128((__m128i*)ctx.v{}.u32)), _mm_set1_ps(ldexpf(1.0f, {}))));", insn.operands[0], insn.operands[1], -int32_t(insn.operands[2])); break; @@ -1615,6 +1647,7 @@ int main(int argc, char* argv[]) case PPC_INST_VCMPEQFP: case PPC_INST_VCMPEQFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_cmpeq_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); break; @@ -1633,6 +1666,7 @@ int main(int argc, char* argv[]) case PPC_INST_VCMPGEFP: case PPC_INST_VCMPGEFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_cmpge_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); if (strchr(insn.opcode->name, '.')) println("\tctx.cr6.setFromMask(_mm_load_ps(ctx.v{}.f32), 0xF);", insn.operands[0]); @@ -1640,6 +1674,7 @@ int main(int argc, char* argv[]) case PPC_INST_VCMPGTFP: case PPC_INST_VCMPGTFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_cmpgt_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); if (strchr(insn.opcode->name, '.')) println("\tctx.cr6.setFromMask(_mm_load_ps(ctx.v{}.f32), 0xF);", insn.operands[0]); @@ -1655,12 +1690,14 @@ int main(int argc, char* argv[]) case PPC_INST_VEXPTEFP128: // TODO: vectorize + println("\tctx.csr.setFlushMode(true);"); for (size_t i = 0; i < 4; i++) println("\tctx.v{}.f32[{}] = exp2f(ctx.v{}.f32[{}]);", insn.operands[0], i, insn.operands[1], i); break; case PPC_INST_VLOGEFP128: // TODO: vectorize + println("\tctx.csr.setFlushMode(true);"); for (size_t i = 0; i < 4; i++) println("\tctx.v{}.f32[{}] = log2f(ctx.v{}.f32[{}]);", insn.operands[0], i, insn.operands[1], i); break; @@ -1668,11 +1705,13 @@ int main(int argc, char* argv[]) case PPC_INST_VMADDCFP128: case PPC_INST_VMADDFP: case PPC_INST_VMADDFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_fmadd_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); break; case PPC_INST_VMAXFP: case PPC_INST_VMAXFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_max_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); break; @@ -1682,6 +1721,7 @@ int main(int argc, char* argv[]) case PPC_INST_VMINFP: case PPC_INST_VMINFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_min_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); break; @@ -1713,19 +1753,23 @@ int main(int argc, char* argv[]) case PPC_INST_VMSUM3FP128: // NOTE: accounting for full vector reversal here. should dot product yzw instead of xyz + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_dp_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32), 0xEF));", insn.operands[0], insn.operands[1], insn.operands[2]); break; case PPC_INST_VMSUM4FP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_dp_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32), 0xFF));", insn.operands[0], insn.operands[1], insn.operands[2]); break; case PPC_INST_VMULFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_mul_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); break; case PPC_INST_VNMSUBFP: case PPC_INST_VNMSUBFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_fnmadd_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); break; @@ -1754,6 +1798,7 @@ int main(int argc, char* argv[]) case PPC_INST_VPKD3D128: // TODO: vectorize somehow? // NOTE: handling vector reversal here too + println("\tctx.csr.setFlushMode(true);"); switch (insn.operands[2]) { case 0: // D3D color @@ -1780,19 +1825,23 @@ int main(int argc, char* argv[]) case PPC_INST_VREFP: case PPC_INST_VREFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_rcp_ps(_mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1]); break; case PPC_INST_VRFIM128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_round_ps(_mm_load_ps(ctx.v{}.f32), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));", insn.operands[0], insn.operands[1]); break; case PPC_INST_VRFIN: case PPC_INST_VRFIN128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_round_ps(_mm_load_ps(ctx.v{}.f32), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));", insn.operands[0], insn.operands[1]); break; case PPC_INST_VRFIZ128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_round_ps(_mm_load_ps(ctx.v{}.f32), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));", insn.operands[0], insn.operands[1]); break; @@ -1805,6 +1854,7 @@ int main(int argc, char* argv[]) case PPC_INST_VRSQRTEFP: case PPC_INST_VRSQRTEFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_rsqrt_ps(_mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1]); break; @@ -1890,6 +1940,7 @@ int main(int argc, char* argv[]) case PPC_INST_VSUBFP: case PPC_INST_VSUBFP128: + println("\tctx.csr.setFlushMode(true);"); println("\t_mm_store_ps(ctx.v{}.f32, _mm_sub_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); break; diff --git a/PowerSample/CMakeLists.txt b/PowerSample/CMakeLists.txt index 3e9f567..89434f3 100644 --- a/PowerSample/CMakeLists.txt +++ b/PowerSample/CMakeLists.txt @@ -3,7 +3,7 @@ project("PowerSample") set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_COMPILER "clang-cl") set(CMAKE_C_COMPILER "clang-cl") -add_compile_options("-march=x86-64-v3") +add_compile_options("/clang:-march=x86-64-v3") file(GLOB RecompiledFiles *.cpp) add_library(PowerSample ${RecompiledFiles}) diff --git a/PowerUtils/ppc_context.h b/PowerUtils/ppc_context.h index 3d8825a..752ef69 100644 --- a/PowerUtils/ppc_context.h +++ b/PowerUtils/ppc_context.h @@ -13,13 +13,13 @@ #define isnan __builtin_isnan #define __assume __builtin_assume #define __unreachable() __builtin_unreachable() -#define PPC_FUNC extern "C" __attribute__((noinline)) #else #include -#define PPC_FUNC extern "C" __declspec(noinline) #define __unreachable() __assume(0) #endif +#define PPC_FUNC(x) extern "C" void x(PPCContext& __restrict ctx, uint8_t* base) noexcept + #define PPC_LOAD_U8(x) *(uint8_t*)(base + (x)) #define PPC_LOAD_U16(x) _byteswap_ushort(*(uint16_t*)(base + (x))) #define PPC_LOAD_U32(x) _byteswap_ulong(*(uint32_t*)(base + (x))) @@ -128,6 +128,28 @@ struct alignas(0x10) PPCVRegister }; }; +struct CSRRegister +{ + uint32_t value; + + void storeValue() + { + value = _mm_getcsr(); + } + + void setFlushMode(bool enable) + { + uint32_t mask = _MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK; + uint32_t newValue = enable ? (value | mask) : (value & ~mask); + + if (value != newValue) + { + _mm_setcsr(newValue); + value = newValue; + } + } +}; + struct PPCContext { PPCFunc** fn; @@ -137,6 +159,7 @@ struct PPCContext PPCRegister reserved; uint32_t msr; uint32_t fpscr; + CSRRegister csr; union {