Handle zero flushing.

This commit is contained in:
Skyth 2024-09-20 18:05:13 +03:00
parent fb97a569f9
commit fc13c684ba
3 changed files with 80 additions and 6 deletions

View File

@ -297,7 +297,7 @@ int main(int argc, char* argv[])
println("#include <ppc_context.h>\n"); println("#include <ppc_context.h>\n");
for (auto& symbol : image.symbols) for (auto& symbol : image.symbols)
println("PPC_FUNC void {}(PPCContext& __restrict ctx, uint8_t* base);", symbol.name); println("PPC_FUNC({});", symbol.name);
saveFile(std::format("{}/ppc_recomp_shared.h", argv[3])); saveFile(std::format("{}/ppc_recomp_shared.h", argv[3]));
} }
@ -336,11 +336,11 @@ int main(int argc, char* argv[])
auto symbol = image.symbols.find(fn.base); auto symbol = image.symbols.find(fn.base);
if (symbol != image.symbols.end()) if (symbol != image.symbols.end())
{ {
println("PPC_FUNC void {}(PPCContext& __restrict ctx, uint8_t* base) {{", symbol->name); println("PPC_FUNC({}) {{", symbol->name);
} }
else else
{ {
println("PPC_FUNC void sub_{:X}(PPCContext& __restrict ctx, uint8_t* base) {{", fn.base); println("PPC_FUNC(sub_{}) {{", fn.base);
} }
println("\t__assume((reinterpret_cast<size_t>(base) & 0xFFFFFFFF) == 0);"); println("\t__assume((reinterpret_cast<size_t>(base) & 0xFFFFFFFF) == 0);");
@ -739,104 +739,129 @@ int main(int argc, char* argv[])
// TODO: fpu operations require denormal flushing checks // TODO: fpu operations require denormal flushing checks
case PPC_INST_FABS: case PPC_INST_FABS:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = fabs(ctx.f{}.f64);", insn.operands[0], insn.operands[1]); println("\tctx.f{}.f64 = fabs(ctx.f{}.f64);", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_FADD: case PPC_INST_FADD:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = ctx.f{}.f64 + ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2]); println("\tctx.f{}.f64 = ctx.f{}.f64 + ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
case PPC_INST_FADDS: case PPC_INST_FADDS:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = float(ctx.f{}.f64 + ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]); println("\tctx.f{}.f64 = float(ctx.f{}.f64 + ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
case PPC_INST_FCFID: case PPC_INST_FCFID:
// TODO: rounding mode? // TODO: rounding mode?
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = ctx.f{}.s64;", insn.operands[0], insn.operands[1]); println("\tctx.f{}.f64 = ctx.f{}.s64;", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_FCMPU: case PPC_INST_FCMPU:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.cr{}.compare(ctx.f{}.f64, ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]); println("\tctx.cr{}.compare(ctx.f{}.f64, ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
case PPC_INST_FCTID: case PPC_INST_FCTID:
// TODO: rounding mode? // TODO: rounding mode?
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.s64 = ctx.f{}.f64;", insn.operands[0], insn.operands[1]); println("\tctx.f{}.s64 = ctx.f{}.f64;", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_FCTIDZ: case PPC_INST_FCTIDZ:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.s64 = trunc(ctx.f{}.f64);", insn.operands[0], insn.operands[1]); println("\tctx.f{}.s64 = trunc(ctx.f{}.f64);", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_FCTIWZ: case PPC_INST_FCTIWZ:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.s32 = trunc(ctx.f{}.f64);", insn.operands[0], insn.operands[1]); println("\tctx.f{}.s32 = trunc(ctx.f{}.f64);", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_FDIV: case PPC_INST_FDIV:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = ctx.f{}.f64 / ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2]); println("\tctx.f{}.f64 = ctx.f{}.f64 / ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
case PPC_INST_FDIVS: case PPC_INST_FDIVS:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = float(ctx.f{}.f64 / ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]); println("\tctx.f{}.f64 = float(ctx.f{}.f64 / ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
case PPC_INST_FMADD: case PPC_INST_FMADD:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = ctx.f{}.f64 * ctx.f{}.f64 + ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); println("\tctx.f{}.f64 = ctx.f{}.f64 * ctx.f{}.f64 + ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]);
break; break;
case PPC_INST_FMADDS: case PPC_INST_FMADDS:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = float(ctx.f{}.f64 * ctx.f{}.f64 + ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); println("\tctx.f{}.f64 = float(ctx.f{}.f64 * ctx.f{}.f64 + ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]);
break; break;
case PPC_INST_FMR: case PPC_INST_FMR:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = ctx.f{}.f64;", insn.operands[0], insn.operands[1]); println("\tctx.f{}.f64 = ctx.f{}.f64;", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_FMSUB: case PPC_INST_FMSUB:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = ctx.f{}.f64 * ctx.f{}.f64 - ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); println("\tctx.f{}.f64 = ctx.f{}.f64 * ctx.f{}.f64 - ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]);
break; break;
case PPC_INST_FMSUBS: case PPC_INST_FMSUBS:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = float(ctx.f{}.f64 * ctx.f{}.f64 - ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); println("\tctx.f{}.f64 = float(ctx.f{}.f64 * ctx.f{}.f64 - ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]);
break; break;
case PPC_INST_FMUL: case PPC_INST_FMUL:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = ctx.f{}.f64 * ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2]); println("\tctx.f{}.f64 = ctx.f{}.f64 * ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
case PPC_INST_FMULS: case PPC_INST_FMULS:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = float(ctx.f{}.f64 * ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]); println("\tctx.f{}.f64 = float(ctx.f{}.f64 * ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
case PPC_INST_FNABS: case PPC_INST_FNABS:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = -fabs(ctx.f{}.f64);", insn.operands[0], insn.operands[1]); println("\tctx.f{}.f64 = -fabs(ctx.f{}.f64);", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_FNEG: case PPC_INST_FNEG:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = -ctx.f{}.f64;", insn.operands[0], insn.operands[1]); println("\tctx.f{}.f64 = -ctx.f{}.f64;", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_FNMADDS: case PPC_INST_FNMADDS:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = -float(ctx.f{}.f64 * ctx.f{}.f64 + ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); println("\tctx.f{}.f64 = -float(ctx.f{}.f64 * ctx.f{}.f64 + ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]);
break; break;
case PPC_INST_FNMSUB: case PPC_INST_FNMSUB:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = -(ctx.f{}.f64 * ctx.f{}.f64 - ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); println("\tctx.f{}.f64 = -(ctx.f{}.f64 * ctx.f{}.f64 - ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]);
break; break;
case PPC_INST_FNMSUBS: case PPC_INST_FNMSUBS:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = -float(ctx.f{}.f64 * ctx.f{}.f64 - ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); println("\tctx.f{}.f64 = -float(ctx.f{}.f64 * ctx.f{}.f64 - ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]);
break; break;
case PPC_INST_FRES: case PPC_INST_FRES:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = 1.0f / float(ctx.f{}.f64);", insn.operands[0], insn.operands[1]); println("\tctx.f{}.f64 = 1.0f / float(ctx.f{}.f64);", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_FRSP: case PPC_INST_FRSP:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = float(ctx.f{}.f64);", insn.operands[0], insn.operands[1]); println("\tctx.f{}.f64 = float(ctx.f{}.f64);", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_FSEL: case PPC_INST_FSEL:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = ctx.f{}.f64 >= 0.0 ? ctx.f{}.f64 : ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); println("\tctx.f{}.f64 = ctx.f{}.f64 >= 0.0 ? ctx.f{}.f64 : ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]);
break; break;
@ -845,14 +870,17 @@ int main(int argc, char* argv[])
break; break;
case PPC_INST_FSQRTS: case PPC_INST_FSQRTS:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = float(sqrt(ctx.f{}.f64));", insn.operands[0], insn.operands[1]); println("\tctx.f{}.f64 = float(sqrt(ctx.f{}.f64));", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_FSUB: case PPC_INST_FSUB:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = ctx.f{}.f64 - ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2]); println("\tctx.f{}.f64 = ctx.f{}.f64 - ctx.f{}.f64;", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
case PPC_INST_FSUBS: case PPC_INST_FSUBS:
println("\tctx.csr.setFlushMode(false);");
println("\tctx.f{}.f64 = float(ctx.f{}.f64 - ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]); println("\tctx.f{}.f64 = float(ctx.f{}.f64 - ctx.f{}.f64);", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
@ -1546,6 +1574,7 @@ int main(int argc, char* argv[])
// TODO: vector instructions require denormal flushing checks // TODO: vector instructions require denormal flushing checks
case PPC_INST_VADDFP: case PPC_INST_VADDFP:
case PPC_INST_VADDFP128: case PPC_INST_VADDFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_add_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_add_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
@ -1596,16 +1625,19 @@ int main(int argc, char* argv[])
case PPC_INST_VCTSXS: case PPC_INST_VCTSXS:
case PPC_INST_VCFPSXWS128: case PPC_INST_VCFPSXWS128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_si128((__m128i*)ctx.v{}.s32, _mm_vctsxs(_mm_mul_ps(_mm_load_ps(ctx.v{}.f32), _mm_set1_ps(exp2f({})))));", insn.operands[0], insn.operands[1], insn.operands[2]); println("\t_mm_store_si128((__m128i*)ctx.v{}.s32, _mm_vctsxs(_mm_mul_ps(_mm_load_ps(ctx.v{}.f32), _mm_set1_ps(exp2f({})))));", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
case PPC_INST_VCFSX: case PPC_INST_VCFSX:
case PPC_INST_VCSXWFP128: case PPC_INST_VCSXWFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_mul_ps(_mm_cvtepi32_ps(_mm_load_si128((__m128i*)ctx.v{}.u32)), _mm_set1_ps(ldexpf(1.0f, {}))));", insn.operands[0], insn.operands[1], -int32_t(insn.operands[2])); println("\t_mm_store_ps(ctx.v{}.f32, _mm_mul_ps(_mm_cvtepi32_ps(_mm_load_si128((__m128i*)ctx.v{}.u32)), _mm_set1_ps(ldexpf(1.0f, {}))));", insn.operands[0], insn.operands[1], -int32_t(insn.operands[2]));
break; break;
case PPC_INST_VCFUX: case PPC_INST_VCFUX:
case PPC_INST_VCUXWFP128: case PPC_INST_VCUXWFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_mul_ps(_mm_cvtepu32_ps_(_mm_load_si128((__m128i*)ctx.v{}.u32)), _mm_set1_ps(ldexpf(1.0f, {}))));", insn.operands[0], insn.operands[1], -int32_t(insn.operands[2])); println("\t_mm_store_ps(ctx.v{}.f32, _mm_mul_ps(_mm_cvtepu32_ps_(_mm_load_si128((__m128i*)ctx.v{}.u32)), _mm_set1_ps(ldexpf(1.0f, {}))));", insn.operands[0], insn.operands[1], -int32_t(insn.operands[2]));
break; break;
@ -1615,6 +1647,7 @@ int main(int argc, char* argv[])
case PPC_INST_VCMPEQFP: case PPC_INST_VCMPEQFP:
case PPC_INST_VCMPEQFP128: case PPC_INST_VCMPEQFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_cmpeq_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_cmpeq_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
@ -1633,6 +1666,7 @@ int main(int argc, char* argv[])
case PPC_INST_VCMPGEFP: case PPC_INST_VCMPGEFP:
case PPC_INST_VCMPGEFP128: case PPC_INST_VCMPGEFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_cmpge_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_cmpge_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]);
if (strchr(insn.opcode->name, '.')) if (strchr(insn.opcode->name, '.'))
println("\tctx.cr6.setFromMask(_mm_load_ps(ctx.v{}.f32), 0xF);", insn.operands[0]); println("\tctx.cr6.setFromMask(_mm_load_ps(ctx.v{}.f32), 0xF);", insn.operands[0]);
@ -1640,6 +1674,7 @@ int main(int argc, char* argv[])
case PPC_INST_VCMPGTFP: case PPC_INST_VCMPGTFP:
case PPC_INST_VCMPGTFP128: case PPC_INST_VCMPGTFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_cmpgt_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_cmpgt_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]);
if (strchr(insn.opcode->name, '.')) if (strchr(insn.opcode->name, '.'))
println("\tctx.cr6.setFromMask(_mm_load_ps(ctx.v{}.f32), 0xF);", insn.operands[0]); println("\tctx.cr6.setFromMask(_mm_load_ps(ctx.v{}.f32), 0xF);", insn.operands[0]);
@ -1655,12 +1690,14 @@ int main(int argc, char* argv[])
case PPC_INST_VEXPTEFP128: case PPC_INST_VEXPTEFP128:
// TODO: vectorize // TODO: vectorize
println("\tctx.csr.setFlushMode(true);");
for (size_t i = 0; i < 4; i++) for (size_t i = 0; i < 4; i++)
println("\tctx.v{}.f32[{}] = exp2f(ctx.v{}.f32[{}]);", insn.operands[0], i, insn.operands[1], i); println("\tctx.v{}.f32[{}] = exp2f(ctx.v{}.f32[{}]);", insn.operands[0], i, insn.operands[1], i);
break; break;
case PPC_INST_VLOGEFP128: case PPC_INST_VLOGEFP128:
// TODO: vectorize // TODO: vectorize
println("\tctx.csr.setFlushMode(true);");
for (size_t i = 0; i < 4; i++) for (size_t i = 0; i < 4; i++)
println("\tctx.v{}.f32[{}] = log2f(ctx.v{}.f32[{}]);", insn.operands[0], i, insn.operands[1], i); println("\tctx.v{}.f32[{}] = log2f(ctx.v{}.f32[{}]);", insn.operands[0], i, insn.operands[1], i);
break; break;
@ -1668,11 +1705,13 @@ int main(int argc, char* argv[])
case PPC_INST_VMADDCFP128: case PPC_INST_VMADDCFP128:
case PPC_INST_VMADDFP: case PPC_INST_VMADDFP:
case PPC_INST_VMADDFP128: case PPC_INST_VMADDFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_fmadd_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_fmadd_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]);
break; break;
case PPC_INST_VMAXFP: case PPC_INST_VMAXFP:
case PPC_INST_VMAXFP128: case PPC_INST_VMAXFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_max_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_max_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
@ -1682,6 +1721,7 @@ int main(int argc, char* argv[])
case PPC_INST_VMINFP: case PPC_INST_VMINFP:
case PPC_INST_VMINFP128: case PPC_INST_VMINFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_min_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_min_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
@ -1713,19 +1753,23 @@ int main(int argc, char* argv[])
case PPC_INST_VMSUM3FP128: case PPC_INST_VMSUM3FP128:
// NOTE: accounting for full vector reversal here. should dot product yzw instead of xyz // NOTE: accounting for full vector reversal here. should dot product yzw instead of xyz
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_dp_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32), 0xEF));", insn.operands[0], insn.operands[1], insn.operands[2]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_dp_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32), 0xEF));", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
case PPC_INST_VMSUM4FP128: case PPC_INST_VMSUM4FP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_dp_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32), 0xFF));", insn.operands[0], insn.operands[1], insn.operands[2]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_dp_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32), 0xFF));", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
case PPC_INST_VMULFP128: case PPC_INST_VMULFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_mul_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_mul_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;
case PPC_INST_VNMSUBFP: case PPC_INST_VNMSUBFP:
case PPC_INST_VNMSUBFP128: case PPC_INST_VNMSUBFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_fnmadd_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_fnmadd_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2], insn.operands[3]);
break; break;
@ -1754,6 +1798,7 @@ int main(int argc, char* argv[])
case PPC_INST_VPKD3D128: case PPC_INST_VPKD3D128:
// TODO: vectorize somehow? // TODO: vectorize somehow?
// NOTE: handling vector reversal here too // NOTE: handling vector reversal here too
println("\tctx.csr.setFlushMode(true);");
switch (insn.operands[2]) switch (insn.operands[2])
{ {
case 0: // D3D color case 0: // D3D color
@ -1780,19 +1825,23 @@ int main(int argc, char* argv[])
case PPC_INST_VREFP: case PPC_INST_VREFP:
case PPC_INST_VREFP128: case PPC_INST_VREFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_rcp_ps(_mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_rcp_ps(_mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_VRFIM128: case PPC_INST_VRFIM128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_round_ps(_mm_load_ps(ctx.v{}.f32), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));", insn.operands[0], insn.operands[1]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_round_ps(_mm_load_ps(ctx.v{}.f32), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_VRFIN: case PPC_INST_VRFIN:
case PPC_INST_VRFIN128: case PPC_INST_VRFIN128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_round_ps(_mm_load_ps(ctx.v{}.f32), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));", insn.operands[0], insn.operands[1]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_round_ps(_mm_load_ps(ctx.v{}.f32), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));", insn.operands[0], insn.operands[1]);
break; break;
case PPC_INST_VRFIZ128: case PPC_INST_VRFIZ128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_round_ps(_mm_load_ps(ctx.v{}.f32), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));", insn.operands[0], insn.operands[1]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_round_ps(_mm_load_ps(ctx.v{}.f32), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));", insn.operands[0], insn.operands[1]);
break; break;
@ -1805,6 +1854,7 @@ int main(int argc, char* argv[])
case PPC_INST_VRSQRTEFP: case PPC_INST_VRSQRTEFP:
case PPC_INST_VRSQRTEFP128: case PPC_INST_VRSQRTEFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_rsqrt_ps(_mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_rsqrt_ps(_mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1]);
break; break;
@ -1890,6 +1940,7 @@ int main(int argc, char* argv[])
case PPC_INST_VSUBFP: case PPC_INST_VSUBFP:
case PPC_INST_VSUBFP128: case PPC_INST_VSUBFP128:
println("\tctx.csr.setFlushMode(true);");
println("\t_mm_store_ps(ctx.v{}.f32, _mm_sub_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]); println("\t_mm_store_ps(ctx.v{}.f32, _mm_sub_ps(_mm_load_ps(ctx.v{}.f32), _mm_load_ps(ctx.v{}.f32)));", insn.operands[0], insn.operands[1], insn.operands[2]);
break; break;

View File

@ -3,7 +3,7 @@ project("PowerSample")
set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_COMPILER "clang-cl") set(CMAKE_CXX_COMPILER "clang-cl")
set(CMAKE_C_COMPILER "clang-cl") set(CMAKE_C_COMPILER "clang-cl")
add_compile_options("-march=x86-64-v3") add_compile_options("/clang:-march=x86-64-v3")
file(GLOB RecompiledFiles *.cpp) file(GLOB RecompiledFiles *.cpp)
add_library(PowerSample ${RecompiledFiles}) add_library(PowerSample ${RecompiledFiles})

View File

@ -13,13 +13,13 @@
#define isnan __builtin_isnan #define isnan __builtin_isnan
#define __assume __builtin_assume #define __assume __builtin_assume
#define __unreachable() __builtin_unreachable() #define __unreachable() __builtin_unreachable()
#define PPC_FUNC extern "C" __attribute__((noinline))
#else #else
#include <intrin.h> #include <intrin.h>
#define PPC_FUNC extern "C" __declspec(noinline)
#define __unreachable() __assume(0) #define __unreachable() __assume(0)
#endif #endif
#define PPC_FUNC(x) extern "C" void x(PPCContext& __restrict ctx, uint8_t* base) noexcept
#define PPC_LOAD_U8(x) *(uint8_t*)(base + (x)) #define PPC_LOAD_U8(x) *(uint8_t*)(base + (x))
#define PPC_LOAD_U16(x) _byteswap_ushort(*(uint16_t*)(base + (x))) #define PPC_LOAD_U16(x) _byteswap_ushort(*(uint16_t*)(base + (x)))
#define PPC_LOAD_U32(x) _byteswap_ulong(*(uint32_t*)(base + (x))) #define PPC_LOAD_U32(x) _byteswap_ulong(*(uint32_t*)(base + (x)))
@ -128,6 +128,28 @@ struct alignas(0x10) PPCVRegister
}; };
}; };
struct CSRRegister
{
uint32_t value;
void storeValue()
{
value = _mm_getcsr();
}
void setFlushMode(bool enable)
{
uint32_t mask = _MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK;
uint32_t newValue = enable ? (value | mask) : (value & ~mask);
if (value != newValue)
{
_mm_setcsr(newValue);
value = newValue;
}
}
};
struct PPCContext struct PPCContext
{ {
PPCFunc** fn; PPCFunc** fn;
@ -137,6 +159,7 @@ struct PPCContext
PPCRegister reserved; PPCRegister reserved;
uint32_t msr; uint32_t msr;
uint32_t fpscr; uint32_t fpscr;
CSRRegister csr;
union union
{ {