From f0e4f11a2d369af554628a3995cbb81eaa0ab516 Mon Sep 17 00:00:00 2001 From: Dennis Stanistan Date: Mon, 3 Mar 2025 09:40:58 +0200 Subject: [PATCH 1/5] Add build instructions for linux --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1621434..d6dba6a 100644 --- a/README.md +++ b/README.md @@ -252,7 +252,12 @@ The project requires CMake 3.20 or later and Clang 18 or later to build. Since t Compilers other than Clang have not been tested and are not recommended, including for recompilation output. The project relies on compiler-specific intrinsics and techniques that may not function correctly on other compilers, and many optimization methods depend on Clang's code generation. On Windows, you can use the clang-cl toolset and open the project in Visual Studio's CMake integration. +On Linux, you can build via the following commands after creating and entering the `build` directory: +``` +cmake -S .. -B . -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang -DCMAKE_EXE_LINKER_FLAGS="-lstdc++" +cmake --build . --config Release +``` ## Special Thanks -This project could not have been possible without the [Xenia](https://github.com/xenia-project/xenia) emulator, as many parts of the CPU code conversion process has been implemented by heavily referencing its PPC code translator. The project also uses code from [Xenia Canary](https://github.com/xenia-canary/xenia-canary) to patch XEX binaries. \ No newline at end of file +This project could not have been possible without the [Xenia](https://github.com/xenia-project/xenia) emulator, as many parts of the CPU code conversion process has been implemented by heavily referencing its PPC code translator. The project also uses code from [Xenia Canary](https://github.com/xenia-canary/xenia-canary) to patch XEX binaries. From 3c58953d3ec33ae669c3fa7b8f9c23495b5a4f1a Mon Sep 17 00:00:00 2001 From: Dennis Stanistan Date: Mon, 3 Mar 2025 09:42:07 +0200 Subject: [PATCH 2/5] Add another newline --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d6dba6a..abf7f6e 100644 --- a/README.md +++ b/README.md @@ -252,6 +252,7 @@ The project requires CMake 3.20 or later and Clang 18 or later to build. Since t Compilers other than Clang have not been tested and are not recommended, including for recompilation output. The project relies on compiler-specific intrinsics and techniques that may not function correctly on other compilers, and many optimization methods depend on Clang's code generation. On Windows, you can use the clang-cl toolset and open the project in Visual Studio's CMake integration. + On Linux, you can build via the following commands after creating and entering the `build` directory: ``` cmake -S .. -B . -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang -DCMAKE_EXE_LINKER_FLAGS="-lstdc++" From c7d9e1e1b21ef39234706b66157fbb835bbcbe6e Mon Sep 17 00:00:00 2001 From: Dennis Stanistan Date: Tue, 4 Mar 2025 02:47:49 +0200 Subject: [PATCH 3/5] Swap between instructions in absoluteSwitch for tag 2 pattern --- XenonAnalyse/main.cpp | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/XenonAnalyse/main.cpp b/XenonAnalyse/main.cpp index d08371e..ed3540f 100644 --- a/XenonAnalyse/main.cpp +++ b/XenonAnalyse/main.cpp @@ -26,11 +26,11 @@ void ReadTable(Image& image, SwitchTable& table) uint32_t pOffset; ppc_insn insn; auto* code = (uint32_t*)image.Find(table.base); - ppc::Disassemble(code, table.base, insn); - pOffset = insn.operands[1] << 16; + ppc::Disassemble(code, table.base, insn); // lis + pOffset = insn.operands[1] << 16; // Upper 16 bits - ppc::Disassemble(code + 1, table.base + 4, insn); - pOffset += insn.operands[2]; + ppc::Disassemble(code + 2, table.base + 8, insn); // addi (skip rlwinm at +4) + pOffset += insn.operands[2]; // Lower 16 bits if (table.type == SWITCH_ABSOLUTE) { @@ -250,15 +250,16 @@ int main(int argc, char** argv) } }; - uint32_t absoluteSwitch[] = - { - PPC_INST_LIS, - PPC_INST_ADDI, - PPC_INST_RLWINM, - PPC_INST_LWZX, - PPC_INST_MTCTR, - PPC_INST_BCTR, - }; + // adjusted for tag 2 + uint32_t absoluteSwitch[] = + { + PPC_INST_LIS, + PPC_INST_RLWINM, // (slwi alias) + PPC_INST_ADDI, + PPC_INST_LWZX, + PPC_INST_MTCTR, + PPC_INST_BCTR + }; uint32_t computedSwitch[] = { From 914b61d9861dadc050305588767c2d743bf76be8 Mon Sep 17 00:00:00 2001 From: Dennis Stanistan Date: Tue, 4 Mar 2025 03:38:40 +0200 Subject: [PATCH 4/5] Prettify main.cpp thanks nano --- XenonAnalyse/main.cpp | 94 +++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/XenonAnalyse/main.cpp b/XenonAnalyse/main.cpp index ed3540f..99e5dcd 100644 --- a/XenonAnalyse/main.cpp +++ b/XenonAnalyse/main.cpp @@ -26,11 +26,11 @@ void ReadTable(Image& image, SwitchTable& table) uint32_t pOffset; ppc_insn insn; auto* code = (uint32_t*)image.Find(table.base); - ppc::Disassemble(code, table.base, insn); // lis - pOffset = insn.operands[1] << 16; // Upper 16 bits + ppc::Disassemble(code, table.base, insn); // lis + pOffset = insn.operands[1] << 16; // Upper 16 bits - ppc::Disassemble(code + 2, table.base + 8, insn); // addi (skip rlwinm at +4) - pOffset += insn.operands[2]; // Lower 16 bits + ppc::Disassemble(code + 2, table.base + 8, insn); // addi (skip rlwinm at +4) + pOffset += insn.operands[2]; // Lower 16 bits if (table.type == SWITCH_ABSOLUTE) { @@ -213,53 +213,53 @@ int main(int argc, char** argv) println("# Generated by XenonAnalyse"); auto scanPattern = [&](uint32_t* pattern, size_t count, size_t type) + { + for (const auto& section : image.sections) { - for (const auto& section : image.sections) + if (!(section.flags & SectionFlags_Code)) { - if (!(section.flags & SectionFlags_Code)) - { - continue; - } - - size_t base = section.base; - uint8_t* data = section.data; - uint8_t* dataStart = section.data; - uint8_t* dataEnd = section.data + section.size; - while (data < dataEnd && data != nullptr) - { - data = (uint8_t*)SearchMask(data, pattern, count, dataEnd - data); - - if (data != nullptr) - { - SwitchTable table{}; - table.type = type; - ScanTable((uint32_t*)data, base + (data - dataStart), table); - - // fmt::println("{:X} ; jmptable - {}", base + (data - dataStart), table.labels.size()); - if (table.base != 0) - { - ReadTable(image, table); - printTable(table); - switches.emplace_back(std::move(table)); - } - - data += 4; - } - continue; - } + continue; } - }; - // adjusted for tag 2 - uint32_t absoluteSwitch[] = - { - PPC_INST_LIS, - PPC_INST_RLWINM, // (slwi alias) - PPC_INST_ADDI, - PPC_INST_LWZX, - PPC_INST_MTCTR, - PPC_INST_BCTR - }; + size_t base = section.base; + uint8_t* data = section.data; + uint8_t* dataStart = section.data; + uint8_t* dataEnd = section.data + section.size; + while (data < dataEnd && data != nullptr) + { + data = (uint8_t*)SearchMask(data, pattern, count, dataEnd - data); + + if (data != nullptr) + { + SwitchTable table{}; + table.type = type; + ScanTable((uint32_t*)data, base + (data - dataStart), table); + + // fmt::println("{:X} ; jmptable - {}", base + (data - dataStart), table.labels.size()); + if (table.base != 0) + { + ReadTable(image, table); + printTable(table); + switches.emplace_back(std::move(table)); + } + + data += 4; + } + continue; + } + } + }; + + // adjusted for tag 2 + uint32_t absoluteSwitch[] = + { + PPC_INST_LIS, + PPC_INST_RLWINM, // (slwi alias) + PPC_INST_ADDI, + PPC_INST_LWZX, + PPC_INST_MTCTR, + PPC_INST_BCTR + }; uint32_t computedSwitch[] = { From 3f02123e7e773418113650bc31f8c9cc12f59b75 Mon Sep 17 00:00:00 2001 From: dennis Date: Wed, 5 Mar 2025 09:10:24 +0200 Subject: [PATCH 5/5] Added more instructions & added changes --- XenonRecomp/recompiler.cpp | 497 +++++++++++++++++++++++++++--- XenonRecomp/recompiler_config.cpp | 9 + XenonUtils/byteswap.h | 32 +- XenonUtils/ppc_context.h | 81 +++++ XenonUtils/xex.cpp | 44 +-- XenonUtils/xex.h | 6 +- 6 files changed, 582 insertions(+), 87 deletions(-) diff --git a/XenonRecomp/recompiler.cpp b/XenonRecomp/recompiler.cpp index e412551..0da9570 100644 --- a/XenonRecomp/recompiler.cpp +++ b/XenonRecomp/recompiler.cpp @@ -99,35 +99,53 @@ void Recompiler::Analyse() { if (i < 32) { - auto& restgpr = functions.emplace_back(); - restgpr.base = config.restGpr14Address + (i - 14) * 4; - restgpr.size = (32 - i) * 4 + 12; - image.symbols.emplace(Symbol{ fmt::format("__restgprlr_{}", i), restgpr.base, restgpr.size, Symbol_Function }); + if (config.restGpr14Address != 0) + { + auto& restgpr = functions.emplace_back(); + restgpr.base = config.restGpr14Address + (i - 14) * 4; + restgpr.size = (32 - i) * 4 + 12; + image.symbols.emplace(Symbol{ fmt::format("__restgprlr_{}", i), restgpr.base, restgpr.size, Symbol_Function }); + } - auto& savegpr = functions.emplace_back(); - savegpr.base = config.saveGpr14Address + (i - 14) * 4; - savegpr.size = (32 - i) * 4 + 8; - image.symbols.emplace(fmt::format("__savegprlr_{}", i), savegpr.base, savegpr.size, Symbol_Function); + if (config.saveGpr14Address != 0) + { + auto& savegpr = functions.emplace_back(); + savegpr.base = config.saveGpr14Address + (i - 14) * 4; + savegpr.size = (32 - i) * 4 + 8; + image.symbols.emplace(fmt::format("__savegprlr_{}", i), savegpr.base, savegpr.size, Symbol_Function); + } - auto& restfpr = functions.emplace_back(); - restfpr.base = config.restFpr14Address + (i - 14) * 4; - restfpr.size = (32 - i) * 4 + 4; - image.symbols.emplace(fmt::format("__restfpr_{}", i), restfpr.base, restfpr.size, Symbol_Function); + if (config.restFpr14Address != 0) + { + auto& restfpr = functions.emplace_back(); + restfpr.base = config.restFpr14Address + (i - 14) * 4; + restfpr.size = (32 - i) * 4 + 4; + image.symbols.emplace(fmt::format("__restfpr_{}", i), restfpr.base, restfpr.size, Symbol_Function); + } - auto& savefpr = functions.emplace_back(); - savefpr.base = config.saveFpr14Address + (i - 14) * 4; - savefpr.size = (32 - i) * 4 + 4; - image.symbols.emplace(fmt::format("__savefpr_{}", i), savefpr.base, savefpr.size, Symbol_Function); + if (config.saveFpr14Address != 0) + { + auto& savefpr = functions.emplace_back(); + savefpr.base = config.saveFpr14Address + (i - 14) * 4; + savefpr.size = (32 - i) * 4 + 4; + image.symbols.emplace(fmt::format("__savefpr_{}", i), savefpr.base, savefpr.size, Symbol_Function); + } - auto& restvmx = functions.emplace_back(); - restvmx.base = config.restVmx14Address + (i - 14) * 8; - restvmx.size = (32 - i) * 8 + 4; - image.symbols.emplace(fmt::format("__restvmx_{}", i), restvmx.base, restvmx.size, Symbol_Function); + if (config.restVmx14Address != 0) + { + auto& restvmx = functions.emplace_back(); + restvmx.base = config.restVmx14Address + (i - 14) * 8; + restvmx.size = (32 - i) * 8 + 4; + image.symbols.emplace(fmt::format("__restvmx_{}", i), restvmx.base, restvmx.size, Symbol_Function); + } - auto& savevmx = functions.emplace_back(); - savevmx.base = config.saveVmx14Address + (i - 14) * 8; - savevmx.size = (32 - i) * 8 + 4; - image.symbols.emplace(fmt::format("__savevmx_{}", i), savevmx.base, savevmx.size, Symbol_Function); + if (config.saveVmx14Address != 0) + { + auto& savevmx = functions.emplace_back(); + savevmx.base = config.saveVmx14Address + (i - 14) * 8; + savevmx.size = (32 - i) * 8 + 4; + image.symbols.emplace(fmt::format("__savevmx_{}", i), savevmx.base, savevmx.size, Symbol_Function); + } } if (i >= 64) @@ -506,6 +524,13 @@ bool Recompiler::Recompile( println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); break; + case PPC_INST_ADDC: + println("\t{}.ca = {}.u32 > ~{}.u32;", xer(), r(insn.operands[2]), r(insn.operands[1])); + println("\t{}.u64 = {}.u64 + {}.u64;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_ADDE: println("\t{}.u8 = ({}.u32 + {}.u32 < {}.u32) | ({}.u32 + {}.u32 + {}.ca < {}.ca);", temp(), r(insn.operands[1]), r(insn.operands[2]), r(insn.operands[1]), r(insn.operands[1]), r(insn.operands[2]), xer(), xer()); println("\t{}.u64 = {}.u64 + {}.u64 + {}.ca;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]), xer()); @@ -514,6 +539,14 @@ bool Recompiler::Recompile( println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); break; + case PPC_INST_ADDME: + println("\t{}.u8 = ({}.u32 - 1 < {}.u32) | ({}.u32 - 1 + {}.ca < {}.ca);", temp(), r(insn.operands[1]), r(insn.operands[1]), r(insn.operands[1]), xer(), xer()); + println("\t{}.u64 = {}.u64 - 1 + {}.ca;", r(insn.operands[0]), r(insn.operands[1]), xer()); + println("\t{}.ca = {}.u8;", xer(), temp()); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_ADDI: print("\t{}.s64 = ", r(insn.operands[0])); if (insn.operands[1] != 0) @@ -627,6 +660,14 @@ bool Recompiler::Recompile( println("\tif ({}.u32 == 0) goto loc_{:X};", ctr(), insn.operands[0]); break; + case PPC_INST_BDZF: + { + constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" }; + println("\t--{}.u64;", ctr()); + println("\tif ({}.u32 == 0 && !{}.{}) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], insn.operands[1]); + break; + } + case PPC_INST_BDZLR: println("\t--{}.u64;", ctr()); println("\tif ({}.u32 == 0) return;", ctr(), insn.operands[0]); @@ -638,10 +679,20 @@ bool Recompiler::Recompile( break; case PPC_INST_BDNZF: - // NOTE: assuming eq here as a shortcut because all the instructions in the game do that + { + constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" }; println("\t--{}.u64;", ctr()); - println("\tif ({}.u32 != 0 && !{}.eq) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), insn.operands[1]); + println("\tif ({}.u32 != 0 && !{}.{}) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], insn.operands[1]); break; + } + + case PPC_INST_BDNZT: + { + constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" }; + println("\t--{}.u64;", ctr()); + println("\tif ({}.u32 != 0 && {}.{}) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], insn.operands[1]); + break; + } case PPC_INST_BEQ: printConditionalBranch(false, "eq"); @@ -687,7 +738,7 @@ bool Recompiler::Recompile( break; case PPC_INST_BLRL: - println("__builtin_debugtrap();"); + println("__debugbreak();"); break; case PPC_INST_BLT: @@ -764,13 +815,27 @@ bool Recompiler::Recompile( break; case PPC_INST_CNTLZD: - println("\t{0}.u64 = {1}.u64 == 0 ? 64 : __builtin_clzll({1}.u64);", r(insn.operands[0]), r(insn.operands[1])); + println("\t{}.u64 = __lzcnt64({}.u64);", r(insn.operands[0]), r(insn.operands[1])); break; case PPC_INST_CNTLZW: - println("\t{0}.u64 = {1}.u32 == 0 ? 32 : __builtin_clz({1}.u32);", r(insn.operands[0]), r(insn.operands[1])); + println("\t{}.u64 = __lzcnt({}.u32);", r(insn.operands[0]), r(insn.operands[1])); break; + case PPC_INST_CROR: + { + constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" }; + println("\t{}.{} = {}.{} | {}.{};", cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], cr(insn.operands[1] / 4), fields[insn.operands[1] % 4], cr(insn.operands[2] / 4), fields[insn.operands[2] % 4]); + break; + } + + case PPC_INST_CRORC: + { + constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" }; + println("\t{}.{} = {}.{} | (~{}.{} & 1);", cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], cr(insn.operands[1] / 4), fields[insn.operands[1] % 4], cr(insn.operands[2] / 4), fields[insn.operands[2] % 4]); + break; + } + case PPC_INST_DB16CYC: // no op break; @@ -783,6 +848,10 @@ bool Recompiler::Recompile( // no op break; + case PPC_INST_DCBST: + // no op + break; + case PPC_INST_DCBTST: // no op break; @@ -827,6 +896,12 @@ bool Recompiler::Recompile( // no op break; + case PPC_INST_EQV: + println("\t{}.u64 = ~({}.u64 ^ {}.u64);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_EXTSB: println("\t{}.s64 = {}.s8;", r(insn.operands[0]), r(insn.operands[1])); if (strchr(insn.opcode->name, '.')) @@ -1010,6 +1085,12 @@ bool Recompiler::Recompile( println("{}.u32);", r(insn.operands[2])); break; + case PPC_INST_LBZUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U8({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_LD: print("\t{}.u64 = PPC_LOAD_U64(", r(insn.operands[0])); if (insn.operands[2] != 0) @@ -1038,6 +1119,12 @@ bool Recompiler::Recompile( println("{}.u32);", r(insn.operands[2])); break; + case PPC_INST_LDUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U64({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_LFD: printSetFlushMode(false); print("\t{}.u64 = PPC_LOAD_U64(", f(insn.operands[0])); @@ -1046,6 +1133,13 @@ bool Recompiler::Recompile( println("{});", int32_t(insn.operands[1])); break; + case PPC_INST_LFDU: + printSetFlushMode(false); + println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U64({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[2]), ea()); + break; + case PPC_INST_LFDX: printSetFlushMode(false); print("\t{}.u64 = PPC_LOAD_U64(", f(insn.operands[0])); @@ -1054,6 +1148,13 @@ bool Recompiler::Recompile( println("{}.u32);", r(insn.operands[2])); break; + case PPC_INST_LFDUX: + printSetFlushMode(false); + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U64({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_LFS: printSetFlushMode(false); print("\t{}.u32 = PPC_LOAD_U32(", temp()); @@ -1063,6 +1164,14 @@ bool Recompiler::Recompile( println("\t{}.f64 = double({}.f32);", f(insn.operands[0]), temp()); break; + case PPC_INST_LFSU: + printSetFlushMode(false); + println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u32 = PPC_LOAD_U32({});", temp(), ea()); + println("\t{}.u32 = {};", r(insn.operands[2]), ea()); + println("\t{}.f64 = double({}.f32);", f(insn.operands[0]), temp()); + break; + case PPC_INST_LFSX: printSetFlushMode(false); print("\t{}.u32 = PPC_LOAD_U32(", temp()); @@ -1072,6 +1181,14 @@ bool Recompiler::Recompile( println("\t{}.f64 = double({}.f32);", f(insn.operands[0]), temp()); break; + case PPC_INST_LFSUX: + printSetFlushMode(false); + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u32 = PPC_LOAD_U32({});", temp(), ea()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + println("\t{}.f64 = double({}.f32);", f(insn.operands[0]), temp()); + break; + case PPC_INST_LHA: print("\t{}.s64 = int16_t(PPC_LOAD_U16(", r(insn.operands[0])); if (insn.operands[2] != 0) @@ -1079,6 +1196,12 @@ bool Recompiler::Recompile( println("{}));", int32_t(insn.operands[1])); break; + case PPC_INST_LHAU: + print("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + print("\t{}.s64 = int16_t(PPC_LOAD_U16({}));", r(insn.operands[0]), ea()); + print("\t{}.u32 = {};", r(insn.operands[2]), ea()); + break; + case PPC_INST_LHAX: print("\t{}.s64 = int16_t(PPC_LOAD_U16(", r(insn.operands[0])); if (insn.operands[1] != 0) @@ -1093,6 +1216,12 @@ bool Recompiler::Recompile( println("{});", int32_t(insn.operands[1])); break; + case PPC_INST_LHZU: + println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U16({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[2]), ea()); + break; + case PPC_INST_LHZX: print("\t{}.u64 = PPC_LOAD_U16(", r(insn.operands[0])); if (insn.operands[1] != 0) @@ -1100,6 +1229,12 @@ bool Recompiler::Recompile( println("{}.u32);", r(insn.operands[2])); break; + case PPC_INST_LHZUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U16({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_LI: println("\t{}.s64 = {};", r(insn.operands[0]), int32_t(insn.operands[1])); break; @@ -1112,6 +1247,7 @@ bool Recompiler::Recompile( case PPC_INST_LVEWX128: case PPC_INST_LVX: case PPC_INST_LVX128: + case PPC_INST_LVEHX: // NOTE: for endian swapping, we reverse the whole vector instead of individual elements. // this is accounted for in every instruction (eg. dp3 sums yzw instead of xyz) print("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ((", v(insn.operands[0])); @@ -1207,6 +1343,12 @@ bool Recompiler::Recompile( println("{}.u32);", r(insn.operands[2])); break; + case PPC_INST_LWZUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = PPC_LOAD_U32({});", r(insn.operands[0]), ea()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_MFCR: for (size_t i = 0; i < 32; i++) { @@ -1343,43 +1485,43 @@ bool Recompiler::Recompile( break; case PPC_INST_RLDICL: - println("\t{}.u64 = __builtin_rotateleft64({}.u64, {}) & 0x{:X};", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], ComputeMask(insn.operands[3], 63)); + println("\t{}.u64 = _rotl64({}.u64, {}) & 0x{:X};", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], ComputeMask(insn.operands[3], 63)); break; case PPC_INST_RLDICR: - println("\t{}.u64 = __builtin_rotateleft64({}.u64, {}) & 0x{:X};", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], ComputeMask(0, insn.operands[3])); + println("\t{}.u64 = _rotl64({}.u64, {}) & 0x{:X};", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], ComputeMask(0, insn.operands[3])); break; case PPC_INST_RLDIMI: { const uint64_t mask = ComputeMask(insn.operands[3], ~insn.operands[2]); - println("\t{}.u64 = (__builtin_rotateleft64({}.u64, {}) & 0x{:X}) | ({}.u64 & 0x{:X});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], mask, r(insn.operands[0]), ~mask); + println("\t{}.u64 = (_rotl64({}.u64, {}) & 0x{:X}) | ({}.u64 & 0x{:X});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], mask, r(insn.operands[0]), ~mask); break; } case PPC_INST_RLWIMI: { const uint64_t mask = ComputeMask(insn.operands[3] + 32, insn.operands[4] + 32); - println("\t{}.u64 = (__builtin_rotateleft32({}.u32, {}) & 0x{:X}) | ({}.u64 & 0x{:X});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], mask, r(insn.operands[0]), ~mask); + println("\t{}.u64 = (_rotl({}.u32, {}) & 0x{:X}) | ({}.u64 & 0x{:X});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], mask, r(insn.operands[0]), ~mask); break; } case PPC_INST_RLWINM: - println("\t{}.u64 = __builtin_rotateleft64({}.u32 | ({}.u64 << 32), {}) & 0x{:X};", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[1]), insn.operands[2], ComputeMask(insn.operands[3] + 32, insn.operands[4] + 32)); + println("\t{}.u64 = _rotl64({}.u32 | ({}.u64 << 32), {}) & 0x{:X};", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[1]), insn.operands[2], ComputeMask(insn.operands[3] + 32, insn.operands[4] + 32)); if (strchr(insn.opcode->name, '.')) println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); break; case PPC_INST_ROTLDI: - println("\t{}.u64 = __builtin_rotateleft64({}.u64, {});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2]); + println("\t{}.u64 = _rotl64({}.u64, {});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2]); break; case PPC_INST_ROTLW: - println("\t{}.u64 = __builtin_rotateleft32({}.u32, {}.u8 & 0x1F);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}.u64 = _rotl({}.u32, {}.u8 & 0x1F);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); break; case PPC_INST_ROTLWI: - println("\t{}.u64 = __builtin_rotateleft32({}.u32, {});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2]); + println("\t{}.u64 = _rotl({}.u32, {});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2]); if (strchr(insn.opcode->name, '.')) println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); break; @@ -1457,7 +1599,7 @@ bool Recompiler::Recompile( case PPC_INST_STBU: println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U8({}, {}.u8);", ea(), r(insn.operands[0])); + println("\t{}{}, {}.u8);", mmioStore() ? "PPC_MM_STORE_U8(" : "PPC_STORE_U8(", ea(), r(insn.operands[0])); println("\t{}.u32 = {};", r(insn.operands[2]), ea()); break; @@ -1468,6 +1610,12 @@ bool Recompiler::Recompile( println("{}.u32, {}.u8);", r(insn.operands[2]), r(insn.operands[0])); break; + case PPC_INST_STBUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}{}, {}.u8);", mmioStore() ? "PPC_MM_STORE_U8(" : "PPC_STORE_U8(", ea(), r(insn.operands[0])); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_STD: print("{}", mmioStore() ? "\tPPC_MM_STORE_U64(" : "\tPPC_STORE_U64("); if (insn.operands[2] != 0) @@ -1478,16 +1626,16 @@ bool Recompiler::Recompile( case PPC_INST_STDCX: println("\t{}.lt = 0;", cr(0)); println("\t{}.gt = 0;", cr(0)); - print("\t{}.eq = __sync_bool_compare_and_swap(reinterpret_cast(base + ", cr(0)); + print("\t{}.eq = PPC_InterlockedCompareExchange64(reinterpret_cast(base + ", cr(0)); if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); - println("{}.u32), {}.s64, __builtin_bswap64({}.s64));", r(insn.operands[2]), reserved(), r(insn.operands[0])); + println("{}.u32), __builtin_bswap64({}.s64), {}.s64) == {}.s64;", r(insn.operands[2]), r(insn.operands[0]), reserved(), reserved()); println("\t{}.so = {}.so;", cr(0), xer()); break; case PPC_INST_STDU: println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U64({}, {}.u64);", ea(), r(insn.operands[0])); + println("\t{}{}, {}.u64);", mmioStore() ? "PPC_MM_STORE_U64(" : "PPC_STORE_U64(", ea(), r(insn.operands[0])); println("\t{}.u32 = {};", r(insn.operands[2]), ea()); break; @@ -1498,6 +1646,12 @@ bool Recompiler::Recompile( println("{}.u32, {}.u64);", r(insn.operands[2]), r(insn.operands[0])); break; + case PPC_INST_STDUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}{}, {}.u64);", mmioStore() ? "PPC_MM_STORE_U64(" : "PPC_STORE_U64(", ea(), r(insn.operands[0])); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_STFD: printSetFlushMode(false); print("{}", mmioStore() ? "\tPPC_MM_STORE_U64(" : "\tPPC_STORE_U64("); @@ -1506,6 +1660,13 @@ bool Recompiler::Recompile( println("{}, {}.u64);", int32_t(insn.operands[1]), f(insn.operands[0])); break; + case PPC_INST_STFDU: + printSetFlushMode(false); + println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + println("\t{}{}, {}.u64);", mmioStore() ? "PPC_MM_STORE_U64(" : "PPC_STORE_U64(", ea(), r(insn.operands[0])); + println("\t{}.u32 = {};", r(insn.operands[2]), ea()); + break; + case PPC_INST_STFDX: printSetFlushMode(false); print("{}", mmioStore() ? "\tPPC_MM_STORE_U64(" : "\tPPC_STORE_U64("); @@ -1531,6 +1692,14 @@ bool Recompiler::Recompile( println("{}, {}.u32);", int32_t(insn.operands[1]), temp()); break; + case PPC_INST_STFSU: + printSetFlushMode(false); + println("\t{}.f32 = float({}.f64);", temp(), f(insn.operands[0])); + println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + println("\t{}{}, {}.u32);", mmioStore() ? "PPC_MM_STORE_U32(" : "PPC_STORE_U32(", ea(), temp()); + println("\t{}.u32 = {};", r(insn.operands[2]), ea()); + break; + case PPC_INST_STFSX: printSetFlushMode(false); println("\t{}.f32 = float({}.f64);", temp(), f(insn.operands[0])); @@ -1540,6 +1709,14 @@ bool Recompiler::Recompile( println("{}.u32, {}.u32);", r(insn.operands[2]), temp()); break; + case PPC_INST_STFSUX: + printSetFlushMode(false); + println("\t{}.f32 = float({}.f64);", temp(), f(insn.operands[0])); + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}{}, {}.u32);", mmioStore() ? "PPC_MM_STORE_U32(" : "PPC_STORE_U32(", ea(), temp()); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_STH: print("{}", mmioStore() ? "\tPPC_MM_STORE_U16(" : "\tPPC_STORE_U16("); if (insn.operands[2] != 0) @@ -1547,6 +1724,18 @@ bool Recompiler::Recompile( println("{}, {}.u16);", int32_t(insn.operands[1]), r(insn.operands[0])); break; + case PPC_INST_STHU: + println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); + println("\t{}{}, {}.u16);", mmioStore() ? "PPC_MM_STORE_U16(" : "PPC_STORE_U16(", ea(), r(insn.operands[0])); + println("\t{}.u32 = {};", r(insn.operands[2]), ea()); + break; + + case PPC_INST_STHUX: + println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); + println("\t{}{}, {}.u16);", mmioStore() ? "PPC_MM_STORE_U16(" : "PPC_STORE_U16(", ea(), r(insn.operands[0])); + println("\t{}.u32 = {};", r(insn.operands[1]), ea()); + break; + case PPC_INST_STHBRX: print("{}", mmioStore() ? "\tPPC_MM_STORE_U16(" : "\tPPC_STORE_U16("); if (insn.operands[1] != 0) @@ -1633,22 +1822,22 @@ bool Recompiler::Recompile( case PPC_INST_STWCX: println("\t{}.lt = 0;", cr(0)); println("\t{}.gt = 0;", cr(0)); - print("\t{}.eq = __sync_bool_compare_and_swap(reinterpret_cast(base + ", cr(0)); + print("\t{}.eq = PPC_InterlockedCompareExchange(reinterpret_cast(base + ", cr(0)); if (insn.operands[1] != 0) print("{}.u32 + ", r(insn.operands[1])); - println("{}.u32), {}.s32, __builtin_bswap32({}.s32));", r(insn.operands[2]), reserved(), r(insn.operands[0])); + println("{}.u32), __builtin_bswap32({}.s32), {}.s32) == {}.s32;", r(insn.operands[2]), r(insn.operands[0]), reserved(), reserved()); println("\t{}.so = {}.so;", cr(0), xer()); break; case PPC_INST_STWU: println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U32({}, {}.u32);", ea(), r(insn.operands[0])); + println("\t{}{}, {}.u32);", mmioStore() ? "PPC_MM_STORE_U32(" : "PPC_STORE_U32(", ea(), r(insn.operands[0])); println("\t{}.u32 = {};", r(insn.operands[2]), ea()); break; case PPC_INST_STWUX: println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2])); - println("\tPPC_STORE_U32({}, {}.u32);", ea(), r(insn.operands[0])); + println("\t{}{}, {}.u32);", mmioStore() ? "PPC_MM_STORE_U32(" : "PPC_STORE_U32(", ea(), r(insn.operands[0])); println("\t{}.u32 = {};", r(insn.operands[1]), ea()); break; @@ -1680,6 +1869,14 @@ bool Recompiler::Recompile( println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); break; + case PPC_INST_SUBFZE: + println("\t{}.u8 = (~{}.u32 < ~{}.u32) | (~{}.u32 + {}.ca < {}.ca);", temp(), r(insn.operands[1]), r(insn.operands[1]), r(insn.operands[1]), xer(), xer()); + println("\t{}.u64 = ~{}.u64 + {}.ca;", r(insn.operands[0]), r(insn.operands[1]), xer()); + println("\t{}.ca = {}.u8;", xer(), temp()); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_SUBFIC: println("\t{}.ca = {}.u32 <= {};", xer(), r(insn.operands[1]), insn.operands[2]); println("\t{}.s64 = {} - {}.s64;", r(insn.operands[0]), int32_t(insn.operands[2]), r(insn.operands[1])); @@ -1715,10 +1912,23 @@ bool Recompiler::Recompile( println("\t_mm_store_ps({}.f32, _mm_add_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VADDSBS: + println("\t_mm_store_si128((__m128i*){}.s8, _mm_adds_epi8(_mm_load_si128((__m128i*){}.s8), _mm_load_si128((__m128i*){}.s8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VADDSHS: println("\t_mm_store_si128((__m128i*){}.s16, _mm_adds_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VADDSWS: + // TODO: vectorize + for (size_t i = 0; i < 4; i++) + { + println("\t{}.s64 = int64_t({}.s32[{}]) + int64_t({}.s32[{}]);", temp(), v(insn.operands[1]), i, v(insn.operands[2]), i); + println("\t{}.s32[{}] = {}.s64 > INT_MAX ? INT_MAX : {}.s64 < INT_MIN ? INT_MIN : {}.s64;", v(insn.operands[0]), i, temp(), temp(), temp()); + } + break; + case PPC_INST_VADDUBM: println("\t_mm_store_si128((__m128i*){}.u8, _mm_add_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; @@ -1760,6 +1970,10 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VAVGUH: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VCTSXS: case PPC_INST_VCFPSXWS128: printSetFlushMode(true); @@ -1770,6 +1984,16 @@ bool Recompiler::Recompile( println("_mm_load_ps({}.f32)));", v(insn.operands[1])); break; + case PPC_INST_VCTUXS: + case PPC_INST_VCFPUXWS128: + printSetFlushMode(true); + print("\t_mm_store_si128((__m128i*){}.u32, _mm_vctuxs(", v(insn.operands[0])); + if (insn.operands[2] != 0) + println("_mm_mul_ps(_mm_load_ps({}.f32), _mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]); + else + println("_mm_load_ps({}.f32)));", v(insn.operands[1])); + break; + case PPC_INST_VCFSX: case PPC_INST_VCSXWFP128: { @@ -1806,7 +2030,7 @@ bool Recompiler::Recompile( case PPC_INST_VCMPBFP: case PPC_INST_VCMPBFP128: - println("\t__builtin_debugtrap();"); + println("\t__debugbreak();"); break; case PPC_INST_VCMPEQFP: @@ -1823,6 +2047,12 @@ bool Recompiler::Recompile( println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0])); break; + case PPC_INST_VCMPEQUH: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u16), 0xFFFF);", cr(6), v(insn.operands[0])); + break; + case PPC_INST_VCMPEQUW: case PPC_INST_VCMPEQUW128: println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); @@ -1848,10 +2078,26 @@ bool Recompiler::Recompile( case PPC_INST_VCMPGTUB: println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPGTUH: println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u16), 0xFFFF);", cr(6), v(insn.operands[0])); + break; + + case PPC_INST_VCMPGTSH: + println("\t_mm_store_si128((__m128i*){}.s8, _mm_cmpgt_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.s16), 0xFFFF);", cr(6), v(insn.operands[0])); + break; + + case PPC_INST_VCMPGTSW: + println("\t_mm_store_si128((__m128i*){}.s8, _mm_cmpgt_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.s32), 0xFFFF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VEXPTEFP: @@ -1883,10 +2129,18 @@ bool Recompiler::Recompile( println("\t_mm_store_ps({}.f32, _mm_max_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VMAXSH: + println("\t_mm_store_si128((__m128i*){}.u16, _mm_max_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VMAXSW: println("\t_mm_store_si128((__m128i*){}.u32, _mm_max_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VMINSH: + println("\t_mm_store_si128((__m128i*){}.u16, _mm_min_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VMINFP: case PPC_INST_VMINFP128: printSetFlushMode(true); @@ -1990,16 +2244,41 @@ bool Recompiler::Recompile( break; default: - println("\t__builtin_debugtrap();"); + println("\t__debugbreak();"); break; } break; + case PPC_INST_VPKSHSS: + case PPC_INST_VPKSHSS128: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_packs_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + break; + + case PPC_INST_VPKSWSS: + case PPC_INST_VPKSWSS128: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_packs_epi32(_mm_load_si128((__m128i*){}.s32), _mm_load_si128((__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + break; + case PPC_INST_VPKSHUS: case PPC_INST_VPKSHUS128: println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; + case PPC_INST_VPKSWUS: + case PPC_INST_VPKSWUS128: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi32(_mm_load_si128((__m128i*){}.s32), _mm_load_si128((__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + break; + + case PPC_INST_VPKUHUS: + case PPC_INST_VPKUHUS128: + for (size_t i = 0; i < 8; i++) + { + println("\t{0}.u8[{1}] = {2}.u16[{1}] > UCHAR_MAX ? UCHAR_MAX : {2}.u16[{1}];", vTemp(), i, v(insn.operands[2])); + println("\t{0}.u8[{1}] = {2}.u16[{3}] > UCHAR_MAX ? UCHAR_MAX : {2}.u16[{3}];", vTemp(), i + 8, v(insn.operands[1]), i); + } + println("{} = {};", v(insn.operands[0]), vTemp()); + break; + case PPC_INST_VREFP: case PPC_INST_VREFP128: // TODO: see if we can use rcp safely @@ -2032,6 +2311,14 @@ bool Recompiler::Recompile( break; } + case PPC_INST_VRLH: + for (size_t i = 0; i < 8; i++) + { + println("\t{0}.u16[{1}] = ({2}.u16[{1}] << ({3}.u16[{1}] & 0xF)) | ({2}.u16[{1}] >> (16 - ({3}.u16[{1}] & 0xF)));", vTemp(), i, v(insn.operands[1]), v(insn.operands[2])); + } + println("{} = {};", v(insn.operands[0]), vTemp()); + break; + case PPC_INST_VRSQRTEFP: case PPC_INST_VRSQRTEFP128: // TODO: see if we can use rsqrt safely @@ -2041,6 +2328,7 @@ bool Recompiler::Recompile( break; case PPC_INST_VSEL: + case PPC_INST_VSEL128: println("\t_mm_store_si128((__m128i*){}.u8, _mm_or_si128(_mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8))));", v(insn.operands[0]), v(insn.operands[3]), v(insn.operands[1]), v(insn.operands[3]), v(insn.operands[2])); break; @@ -2050,6 +2338,12 @@ bool Recompiler::Recompile( println("\t{}.u8[{}] = {}.u8[{}] << ({}.u8[{}] & 0x7);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i); break; + case PPC_INST_VSLH: + // TODO: vectorize + for (size_t i = 0; i < 8; i++) + println("\t{}.u16[{}] = {}.u16[{}] << ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 2); + break; + case PPC_INST_VSLDOI: case PPC_INST_VSLDOI128: println("\t_mm_store_si128((__m128i*){}.u8, _mm_alignr_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8), {}));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), 16 - insn.operands[3]); @@ -2083,6 +2377,10 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.u8, _mm_set1_epi8(char(0x{:X})));", v(insn.operands[0]), insn.operands[1]); break; + case PPC_INST_VSPLTISH: + println("\t_mm_store_si128((__m128i*){}.u16, _mm_set1_epi16(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]); + break; + case PPC_INST_VSPLTISW: case PPC_INST_VSPLTISW128: println("\t_mm_store_si128((__m128i*){}.u32, _mm_set1_epi32(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]); @@ -2102,6 +2400,18 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.u8, _mm_vsr(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VSRAB: + // TODO: vectorize, ensure endianness is correct + for (size_t i = 0; i < 16; i++) + println("\t{}.s8[{}] = {}.s8[{}] >> ({}.u8[{}] & 0x7);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i); + break; + + case PPC_INST_VSRAH: + // TODO: vectorize, ensure endianness is correct + for (size_t i = 0; i < 8; i++) + println("\t{}.s16[{}] = {}.s16[{}] >> ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 2); + break; + case PPC_INST_VSRAW: case PPC_INST_VSRAW128: // TODO: vectorize, ensure endianness is correct @@ -2109,6 +2419,12 @@ bool Recompiler::Recompile( println("\t{}.s32[{}] = {}.s32[{}] >> ({}.u8[{}] & 0x1F);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 4); break; + case PPC_INST_VSRH: + // TODO: vectorize, ensure endianness is correct + for (size_t i = 0; i < 8; i++) + println("\t{}.u16[{}] = {}.u16[{}] >> ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 2); + break; + case PPC_INST_VSRW: case PPC_INST_VSRW128: // TODO: vectorize, ensure endianness is correct @@ -2122,6 +2438,15 @@ bool Recompiler::Recompile( println("\t_mm_store_ps({}.f32, _mm_sub_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VSUBSHS: + // TODO: vectorize + for (size_t i = 0; i < 8; i++) + { + println("\t{}.s64 = int64_t({}.s16[{}]) - int64_t({}.s16[{}]);", temp(), v(insn.operands[1]), i, v(insn.operands[2]), i); + println("\t{}.s16[{}] = {}.s64 > SHRT_MAX ? SHRT_MAX : {}.s64 < SHRT_MIN ? SHRT_MIN : {}.s64;", v(insn.operands[0]), i, temp(), temp(), temp()); + } + break; + case PPC_INST_VSUBSWS: // TODO: vectorize for (size_t i = 0; i < 4; i++) @@ -2135,8 +2460,12 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.u8, _mm_subs_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VSUBUBM: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_sub_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VSUBUHM: - println("\t_mm_store_si128((__m128i*){}.u8, _mm_sub_epi16(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + println("\t_mm_store_si128((__m128i*){}.u8, _mm_sub_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; case PPC_INST_VUPKD3D128: @@ -2166,7 +2495,7 @@ bool Recompiler::Recompile( break; default: - println("\t__builtin_debugtrap();"); + println("\t__debugbreak();"); break; } break; @@ -2216,6 +2545,74 @@ bool Recompiler::Recompile( println("\t{}.u64 = {}.u64 ^ {};", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2] << 16); break; + case PPC_INST_MULHD: + println("\t{}.s64 = (int64_t({}.s32) * int64_t({}.s32)) >> 32;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + break; + + case PPC_INST_MULHDU: + println("\t{}.u64 = (uint64_t({}.u32) * uint64_t({}.u32)) >> 32;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + break; + + case PPC_INST_VCMPGTUW: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) // For vcmpgtuw. + println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); + break; + + case PPC_INST_VANDC: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + break; + + case PPC_INST_VNOR: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_xor_si128(_mm_or_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_set1_epi32(0xFFFFFFFF)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + + case PPC_INST_VNOR128: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_xor_si128(_mm_or_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_set1_epi32(0xFFFFFFFF)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + + case PPC_INST_VSL: + // Assuming byte-wise shift for generality (could be word/halfword depending on context) + for (size_t i = 0; i < 16; i++) + println("\t{}.u8[{}] = {}.u8[{}] << ({}.u8[{}] & 0x7);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i); + break; + + case PPC_INST_VMAXUB: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_max_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + + case PPC_INST_VMINUB: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_min_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + + case PPC_INST_VPKUHUM: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + break; + + case PPC_INST_VSUBUWS: + println("\t_mm_store_si128((__m128i*){}.u32, _mm_subs_epu32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + + case PPC_INST_VRLW128: + for (size_t i = 0; i < 4; i++) + println("\t{}.u32[{}] = ({}.u32[{}] << ({}.u8[{}] & 0x1F)) | ({}.u32[{}] >> (32 - ({}.u8[{}] & 0x1F)));", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 4, v(insn.operands[1]), i, v(insn.operands[2]), i * 4); + break; + + case PPC_INST_MACLHWU: + println("\t{}.u64 = (({}.u32 & 0xFFFF) * ({}.u32 & 0xFFFF) + {}.u32) & 0xFFFFFFFF;", + r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]), r(insn.operands[0])); + break; + + case PPC_INST_VSUBUWM: + println("\t_mm_store_si128((__m128i*){}.u32, _mm_sub_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", + v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + + case PPC_INST_MACCHWU: + println("\t{}.u64 = ((({}.u32 >> 16) * ({}.u32 & 0xFFFF)) + {}.u32) & 0xFFFFFFFF;", + r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]), r(insn.operands[0])); + break; + default: return false; } @@ -2610,4 +3007,4 @@ void Recompiler::SaveCurrentOutData(const std::string_view& name) out.clear(); } -} +} \ No newline at end of file diff --git a/XenonRecomp/recompiler_config.cpp b/XenonRecomp/recompiler_config.cpp index d746b68..81330a4 100644 --- a/XenonRecomp/recompiler_config.cpp +++ b/XenonRecomp/recompiler_config.cpp @@ -38,6 +38,15 @@ void RecompilerConfig::Load(const std::string_view& configFilePath) longJmpAddress = main["longjmp_address"].value_or(0u); setJmpAddress = main["setjmp_address"].value_or(0u); + if (restGpr14Address == 0) fmt::println("ERROR: __restgprlr_14 address is unspecified"); + if (saveGpr14Address == 0) fmt::println("ERROR: __savegprlr_14 address is unspecified"); + if (restFpr14Address == 0) fmt::println("ERROR: __restfpr_14 address is unspecified"); + if (saveFpr14Address == 0) fmt::println("ERROR: __savefpr_14 address is unspecified"); + if (restVmx14Address == 0) fmt::println("ERROR: __restvmx_14 address is unspecified"); + if (saveVmx14Address == 0) fmt::println("ERROR: __savevmx_14 address is unspecified"); + if (restVmx64Address == 0) fmt::println("ERROR: __restvmx_64 address is unspecified"); + if (saveVmx64Address == 0) fmt::println("ERROR: __savevmx_64 address is unspecified"); + if (auto functionsArray = main["functions"].as_array()) { for (auto& func : *functionsArray) diff --git a/XenonUtils/byteswap.h b/XenonUtils/byteswap.h index 33e959f..4bb6dfe 100644 --- a/XenonUtils/byteswap.h +++ b/XenonUtils/byteswap.h @@ -1,25 +1,33 @@ -#pragma once +#pragma once #include -template -inline T ByteSwap(T value) +// https://github.com/hedge-dev/XenonRecomp/pull/35 +#ifdef __clang__ +#define _byte_swap16(value) __builtin_bswap16(static_cast(value)) +#define _byte_swap32(value) __builtin_bswap32(static_cast(value)) +#define _byte_swap64(value) __builtin_bswap64(static_cast(value)) +#elif defined(_MSC_VER) +#define _byte_swap16(value) _byteswap_ushort(static_cast(value)) +#define _byte_swap32(value) _byteswap_ulong(static_cast(value)) +#define _byte_swap64(value) _byteswap_uint64(static_cast(value)) +#endif + +template T ByteSwap(T value) { if constexpr (sizeof(T) == 1) return value; - else if constexpr (sizeof(T) == 2) - return static_cast(__builtin_bswap16(static_cast(value))); - else if constexpr (sizeof(T) == 4) - return static_cast(__builtin_bswap32(static_cast(value))); - else if constexpr (sizeof(T) == 8) - return static_cast(__builtin_bswap64(static_cast(value))); + if constexpr (sizeof(T) == 2) + return static_cast(_byte_swap16(value)); + if constexpr (sizeof(T) == 4) + return static_cast(_byte_swap32(value)); + if constexpr (sizeof(T) == 8) + return static_cast(_byte_swap64(value)); assert(false && "Unexpected byte size."); - return value; } -template -inline void ByteSwapInplace(T& value) +template void ByteSwapInplace(T& value) { value = ByteSwap(value); } diff --git a/XenonUtils/ppc_context.h b/XenonUtils/ppc_context.h index bc427c9..f641359 100644 --- a/XenonUtils/ppc_context.h +++ b/XenonUtils/ppc_context.h @@ -651,10 +651,91 @@ inline __m128i _mm_vctsxs(__m128 src1) return _mm_andnot_si128(_mm_castps_si128(xmm2), _mm_castps_si128(dest)); } +inline __m128i _mm_vctuxs(__m128 src1) +{ + __m128 xmm0 = _mm_max_ps(src1, _mm_set1_epi32(0)); + __m128 xmm1 = _mm_cmpge_ps(xmm0, _mm_set1_ps((float)0x80000000)); + __m128 xmm2 = _mm_sub_ps(xmm0, _mm_set1_ps((float)0x80000000)); + xmm0 = _mm_blendv_ps(xmm0, xmm2, xmm1); + __m128i dest = _mm_cvttps_epi32(xmm0); + xmm0 = _mm_cmpeq_epi32(dest, _mm_set1_epi32(INT_MIN)); + xmm1 = _mm_and_si128(xmm1, _mm_set1_epi32(INT_MIN)); + dest = _mm_add_epi32(dest, xmm1); + return _mm_or_si128(dest, xmm0); +} + inline __m128i _mm_vsr(__m128i a, __m128i b) { b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61); return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10)); } +inline uint64_t _rotl64(uint64_t value, int shift) { + shift &= 63; // Normalize shift to 0-63 + return (value << shift) | (value >> (64 - shift)); +} + +inline uint32_t __lzcnt(uint32_t value) { + if (value == 0) return 32; + uint32_t count = 0; + while ((value & 0x80000000) == 0) { + count++; + value <<= 1; + } + return count; +} + +#ifdef _WIN32 + #pragma intrinsic(_InterlockedCompareExchange) + #define PPC_InterlockedCompareExchange _InterlockedCompareExchange +#else + // Fallback for GCC/Clang + inline long PPC_InterlockedCompareExchange(long volatile* Destination, long Exchange, long Comparand) { + int32_t expected = Comparand; + bool success = __atomic_compare_exchange_n( + reinterpret_cast(Destination), // Preserve volatile + &expected, + Exchange, + false, + __ATOMIC_SEQ_CST, + __ATOMIC_SEQ_CST + ); + return success ? Comparand : expected; + } +#endif + +#ifdef _WIN32 + #pragma intrinsic(_InterlockedCompareExchange64) + #define PPC_InterlockedCompareExchange64 _InterlockedCompareExchange64 +#else + // Fallback for GCC/Clang + inline int64_t PPC_InterlockedCompareExchange64(int64_t volatile* Destination, int64_t Exchange, int64_t Comparand) { + int64_t expected = Comparand; + bool success = __atomic_compare_exchange_n( + reinterpret_cast(Destination), // Preserve volatile + &expected, + Exchange, + false, + __ATOMIC_SEQ_CST, + __ATOMIC_SEQ_CST + ); + return success ? Comparand : expected; + } +#endif + + +#ifndef __debugbreak +#ifdef _WIN32 +#pragma intrinsic(__debugbreak) +#define __debugbreak __debugbreak +#else +// GCC/Clang/Linux fallback +#ifdef __x86_64__ +#define __debugbreak() asm volatile("int $0x3") +#else +#define __debugbreak() raise(SIGTRAP) +#endif +#endif +#endif + #endif diff --git a/XenonUtils/xex.cpp b/XenonUtils/xex.cpp index d1972c0..50a357e 100644 --- a/XenonUtils/xex.cpp +++ b/XenonUtils/xex.cpp @@ -11,26 +11,26 @@ #ifndef _WIN32 -typedef struct _IMAGE_DOS_HEADER { - uint16_t e_magic; - uint16_t e_cblp; - uint16_t e_cp; - uint16_t e_crlc; - uint16_t e_cparhdr; - uint16_t e_minalloc; - uint16_t e_maxalloc; - uint16_t e_ss; - uint16_t e_sp; - uint16_t e_csum; - uint16_t e_ip; - uint16_t e_cs; - uint16_t e_lfarlc; - uint16_t e_ovno; - uint16_t e_res[4]; - uint16_t e_oemid; - uint16_t e_oeminfo; - uint16_t e_res2[10]; - uint32_t e_lfanew; +typedef struct _IMAGE_DOS_HEADER { + uint16_t e_magic; + uint16_t e_cblp; + uint16_t e_cp; + uint16_t e_crlc; + uint16_t e_cparhdr; + uint16_t e_minalloc; + uint16_t e_maxalloc; + uint16_t e_ss; + uint16_t e_sp; + uint16_t e_csum; + uint16_t e_ip; + uint16_t e_cs; + uint16_t e_lfarlc; + uint16_t e_ovno; + uint16_t e_res[4]; + uint16_t e_oemid; + uint16_t e_oeminfo; + uint16_t e_res2[10]; + uint32_t e_lfanew; } IMAGE_DOS_HEADER, * PIMAGE_DOS_HEADER; typedef struct _IMAGE_FILE_HEADER { @@ -112,7 +112,7 @@ typedef struct _IMAGE_SECTION_HEADER { #endif -std::unordered_map XamExports = +std::unordered_map XamExports = { #include "xbox/xam_table.inc" }; @@ -217,7 +217,7 @@ Image Xex2LoadImage(const uint8_t* data, size_t dataSize) flags |= SectionFlags_Code; } - image.Map(reinterpret_cast(section.Name), section.VirtualAddress, + image.Map(reinterpret_cast(section.Name), section.VirtualAddress, section.Misc.VirtualSize, flags, image.data.get() + section.VirtualAddress); } diff --git a/XenonUtils/xex.h b/XenonUtils/xex.h index 9ab831e..5ad1951 100644 --- a/XenonUtils/xex.h +++ b/XenonUtils/xex.h @@ -193,7 +193,7 @@ struct Xex2ImportHeader be numImports; }; -struct Xex2ImportLibrary +struct Xex2ImportLibrary { be size; char nextImportDigest[0x14]; @@ -204,12 +204,12 @@ struct Xex2ImportLibrary be numberOfImports; }; -struct Xex2ImportDescriptor +struct Xex2ImportDescriptor { be firstThunk; // VA XEX_THUNK_DATA }; -struct Xex2ThunkData +struct Xex2ThunkData { union {