Merge 775ad31136b8be0690bf9895a26862e1117a2bef into 865319a39cec873370500d26ce775959a4c5e784

2025-06-23 16:05:30 +00:00 · 2025-04-17 13:58:29 +02:00 · 2025-04-17 13:58:29 +02:00 · 843ee91fcc
commit 843ee91fcc
parent 865319a39c 775ad31136
7 changed files with 180 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -89,7 +89,7 @@ Additionally, mid-asm hooks can be inserted directly into the translated C++ cod
 XenonAnalyse, when used as a command-line application, allows an XEX file to be passed as an input argument to output a TOML file containing all the detected jump tables in the executable:

 ```
-XenonAnalyse [input XEX file path] [output jump table TOML file path]
+XenonAnalyse [input XEX file path] [name of toml output file]
 ```

 However, as explained in the earlier sections, due to variations between games, additional support may be needed to handle different patterns.
@ -100,6 +100,7 @@ However, as explained in the earlier sections, due to variations between games,

 XenonRecomp accepts a TOML file with recompiler configurations and the path to the `ppc_context.h` file located in the XenonUtils directory:

+
 ```
 XenonRecomp [input TOML file path] [input PPC context header file path]
 ```
@ -110,7 +111,7 @@ XenonRecomp [input TOML file path] [input PPC context header file path]

 ```toml
 [main]
-file_path = "../private/default.xex"
+file_path = "./private/default.xex"
 patch_file_path = "../private/default.xexp"
 patched_file_path = "../private/default_patched.xex"
 out_directory_path = "../ppc"
@ -249,11 +250,15 @@ Once the files are generated, refresh XenonTests' CMake cache to make them appea

 ## Building

-The project requires CMake 3.20 or later and Clang 18 or later to build. Since the repository includes submodules, ensure you clone it recursively.

-Compilers other than Clang have not been tested and are not recommended, including for recompilation output. The project relies on compiler-specific intrinsics and techniques that may not function correctly on other compilers, and many optimization methods depend on Clang's code generation.
+### Windows (MSYS2)
+-install [MSYS2](https://www.msys2.org/) and use the "MSYS2 CLANG64" environment to build the project.

-On Windows, you can use the clang-cl toolset and open the project in Visual Studio's CMake integration.
+-First, you need to install the necessary packages (`mingw-w64-clang-x86_64-cmake`, `mingw-w64-clang-x86_64-libc++`, `mingw-w64-clang-x86_64-clang` and `mingw-w64-x86_64-ninja`) with `pacman -S <package_name>`.
+
+-Then, you can head into the cloned repo's directory (you can access your C drive by going into the `/c` folder inside of MSYS2), and execute the command `cmake -DCMAKE_BUILD_TYPE=Debug .`, which will generate a `build.ninja` file for the project.
+
+-Finally, run the `ninja` command, and you should end up with compiled executables. Attempting to launch them will tell you about a missing `libc++.dll` file, which you can copy to your current folder with the `cp /clang64/bin/libc++.dll .` command.

 ## Special Thanks

--- a/XenonRecomp/CMakeLists.txt
+++ b/XenonRecomp/CMakeLists.txt
@ -2,6 +2,11 @@ cmake_minimum_required (VERSION 3.8)

 project("XenonRecomp")

+# Find required packages
+find_package(fmt REQUIRED)
+find_package(tomlplusplus REQUIRED)
+find_package(xxHash REQUIRED)
+
 add_executable(XenonRecomp 
    "main.cpp" 
    "recompiler.cpp"
--- a/XenonRecomp/recompiler.cpp
+++ b/XenonRecomp/recompiler.cpp
@ -800,6 +800,13 @@ bool Recompiler::Recompile(
        // no op
        break;

+
+    case PPC_INST_EQV:
+        println("\t{}.u64 = ~({}.u64 ^ {}.u64);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]));
+        if (strchr(insn.opcode->name, '.'))
+            println("\t{}.compare<int64_t>({}.s64, 0, {});", cr(0), r(insn.operands[0]), xer()); // Check if CR0 comparison uses s64
+        break;
+
    case PPC_INST_DCBF:
        // no op
        break;
@ -808,6 +815,10 @@ bool Recompiler::Recompile(
        // no op
        break;

+    case PPC_INST_DCBST:
+        // no op
+        break;    
+
    case PPC_INST_DCBTST:
        // no op
        break;
@ -990,6 +1001,19 @@ bool Recompiler::Recompile(
        println("\t{}.f64 = double(float({}.f64));", f(insn.operands[0]), f(insn.operands[1]));
        break;

+    case PPC_INST_FRSQRTE:
+        printSetFlushMode(false); // Ensure standard FPU mode
+        // Uses SSE reciprocal square root estimate instruction _mm_rsqrt_ss
+        println("\t{{");
+        println("\t\t__m128 val_pd = _mm_load_sd(&{}.f64);", f(insn.operands[1]));       // Load double
+        println("\t\t__m128 val_ss = _mm_cvtpd_ps(val_pd);");     // Convert to single
+        println("\t\t__m128 rsqrt_est_ss = _mm_rsqrt_ss(val_ss);"); // Estimate (single)
+        println("\t\t__m128 result_pd = _mm_cvtps_pd(rsqrt_est_ss);"); // Convert back to double
+        println("\t\t_mm_store_sd(&{}.f64, result_pd);", f(insn.operands[0]));      // Store result
+        println("\t}}");
+        // FRSQRTE does not typically set FPSCR bits, but check PDF if needed.
+        break;    
+
    case PPC_INST_FSEL:
        printSetFlushMode(false);
        println("\t{}.f64 = {}.f64 >= 0.0 ? {}.f64 : {}.f64;", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[2]), f(insn.operands[3]));
@ -1315,10 +1339,28 @@ bool Recompiler::Recompile(
        println("\t{}.s64 = {}.s64 * {}.s64;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]));
        break;

+    case PPC_INST_MULHD:
+        println("\t{}.s64 = ((__int128_t){}.s64 * (__int128_t){}.s64) >> 64;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]));
+        if (strchr(insn.opcode->name, '.'))
+            println("\t{}.compare<int64_t>({}.s64, 0, {});", cr(0), r(insn.operands[0]), xer()); // Check if CR0 comparison uses s64
+        break;
+        
+    case PPC_INST_MULHDU:
+        println("\t{}.u64 = ((__uint128_t){}.u64 * (__uint128_t){}.u64) >> 64;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]));
+        if (strchr(insn.opcode->name, '.'))
+            println("\t{}.compare<int64_t>({}.s64, 0, {});", cr(0), r(insn.operands[0]), xer()); // Check if CR0 comparison uses s64 or u64
+        break;   
+
    case PPC_INST_MULLI:
        println("\t{}.s64 = {}.s64 * {};", r(insn.operands[0]), r(insn.operands[1]), int32_t(insn.operands[2]));
        break;

+    case PPC_INST_MULLHWU: // Verify this ID exists
+        println("\t{}.u64 = (uint32_t)(({}.u64 & 0xFFFF) * ({}.u64 & 0xFFFF));", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]));
+        if (strchr(insn.opcode->name, '.'))
+            println("\t{}.compare<int32_t>({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer());
+        break;
+
    case PPC_INST_MULLW:
        println("\t{}.s64 = int64_t({}.s32) * int64_t({}.s32);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]));
        if (strchr(insn.opcode->name, '.'))
@ -1339,6 +1381,19 @@ bool Recompiler::Recompile(
        // no op
        break;

+    case PPC_INST_VNOR128:
+        printSetFlushMode(true);
+        println("\t{{");
+        println("\t\t__m128i vra = _mm_load_si128((__m128i*){}.u8);", v(insn.operands[1])); // Load VRA
+        println("\t\t__m128i vrb = _mm_load_si128((__m128i*){}.u8);", v(insn.operands[2])); // Load VRB
+        println("\t\t__m128i or_result = _mm_or_si128(vra, vrb);"); // VRA | VRB
+        // Invert bits using XOR with all ones (~(A|B))
+        println("\t\t__m128i all_ones = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());");
+        println("\t\t__m128i nor_result = _mm_xor_si128(or_result, all_ones);");
+        println("\t\t_mm_store_si128((__m128i*){}.u8, nor_result);", v(insn.operands[0])); // Store VRT
+        println("\t}}");
+        break;    
+
    case PPC_INST_NOR:
        println("\t{}.u64 = ~({}.u64 | {}.u64);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]));
        break;
@ -1539,6 +1594,14 @@ bool Recompiler::Recompile(
        println("{}.u32, {}.u64);", r(insn.operands[2]), f(insn.operands[0]));
        break;

+    case PPC_INST_STFSU:
+        printSetFlushMode(false);
+        println("\t{}.f32 = float({}.f64);", temp(), f(insn.operands[0])); // Convert FRS (double) to float in temp
+        println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2])); // Calculate EA = RA + D
+        println("\tPPC_STORE_U32({}, {}.u32);", ea(), temp()); // Store float bits
+        println("\t{}.u32 = {};", r(insn.operands[2]), ea()); // Update RA with EA
+        break;
+
    case PPC_INST_STFIWX:
        printSetFlushMode(false);
        print("{}", mmioStore() ? "\tPPC_MM_STORE_U32(" : "\tPPC_STORE_U32(");
@ -1769,6 +1832,12 @@ bool Recompiler::Recompile(
        println("\t_mm_store_si128((__m128i*){}.u8, _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
        break;

+    case PPC_INST_VANDC:
+        printSetFlushMode(true);
+        // Computes VRA & ~VRB using _mm_andnot_si128(VRB, VRA)
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); // VRT, VRB, VRA
+        break;
+
    case PPC_INST_VANDC128:
        println("\t_mm_store_si128((__m128i*){}.u8, _mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
        break;
@ -1795,6 +1864,22 @@ bool Recompiler::Recompile(
            println("_mm_load_ps({}.f32)));", v(insn.operands[1]));
        break;

+    case PPC_INST_VCFPUXWS128: // Or PPC_INST_VCTUXS if that's the ID used
+        printSetFlushMode(true);
+        println("\t{{");
+        println("\t\t__m128 vrbf = _mm_load_ps({}.f32);", v(insn.operands[1])); // Load VRB floats
+        if (insn.operands[2] != 0) { // Check UIMM (operand 2)
+            // Scale VRB by 2^UIMM before converting
+            println("\t\tfloat scale = ldexpf(1.0f, {});", (int32_t)insn.operands[2]); // Calculate 2^UIMM
+            println("\t\t__m128 scale_ps = _mm_set1_ps(scale);");
+            println("\t\tvrbf = _mm_mul_ps(vrbf, scale_ps);");
+        }
+        // Use the helper function from ppc_context.h which handles conversion and saturation
+        println("\t\t__m128i result = _mm_vctuxs(vrbf);");
+        println("\t\t_mm_store_si128((__m128i*){}.u32, result);", v(insn.operands[0])); // Store VRT
+        println("\t}}");
+        break;  
+
    case PPC_INST_VCFSX:
    case PPC_INST_VCSXWFP128:
    {
@ -1918,6 +2003,28 @@ bool Recompiler::Recompile(
        println("\t_mm_store_ps({}.f32, _mm_min_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
        break;

+    case PPC_INST_VPKSWSS:
+    case PPC_INST_VPKSWSS128: // Or PPC_INST_VPKSWSS
+        printSetFlushMode(true);
+        println("\t_mm_store_si128((__m128i*){}.s16, _mm_packs_epi32(_mm_load_si128((__m128i*){}.s32), _mm_load_si128((__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); // VRT, VRA, VRB
+        break;
+
+    case PPC_INST_VPKUWUS128:
+        printSetFlushMode(true);
+        println("\t{{");
+        println("\t\t__m128i max_val = _mm_set1_epi32(0xFFFF);"); // Max value for unsigned 16-bit
+        println("\t\t__m128i vra = _mm_load_si128((__m128i*){}.u32);", v(insn.operands[2])); // Load VRA (operand 2)
+        println("\t\t__m128i vrb = _mm_load_si128((__m128i*){}.u32);", v(insn.operands[1])); // Load VRB (operand 1)
+        // Saturate VRA words (unsigned) [0, 65535]
+        println("\t\tvra = _mm_min_epu32(vra, max_val);");
+        // Saturate VRB words (unsigned) [0, 65535]
+        println("\t\tvrb = _mm_min_epu32(vrb, max_val);");
+        // Pack clamped words. _mm_packs_epi32 works correctly here because inputs are pre-clamped.
+        println("\t\t__m128i result = _mm_packs_epi32(vra, vrb);");
+        println("\t\t_mm_store_si128((__m128i*){}.u16, result);", v(insn.operands[0])); // Store VRT (operand 0)
+        println("\t}}");
+        break;   
+
    case PPC_INST_VMRGHB:
        println("\t_mm_store_si128((__m128i*){}.u8, _mm_unpackhi_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
        break;
@ -2047,6 +2154,14 @@ bool Recompiler::Recompile(
        println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
        break;

+    case PPC_INST_VPKUHUS:
+    case PPC_INST_VPKUHUS128: // Or PPC_INST_VPKUHUS
+        printSetFlushMode(true);
+        // _mm_packus_epi16 performs unsigned saturation from signed 16-bit to unsigned 8-bit.
+        // This matches VPKUHUS behavior.
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); // VRT, VRA, VRB
+        break;  
+
    case PPC_INST_VREFP:
    case PPC_INST_VREFP128:
        // TODO: see if we can use rcp safely
@ -2088,7 +2203,11 @@ bool Recompiler::Recompile(
        break;

    case PPC_INST_VSEL:
-        println("\t_mm_store_si128((__m128i*){}.u8, _mm_or_si128(_mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8))));", v(insn.operands[0]), v(insn.operands[3]), v(insn.operands[1]), v(insn.operands[3]), v(insn.operands[2]));
+    case PPC_INST_VSEL128: // Or PPC_INST_VSEL
+        printSetFlushMode(true);
+        // VRT = (VRC sign bit set) ? VRB : VRA;
+        // _mm_blendv_epi8 uses the sign bit of the mask (VRC) to select bytes from VRB (if sign=1) or VRA (if sign=0)
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_blendv_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); // VRT, VRA, VRB, VRC
        break;

    case PPC_INST_VSLB:
--- a/thirdparty/disasm/CMakeFiles/disasm.dir/disasm.c.obj
+++ b/thirdparty/disasm/CMakeFiles/disasm.dir/disasm.c.obj
--- a/thirdparty/disasm/CMakeFiles/disasm.dir/ppc-dis.c.obj
+++ b/thirdparty/disasm/CMakeFiles/disasm.dir/ppc-dis.c.obj
--- a/thirdparty/disasm/cmake_install.cmake
+++ b/thirdparty/disasm/cmake_install.cmake
@ -0,0 +1,45 @@
+# Install script for directory: E:/recomp/XenonRecomp/thirdparty/disasm
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "C:/Program Files (x86)/disasm")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "Debug")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set path to fallback-tool for dependency-resolution.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "C:/msys64/clang64/bin/llvm-objdump.exe")
+endif()
+
+string(REPLACE ";" "\n" CMAKE_INSTALL_MANIFEST_CONTENT
+       "${CMAKE_INSTALL_MANIFEST_FILES}")
+if(CMAKE_INSTALL_LOCAL_ONLY)
+  file(WRITE "E:/recomp/XenonRecomp/thirdparty/disasm/install_local_manifest.txt"
+     "${CMAKE_INSTALL_MANIFEST_CONTENT}")
+endif()
--- a/thirdparty/disasm/libdisasm.a
+++ b/thirdparty/disasm/libdisasm.a