diff --git a/README.md b/README.md
index 0aa90e5..04834f3 100644
--- a/README.md
+++ b/README.md
@@ -255,6 +255,12 @@ Compilers other than Clang have not been tested and are not recommended, includi
 
 On Windows, you can use the clang-cl toolset and open the project in Visual Studio's CMake integration.
 
+On Linux, you can build via the following commands after creating and entering the `build` directory:
+```
+cmake -S .. -B . -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang -DCMAKE_EXE_LINKER_FLAGS="-lstdc++"
+cmake --build . --config Release
+```
+
 ## Special Thanks
 
 This project could not have been possible without the [Xenia](https://github.com/xenia-project/xenia) emulator, as many parts of the CPU code conversion process has been implemented by heavily referencing its PPC code translator. The project also uses code from [Xenia Canary](https://github.com/xenia-canary/xenia-canary) to patch XEX binaries.
diff --git a/XenonAnalyse/main.cpp b/XenonAnalyse/main.cpp
index d08371e..99e5dcd 100644
--- a/XenonAnalyse/main.cpp
+++ b/XenonAnalyse/main.cpp
@@ -26,11 +26,11 @@ void ReadTable(Image& image, SwitchTable& table)
     uint32_t pOffset;
     ppc_insn insn;
     auto* code = (uint32_t*)image.Find(table.base);
-    ppc::Disassemble(code, table.base, insn);
-    pOffset = insn.operands[1] << 16;
+    ppc::Disassemble(code, table.base, insn);           // lis
+    pOffset = insn.operands[1] << 16;                   // Upper 16 bits
 
-    ppc::Disassemble(code + 1, table.base + 4, insn);
-    pOffset += insn.operands[2];
+    ppc::Disassemble(code + 2, table.base + 8, insn);   // addi (skip rlwinm at +4)
+    pOffset += insn.operands[2];                        // Lower 16 bits
 
     if (table.type == SWITCH_ABSOLUTE)
     {
@@ -213,51 +213,52 @@ int main(int argc, char** argv)
     println("# Generated by XenonAnalyse");
 
     auto scanPattern = [&](uint32_t* pattern, size_t count, size_t type)
+    {
+        for (const auto& section : image.sections)
         {
-            for (const auto& section : image.sections)
+            if (!(section.flags & SectionFlags_Code))
             {
-                if (!(section.flags & SectionFlags_Code))
-                {
-                    continue;
-                }
-
-                size_t base = section.base;
-                uint8_t* data = section.data;
-                uint8_t* dataStart = section.data;
-                uint8_t* dataEnd = section.data + section.size;
-                while (data < dataEnd && data != nullptr)
-                {
-                    data = (uint8_t*)SearchMask(data, pattern, count, dataEnd - data);
-
-                    if (data != nullptr)
-                    {
-                        SwitchTable table{};
-                        table.type = type;
-                        ScanTable((uint32_t*)data, base + (data - dataStart), table);
-
-                        // fmt::println("{:X} ; jmptable - {}", base + (data - dataStart), table.labels.size());
-                        if (table.base != 0)
-                        {
-                            ReadTable(image, table);
-                            printTable(table);
-                            switches.emplace_back(std::move(table));
-                        }
-
-                        data += 4;
-                    }
-                    continue;
-                }
+                continue;
             }
-        };
 
+            size_t base = section.base;
+            uint8_t* data = section.data;
+            uint8_t* dataStart = section.data;
+            uint8_t* dataEnd = section.data + section.size;
+            while (data < dataEnd && data != nullptr)
+            {
+                data = (uint8_t*)SearchMask(data, pattern, count, dataEnd - data);
+
+                if (data != nullptr)
+                {
+                    SwitchTable table{};
+                    table.type = type;
+                    ScanTable((uint32_t*)data, base + (data - dataStart), table);
+
+                    // fmt::println("{:X} ; jmptable - {}", base + (data - dataStart), table.labels.size());
+                    if (table.base != 0)
+                    {
+                        ReadTable(image, table);
+                        printTable(table);
+                        switches.emplace_back(std::move(table));
+                    }
+
+                    data += 4;
+                }
+                continue;
+            }
+        }
+    };
+
+    // adjusted for tag 2
     uint32_t absoluteSwitch[] =
     {
         PPC_INST_LIS,
+        PPC_INST_RLWINM,   // (slwi alias)
         PPC_INST_ADDI,
-        PPC_INST_RLWINM,
         PPC_INST_LWZX,
         PPC_INST_MTCTR,
-        PPC_INST_BCTR,
+        PPC_INST_BCTR
     };
 
     uint32_t computedSwitch[] =
diff --git a/XenonRecomp/recompiler.cpp b/XenonRecomp/recompiler.cpp
index f860817..6266c9d 100644
--- a/XenonRecomp/recompiler.cpp
+++ b/XenonRecomp/recompiler.cpp
@@ -531,6 +531,13 @@ bool Recompiler::Recompile(
             println("\t{}.compare<int32_t>({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer());
         break;
 
+    case PPC_INST_ADDC:
+        println("\t{}.ca = {}.u32 > ~{}.u32;", xer(), r(insn.operands[2]), r(insn.operands[1]));
+        println("\t{}.u64 = {}.u64 + {}.u64;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]));
+        if (strchr(insn.opcode->name, '.'))
+            println("\t{}.compare<int32_t>({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer());
+        break;
+
     case PPC_INST_ADDE:
         println("\t{}.u8 = ({}.u32 + {}.u32 < {}.u32) | ({}.u32 + {}.u32 + {}.ca < {}.ca);", temp(), r(insn.operands[1]), r(insn.operands[2]), r(insn.operands[1]), r(insn.operands[1]), r(insn.operands[2]), xer(), xer());
         println("\t{}.u64 = {}.u64 + {}.u64 + {}.ca;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]), xer());
@@ -539,6 +546,14 @@ bool Recompiler::Recompile(
             println("\t{}.compare<int32_t>({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer());
         break;
 
+    case PPC_INST_ADDME:
+        println("\t{}.u8 = ({}.u32 - 1 < {}.u32) | ({}.u32 - 1 + {}.ca < {}.ca);", temp(), r(insn.operands[1]), r(insn.operands[1]), r(insn.operands[1]), xer(), xer());
+        println("\t{}.u64 = {}.u64 - 1 + {}.ca;", r(insn.operands[0]), r(insn.operands[1]), xer());
+        println("\t{}.ca = {}.u8;", xer(), temp());
+        if (strchr(insn.opcode->name, '.'))
+            println("\t{}.compare<int32_t>({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer());
+        break;
+
     case PPC_INST_ADDI:
         print("\t{}.s64 = ", r(insn.operands[0]));
         if (insn.operands[1] != 0)
@@ -652,6 +667,14 @@ bool Recompiler::Recompile(
         println("\tif ({}.u32 == 0) goto loc_{:X};", ctr(), insn.operands[0]);
         break;
 
+    case PPC_INST_BDZF:
+    {
+        constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" };
+        println("\t--{}.u64;", ctr());
+        println("\tif ({}.u32 == 0 && !{}.{}) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], insn.operands[1]);
+        break;
+    }
+
     case PPC_INST_BDZLR:
         println("\t--{}.u64;", ctr());
         println("\tif ({}.u32 == 0) return;", ctr(), insn.operands[0]);
@@ -663,10 +686,20 @@ bool Recompiler::Recompile(
         break;
 
     case PPC_INST_BDNZF:
-        // NOTE: assuming eq here as a shortcut because all the instructions in the game do that
+    {
+        constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" };
         println("\t--{}.u64;", ctr());
-        println("\tif ({}.u32 != 0 && !{}.eq) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), insn.operands[1]);
+        println("\tif ({}.u32 != 0 && !{}.{}) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], insn.operands[1]);
         break;
+    }
+
+    case PPC_INST_BDNZT:
+    {
+        constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" };
+        println("\t--{}.u64;", ctr());
+        println("\tif ({}.u32 != 0 && {}.{}) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], insn.operands[1]);
+        break;
+    }
 
     case PPC_INST_BEQ:
         printConditionalBranch(false, "eq");
@@ -712,7 +745,7 @@ bool Recompiler::Recompile(
         break;
 
     case PPC_INST_BLRL:
-        println("__builtin_debugtrap();");
+        println("__debugbreak();");
         break;
 
     case PPC_INST_BLT:
@@ -789,13 +822,27 @@ bool Recompiler::Recompile(
         break;
 
     case PPC_INST_CNTLZD:
-        println("\t{0}.u64 = {1}.u64 == 0 ? 64 : __builtin_clzll({1}.u64);", r(insn.operands[0]), r(insn.operands[1]));
+        println("\t{}.u64 = __lzcnt64({}.u64);", r(insn.operands[0]), r(insn.operands[1]));
         break;
 
     case PPC_INST_CNTLZW:
-        println("\t{0}.u64 = {1}.u32 == 0 ? 32 : __builtin_clz({1}.u32);", r(insn.operands[0]), r(insn.operands[1]));
+        println("\t{}.u64 = __lzcnt({}.u32);", r(insn.operands[0]), r(insn.operands[1]));
         break;
 
+    case PPC_INST_CROR:
+    {
+        constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" };
+        println("\t{}.{} = {}.{} | {}.{};", cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], cr(insn.operands[1] / 4), fields[insn.operands[1] % 4], cr(insn.operands[2] / 4), fields[insn.operands[2] % 4]);
+        break;
+    }
+
+    case PPC_INST_CRORC:
+    {
+        constexpr std::string_view fields[] = { "lt", "gt", "eq", "so" };
+        println("\t{}.{} = {}.{} | (~{}.{} & 1);", cr(insn.operands[0] / 4), fields[insn.operands[0] % 4], cr(insn.operands[1] / 4), fields[insn.operands[1] % 4], cr(insn.operands[2] / 4), fields[insn.operands[2] % 4]);
+        break;
+    }
+
     case PPC_INST_DB16CYC:
         // no op
         break;
@@ -808,6 +855,10 @@ bool Recompiler::Recompile(
         // no op
         break;
 
+    case PPC_INST_DCBST:
+        // no op
+        break;
+
     case PPC_INST_DCBTST:
         // no op
         break;
@@ -852,6 +903,12 @@ bool Recompiler::Recompile(
         // no op
         break;
 
+    case PPC_INST_EQV:
+        println("\t{}.u64 = ~({}.u64 ^ {}.u64);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]));
+        if (strchr(insn.opcode->name, '.'))
+            println("\t{}.compare<int32_t>({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer());
+        break;
+
     case PPC_INST_EXTSB:
         println("\t{}.s64 = {}.s8;", r(insn.operands[0]), r(insn.operands[1]));
         if (strchr(insn.opcode->name, '.'))
@@ -1035,6 +1092,12 @@ bool Recompiler::Recompile(
         println("{}.u32);", r(insn.operands[2]));
         break;
 
+    case PPC_INST_LBZUX:
+        println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}.u64 = PPC_LOAD_U8({});", r(insn.operands[0]), ea());
+        println("\t{}.u32 = {};", r(insn.operands[1]), ea());
+        break;
+
     case PPC_INST_LD:
         print("\t{}.u64 = PPC_LOAD_U64(", r(insn.operands[0]));
         if (insn.operands[2] != 0)
@@ -1063,6 +1126,12 @@ bool Recompiler::Recompile(
         println("{}.u32);", r(insn.operands[2]));
         break;
 
+    case PPC_INST_LDUX:
+        println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}.u64 = PPC_LOAD_U64({});", r(insn.operands[0]), ea());
+        println("\t{}.u32 = {};", r(insn.operands[1]), ea());
+        break;
+
     case PPC_INST_LFD:
         printSetFlushMode(false);
         print("\t{}.u64 = PPC_LOAD_U64(", f(insn.operands[0]));
@@ -1071,6 +1140,13 @@ bool Recompiler::Recompile(
         println("{});", int32_t(insn.operands[1]));
         break;
 
+    case PPC_INST_LFDU:
+        printSetFlushMode(false);
+        println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}.u64 = PPC_LOAD_U64({});", r(insn.operands[0]), ea());
+        println("\t{}.u32 = {};", r(insn.operands[2]), ea());
+        break;
+
     case PPC_INST_LFDX:
         printSetFlushMode(false);
         print("\t{}.u64 = PPC_LOAD_U64(", f(insn.operands[0]));
@@ -1079,6 +1155,13 @@ bool Recompiler::Recompile(
         println("{}.u32);", r(insn.operands[2]));
         break;
 
+    case PPC_INST_LFDUX:
+        printSetFlushMode(false);
+        println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}.u64 = PPC_LOAD_U64({});", r(insn.operands[0]), ea());
+        println("\t{}.u32 = {};", r(insn.operands[1]), ea());
+        break;
+
     case PPC_INST_LFS:
         printSetFlushMode(false);
         print("\t{}.u32 = PPC_LOAD_U32(", temp());
@@ -1088,6 +1171,14 @@ bool Recompiler::Recompile(
         println("\t{}.f64 = double({}.f32);", f(insn.operands[0]), temp());
         break;
 
+    case PPC_INST_LFSU:
+        printSetFlushMode(false);
+        println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}.u32 = PPC_LOAD_U32({});", temp(), ea());
+        println("\t{}.u32 = {};", r(insn.operands[2]), ea());
+        println("\t{}.f64 = double({}.f32);", f(insn.operands[0]), temp());
+        break;
+
     case PPC_INST_LFSX:
         printSetFlushMode(false);
         print("\t{}.u32 = PPC_LOAD_U32(", temp());
@@ -1097,6 +1188,14 @@ bool Recompiler::Recompile(
         println("\t{}.f64 = double({}.f32);", f(insn.operands[0]), temp());
         break;
 
+    case PPC_INST_LFSUX:
+        printSetFlushMode(false);
+        println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}.u32 = PPC_LOAD_U32({});", temp(), ea());
+        println("\t{}.u32 = {};", r(insn.operands[1]), ea());
+        println("\t{}.f64 = double({}.f32);", f(insn.operands[0]), temp());
+        break;
+
     case PPC_INST_LHA:
         print("\t{}.s64 = int16_t(PPC_LOAD_U16(", r(insn.operands[0]));
         if (insn.operands[2] != 0)
@@ -1104,6 +1203,12 @@ bool Recompiler::Recompile(
         println("{}));", int32_t(insn.operands[1]));
         break;
 
+    case PPC_INST_LHAU:
+        print("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2]));
+        print("\t{}.s64 = int16_t(PPC_LOAD_U16({}));", r(insn.operands[0]), ea());
+        print("\t{}.u32 = {};", r(insn.operands[2]), ea());
+        break;
+
     case PPC_INST_LHAX:
         print("\t{}.s64 = int16_t(PPC_LOAD_U16(", r(insn.operands[0]));
         if (insn.operands[1] != 0)
@@ -1118,6 +1223,12 @@ bool Recompiler::Recompile(
         println("{});", int32_t(insn.operands[1]));
         break;
 
+    case PPC_INST_LHZU:
+        println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}.u64 = PPC_LOAD_U16({});", r(insn.operands[0]), ea());
+        println("\t{}.u32 = {};", r(insn.operands[2]), ea());
+        break;
+
     case PPC_INST_LHZX:
         print("\t{}.u64 = PPC_LOAD_U16(", r(insn.operands[0]));
         if (insn.operands[1] != 0)
@@ -1125,6 +1236,12 @@ bool Recompiler::Recompile(
         println("{}.u32);", r(insn.operands[2]));
         break;
 
+    case PPC_INST_LHZUX:
+        println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}.u64 = PPC_LOAD_U16({});", r(insn.operands[0]), ea());
+        println("\t{}.u32 = {};", r(insn.operands[1]), ea());
+        break;
+
     case PPC_INST_LI:
         println("\t{}.s64 = {};", r(insn.operands[0]), int32_t(insn.operands[1]));
         break;
@@ -1137,6 +1254,7 @@ bool Recompiler::Recompile(
     case PPC_INST_LVEWX128:
     case PPC_INST_LVX:
     case PPC_INST_LVX128:
+    case PPC_INST_LVEHX:
         // NOTE: for endian swapping, we reverse the whole vector instead of individual elements.
         // this is accounted for in every instruction (eg. dp3 sums yzw instead of xyz)
         print("\t_mm_store_si128((__m128i*){}.u8, _mm_shuffle_epi8(_mm_load_si128((__m128i*)(base + ((", v(insn.operands[0]));
@@ -1232,6 +1350,12 @@ bool Recompiler::Recompile(
         println("{}.u32);", r(insn.operands[2]));
         break;
 
+    case PPC_INST_LWZUX:
+        println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}.u64 = PPC_LOAD_U32({});", r(insn.operands[0]), ea());
+        println("\t{}.u32 = {};", r(insn.operands[1]), ea());
+        break;
+
     case PPC_INST_MFCR:
         for (size_t i = 0; i < 32; i++)
         {
@@ -1368,43 +1492,43 @@ bool Recompiler::Recompile(
         break;
 
     case PPC_INST_RLDICL:
-        println("\t{}.u64 = __builtin_rotateleft64({}.u64, {}) & 0x{:X};", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], ComputeMask(insn.operands[3], 63));
+        println("\t{}.u64 = _rotl64({}.u64, {}) & 0x{:X};", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], ComputeMask(insn.operands[3], 63));
         break;
 
     case PPC_INST_RLDICR:
-        println("\t{}.u64 = __builtin_rotateleft64({}.u64, {}) & 0x{:X};", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], ComputeMask(0, insn.operands[3]));
+        println("\t{}.u64 = _rotl64({}.u64, {}) & 0x{:X};", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], ComputeMask(0, insn.operands[3]));
         break;
 
     case PPC_INST_RLDIMI:
     {
         const uint64_t mask = ComputeMask(insn.operands[3], ~insn.operands[2]);
-        println("\t{}.u64 = (__builtin_rotateleft64({}.u64, {}) & 0x{:X}) | ({}.u64 & 0x{:X});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], mask, r(insn.operands[0]), ~mask);
+        println("\t{}.u64 = (_rotl64({}.u64, {}) & 0x{:X}) | ({}.u64 & 0x{:X});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], mask, r(insn.operands[0]), ~mask);
         break;
     }
 
     case PPC_INST_RLWIMI:
     {
         const uint64_t mask = ComputeMask(insn.operands[3] + 32, insn.operands[4] + 32);
-        println("\t{}.u64 = (__builtin_rotateleft32({}.u32, {}) & 0x{:X}) | ({}.u64 & 0x{:X});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], mask, r(insn.operands[0]), ~mask);
+        println("\t{}.u64 = (_rotl({}.u32, {}) & 0x{:X}) | ({}.u64 & 0x{:X});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2], mask, r(insn.operands[0]), ~mask);
         break;
     }
 
     case PPC_INST_RLWINM:
-        println("\t{}.u64 = __builtin_rotateleft64({}.u32 | ({}.u64 << 32), {}) & 0x{:X};", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[1]), insn.operands[2], ComputeMask(insn.operands[3] + 32, insn.operands[4] + 32));
+        println("\t{}.u64 = _rotl64({}.u32 | ({}.u64 << 32), {}) & 0x{:X};", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[1]), insn.operands[2], ComputeMask(insn.operands[3] + 32, insn.operands[4] + 32));
         if (strchr(insn.opcode->name, '.'))
             println("\t{}.compare<int32_t>({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer());
         break;
 
     case PPC_INST_ROTLDI:
-        println("\t{}.u64 = __builtin_rotateleft64({}.u64, {});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2]);
+        println("\t{}.u64 = _rotl64({}.u64, {});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2]);
         break;
 
     case PPC_INST_ROTLW:
-        println("\t{}.u64 = __builtin_rotateleft32({}.u32, {}.u8 & 0x1F);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}.u64 = _rotl({}.u32, {}.u8 & 0x1F);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]));
         break;
 
     case PPC_INST_ROTLWI:
-        println("\t{}.u64 = __builtin_rotateleft32({}.u32, {});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2]);
+        println("\t{}.u64 = _rotl({}.u32, {});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2]);
         if (strchr(insn.opcode->name, '.'))
             println("\t{}.compare<int32_t>({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer());
         break;
@@ -1482,7 +1606,7 @@ bool Recompiler::Recompile(
 
     case PPC_INST_STBU:
         println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2]));
-        println("\tPPC_STORE_U8({}, {}.u8);", ea(), r(insn.operands[0]));
+        println("\t{}{}, {}.u8);", mmioStore() ? "PPC_MM_STORE_U8(" : "PPC_STORE_U8(", ea(), r(insn.operands[0]));
         println("\t{}.u32 = {};", r(insn.operands[2]), ea());
         break;
 
@@ -1493,6 +1617,12 @@ bool Recompiler::Recompile(
         println("{}.u32, {}.u8);", r(insn.operands[2]), r(insn.operands[0]));
         break;
 
+    case PPC_INST_STBUX:
+        println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}{}, {}.u8);", mmioStore() ? "PPC_MM_STORE_U8(" : "PPC_STORE_U8(", ea(), r(insn.operands[0]));
+        println("\t{}.u32 = {};", r(insn.operands[1]), ea());
+        break;
+
     case PPC_INST_STD:
         print("{}", mmioStore() ? "\tPPC_MM_STORE_U64(" : "\tPPC_STORE_U64(");
         if (insn.operands[2] != 0)
@@ -1503,16 +1633,16 @@ bool Recompiler::Recompile(
     case PPC_INST_STDCX:
         println("\t{}.lt = 0;", cr(0));
         println("\t{}.gt = 0;", cr(0));
-        print("\t{}.eq = __sync_bool_compare_and_swap(reinterpret_cast<uint64_t*>(base + ", cr(0));
+        print("\t{}.eq = PPC_InterlockedCompareExchange64(reinterpret_cast<int64_t*>(base + ", cr(0));
         if (insn.operands[1] != 0)
             print("{}.u32 + ", r(insn.operands[1]));
-        println("{}.u32), {}.s64, __builtin_bswap64({}.s64));", r(insn.operands[2]), reserved(), r(insn.operands[0]));
+        println("{}.u32), __builtin_bswap64({}.s64), {}.s64) == {}.s64;", r(insn.operands[2]), r(insn.operands[0]), reserved(), reserved());
         println("\t{}.so = {}.so;", cr(0), xer());
         break;
 
     case PPC_INST_STDU:
         println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2]));
-        println("\tPPC_STORE_U64({}, {}.u64);", ea(), r(insn.operands[0]));
+        println("\t{}{}, {}.u64);", mmioStore() ? "PPC_MM_STORE_U64(" : "PPC_STORE_U64(", ea(), r(insn.operands[0]));
         println("\t{}.u32 = {};", r(insn.operands[2]), ea());
         break;
 
@@ -1523,6 +1653,12 @@ bool Recompiler::Recompile(
         println("{}.u32, {}.u64);", r(insn.operands[2]), r(insn.operands[0]));
         break;
 
+    case PPC_INST_STDUX:
+        println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}{}, {}.u64);", mmioStore() ? "PPC_MM_STORE_U64(" : "PPC_STORE_U64(", ea(), r(insn.operands[0]));
+        println("\t{}.u32 = {};", r(insn.operands[1]), ea());
+        break;
+
     case PPC_INST_STFD:
         printSetFlushMode(false);
         print("{}", mmioStore() ? "\tPPC_MM_STORE_U64(" : "\tPPC_STORE_U64(");
@@ -1531,6 +1667,13 @@ bool Recompiler::Recompile(
         println("{}, {}.u64);", int32_t(insn.operands[1]), f(insn.operands[0]));
         break;
 
+    case PPC_INST_STFDU:
+        printSetFlushMode(false);
+        println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}{}, {}.u64);", mmioStore() ? "PPC_MM_STORE_U64(" : "PPC_STORE_U64(", ea(), r(insn.operands[0]));
+        println("\t{}.u32 = {};", r(insn.operands[2]), ea());
+        break;
+
     case PPC_INST_STFDX:
         printSetFlushMode(false);
         print("{}", mmioStore() ? "\tPPC_MM_STORE_U64(" : "\tPPC_STORE_U64(");
@@ -1556,6 +1699,14 @@ bool Recompiler::Recompile(
         println("{}, {}.u32);", int32_t(insn.operands[1]), temp());
         break;
 
+    case PPC_INST_STFSU:
+        printSetFlushMode(false);
+        println("\t{}.f32 = float({}.f64);", temp(), f(insn.operands[0]));
+        println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}{}, {}.u32);", mmioStore() ? "PPC_MM_STORE_U32(" : "PPC_STORE_U32(", ea(), temp());
+        println("\t{}.u32 = {};", r(insn.operands[2]), ea());
+        break;
+
     case PPC_INST_STFSX:
         printSetFlushMode(false);
         println("\t{}.f32 = float({}.f64);", temp(), f(insn.operands[0]));
@@ -1565,6 +1716,14 @@ bool Recompiler::Recompile(
         println("{}.u32, {}.u32);", r(insn.operands[2]), temp());
         break;
 
+    case PPC_INST_STFSUX:
+        printSetFlushMode(false);
+        println("\t{}.f32 = float({}.f64);", temp(), f(insn.operands[0]));
+        println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}{}, {}.u32);", mmioStore() ? "PPC_MM_STORE_U32(" : "PPC_STORE_U32(", ea(), temp());
+        println("\t{}.u32 = {};", r(insn.operands[1]), ea());
+        break;
+
     case PPC_INST_STH:
         print("{}", mmioStore() ? "\tPPC_MM_STORE_U16(" : "\tPPC_STORE_U16(");
         if (insn.operands[2] != 0)
@@ -1572,6 +1731,18 @@ bool Recompiler::Recompile(
         println("{}, {}.u16);", int32_t(insn.operands[1]), r(insn.operands[0]));
         break;
 
+    case PPC_INST_STHU:
+        println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}{}, {}.u16);", mmioStore() ? "PPC_MM_STORE_U16(" : "PPC_STORE_U16(", ea(), r(insn.operands[0]));
+        println("\t{}.u32 = {};", r(insn.operands[2]), ea());
+        break;
+
+    case PPC_INST_STHUX:
+        println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2]));
+        println("\t{}{}, {}.u16);", mmioStore() ? "PPC_MM_STORE_U16(" : "PPC_STORE_U16(", ea(), r(insn.operands[0]));
+        println("\t{}.u32 = {};", r(insn.operands[1]), ea());
+        break;
+
     case PPC_INST_STHBRX:
         print("{}", mmioStore() ? "\tPPC_MM_STORE_U16(" : "\tPPC_STORE_U16(");
         if (insn.operands[1] != 0)
@@ -1658,22 +1829,22 @@ bool Recompiler::Recompile(
     case PPC_INST_STWCX:
         println("\t{}.lt = 0;", cr(0));
         println("\t{}.gt = 0;", cr(0));
-        print("\t{}.eq = __sync_bool_compare_and_swap(reinterpret_cast<uint32_t*>(base + ", cr(0));
+        print("\t{}.eq = PPC_InterlockedCompareExchange(reinterpret_cast<long*>(base + ", cr(0));
         if (insn.operands[1] != 0)
             print("{}.u32 + ", r(insn.operands[1]));
-        println("{}.u32), {}.s32, __builtin_bswap32({}.s32));", r(insn.operands[2]), reserved(), r(insn.operands[0]));
+        println("{}.u32), __builtin_bswap32({}.s32), {}.s32) == {}.s32;", r(insn.operands[2]), r(insn.operands[0]), reserved(), reserved());
         println("\t{}.so = {}.so;", cr(0), xer());
         break;
 
     case PPC_INST_STWU:
         println("\t{} = {} + {}.u32;", ea(), int32_t(insn.operands[1]), r(insn.operands[2]));
-        println("\tPPC_STORE_U32({}, {}.u32);", ea(), r(insn.operands[0]));
+        println("\t{}{}, {}.u32);", mmioStore() ? "PPC_MM_STORE_U32(" : "PPC_STORE_U32(", ea(), r(insn.operands[0]));
         println("\t{}.u32 = {};", r(insn.operands[2]), ea());
         break;
 
     case PPC_INST_STWUX:
         println("\t{} = {}.u32 + {}.u32;", ea(), r(insn.operands[1]), r(insn.operands[2]));
-        println("\tPPC_STORE_U32({}, {}.u32);", ea(), r(insn.operands[0]));
+        println("\t{}{}, {}.u32);", mmioStore() ? "PPC_MM_STORE_U32(" : "PPC_STORE_U32(", ea(), r(insn.operands[0]));
         println("\t{}.u32 = {};", r(insn.operands[1]), ea());
         break;
 
@@ -1705,6 +1876,14 @@ bool Recompiler::Recompile(
             println("\t{}.compare<int32_t>({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer());
         break;
 
+    case PPC_INST_SUBFZE:
+        println("\t{}.u8 = (~{}.u32 < ~{}.u32) | (~{}.u32 + {}.ca < {}.ca);", temp(), r(insn.operands[1]), r(insn.operands[1]), r(insn.operands[1]), xer(), xer());
+        println("\t{}.u64 = ~{}.u64 + {}.ca;", r(insn.operands[0]), r(insn.operands[1]), xer());
+        println("\t{}.ca = {}.u8;", xer(), temp());
+        if (strchr(insn.opcode->name, '.'))
+            println("\t{}.compare<int32_t>({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer());
+        break;
+
     case PPC_INST_SUBFIC:
         println("\t{}.ca = {}.u32 <= {};", xer(), r(insn.operands[1]), insn.operands[2]);
         println("\t{}.s64 = {} - {}.s64;", r(insn.operands[0]), int32_t(insn.operands[2]), r(insn.operands[1]));
@@ -1740,10 +1919,23 @@ bool Recompiler::Recompile(
         println("\t_mm_store_ps({}.f32, _mm_add_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
         break;
 
+    case PPC_INST_VADDSBS:
+        println("\t_mm_store_si128((__m128i*){}.s8, _mm_adds_epi8(_mm_load_si128((__m128i*){}.s8), _mm_load_si128((__m128i*){}.s8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        break;
+
     case PPC_INST_VADDSHS:
         println("\t_mm_store_si128((__m128i*){}.s16, _mm_adds_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
         break;
 
+    case PPC_INST_VADDSWS:
+        // TODO: vectorize
+        for (size_t i = 0; i < 4; i++)
+        {
+            println("\t{}.s64 = int64_t({}.s32[{}]) + int64_t({}.s32[{}]);", temp(), v(insn.operands[1]), i, v(insn.operands[2]), i);
+            println("\t{}.s32[{}] = {}.s64 > INT_MAX ? INT_MAX : {}.s64 < INT_MIN ? INT_MIN : {}.s64;", v(insn.operands[0]), i, temp(), temp(), temp());
+        }
+        break;
+
     case PPC_INST_VADDUBM:
         println("\t_mm_store_si128((__m128i*){}.u8, _mm_add_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
         break;
@@ -1785,6 +1977,10 @@ bool Recompiler::Recompile(
         println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
         break;
 
+    case PPC_INST_VAVGUH:
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        break;
+
     case PPC_INST_VCTSXS:
     case PPC_INST_VCFPSXWS128:
         printSetFlushMode(true);
@@ -1795,6 +1991,16 @@ bool Recompiler::Recompile(
             println("_mm_load_ps({}.f32)));", v(insn.operands[1]));
         break;
 
+    case PPC_INST_VCTUXS:
+    case PPC_INST_VCFPUXWS128:
+        printSetFlushMode(true);
+        print("\t_mm_store_si128((__m128i*){}.u32, _mm_vctuxs(", v(insn.operands[0]));
+        if (insn.operands[2] != 0)
+            println("_mm_mul_ps(_mm_load_ps({}.f32), _mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]);
+        else
+            println("_mm_load_ps({}.f32)));", v(insn.operands[1]));
+        break;
+
     case PPC_INST_VCFSX:
     case PPC_INST_VCSXWFP128:
     {
@@ -1831,7 +2037,7 @@ bool Recompiler::Recompile(
 
     case PPC_INST_VCMPBFP:
     case PPC_INST_VCMPBFP128:
-        println("\t__builtin_debugtrap();");
+        println("\t__debugbreak();");
         break;
 
     case PPC_INST_VCMPEQFP:
@@ -1848,6 +2054,12 @@ bool Recompiler::Recompile(
             println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0]));
         break;
 
+    case PPC_INST_VCMPEQUH:
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        if (strchr(insn.opcode->name, '.'))
+            println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u16), 0xFFFF);", cr(6), v(insn.operands[0]));
+        break;
+
     case PPC_INST_VCMPEQUW:
     case PPC_INST_VCMPEQUW128:
         println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpeq_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
@@ -1873,10 +2085,26 @@ bool Recompiler::Recompile(
 
     case PPC_INST_VCMPGTUB:
         println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        if (strchr(insn.opcode->name, '.'))
+            println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u8), 0xFFFF);", cr(6), v(insn.operands[0]));
         break;
 
     case PPC_INST_VCMPGTUH:
         println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        if (strchr(insn.opcode->name, '.'))
+            println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.u16), 0xFFFF);", cr(6), v(insn.operands[0]));
+        break;
+
+    case PPC_INST_VCMPGTSH:
+        println("\t_mm_store_si128((__m128i*){}.s8, _mm_cmpgt_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        if (strchr(insn.opcode->name, '.'))
+            println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.s16), 0xFFFF);", cr(6), v(insn.operands[0]));
+        break;
+
+    case PPC_INST_VCMPGTSW:
+        println("\t_mm_store_si128((__m128i*){}.s8, _mm_cmpgt_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        if (strchr(insn.opcode->name, '.'))
+            println("\t{}.setFromMask(_mm_load_si128((__m128i*){}.s32), 0xFFFF);", cr(6), v(insn.operands[0]));
         break;
 
     case PPC_INST_VEXPTEFP:
@@ -1908,10 +2136,18 @@ bool Recompiler::Recompile(
         println("\t_mm_store_ps({}.f32, _mm_max_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
         break;
 
+    case PPC_INST_VMAXSH:
+        println("\t_mm_store_si128((__m128i*){}.u16, _mm_max_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        break;
+
     case PPC_INST_VMAXSW:
         println("\t_mm_store_si128((__m128i*){}.u32, _mm_max_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
         break;
 
+    case PPC_INST_VMINSH:
+        println("\t_mm_store_si128((__m128i*){}.u16, _mm_min_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        break;
+
     case PPC_INST_VMINFP:
     case PPC_INST_VMINFP128:
         printSetFlushMode(true);
@@ -2037,16 +2273,41 @@ bool Recompiler::Recompile(
             break;
 
         default:
-            println("\t__builtin_debugtrap();");
+            println("\t__debugbreak();");
             break;
         }
         break;
 
+    case PPC_INST_VPKSHSS:
+    case PPC_INST_VPKSHSS128:
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_packs_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
+        break;
+
+    case PPC_INST_VPKSWSS:
+    case PPC_INST_VPKSWSS128:
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_packs_epi32(_mm_load_si128((__m128i*){}.s32), _mm_load_si128((__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
+        break;
+
     case PPC_INST_VPKSHUS:
     case PPC_INST_VPKSHUS128:
         println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
         break;
 
+    case PPC_INST_VPKSWUS:
+    case PPC_INST_VPKSWUS128:
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi32(_mm_load_si128((__m128i*){}.s32), _mm_load_si128((__m128i*){}.s32)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
+        break;
+
+    case PPC_INST_VPKUHUS:
+    case PPC_INST_VPKUHUS128:
+        for (size_t i = 0; i < 8; i++)
+        {
+            println("\t{0}.u8[{1}] = {2}.u16[{1}] > UCHAR_MAX ? UCHAR_MAX : {2}.u16[{1}];", vTemp(), i, v(insn.operands[2]));
+            println("\t{0}.u8[{1}] = {2}.u16[{3}] > UCHAR_MAX ? UCHAR_MAX : {2}.u16[{3}];", vTemp(), i + 8, v(insn.operands[1]), i);
+        }
+        println("{} = {};", v(insn.operands[0]), vTemp());
+        break;
+
     case PPC_INST_VREFP:
     case PPC_INST_VREFP128:
         // TODO: see if we can use rcp safely
@@ -2079,6 +2340,14 @@ bool Recompiler::Recompile(
         break;
     }
 
+    case PPC_INST_VRLH:
+        for (size_t i = 0; i < 8; i++)
+        {
+            println("\t{0}.u16[{1}] = ({2}.u16[{1}] << ({3}.u16[{1}] & 0xF)) | ({2}.u16[{1}] >> (16 - ({3}.u16[{1}] & 0xF)));", vTemp(), i, v(insn.operands[1]), v(insn.operands[2]));
+        }
+        println("{} = {};", v(insn.operands[0]), vTemp());
+        break;
+
     case PPC_INST_VRSQRTEFP:
     case PPC_INST_VRSQRTEFP128:
         // TODO: see if we can use rsqrt safely
@@ -2088,6 +2357,7 @@ bool Recompiler::Recompile(
         break;
 
     case PPC_INST_VSEL:
+    case PPC_INST_VSEL128:
         println("\t_mm_store_si128((__m128i*){}.u8, _mm_or_si128(_mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8))));", v(insn.operands[0]), v(insn.operands[3]), v(insn.operands[1]), v(insn.operands[3]), v(insn.operands[2]));
         break;
 
@@ -2097,6 +2367,12 @@ bool Recompiler::Recompile(
             println("\t{}.u8[{}] = {}.u8[{}] << ({}.u8[{}] & 0x7);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i);
         break;
 
+    case PPC_INST_VSLH:
+        // TODO: vectorize
+        for (size_t i = 0; i < 8; i++)
+            println("\t{}.u16[{}] = {}.u16[{}] << ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 2);
+        break;
+
     case PPC_INST_VSLDOI:
     case PPC_INST_VSLDOI128:
         println("\t_mm_store_si128((__m128i*){}.u8, _mm_alignr_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8), {}));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), 16 - insn.operands[3]);
@@ -2130,6 +2406,10 @@ bool Recompiler::Recompile(
         println("\t_mm_store_si128((__m128i*){}.u8, _mm_set1_epi8(char(0x{:X})));", v(insn.operands[0]), insn.operands[1]);
         break;
 
+    case PPC_INST_VSPLTISH:
+        println("\t_mm_store_si128((__m128i*){}.u16, _mm_set1_epi16(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]);
+        break;
+
     case PPC_INST_VSPLTISW:
     case PPC_INST_VSPLTISW128:
         println("\t_mm_store_si128((__m128i*){}.u32, _mm_set1_epi32(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]);
@@ -2149,6 +2429,18 @@ bool Recompiler::Recompile(
         println("\t_mm_store_si128((__m128i*){}.u8, _mm_vsr(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
         break;
 
+    case PPC_INST_VSRAB:
+        // TODO: vectorize, ensure endianness is correct
+        for (size_t i = 0; i < 16; i++)
+            println("\t{}.s8[{}] = {}.s8[{}] >> ({}.u8[{}] & 0x7);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i);
+        break;
+
+    case PPC_INST_VSRAH:
+        // TODO: vectorize, ensure endianness is correct
+        for (size_t i = 0; i < 8; i++)
+            println("\t{}.s16[{}] = {}.s16[{}] >> ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 2);
+        break;
+
     case PPC_INST_VSRAW:
     case PPC_INST_VSRAW128:
         // TODO: vectorize, ensure endianness is correct
@@ -2156,6 +2448,12 @@ bool Recompiler::Recompile(
             println("\t{}.s32[{}] = {}.s32[{}] >> ({}.u8[{}] & 0x1F);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 4);
         break;
 
+    case PPC_INST_VSRH:
+        // TODO: vectorize, ensure endianness is correct
+        for (size_t i = 0; i < 8; i++)
+            println("\t{}.u16[{}] = {}.u16[{}] >> ({}.u8[{}] & 0xF);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 2);
+        break;
+
     case PPC_INST_VSRW:
     case PPC_INST_VSRW128:
         // TODO: vectorize, ensure endianness is correct
@@ -2169,6 +2467,15 @@ bool Recompiler::Recompile(
         println("\t_mm_store_ps({}.f32, _mm_sub_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
         break;
 
+    case PPC_INST_VSUBSHS:
+        // TODO: vectorize
+        for (size_t i = 0; i < 8; i++)
+        {
+            println("\t{}.s64 = int64_t({}.s16[{}]) - int64_t({}.s16[{}]);", temp(), v(insn.operands[1]), i, v(insn.operands[2]), i);
+            println("\t{}.s16[{}] = {}.s64 > SHRT_MAX ? SHRT_MAX : {}.s64 < SHRT_MIN ? SHRT_MIN : {}.s64;", v(insn.operands[0]), i, temp(), temp(), temp());
+        }
+        break;
+
     case PPC_INST_VSUBSWS:
         // TODO: vectorize
         for (size_t i = 0; i < 4; i++)
@@ -2182,8 +2489,12 @@ bool Recompiler::Recompile(
         println("\t_mm_store_si128((__m128i*){}.u8, _mm_subs_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
         break;
 
+    case PPC_INST_VSUBUBM:
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_sub_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        break;
+
     case PPC_INST_VSUBUHM:
-        println("\t_mm_store_si128((__m128i*){}.u8, _mm_sub_epi16(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_sub_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
         break;
 
     case PPC_INST_VUPKD3D128:
@@ -2213,7 +2524,7 @@ bool Recompiler::Recompile(
             break;
 
         default:
-            println("\t__builtin_debugtrap();");
+            println("\t__debugbreak();");
             break;
         }
         break;
@@ -2263,6 +2574,74 @@ bool Recompiler::Recompile(
         println("\t{}.u64 = {}.u64 ^ {};", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2] << 16);
         break;
 
+    case PPC_INST_MULHD:
+        println("\t{}.s64 = (int64_t({}.s32) * int64_t({}.s32)) >> 32;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]));
+        break;
+
+    case PPC_INST_MULHDU:
+        println("\t{}.u64 = (uint64_t({}.u32) * uint64_t({}.u32)) >> 32;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]));
+        break;
+
+    case PPC_INST_VCMPGTUW:
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_cmpgt_epu32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        if (strchr(insn.opcode->name, '.'))  // For vcmpgtuw.
+            println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0]));
+        break;
+
+    case PPC_INST_VANDC:
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
+        break;
+
+    case PPC_INST_VNOR:
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_xor_si128(_mm_or_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_set1_epi32(0xFFFFFFFF)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        break;
+
+    case PPC_INST_VNOR128:
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_xor_si128(_mm_or_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_set1_epi32(0xFFFFFFFF)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        break;
+
+    case PPC_INST_VSL:
+        // Assuming byte-wise shift for generality (could be word/halfword depending on context)
+        for (size_t i = 0; i < 16; i++)
+            println("\t{}.u8[{}] = {}.u8[{}] << ({}.u8[{}] & 0x7);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i);
+        break;
+
+    case PPC_INST_VMAXUB:
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_max_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        break;
+
+    case PPC_INST_VMINUB:
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_min_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        break;
+
+    case PPC_INST_VPKUHUM:
+        println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1]));
+        break;
+
+    case PPC_INST_VSUBUWS:
+        println("\t_mm_store_si128((__m128i*){}.u32, _mm_subs_epu32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        break;
+
+    case PPC_INST_VRLW128:
+        for (size_t i = 0; i < 4; i++)
+            println("\t{}.u32[{}] = ({}.u32[{}] << ({}.u8[{}] & 0x1F)) | ({}.u32[{}] >> (32 - ({}.u8[{}] & 0x1F)));", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i * 4, v(insn.operands[1]), i, v(insn.operands[2]), i * 4);
+        break;
+
+    case PPC_INST_MACLHWU:
+        println("\t{}.u64 = (({}.u32 & 0xFFFF) * ({}.u32 & 0xFFFF) + {}.u32) & 0xFFFFFFFF;",
+                r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]), r(insn.operands[0]));
+        break;
+
+    case PPC_INST_VSUBUWM:
+        println("\t_mm_store_si128((__m128i*){}.u32, _mm_sub_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));",
+                v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]));
+        break;
+
+    case PPC_INST_MACCHWU:
+        println("\t{}.u64 = ((({}.u32 >> 16) * ({}.u32 & 0xFFFF)) + {}.u32) & 0xFFFFFFFF;",
+                r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]), r(insn.operands[0]));
+        break;
+
     default:
         return false;
     }
@@ -2666,4 +3045,4 @@ void Recompiler::SaveCurrentOutData(const std::string_view& name)
 
         out.clear();
     }
-}
+}
\ No newline at end of file
diff --git a/XenonUtils/byteswap.h b/XenonUtils/byteswap.h
index 33e959f..4bb6dfe 100644
--- a/XenonUtils/byteswap.h
+++ b/XenonUtils/byteswap.h
@@ -1,25 +1,33 @@
-#pragma once 
+#pragma once
 
 #include <cassert>
 
-template<typename T>
-inline T ByteSwap(T value)
+// https://github.com/hedge-dev/XenonRecomp/pull/35
+#ifdef __clang__
+#define _byte_swap16(value) __builtin_bswap16(static_cast<uint16_t>(value))
+#define _byte_swap32(value) __builtin_bswap32(static_cast<uint32_t>(value))
+#define _byte_swap64(value) __builtin_bswap64(static_cast<uint64_t>(value))
+#elif defined(_MSC_VER)
+#define _byte_swap16(value) _byteswap_ushort(static_cast<uint16_t>(value))
+#define _byte_swap32(value) _byteswap_ulong(static_cast<uint32_t>(value))
+#define _byte_swap64(value) _byteswap_uint64(static_cast<uint64_t>(value))
+#endif
+
+template<typename T> T ByteSwap(T value)
 {
     if constexpr (sizeof(T) == 1)
         return value;
-    else if constexpr (sizeof(T) == 2)
-        return static_cast<T>(__builtin_bswap16(static_cast<uint16_t>(value)));
-    else if constexpr (sizeof(T) == 4)
-        return static_cast<T>(__builtin_bswap32(static_cast<uint32_t>(value)));
-    else if constexpr (sizeof(T) == 8)
-        return static_cast<T>(__builtin_bswap64(static_cast<uint64_t>(value)));
+    if constexpr (sizeof(T) == 2)
+        return static_cast<T>(_byte_swap16(value));
+    if constexpr (sizeof(T) == 4)
+        return static_cast<T>(_byte_swap32(value));
+    if constexpr (sizeof(T) == 8)
+        return static_cast<T>(_byte_swap64(value));
 
     assert(false && "Unexpected byte size.");
-    return value;
 }
 
-template<typename T>
-inline void ByteSwapInplace(T& value)
+template<typename T> void ByteSwapInplace(T& value)
 {
     value = ByteSwap(value);
 }
diff --git a/XenonUtils/ppc_context.h b/XenonUtils/ppc_context.h
index c1091d1..f8b85df 100644
--- a/XenonUtils/ppc_context.h
+++ b/XenonUtils/ppc_context.h
@@ -645,10 +645,91 @@ inline __m128i _mm_vctsxs(__m128 src1)
     return _mm_andnot_si128(_mm_castps_si128(xmm2), _mm_castps_si128(dest));
 }
 
+inline __m128i _mm_vctuxs(__m128 src1)
+{
+    __m128 xmm0 = _mm_max_ps(src1, _mm_set1_epi32(0));
+    __m128 xmm1 = _mm_cmpge_ps(xmm0, _mm_set1_ps((float)0x80000000));
+    __m128 xmm2 = _mm_sub_ps(xmm0, _mm_set1_ps((float)0x80000000));
+    xmm0 = _mm_blendv_ps(xmm0, xmm2, xmm1);
+    __m128i dest = _mm_cvttps_epi32(xmm0);
+    xmm0 = _mm_cmpeq_epi32(dest, _mm_set1_epi32(INT_MIN));
+    xmm1 = _mm_and_si128(xmm1, _mm_set1_epi32(INT_MIN));
+    dest = _mm_add_epi32(dest, xmm1);
+    return _mm_or_si128(dest, xmm0);
+}
+
 inline __m128i _mm_vsr(__m128i a, __m128i b)
 {
     b = _mm_srli_epi64(_mm_slli_epi64(b, 61), 61);
     return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10));
 }
 
+inline uint64_t _rotl64(uint64_t value, int shift) {
+    shift &= 63; // Normalize shift to 0-63
+    return (value << shift) | (value >> (64 - shift));
+}
+
+inline uint32_t __lzcnt(uint32_t value) {
+    if (value == 0) return 32;
+    uint32_t count = 0;
+    while ((value & 0x80000000) == 0) {
+        count++;
+        value <<= 1;
+    }
+    return count;
+}
+
+#ifdef _WIN32
+    #pragma intrinsic(_InterlockedCompareExchange)
+    #define PPC_InterlockedCompareExchange _InterlockedCompareExchange
+#else
+    // Fallback for GCC/Clang
+    inline long PPC_InterlockedCompareExchange(long volatile* Destination, long Exchange, long Comparand) {
+        int32_t expected = Comparand;
+        bool success = __atomic_compare_exchange_n(
+            reinterpret_cast<volatile int32_t*>(Destination),  // Preserve volatile
+            &expected,
+            Exchange,
+            false,
+            __ATOMIC_SEQ_CST,
+            __ATOMIC_SEQ_CST
+        );
+        return success ? Comparand : expected;
+    }
+#endif
+
+#ifdef _WIN32
+    #pragma intrinsic(_InterlockedCompareExchange64)
+    #define PPC_InterlockedCompareExchange64 _InterlockedCompareExchange64
+#else
+    // Fallback for GCC/Clang
+    inline int64_t PPC_InterlockedCompareExchange64(int64_t volatile* Destination, int64_t Exchange, int64_t Comparand) {
+        int64_t expected = Comparand;
+        bool success = __atomic_compare_exchange_n(
+            reinterpret_cast<volatile int64_t*>(Destination),  // Preserve volatile
+            &expected,
+            Exchange,
+            false,
+            __ATOMIC_SEQ_CST,
+            __ATOMIC_SEQ_CST
+        );
+        return success ? Comparand : expected;
+    }
+#endif
+
+
+#ifndef __debugbreak
+#ifdef _WIN32
+#pragma intrinsic(__debugbreak)
+#define __debugbreak __debugbreak
+#else
+// GCC/Clang/Linux fallback
+#ifdef __x86_64__
+#define __debugbreak() asm volatile("int $0x3")
+#else
+#define __debugbreak() raise(SIGTRAP)
+#endif
+#endif
+#endif
+
 #endif
diff --git a/XenonUtils/xex.cpp b/XenonUtils/xex.cpp
index 4e79042..ed5d53e 100644
--- a/XenonUtils/xex.cpp
+++ b/XenonUtils/xex.cpp
@@ -13,26 +13,26 @@
 
 #ifndef _WIN32
 
-typedef struct _IMAGE_DOS_HEADER { 
-    uint16_t   e_magic;                
-    uint16_t   e_cblp;                 
-    uint16_t   e_cp;                   
-    uint16_t   e_crlc;                 
-    uint16_t   e_cparhdr;              
-    uint16_t   e_minalloc;             
-    uint16_t   e_maxalloc;             
-    uint16_t   e_ss;                   
-    uint16_t   e_sp;                   
-    uint16_t   e_csum;                 
-    uint16_t   e_ip;                   
-    uint16_t   e_cs;                   
-    uint16_t   e_lfarlc;               
-    uint16_t   e_ovno;                 
-    uint16_t   e_res[4];               
-    uint16_t   e_oemid;                
-    uint16_t   e_oeminfo;              
-    uint16_t   e_res2[10];             
-    uint32_t   e_lfanew;               
+typedef struct _IMAGE_DOS_HEADER {
+    uint16_t   e_magic;
+    uint16_t   e_cblp;
+    uint16_t   e_cp;
+    uint16_t   e_crlc;
+    uint16_t   e_cparhdr;
+    uint16_t   e_minalloc;
+    uint16_t   e_maxalloc;
+    uint16_t   e_ss;
+    uint16_t   e_sp;
+    uint16_t   e_csum;
+    uint16_t   e_ip;
+    uint16_t   e_cs;
+    uint16_t   e_lfarlc;
+    uint16_t   e_ovno;
+    uint16_t   e_res[4];
+    uint16_t   e_oemid;
+    uint16_t   e_oeminfo;
+    uint16_t   e_res2[10];
+    uint32_t   e_lfanew;
 } IMAGE_DOS_HEADER, * PIMAGE_DOS_HEADER;
 
 typedef struct _IMAGE_FILE_HEADER {
@@ -114,7 +114,7 @@ typedef struct _IMAGE_SECTION_HEADER {
 
 #endif
 
-std::unordered_map<size_t, const char*> XamExports = 
+std::unordered_map<size_t, const char*> XamExports =
 {
     #include "xbox/xam_table.inc"
 };
@@ -289,7 +289,7 @@ Image Xex2LoadImage(const uint8_t* data, size_t dataSize)
             flags |= SectionFlags_Code;
         }
 
-        image.Map(reinterpret_cast<const char*>(section.Name), section.VirtualAddress, 
+        image.Map(reinterpret_cast<const char*>(section.Name), section.VirtualAddress,
             section.Misc.VirtualSize, flags, image.data.get() + section.VirtualAddress);
     }
 
diff --git a/XenonUtils/xex.h b/XenonUtils/xex.h
index 363efff..169e569 100644
--- a/XenonUtils/xex.h
+++ b/XenonUtils/xex.h
@@ -193,7 +193,7 @@ struct Xex2ImportHeader
     be<uint32_t> numImports;
 };
 
-struct Xex2ImportLibrary 
+struct Xex2ImportLibrary
 {
     be<uint32_t> size;
     char nextImportDigest[0x14];
@@ -204,12 +204,12 @@ struct Xex2ImportLibrary
     be<uint16_t> numberOfImports;
 };
 
-struct Xex2ImportDescriptor 
+struct Xex2ImportDescriptor
 {
     be<uint32_t> firstThunk; // VA XEX_THUNK_DATA
 };
 
-struct Xex2ThunkData 
+struct Xex2ThunkData
 {
     union
     {