diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc
index bc375c3..afc9c2c 100644
Binary files a/ptx/lib/zluda_ptx_impl.bc and b/ptx/lib/zluda_ptx_impl.bc differ
diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp
index 6174ec1..f247f45 100644
--- a/ptx/lib/zluda_ptx_impl.cpp
+++ b/ptx/lib/zluda_ptx_impl.cpp
@@ -780,4 +780,66 @@ typedef uint32_t ShflSyncResult __attribute__((ext_vector_type(2)));
     
         return d_out;
     }
+
+    struct byte4
+    {
+        union
+        {
+            uint32_t u32;
+            uint8_t u8x4[4];
+        };
+    } __attribute__((aligned(4)));
+
+    struct byte8
+    {
+        union
+        {
+            uint32_t u32x2[2];
+            uint8_t u8x8[8];
+        };
+    } __attribute__((aligned(8)));
+
+    uint32_t FUNC(prmt_b32)(uint32_t x, uint32_t y, uint32_t s)
+    {
+        byte4 v_perm_selector;
+        v_perm_selector.u32 = 0;
+
+        byte8 input;
+        input.u32x2[0] = x;
+        input.u32x2[1] = y;
+
+        for (size_t i = 0; i < 4; i++)
+        {
+            uint8_t sel = static_cast<uint8_t>(s >> (i * 4));
+            uint8_t addr = sel & 0x7;
+            if (sel & 0x8)
+            {
+                if (addr % 2 == 1)
+                {
+                    v_perm_selector.u8x4[i] = 0x8 + addr / 2;
+                    continue;
+                }
+            }
+            v_perm_selector.u8x4[i] = addr;
+        }
+
+        byte4 output;
+        output.u32 = __builtin_amdgcn_perm(input.u32x2[1], input.u32x2[0], v_perm_selector.u32);
+
+        for (size_t i = 0; i < 4; i++)
+        {
+            uint8_t sel = static_cast<uint8_t>(s >> (i * 4));
+            uint8_t addr = sel & 0x7;
+            if (sel & 0x8)
+            {
+                if (addr % 2 != 1)
+                {
+                    output.u8x4[i] = (output.u8x4[i] & 0x80) * 0xff;
+                    continue;
+                }
+            }
+        }
+
+        return output.u32;
+    }
 }
diff --git a/ptx/src/pass/insert_post_saturation.rs b/ptx/src/pass/insert_post_saturation.rs
index 525ae15..620e46b 100644
--- a/ptx/src/pass/insert_post_saturation.rs
+++ b/ptx/src/pass/insert_post_saturation.rs
@@ -164,7 +164,6 @@ fn run_instruction<'input>(
         | ast::Instruction::Or { .. }
         | ast::Instruction::Popc { .. }
         | ast::Instruction::Prmt { .. }
-        | ast::Instruction::PrmtSlow { .. }
         | ast::Instruction::Rcp { .. }
         | ast::Instruction::Rem { .. }
         | ast::Instruction::Ret { .. }
diff --git a/ptx/src/pass/instruction_mode_to_global_mode/mod.rs b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs
index d365e29..12851a6 100644
--- a/ptx/src/pass/instruction_mode_to_global_mode/mod.rs
+++ b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs
@@ -1818,7 +1818,6 @@ fn get_modes<T: ast::Operand>(inst: &ast::Instruction<T>) -> InstructionModes {
         | ast::Instruction::Mov { .. }
         | ast::Instruction::Ld { .. }
         | ast::Instruction::St { .. }
-        | ast::Instruction::PrmtSlow { .. }
         | ast::Instruction::Prmt { .. }
         | ast::Instruction::Activemask { .. }
         | ast::Instruction::Membar { .. }
diff --git a/ptx/src/pass/llvm/emit.rs b/ptx/src/pass/llvm/emit.rs
index 76717e1..0a68f8b 100644
--- a/ptx/src/pass/llvm/emit.rs
+++ b/ptx/src/pass/llvm/emit.rs
@@ -510,10 +510,6 @@ impl<'a> MethodEmitContext<'a> {
             ast::Instruction::Xor { data, arguments } => self.emit_xor(data, arguments),
             ast::Instruction::Rem { data, arguments } => self.emit_rem(data, arguments),
             ast::Instruction::BarWarp { .. } => self.emit_bar_warp(),
-            ast::Instruction::PrmtSlow { .. } => {
-                Err(error_todo_msg("PrmtSlow is not implemented yet"))
-            }
-            ast::Instruction::Prmt { data, arguments } => self.emit_prmt(data, arguments),
             ast::Instruction::Membar { data } => self.emit_membar(data),
             ast::Instruction::Trap {} => self.emit_trap(),
             ast::Instruction::Tanh { data, arguments } => self.emit_tanh(data, arguments),
@@ -533,7 +529,8 @@ impl<'a> MethodEmitContext<'a> {
             | ast::Instruction::Nanosleep { .. }
             | ast::Instruction::ReduxSync { .. }
             | ast::Instruction::LdMatrix { .. }
-            | ast::Instruction::Mma { .. } => return Err(error_unreachable()),
+            | ast::Instruction::Mma { .. }
+            | ast::Instruction::Prmt { .. } => return Err(error_unreachable()),
         }
     }
 
@@ -2447,48 +2444,6 @@ impl<'a> MethodEmitContext<'a> {
         Ok(())
     }
 
-    fn emit_prmt(
-        &mut self,
-        control: u16,
-        arguments: ptx_parser::PrmtArgs<SpirvWord>,
-    ) -> Result<(), TranslateError> {
-        let components = [
-            (control >> 0) & 0b1111,
-            (control >> 4) & 0b1111,
-            (control >> 8) & 0b1111,
-            (control >> 12) & 0b1111,
-        ];
-        if components.iter().any(|&c| c > 7) {
-            return Err(error_todo());
-        }
-        let u32_type = get_scalar_type(self.context, ast::ScalarType::U32);
-        let v4u8_type = get_type(self.context, &ast::Type::Vector(4, ast::ScalarType::U8))?;
-        let mut components = [
-            unsafe { LLVMConstInt(u32_type, components[0] as _, 0) },
-            unsafe { LLVMConstInt(u32_type, components[1] as _, 0) },
-            unsafe { LLVMConstInt(u32_type, components[2] as _, 0) },
-            unsafe { LLVMConstInt(u32_type, components[3] as _, 0) },
-        ];
-        let components_indices =
-            unsafe { LLVMConstVector(components.as_mut_ptr(), components.len() as u32) };
-        let src1 = self.resolver.value(arguments.src1)?;
-        let src1_vector =
-            unsafe { LLVMBuildBitCast(self.builder, src1, v4u8_type, LLVM_UNNAMED.as_ptr()) };
-        let src2 = self.resolver.value(arguments.src2)?;
-        let src2_vector =
-            unsafe { LLVMBuildBitCast(self.builder, src2, v4u8_type, LLVM_UNNAMED.as_ptr()) };
-        self.resolver.with_result(arguments.dst, |dst| unsafe {
-            LLVMBuildShuffleVector(
-                self.builder,
-                src1_vector,
-                src2_vector,
-                components_indices,
-                dst,
-            )
-        });
-        Ok(())
-    }
-
     fn emit_abs(
         &mut self,
         data: ast::TypeFtz,
diff --git a/ptx/src/pass/replace_instructions_with_functions.rs b/ptx/src/pass/replace_instructions_with_functions.rs
index a68008f..2951657 100644
--- a/ptx/src/pass/replace_instructions_with_functions.rs
+++ b/ptx/src/pass/replace_instructions_with_functions.rs
@@ -519,6 +519,9 @@ fn run_instruction<'input>(
                 i,
             )?
         }
+        i @ ptx_parser::Instruction::Prmt { .. } => {
+            to_call(resolver, fn_declarations, "prmt_b32".into(), i)?
+        }
         i => i,
     })
 }
diff --git a/ptx/src/test/ll/prmt.ll b/ptx/src/test/ll/prmt.ll
index 7753f5c..933229d 100644
--- a/ptx/src/test/ll/prmt.ll
+++ b/ptx/src/test/ll/prmt.ll
@@ -1,38 +1,39 @@
-define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
-  %"38" = alloca i64, align 8, addrspace(5)
+declare hidden i32 @__zluda_ptx_impl_prmt_b32(i32, i32, i32) #0
+
+define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 {
   %"39" = alloca i64, align 8, addrspace(5)
-  %"40" = alloca i32, align 4, addrspace(5)
+  %"40" = alloca i64, align 8, addrspace(5)
   %"41" = alloca i32, align 4, addrspace(5)
+  %"42" = alloca i32, align 4, addrspace(5)
   br label %1
 
 1:                                                ; preds = %0
-  br label %"35"
+  br label %"36"
 
-"35":                                             ; preds = %1
-  %"42" = load i64, ptr addrspace(4) %"36", align 8
-  store i64 %"42", ptr addrspace(5) %"38", align 8
+"36":                                             ; preds = %1
   %"43" = load i64, ptr addrspace(4) %"37", align 8
   store i64 %"43", ptr addrspace(5) %"39", align 8
-  %"45" = load i64, ptr addrspace(5) %"38", align 8
-  %"53" = inttoptr i64 %"45" to ptr
-  %"44" = load i32, ptr %"53", align 4
-  store i32 %"44", ptr addrspace(5) %"40", align 4
-  %"46" = load i64, ptr addrspace(5) %"38", align 8
+  %"44" = load i64, ptr addrspace(4) %"38", align 8
+  store i64 %"44", ptr addrspace(5) %"40", align 8
+  %"46" = load i64, ptr addrspace(5) %"39", align 8
   %"54" = inttoptr i64 %"46" to ptr
-  %"34" = getelementptr inbounds i8, ptr %"54", i64 4
-  %"47" = load i32, ptr %"34", align 4
-  store i32 %"47", ptr addrspace(5) %"41", align 4
-  %"49" = load i32, ptr addrspace(5) %"40", align 4
+  %"45" = load i32, ptr %"54", align 4
+  store i32 %"45", ptr addrspace(5) %"41", align 4
+  %"47" = load i64, ptr addrspace(5) %"39", align 8
+  %"55" = inttoptr i64 %"47" to ptr
+  %"34" = getelementptr inbounds i8, ptr %"55", i64 4
+  %"48" = load i32, ptr %"34", align 4
+  store i32 %"48", ptr addrspace(5) %"42", align 4
   %"50" = load i32, ptr addrspace(5) %"41", align 4
-  %2 = bitcast i32 %"49" to <4 x i8>
-  %3 = bitcast i32 %"50" to <4 x i8>
-  %"55" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
-  store <4 x i8> %"55", ptr addrspace(5) %"41", align 4
-  %"51" = load i64, ptr addrspace(5) %"39", align 8
-  %"52" = load i32, ptr addrspace(5) %"41", align 4
-  %"58" = inttoptr i64 %"51" to ptr
-  store i32 %"52", ptr %"58", align 4
+  %"51" = load i32, ptr addrspace(5) %"42", align 4
+  %"56" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"50", i32 %"51", i32 30212)
+  store i32 %"56", ptr addrspace(5) %"42", align 4
+  %"52" = load i64, ptr addrspace(5) %"40", align 8
+  %"53" = load i32, ptr addrspace(5) %"42", align 4
+  %"59" = inttoptr i64 %"52" to ptr
+  store i32 %"53", ptr %"59", align 4
   ret void
 }
 
-attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
\ No newline at end of file
+attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
\ No newline at end of file
diff --git a/ptx/src/test/ll/prmt_slow.ll b/ptx/src/test/ll/prmt_slow.ll
new file mode 100644
index 0000000..f178332
--- /dev/null
+++ b/ptx/src/test/ll/prmt_slow.ll
@@ -0,0 +1,46 @@
+declare hidden i32 @__zluda_ptx_impl_prmt_b32(i32, i32, i32) #0
+
+define amdgpu_kernel void @prmt_slow(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 {
+  %"41" = alloca i64, align 8, addrspace(5)
+  %"42" = alloca i64, align 8, addrspace(5)
+  %"43" = alloca i32, align 4, addrspace(5)
+  %"44" = alloca i32, align 4, addrspace(5)
+  %"45" = alloca i32, align 4, addrspace(5)
+  br label %1
+
+1:                                                ; preds = %0
+  br label %"38"
+
+"38":                                             ; preds = %1
+  %"46" = load i64, ptr addrspace(4) %"39", align 8
+  store i64 %"46", ptr addrspace(5) %"41", align 8
+  %"47" = load i64, ptr addrspace(4) %"40", align 8
+  store i64 %"47", ptr addrspace(5) %"42", align 8
+  %"49" = load i64, ptr addrspace(5) %"41", align 8
+  %"60" = inttoptr i64 %"49" to ptr
+  %"48" = load i32, ptr %"60", align 4
+  store i32 %"48", ptr addrspace(5) %"43", align 4
+  %"50" = load i64, ptr addrspace(5) %"41", align 8
+  %"61" = inttoptr i64 %"50" to ptr
+  %"35" = getelementptr inbounds i8, ptr %"61", i64 4
+  %"51" = load i32, ptr %"35", align 4
+  store i32 %"51", ptr addrspace(5) %"44", align 4
+  %"52" = load i64, ptr addrspace(5) %"41", align 8
+  %"62" = inttoptr i64 %"52" to ptr
+  %"37" = getelementptr inbounds i8, ptr %"62", i64 8
+  %"53" = load i32, ptr %"37", align 4
+  store i32 %"53", ptr addrspace(5) %"45", align 4
+  %"55" = load i32, ptr addrspace(5) %"43", align 4
+  %"56" = load i32, ptr addrspace(5) %"44", align 4
+  %"57" = load i32, ptr addrspace(5) %"45", align 4
+  %"63" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"55", i32 %"56", i32 %"57")
+  store i32 %"63", ptr addrspace(5) %"44", align 4
+  %"58" = load i64, ptr addrspace(5) %"42", align 8
+  %"59" = load i32, ptr addrspace(5) %"44", align 4
+  %"67" = inttoptr i64 %"58" to ptr
+  store i32 %"59", ptr %"67", align 4
+  ret void
+}
+
+attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
\ No newline at end of file
diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs
index bd5d900..c24ca1a 100644
--- a/ptx/src/test/spirv_run/mod.rs
+++ b/ptx/src/test/spirv_run/mod.rs
@@ -275,6 +275,11 @@ test_ptx!(const_ident, [0u16], [0u64, 0u64]);
 test_ptx!(cvt_s16_s8, [0x139231C2u32], [0xFFFFFFC2u32]);
 test_ptx!(cvt_f64_f32, [0.125f32], [0.125f64]);
 test_ptx!(prmt, [0x70c507d6u32, 0x6fbd4b5cu32], [0x6fbdd65cu32]);
+test_ptx!(
+    prmt_slow,
+    [0x70c507d6u32, 0x6fbd4b5cu32, 30212],
+    [0x6fbdd65cu32]
+);
 test_ptx!(activemask, [0u32], [1u32]);
 test_ptx!(membar, [152731u32], [152731u32]);
 test_ptx!(shared_unify_extern, [7681u64, 7682u64], [15363u64]);
diff --git a/ptx/src/test/spirv_run/prmt_slow.ptx b/ptx/src/test/spirv_run/prmt_slow.ptx
new file mode 100644
index 0000000..08668ae
--- /dev/null
+++ b/ptx/src/test/spirv_run/prmt_slow.ptx
@@ -0,0 +1,25 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry prmt_slow(
+	.param .u64 input,
+	.param .u64 output
+)
+{
+	.reg .u64 	    in_addr;
+    .reg .u64 	    out_addr;
+    .reg .u32 	    temp1;
+    .reg .u32 	    temp2;
+    .reg .u32 	    temp3;
+
+	ld.param.u64 	in_addr, [input];
+    ld.param.u64 	out_addr, [output];
+
+    ld.u32          temp1, [in_addr];
+    ld.u32          temp2, [in_addr+4];
+    ld.u32          temp3, [in_addr+8];
+	prmt.b32        temp2, temp1, temp2, temp3;
+    st.u32          [out_addr], temp2;
+	ret;
+}
diff --git a/ptx_parser/src/ast.rs b/ptx_parser/src/ast.rs
index 84d5f57..b1cf959 100644
--- a/ptx_parser/src/ast.rs
+++ b/ptx_parser/src/ast.rs
@@ -432,15 +432,6 @@ ptx_parser_macros::generate_instruction_type!(
         },
         Prmt {
             type: Type::Scalar(ScalarType::B32),
-            data: u16,
-            arguments<T>: {
-                dst: T,
-                src1: T,
-                src2: T
-            }
-        },
-        PrmtSlow {
-            type: Type::Scalar(ScalarType::U32),
             arguments<T>: {
                 dst: T,
                 src1: T,
diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs
index 2c9003b..26ae5e9 100644
--- a/ptx_parser/src/lib.rs
+++ b/ptx_parser/src/lib.rs
@@ -3671,17 +3671,9 @@ derive_parser!(
     // prmt.b32{.mode}  d, a, b, c;
     // .mode = { .f4e, .b4e, .rc8, .ecl, .ecr, .rc16 };
     prmt.b32  d, a, b, c => {
-        match c {
-            ast::ParsedOperand::Imm(ImmediateValue::S64(control)) => ast::Instruction::Prmt {
-                data: control as u16,
-                arguments: PrmtArgs {
-                    dst: d, src1: a, src2: b
-                }
-            },
-            _ => ast::Instruction::PrmtSlow {
-                arguments: PrmtSlowArgs {
-                    dst: d, src1: a, src2: b, src3: c
-                }
+        ast::Instruction::Prmt {
+            arguments: PrmtArgs {
+                dst: d, src1: a, src2: b, src3: c
             }
         }
     }