diff --git a/ptx/src/pass/insert_post_saturation.rs b/ptx/src/pass/insert_post_saturation.rs index 525ae15..620e46b 100644 --- a/ptx/src/pass/insert_post_saturation.rs +++ b/ptx/src/pass/insert_post_saturation.rs @@ -164,7 +164,6 @@ fn run_instruction<'input>( | ast::Instruction::Or { .. } | ast::Instruction::Popc { .. } | ast::Instruction::Prmt { .. } - | ast::Instruction::PrmtSlow { .. } | ast::Instruction::Rcp { .. } | ast::Instruction::Rem { .. } | ast::Instruction::Ret { .. } diff --git a/ptx/src/pass/instruction_mode_to_global_mode/mod.rs b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs index d365e29..12851a6 100644 --- a/ptx/src/pass/instruction_mode_to_global_mode/mod.rs +++ b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs @@ -1818,7 +1818,6 @@ fn get_modes(inst: &ast::Instruction) -> InstructionModes { | ast::Instruction::Mov { .. } | ast::Instruction::Ld { .. } | ast::Instruction::St { .. } - | ast::Instruction::PrmtSlow { .. } | ast::Instruction::Prmt { .. } | ast::Instruction::Activemask { .. } | ast::Instruction::Membar { .. } diff --git a/ptx/src/pass/llvm/emit.rs b/ptx/src/pass/llvm/emit.rs index d9ec666..e449b01 100644 --- a/ptx/src/pass/llvm/emit.rs +++ b/ptx/src/pass/llvm/emit.rs @@ -511,7 +511,6 @@ impl<'a> MethodEmitContext<'a> { ast::Instruction::Xor { data, arguments } => self.emit_xor(data, arguments), ast::Instruction::Rem { data, arguments } => self.emit_rem(data, arguments), ast::Instruction::BarWarp { .. } => self.emit_bar_warp(), - ast::Instruction::Prmt { data, arguments } => self.emit_prmt(data, arguments), ast::Instruction::Membar { data } => self.emit_membar(data), ast::Instruction::Trap {} => self.emit_trap(), ast::Instruction::Tanh { data, arguments } => self.emit_tanh(data, arguments), @@ -532,7 +531,7 @@ impl<'a> MethodEmitContext<'a> { | ast::Instruction::ReduxSync { .. } | ast::Instruction::LdMatrix { .. } | ast::Instruction::Mma { .. } - | ast::Instruction::PrmtSlow { .. } => return Err(error_unreachable()), + | ast::Instruction::Prmt { .. } => return Err(error_unreachable()), } } @@ -2446,48 +2445,6 @@ impl<'a> MethodEmitContext<'a> { Ok(()) } - fn emit_prmt( - &mut self, - control: u16, - arguments: ptx_parser::PrmtArgs, - ) -> Result<(), TranslateError> { - let components = [ - (control >> 0) & 0b1111, - (control >> 4) & 0b1111, - (control >> 8) & 0b1111, - (control >> 12) & 0b1111, - ]; - if components.iter().any(|&c| c > 7) { - return Err(error_todo()); - } - let u32_type = get_scalar_type(self.context, ast::ScalarType::U32); - let v4u8_type = get_type(self.context, &ast::Type::Vector(4, ast::ScalarType::U8))?; - let mut components = [ - unsafe { LLVMConstInt(u32_type, components[0] as _, 0) }, - unsafe { LLVMConstInt(u32_type, components[1] as _, 0) }, - unsafe { LLVMConstInt(u32_type, components[2] as _, 0) }, - unsafe { LLVMConstInt(u32_type, components[3] as _, 0) }, - ]; - let components_indices = - unsafe { LLVMConstVector(components.as_mut_ptr(), components.len() as u32) }; - let src1 = self.resolver.value(arguments.src1)?; - let src1_vector = - unsafe { LLVMBuildBitCast(self.builder, src1, v4u8_type, LLVM_UNNAMED.as_ptr()) }; - let src2 = self.resolver.value(arguments.src2)?; - let src2_vector = - unsafe { LLVMBuildBitCast(self.builder, src2, v4u8_type, LLVM_UNNAMED.as_ptr()) }; - self.resolver.with_result(arguments.dst, |dst| unsafe { - LLVMBuildShuffleVector( - self.builder, - src1_vector, - src2_vector, - components_indices, - dst, - ) - }); - Ok(()) - } - fn emit_abs( &mut self, data: ast::TypeFtz, diff --git a/ptx/src/pass/replace_instructions_with_functions.rs b/ptx/src/pass/replace_instructions_with_functions.rs index 2304538..a0198d1 100644 --- a/ptx/src/pass/replace_instructions_with_functions.rs +++ b/ptx/src/pass/replace_instructions_with_functions.rs @@ -519,7 +519,7 @@ fn run_instruction<'input>( i, )? } - i @ ptx_parser::Instruction::PrmtSlow { .. } => { + i @ ptx_parser::Instruction::Prmt { .. } => { to_call(resolver, fn_declarations, "prmt_b32".into(), i)? } i => i, diff --git a/ptx/src/test/ll/prmt.ll b/ptx/src/test/ll/prmt.ll index 7753f5c..933229d 100644 --- a/ptx/src/test/ll/prmt.ll +++ b/ptx/src/test/ll/prmt.ll @@ -1,38 +1,39 @@ -define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +declare hidden i32 @__zluda_ptx_impl_prmt_b32(i32, i32, i32) #0 + +define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i32, align 4, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - br label %"35" + br label %"36" -"35": ; preds = %1 - %"42" = load i64, ptr addrspace(4) %"36", align 8 - store i64 %"42", ptr addrspace(5) %"38", align 8 +"36": ; preds = %1 %"43" = load i64, ptr addrspace(4) %"37", align 8 store i64 %"43", ptr addrspace(5) %"39", align 8 - %"45" = load i64, ptr addrspace(5) %"38", align 8 - %"53" = inttoptr i64 %"45" to ptr - %"44" = load i32, ptr %"53", align 4 - store i32 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 8 + %"44" = load i64, ptr addrspace(4) %"38", align 8 + store i64 %"44", ptr addrspace(5) %"40", align 8 + %"46" = load i64, ptr addrspace(5) %"39", align 8 %"54" = inttoptr i64 %"46" to ptr - %"34" = getelementptr inbounds i8, ptr %"54", i64 4 - %"47" = load i32, ptr %"34", align 4 - store i32 %"47", ptr addrspace(5) %"41", align 4 - %"49" = load i32, ptr addrspace(5) %"40", align 4 + %"45" = load i32, ptr %"54", align 4 + store i32 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 8 + %"55" = inttoptr i64 %"47" to ptr + %"34" = getelementptr inbounds i8, ptr %"55", i64 4 + %"48" = load i32, ptr %"34", align 4 + store i32 %"48", ptr addrspace(5) %"42", align 4 %"50" = load i32, ptr addrspace(5) %"41", align 4 - %2 = bitcast i32 %"49" to <4 x i8> - %3 = bitcast i32 %"50" to <4 x i8> - %"55" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> - store <4 x i8> %"55", ptr addrspace(5) %"41", align 4 - %"51" = load i64, ptr addrspace(5) %"39", align 8 - %"52" = load i32, ptr addrspace(5) %"41", align 4 - %"58" = inttoptr i64 %"51" to ptr - store i32 %"52", ptr %"58", align 4 + %"51" = load i32, ptr addrspace(5) %"42", align 4 + %"56" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"50", i32 %"51", i32 30212) + store i32 %"56", ptr addrspace(5) %"42", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 8 + %"53" = load i32, ptr addrspace(5) %"42", align 4 + %"59" = inttoptr i64 %"52" to ptr + store i32 %"53", ptr %"59", align 4 ret void } -attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/prmt_slow.ll b/ptx/src/test/ll/prmt_slow.ll index 3943afa..f178332 100644 --- a/ptx/src/test/ll/prmt_slow.ll +++ b/ptx/src/test/ll/prmt_slow.ll @@ -33,14 +33,14 @@ define amdgpu_kernel void @prmt_slow(ptr addrspace(4) byref(i64) %"39", ptr addr %"55" = load i32, ptr addrspace(5) %"43", align 4 %"56" = load i32, ptr addrspace(5) %"44", align 4 %"57" = load i32, ptr addrspace(5) %"45", align 4 - %"54" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"55", i32 %"56", i32 %"57") - store i32 %"54", ptr addrspace(5) %"44", align 4 + %"63" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"55", i32 %"56", i32 %"57") + store i32 %"63", ptr addrspace(5) %"44", align 4 %"58" = load i64, ptr addrspace(5) %"42", align 8 %"59" = load i32, ptr addrspace(5) %"44", align 4 - %"63" = inttoptr i64 %"58" to ptr - store i32 %"59", ptr %"63", align 4 + %"67" = inttoptr i64 %"58" to ptr + store i32 %"59", ptr %"67", align 4 ret void } attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx_parser/src/ast.rs b/ptx_parser/src/ast.rs index 1bc622c..9ab1037 100644 --- a/ptx_parser/src/ast.rs +++ b/ptx_parser/src/ast.rs @@ -432,15 +432,6 @@ ptx_parser_macros::generate_instruction_type!( }, Prmt { type: Type::Scalar(ScalarType::B32), - data: u16, - arguments: { - dst: T, - src1: T, - src2: T - } - }, - PrmtSlow { - type: Type::Scalar(ScalarType::U32), arguments: { dst: T, src1: T, diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs index a4f9080..ceffc65 100644 --- a/ptx_parser/src/lib.rs +++ b/ptx_parser/src/lib.rs @@ -3659,17 +3659,9 @@ derive_parser!( // prmt.b32{.mode} d, a, b, c; // .mode = { .f4e, .b4e, .rc8, .ecl, .ecr, .rc16 }; prmt.b32 d, a, b, c => { - match c { - ast::ParsedOperand::Imm(ImmediateValue::S64(control)) => ast::Instruction::Prmt { - data: control as u16, - arguments: PrmtArgs { - dst: d, src1: a, src2: b - } - }, - _ => ast::Instruction::PrmtSlow { - arguments: PrmtSlowArgs { - dst: d, src1: a, src2: b, src3: c - } + ast::Instruction::Prmt { + arguments: PrmtArgs { + dst: d, src1: a, src2: b, src3: c } } }