Remove PrmtSlow

This commit is contained in:
Violet 2025-09-22 22:23:52 +00:00
commit 12a5f14837
8 changed files with 36 additions and 97 deletions

View file

@ -164,7 +164,6 @@ fn run_instruction<'input>(
| ast::Instruction::Or { .. } | ast::Instruction::Or { .. }
| ast::Instruction::Popc { .. } | ast::Instruction::Popc { .. }
| ast::Instruction::Prmt { .. } | ast::Instruction::Prmt { .. }
| ast::Instruction::PrmtSlow { .. }
| ast::Instruction::Rcp { .. } | ast::Instruction::Rcp { .. }
| ast::Instruction::Rem { .. } | ast::Instruction::Rem { .. }
| ast::Instruction::Ret { .. } | ast::Instruction::Ret { .. }

View file

@ -1818,7 +1818,6 @@ fn get_modes<T: ast::Operand>(inst: &ast::Instruction<T>) -> InstructionModes {
| ast::Instruction::Mov { .. } | ast::Instruction::Mov { .. }
| ast::Instruction::Ld { .. } | ast::Instruction::Ld { .. }
| ast::Instruction::St { .. } | ast::Instruction::St { .. }
| ast::Instruction::PrmtSlow { .. }
| ast::Instruction::Prmt { .. } | ast::Instruction::Prmt { .. }
| ast::Instruction::Activemask { .. } | ast::Instruction::Activemask { .. }
| ast::Instruction::Membar { .. } | ast::Instruction::Membar { .. }

View file

@ -511,7 +511,6 @@ impl<'a> MethodEmitContext<'a> {
ast::Instruction::Xor { data, arguments } => self.emit_xor(data, arguments), ast::Instruction::Xor { data, arguments } => self.emit_xor(data, arguments),
ast::Instruction::Rem { data, arguments } => self.emit_rem(data, arguments), ast::Instruction::Rem { data, arguments } => self.emit_rem(data, arguments),
ast::Instruction::BarWarp { .. } => self.emit_bar_warp(), ast::Instruction::BarWarp { .. } => self.emit_bar_warp(),
ast::Instruction::Prmt { data, arguments } => self.emit_prmt(data, arguments),
ast::Instruction::Membar { data } => self.emit_membar(data), ast::Instruction::Membar { data } => self.emit_membar(data),
ast::Instruction::Trap {} => self.emit_trap(), ast::Instruction::Trap {} => self.emit_trap(),
ast::Instruction::Tanh { data, arguments } => self.emit_tanh(data, arguments), ast::Instruction::Tanh { data, arguments } => self.emit_tanh(data, arguments),
@ -532,7 +531,7 @@ impl<'a> MethodEmitContext<'a> {
| ast::Instruction::ReduxSync { .. } | ast::Instruction::ReduxSync { .. }
| ast::Instruction::LdMatrix { .. } | ast::Instruction::LdMatrix { .. }
| ast::Instruction::Mma { .. } | ast::Instruction::Mma { .. }
| ast::Instruction::PrmtSlow { .. } => return Err(error_unreachable()), | ast::Instruction::Prmt { .. } => return Err(error_unreachable()),
} }
} }
@ -2446,48 +2445,6 @@ impl<'a> MethodEmitContext<'a> {
Ok(()) Ok(())
} }
fn emit_prmt(
&mut self,
control: u16,
arguments: ptx_parser::PrmtArgs<SpirvWord>,
) -> Result<(), TranslateError> {
let components = [
(control >> 0) & 0b1111,
(control >> 4) & 0b1111,
(control >> 8) & 0b1111,
(control >> 12) & 0b1111,
];
if components.iter().any(|&c| c > 7) {
return Err(error_todo());
}
let u32_type = get_scalar_type(self.context, ast::ScalarType::U32);
let v4u8_type = get_type(self.context, &ast::Type::Vector(4, ast::ScalarType::U8))?;
let mut components = [
unsafe { LLVMConstInt(u32_type, components[0] as _, 0) },
unsafe { LLVMConstInt(u32_type, components[1] as _, 0) },
unsafe { LLVMConstInt(u32_type, components[2] as _, 0) },
unsafe { LLVMConstInt(u32_type, components[3] as _, 0) },
];
let components_indices =
unsafe { LLVMConstVector(components.as_mut_ptr(), components.len() as u32) };
let src1 = self.resolver.value(arguments.src1)?;
let src1_vector =
unsafe { LLVMBuildBitCast(self.builder, src1, v4u8_type, LLVM_UNNAMED.as_ptr()) };
let src2 = self.resolver.value(arguments.src2)?;
let src2_vector =
unsafe { LLVMBuildBitCast(self.builder, src2, v4u8_type, LLVM_UNNAMED.as_ptr()) };
self.resolver.with_result(arguments.dst, |dst| unsafe {
LLVMBuildShuffleVector(
self.builder,
src1_vector,
src2_vector,
components_indices,
dst,
)
});
Ok(())
}
fn emit_abs( fn emit_abs(
&mut self, &mut self,
data: ast::TypeFtz, data: ast::TypeFtz,

View file

@ -519,7 +519,7 @@ fn run_instruction<'input>(
i, i,
)? )?
} }
i @ ptx_parser::Instruction::PrmtSlow { .. } => { i @ ptx_parser::Instruction::Prmt { .. } => {
to_call(resolver, fn_declarations, "prmt_b32".into(), i)? to_call(resolver, fn_declarations, "prmt_b32".into(), i)?
} }
i => i, i => i,

View file

@ -1,38 +1,39 @@
define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { declare hidden i32 @__zluda_ptx_impl_prmt_b32(i32, i32, i32) #0
%"38" = alloca i64, align 8, addrspace(5)
define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 {
%"39" = alloca i64, align 8, addrspace(5) %"39" = alloca i64, align 8, addrspace(5)
%"40" = alloca i32, align 4, addrspace(5) %"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i32, align 4, addrspace(5) %"41" = alloca i32, align 4, addrspace(5)
%"42" = alloca i32, align 4, addrspace(5)
br label %1 br label %1
1: ; preds = %0 1: ; preds = %0
br label %"35" br label %"36"
"35": ; preds = %1 "36": ; preds = %1
%"42" = load i64, ptr addrspace(4) %"36", align 8
store i64 %"42", ptr addrspace(5) %"38", align 8
%"43" = load i64, ptr addrspace(4) %"37", align 8 %"43" = load i64, ptr addrspace(4) %"37", align 8
store i64 %"43", ptr addrspace(5) %"39", align 8 store i64 %"43", ptr addrspace(5) %"39", align 8
%"45" = load i64, ptr addrspace(5) %"38", align 8 %"44" = load i64, ptr addrspace(4) %"38", align 8
%"53" = inttoptr i64 %"45" to ptr store i64 %"44", ptr addrspace(5) %"40", align 8
%"44" = load i32, ptr %"53", align 4 %"46" = load i64, ptr addrspace(5) %"39", align 8
store i32 %"44", ptr addrspace(5) %"40", align 4
%"46" = load i64, ptr addrspace(5) %"38", align 8
%"54" = inttoptr i64 %"46" to ptr %"54" = inttoptr i64 %"46" to ptr
%"34" = getelementptr inbounds i8, ptr %"54", i64 4 %"45" = load i32, ptr %"54", align 4
%"47" = load i32, ptr %"34", align 4 store i32 %"45", ptr addrspace(5) %"41", align 4
store i32 %"47", ptr addrspace(5) %"41", align 4 %"47" = load i64, ptr addrspace(5) %"39", align 8
%"49" = load i32, ptr addrspace(5) %"40", align 4 %"55" = inttoptr i64 %"47" to ptr
%"34" = getelementptr inbounds i8, ptr %"55", i64 4
%"48" = load i32, ptr %"34", align 4
store i32 %"48", ptr addrspace(5) %"42", align 4
%"50" = load i32, ptr addrspace(5) %"41", align 4 %"50" = load i32, ptr addrspace(5) %"41", align 4
%2 = bitcast i32 %"49" to <4 x i8> %"51" = load i32, ptr addrspace(5) %"42", align 4
%3 = bitcast i32 %"50" to <4 x i8> %"56" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"50", i32 %"51", i32 30212)
%"55" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> <i32 4, i32 0, i32 6, i32 7> store i32 %"56", ptr addrspace(5) %"42", align 4
store <4 x i8> %"55", ptr addrspace(5) %"41", align 4 %"52" = load i64, ptr addrspace(5) %"40", align 8
%"51" = load i64, ptr addrspace(5) %"39", align 8 %"53" = load i32, ptr addrspace(5) %"42", align 4
%"52" = load i32, ptr addrspace(5) %"41", align 4 %"59" = inttoptr i64 %"52" to ptr
%"58" = inttoptr i64 %"51" to ptr store i32 %"53", ptr %"59", align 4
store i32 %"52", ptr %"58", align 4
ret void ret void
} }
attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }

View file

@ -33,12 +33,12 @@ define amdgpu_kernel void @prmt_slow(ptr addrspace(4) byref(i64) %"39", ptr addr
%"55" = load i32, ptr addrspace(5) %"43", align 4 %"55" = load i32, ptr addrspace(5) %"43", align 4
%"56" = load i32, ptr addrspace(5) %"44", align 4 %"56" = load i32, ptr addrspace(5) %"44", align 4
%"57" = load i32, ptr addrspace(5) %"45", align 4 %"57" = load i32, ptr addrspace(5) %"45", align 4
%"54" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"55", i32 %"56", i32 %"57") %"63" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"55", i32 %"56", i32 %"57")
store i32 %"54", ptr addrspace(5) %"44", align 4 store i32 %"63", ptr addrspace(5) %"44", align 4
%"58" = load i64, ptr addrspace(5) %"42", align 8 %"58" = load i64, ptr addrspace(5) %"42", align 8
%"59" = load i32, ptr addrspace(5) %"44", align 4 %"59" = load i32, ptr addrspace(5) %"44", align 4
%"63" = inttoptr i64 %"58" to ptr %"67" = inttoptr i64 %"58" to ptr
store i32 %"59", ptr %"63", align 4 store i32 %"59", ptr %"67", align 4
ret void ret void
} }

View file

@ -432,15 +432,6 @@ ptx_parser_macros::generate_instruction_type!(
}, },
Prmt { Prmt {
type: Type::Scalar(ScalarType::B32), type: Type::Scalar(ScalarType::B32),
data: u16,
arguments<T>: {
dst: T,
src1: T,
src2: T
}
},
PrmtSlow {
type: Type::Scalar(ScalarType::U32),
arguments<T>: { arguments<T>: {
dst: T, dst: T,
src1: T, src1: T,

View file

@ -3659,20 +3659,12 @@ derive_parser!(
// prmt.b32{.mode} d, a, b, c; // prmt.b32{.mode} d, a, b, c;
// .mode = { .f4e, .b4e, .rc8, .ecl, .ecr, .rc16 }; // .mode = { .f4e, .b4e, .rc8, .ecl, .ecr, .rc16 };
prmt.b32 d, a, b, c => { prmt.b32 d, a, b, c => {
match c { ast::Instruction::Prmt {
ast::ParsedOperand::Imm(ImmediateValue::S64(control)) => ast::Instruction::Prmt {
data: control as u16,
arguments: PrmtArgs { arguments: PrmtArgs {
dst: d, src1: a, src2: b
}
},
_ => ast::Instruction::PrmtSlow {
arguments: PrmtSlowArgs {
dst: d, src1: a, src2: b, src3: c dst: d, src1: a, src2: b, src3: c
} }
} }
} }
}
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
activemask.b32 d => { activemask.b32 d => {