mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-09-27 03:39:10 +00:00
Handle PrmtSlow
Uses the HIP implementation of `__byte_perm`.
This commit is contained in:
parent
62ec652e7c
commit
bf560bd8ec
7 changed files with 86 additions and 4 deletions
Binary file not shown.
|
@ -780,4 +780,9 @@ typedef uint32_t ShflSyncResult __attribute__((ext_vector_type(2)));
|
|||
|
||||
return d_out;
|
||||
}
|
||||
|
||||
uint32_t FUNC(prmt_b32)(uint32_t x, uint32_t y, uint32_t s)
|
||||
{
|
||||
return __byte_perm(x, y, s);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -511,9 +511,6 @@ impl<'a> MethodEmitContext<'a> {
|
|||
ast::Instruction::Xor { data, arguments } => self.emit_xor(data, arguments),
|
||||
ast::Instruction::Rem { data, arguments } => self.emit_rem(data, arguments),
|
||||
ast::Instruction::BarWarp { .. } => self.emit_bar_warp(),
|
||||
ast::Instruction::PrmtSlow { .. } => {
|
||||
Err(error_todo_msg("PrmtSlow is not implemented yet"))
|
||||
}
|
||||
ast::Instruction::Prmt { data, arguments } => self.emit_prmt(data, arguments),
|
||||
ast::Instruction::Membar { data } => self.emit_membar(data),
|
||||
ast::Instruction::Trap {} => self.emit_trap(),
|
||||
|
@ -534,7 +531,8 @@ impl<'a> MethodEmitContext<'a> {
|
|||
| ast::Instruction::Nanosleep { .. }
|
||||
| ast::Instruction::ReduxSync { .. }
|
||||
| ast::Instruction::LdMatrix { .. }
|
||||
| ast::Instruction::Mma { .. } => return Err(error_unreachable()),
|
||||
| ast::Instruction::Mma { .. }
|
||||
| ast::Instruction::PrmtSlow { .. } => return Err(error_unreachable()),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -519,6 +519,9 @@ fn run_instruction<'input>(
|
|||
i,
|
||||
)?
|
||||
}
|
||||
i @ ptx_parser::Instruction::PrmtSlow { .. } => {
|
||||
to_call(resolver, fn_declarations, "prmt_b32".into(), i)?
|
||||
}
|
||||
i => i,
|
||||
})
|
||||
}
|
||||
|
|
46
ptx/src/test/ll/prmt_slow.ll
Normal file
46
ptx/src/test/ll/prmt_slow.ll
Normal file
|
@ -0,0 +1,46 @@
|
|||
declare hidden i32 @__zluda_ptx_impl_prmt_b32(i32, i32, i32) #0
|
||||
|
||||
define amdgpu_kernel void @prmt_slow(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 {
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca i32, align 4, addrspace(5)
|
||||
%"44" = alloca i32, align 4, addrspace(5)
|
||||
%"45" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"38"
|
||||
|
||||
"38": ; preds = %1
|
||||
%"46" = load i64, ptr addrspace(4) %"39", align 8
|
||||
store i64 %"46", ptr addrspace(5) %"41", align 8
|
||||
%"47" = load i64, ptr addrspace(4) %"40", align 8
|
||||
store i64 %"47", ptr addrspace(5) %"42", align 8
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 8
|
||||
%"60" = inttoptr i64 %"49" to ptr
|
||||
%"48" = load i32, ptr %"60", align 4
|
||||
store i32 %"48", ptr addrspace(5) %"43", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"41", align 8
|
||||
%"61" = inttoptr i64 %"50" to ptr
|
||||
%"35" = getelementptr inbounds i8, ptr %"61", i64 4
|
||||
%"51" = load i32, ptr %"35", align 4
|
||||
store i32 %"51", ptr addrspace(5) %"44", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"41", align 8
|
||||
%"62" = inttoptr i64 %"52" to ptr
|
||||
%"37" = getelementptr inbounds i8, ptr %"62", i64 8
|
||||
%"53" = load i32, ptr %"37", align 4
|
||||
store i32 %"53", ptr addrspace(5) %"45", align 4
|
||||
%"55" = load i32, ptr addrspace(5) %"43", align 4
|
||||
%"56" = load i32, ptr addrspace(5) %"44", align 4
|
||||
%"57" = load i32, ptr addrspace(5) %"45", align 4
|
||||
%"54" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"55", i32 %"56", i32 %"57")
|
||||
store i32 %"54", ptr addrspace(5) %"44", align 4
|
||||
%"58" = load i64, ptr addrspace(5) %"42", align 8
|
||||
%"59" = load i32, ptr addrspace(5) %"44", align 4
|
||||
%"63" = inttoptr i64 %"58" to ptr
|
||||
store i32 %"59", ptr %"63", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -274,6 +274,11 @@ test_ptx!(const_ident, [0u16], [0u64, 0u64]);
|
|||
test_ptx!(cvt_s16_s8, [0x139231C2u32], [0xFFFFFFC2u32]);
|
||||
test_ptx!(cvt_f64_f32, [0.125f32], [0.125f64]);
|
||||
test_ptx!(prmt, [0x70c507d6u32, 0x6fbd4b5cu32], [0x6fbdd65cu32]);
|
||||
test_ptx!(
|
||||
prmt_slow,
|
||||
[0x70c507d6u32, 0x6fbd4b5cu32, 30212],
|
||||
[0x6fbdd65cu32]
|
||||
);
|
||||
test_ptx!(activemask, [0u32], [1u32]);
|
||||
test_ptx!(membar, [152731u32], [152731u32]);
|
||||
test_ptx!(shared_unify_extern, [7681u64, 7682u64], [15363u64]);
|
||||
|
|
25
ptx/src/test/spirv_run/prmt_slow.ptx
Normal file
25
ptx/src/test/spirv_run/prmt_slow.ptx
Normal file
|
@ -0,0 +1,25 @@
|
|||
.version 6.5
|
||||
.target sm_30
|
||||
.address_size 64
|
||||
|
||||
.visible .entry prmt_slow(
|
||||
.param .u64 input,
|
||||
.param .u64 output
|
||||
)
|
||||
{
|
||||
.reg .u64 in_addr;
|
||||
.reg .u64 out_addr;
|
||||
.reg .u32 temp1;
|
||||
.reg .u32 temp2;
|
||||
.reg .u32 temp3;
|
||||
|
||||
ld.param.u64 in_addr, [input];
|
||||
ld.param.u64 out_addr, [output];
|
||||
|
||||
ld.u32 temp1, [in_addr];
|
||||
ld.u32 temp2, [in_addr+4];
|
||||
ld.u32 temp3, [in_addr+8];
|
||||
prmt.b32 temp2, temp1, temp2, temp3;
|
||||
st.u32 [out_addr], temp2;
|
||||
ret;
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue