From 4363545d0ee0358b39e3ec0067fe25b0621048b8 Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Tue, 27 Feb 2024 19:57:17 +0100 Subject: [PATCH] Implement isspacep --- ptx/src/ast.rs | 1 + ptx/src/emit.rs | 103 +++++++++++++++++++++++----- ptx/src/ptx.lalrpop | 16 +++++ ptx/src/test/spirv_run/isspacep.ll | 57 +++++++++++++++ ptx/src/test/spirv_run/isspacep.ptx | 28 ++++++++ ptx/src/test/spirv_run/mod.rs | 13 +++- ptx/src/translate.rs | 9 +++ 7 files changed, 208 insertions(+), 19 deletions(-) create mode 100644 ptx/src/test/spirv_run/isspacep.ll create mode 100644 ptx/src/test/spirv_run/isspacep.ptx diff --git a/ptx/src/ast.rs b/ptx/src/ast.rs index 0281961..225fc1d 100644 --- a/ptx/src/ast.rs +++ b/ptx/src/ast.rs @@ -475,6 +475,7 @@ pub enum Instruction { MatchAny(Arg3

), Red(AtomDetails, Arg2St

), Nanosleep(Arg1

), + Isspacep(StateSpace, Arg2

), } #[derive(Copy, Clone)] diff --git a/ptx/src/emit.rs b/ptx/src/emit.rs index 94cc973..af72f89 100644 --- a/ptx/src/emit.rs +++ b/ptx/src/emit.rs @@ -1137,6 +1137,7 @@ fn emit_instruction( ast::Instruction::Vshr(arg) => emit_inst_vshr(ctx, arg)?, ast::Instruction::Set(details, arg) => emit_inst_set(ctx, details, arg)?, ast::Instruction::Red(details, arg) => emit_inst_red(ctx, details, arg)?, + ast::Instruction::Isspacep(space, arg) => emit_inst_isspacep(ctx, *space, arg)?, // replaced by function calls or Statement variants ast::Instruction::Activemask { .. } | ast::Instruction::Bar(..) @@ -1161,6 +1162,70 @@ fn emit_instruction( }) } +fn emit_inst_isspacep( + ctx: &mut EmitContext, + space: ast::StateSpace, + arg: &ast::Arg2, +) -> Result<(), TranslateError> { + match space { + ast::StateSpace::Local => { + emit_inst_isspacep_impl(ctx, Some(arg.dst), arg.src, b"llvm.amdgcn.is.private\0")?; + Ok(()) + } + ast::StateSpace::Shared => { + emit_inst_isspacep_impl(ctx, Some(arg.dst), arg.src, b"llvm.amdgcn.is.shared\0")?; + Ok(()) + } + ast::StateSpace::Global => { + let builder = ctx.builder.get(); + let is_private = + emit_inst_isspacep_impl(ctx, None, arg.src, b"llvm.amdgcn.is.private\0")?; + let is_shared = + emit_inst_isspacep_impl(ctx, None, arg.src, b"llvm.amdgcn.is.shared\0")?; + let private_or_shared = + unsafe { LLVMBuildOr(builder, is_private, is_shared, LLVM_UNNAMED) }; + let i1_true = unsafe { + LLVMConstInt( + get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::Pred))?, + 1, + 0, + ) + }; + ctx.names.register_result(arg.dst, |dst| unsafe { + // I'd rathr user LLVMBuildNeg(...), but when using LLVMBuildNeg(...) in LLVM 15, + // LLVM emits this broken IR: + // %"14" = sub i1 false, %4 + LLVMBuildSub(builder, i1_true, private_or_shared, dst) + }); + Ok(()) + } + _ => Err(TranslateError::unreachable()), + } +} + +fn emit_inst_isspacep_impl( + ctx: &mut EmitContext, + dst: Option, + src: Id, + intrinsic: &[u8], +) -> Result { + let src = ctx.names.value(src)?; + let b8 = get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::B8))?; + let b8_generic_ptr = unsafe { + LLVMPointerType( + b8, + get_llvm_address_space(&ctx.constants, ast::StateSpace::Generic)?, + ) + }; + let src = unsafe { LLVMBuildIntToPtr(ctx.builder.get(), src, b8_generic_ptr, LLVM_UNNAMED) }; + emit_intrinsic_arg2( + ctx, + (ast::ScalarType::Pred, dst), + (ast::ScalarType::B8, ast::StateSpace::Generic, src), + intrinsic, + ) +} + fn emit_inst_red( ctx: &mut EmitContext, details: &ast::AtomDetails, @@ -1397,7 +1462,7 @@ fn emit_inst_abs( emit_intrinsic_arg2( ctx, (details.typ, Some(args.dst)), - (details.typ, args.src), + (details.typ, ast::StateSpace::Reg, args.src), intrinsic_name.as_bytes(), )?; } else { @@ -1565,7 +1630,7 @@ fn emit_inst_rsqrt( let sqrt_result = emit_intrinsic_arg2( ctx, (details.typ, None), - (details.typ, args.src), + (details.typ, ast::StateSpace::Reg, args.src), intrinsic_fn, )?; unsafe { LLVMZludaSetFastMathFlags(sqrt_result, FastMathFlags::ApproxFunc) }; @@ -1623,7 +1688,7 @@ fn emit_inst_sqrt( let sqrt_result = emit_intrinsic_arg2( ctx, (details.type_, Some(args.dst)), - (details.type_, args.src), + (details.type_, ast::StateSpace::Reg, args.src), intrinsic_fn, )?; unsafe { LLVMZludaSetFastMathFlags(sqrt_result, fast_math) }; @@ -2468,7 +2533,7 @@ fn emit_inst_cvt( emit_intrinsic_arg2( ctx, (type_, Some(args.dst)), - (type_, args.src), + (type_, ast::StateSpace::Reg, args.src), intrinsic_fn, )?; } @@ -2482,7 +2547,7 @@ fn emit_inst_cvt( emit_intrinsic_arg2( ctx, (type_, Some(args.dst)), - (type_, args.src), + (type_, ast::StateSpace::Reg, args.src), intrinsic_fn, )?; } @@ -2496,7 +2561,7 @@ fn emit_inst_cvt( emit_intrinsic_arg2( ctx, (type_, Some(args.dst)), - (type_, args.src), + (type_, ast::StateSpace::Reg, args.src), intrinsic_fn, )?; } @@ -2510,7 +2575,7 @@ fn emit_inst_cvt( emit_intrinsic_arg2( ctx, (type_, Some(args.dst)), - (type_, args.src), + (type_, ast::StateSpace::Reg, args.src), intrinsic_fn, )?; } @@ -2676,7 +2741,7 @@ fn emit_inst_cos( let cos_value = emit_intrinsic_arg2( ctx, (ast::ScalarType::F32, Some(args.dst)), - (ast::ScalarType::F32, args.src), + (ast::ScalarType::F32, ast::StateSpace::Reg, args.src), function_name, )?; unsafe { LLVMZludaSetFastMathFlags(cos_value, FastMathFlags::ApproxFunc) }; @@ -2691,7 +2756,7 @@ fn emit_inst_sin( let cos_value = emit_intrinsic_arg2( ctx, (ast::ScalarType::F32, Some(args.dst)), - (ast::ScalarType::F32, args.src), + (ast::ScalarType::F32, ast::StateSpace::Reg, args.src), function_name, )?; unsafe { LLVMZludaSetFastMathFlags(cos_value, FastMathFlags::ApproxFunc) }; @@ -2895,7 +2960,7 @@ fn emit_inst_brev( emit_intrinsic_arg2( ctx, (type_, Some(args.dst)), - (type_, args.src), + (type_, ast::StateSpace::Reg, args.src), function_name, )?; Ok(()) @@ -2913,8 +2978,12 @@ fn emit_inst_popc( _ => return Err(TranslateError::unreachable()), }; let popc_dst = if shorten { None } else { Some(args.dst) }; - let popc_result = - emit_intrinsic_arg2(ctx, (type_, popc_dst), (type_, args.src), function_name)?; + let popc_result = emit_intrinsic_arg2( + ctx, + (type_, popc_dst), + (type_, ast::StateSpace::Reg, args.src), + function_name, + )?; if shorten { let llvm_i32 = get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::U32))?; ctx.names.register_result(args.dst, |dst_name| unsafe { @@ -2932,7 +3001,7 @@ fn emit_inst_ex2( let llvm_value = emit_intrinsic_arg2( ctx, (ast::ScalarType::F32, Some(args.dst)), - (ast::ScalarType::F32, args.src), + (ast::ScalarType::F32, ast::StateSpace::Reg, args.src), function_name, )?; unsafe { LLVMZludaSetFastMathFlags(llvm_value, FastMathFlags::ApproxFunc) }; @@ -2947,7 +3016,7 @@ fn emit_inst_lg2( let llvm_value = emit_intrinsic_arg2( ctx, (ast::ScalarType::F32, Some(args.dst)), - (ast::ScalarType::F32, args.src), + (ast::ScalarType::F32, ast::StateSpace::Reg, args.src), function_name, )?; unsafe { LLVMZludaSetFastMathFlags(llvm_value, FastMathFlags::ApproxFunc) }; @@ -2986,16 +3055,16 @@ fn emit_intrinsic_arg0( fn emit_intrinsic_arg2( ctx: &mut EmitContext, (dst_type, dst): (ast::ScalarType, Option), - (src_type, src): (ast::ScalarType, Id), + (src_type, src_space, src): (ast::ScalarType, ast::StateSpace, impl GetLLVMValue), intrinsic_name: &[u8], ) -> Result { let builder = ctx.builder.get(); - let mut llvm_src = ctx.names.value(src)?; + let mut llvm_src = src.get_llvm_value(&mut ctx.names)?; let dst_type = get_llvm_type(ctx, &ast::Type::Scalar(dst_type))?; let function_type = get_llvm_function_type( ctx, dst_type, - iter::once((&ast::Type::Scalar(src_type), ast::StateSpace::Reg)), + iter::once((&ast::Type::Scalar(src_type), src_space)), )?; let mut function_value = unsafe { LLVMGetNamedFunction(ctx.module.get(), intrinsic_name.as_ptr() as _) }; diff --git a/ptx/src/ptx.lalrpop b/ptx/src/ptx.lalrpop index ae57575..547810f 100644 --- a/ptx/src/ptx.lalrpop +++ b/ptx/src/ptx.lalrpop @@ -201,6 +201,7 @@ match { "function_name", "generic", "inlined_at", + "isspacep", "ld", "ldmatrix", "lg2", @@ -282,6 +283,7 @@ ExtendedID : &'input str = { "function_name", "generic", "inlined_at", + "isspacep", "ld", "ldmatrix", "lg2", @@ -839,6 +841,7 @@ Instruction: ast::Instruction> = { InstMatch, InstRed, InstNanosleep, + InstIsspacep, }; // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld @@ -2372,6 +2375,19 @@ InstNanosleep: ast::Instruction> = { } } +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep +InstIsspacep: ast::Instruction> = { + "isspacep" ".local" => { + ast::Instruction::Isspacep(ast::StateSpace::Local, a) + }, + "isspacep" ".shared" => { + ast::Instruction::Isspacep(ast::StateSpace::Shared, a) + }, + "isspacep" ".global" => { + ast::Instruction::Isspacep(ast::StateSpace::Global, a) + } +} + NegTypeFtz: ast::ScalarType = { ".f16" => ast::ScalarType::F16, ".f16x2" => ast::ScalarType::F16x2, diff --git a/ptx/src/test/spirv_run/isspacep.ll b/ptx/src/test/spirv_run/isspacep.ll new file mode 100644 index 0000000..08371e3 --- /dev/null +++ b/ptx/src/test/spirv_run/isspacep.ll @@ -0,0 +1,57 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @isspacep(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { +"36": + %"10" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = alloca i1, align 1, addrspace(5) + store i1 false, ptr addrspace(5) %"11", align 1 + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i1, align 1, addrspace(5) + %"7" = alloca i1, align 1, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"12" = load i64, ptr addrspace(4) %"32", align 8 + store i64 %"12", ptr addrspace(5) %"4", align 8 + %"13" = load i64, ptr addrspace(4) %"33", align 8 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %0 = inttoptr i64 %"15" to ptr + %1 = call i1 @llvm.amdgcn.is.private(ptr %0) + %2 = inttoptr i64 %"15" to ptr + %3 = call i1 @llvm.amdgcn.is.shared(ptr %2) + %4 = or i1 %1, %3 + %"14" = sub i1 true, %4 + store i1 %"14", ptr addrspace(5) %"6", align 1 + %"17" = load i1, ptr addrspace(5) %"6", align 1 + %"16" = select i1 %"17", i32 1, i32 0 + store i32 %"16", ptr addrspace(5) %"8", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %5 = inttoptr i64 %"19" to ptr + %"18" = call i1 @llvm.amdgcn.is.shared(ptr %5) + store i1 %"18", ptr addrspace(5) %"7", align 1 + %"21" = load i1, ptr addrspace(5) %"7", align 1 + %"20" = select i1 %"21", i32 1, i32 0 + store i32 %"20", ptr addrspace(5) %"9", align 4 + %"22" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = load i32, ptr addrspace(5) %"8", align 4 + %"34" = inttoptr i64 %"22" to ptr + store i32 %"23", ptr %"34", align 4 + %"24" = load i64, ptr addrspace(5) %"5", align 8 + %"25" = load i32, ptr addrspace(5) %"9", align 4 + %"35" = inttoptr i64 %"24" to ptr + %"38" = getelementptr inbounds i8, ptr %"35", i64 4 + store i32 %"25", ptr %"38", align 4 + ret void +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare i1 @llvm.amdgcn.is.private(ptr nocapture) #1 + +; Function Attrs: nounwind readnone speculatable willreturn +declare i1 @llvm.amdgcn.is.shared(ptr nocapture) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/isspacep.ptx b/ptx/src/test/spirv_run/isspacep.ptx new file mode 100644 index 0000000..55d39f5 --- /dev/null +++ b/ptx/src/test/spirv_run/isspacep.ptx @@ -0,0 +1,28 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry isspacep( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .pred is_global; + .reg .pred is_shared; + + .reg .u32 is_global_u32; + .reg .u32 is_shared_u32; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + isspacep.global is_global, in_addr; + selp.u32 is_global_u32, 1, 0, is_global; + isspacep.shared is_shared, in_addr; + selp.u32 is_shared_u32, 1, 0, is_shared; + st.u32 [out_addr], is_global_u32; + st.u32 [out_addr+4], is_shared_u32; + ret; +} diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index a65240c..36d82d2 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -271,7 +271,11 @@ test_ptx!(const, [0u16], [10u16, 20, 30, 40]); test_ptx!(cvt_s16_s8, [0x139231C2u32], [0xFFFFFFC2u32]); test_ptx!(cvt_f64_f32, [0.125f32], [0.125f64]); test_ptx!(cvt_f32_f16, [0xa1u16], [0x37210000u32]); -test_ptx!(prmt, [0x70c507d6u32, 0x6fbd4b5cu32], [0x6fbdd65cu32, 0x6FFFD600]); +test_ptx!( + prmt, + [0x70c507d6u32, 0x6fbd4b5cu32], + [0x6fbdd65cu32, 0x6FFFD600] +); test_ptx!( prmt_non_immediate, [0x70c507d6u32, 0x6fbd4b5cu32], @@ -336,7 +340,11 @@ test_ptx!( [f16::from_f32(2.0), f16::from_f32(3.0)], [f16::from_f32(2.0), f16::from_f32(5.0)] ); -test_ptx!(set_f16x2, [0xc1690e6eu32, 0x13739444u32, 0x424834CC, 0x4248B4CC], [0xffffu32, 0x3C000000]); +test_ptx!( + set_f16x2, + [0xc1690e6eu32, 0x13739444u32, 0x424834CC, 0x4248B4CC], + [0xffffu32, 0x3C000000] +); test_ptx!( dp4a, [0xde3032f5u32, 0x2474fe15, 0xf51d8d6c], @@ -350,6 +358,7 @@ test_ptx!( [1923569713u64, 1923569712], [1923569713u64, 1923569712] ); +test_ptx!(isspacep, [0xDEADu32], [1u32, 0]); test_ptx_warp!( shfl, diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs index 041c690..f7fd281 100644 --- a/ptx/src/translate.rs +++ b/ptx/src/translate.rs @@ -6594,6 +6594,14 @@ impl ast::Instruction { ast::StateSpace::Reg, )), )?), + ast::Instruction::Isspacep(space, arg) => ast::Instruction::Isspacep( + space, + arg.map_different_types( + visitor, + &ast::Type::Scalar(ast::ScalarType::Pred), + &ast::Type::Scalar(ast::ScalarType::U64), + )?, + ), }) } } @@ -6915,6 +6923,7 @@ impl ast::Instruction { ast::Instruction::Vshr { .. } => None, ast::Instruction::Dp4a { .. } => None, ast::Instruction::MatchAny { .. } => None, + ast::Instruction::Isspacep { .. } => None, ast::Instruction::Sub(ast::ArithDetails::Signed(_), _) => None, ast::Instruction::Sub(ast::ArithDetails::Unsigned(_), _) => None, ast::Instruction::Add(ast::ArithDetails::Signed(_), _) => None,