From 4363545d0ee0358b39e3ec0067fe25b0621048b8 Mon Sep 17 00:00:00 2001
From: Andrzej Janik <vosen@vosen.pl>
Date: Tue, 27 Feb 2024 19:57:17 +0100
Subject: [PATCH] Implement isspacep

---
 ptx/src/ast.rs                      |   1 +
 ptx/src/emit.rs                     | 103 +++++++++++++++++++++++-----
 ptx/src/ptx.lalrpop                 |  16 +++++
 ptx/src/test/spirv_run/isspacep.ll  |  57 +++++++++++++++
 ptx/src/test/spirv_run/isspacep.ptx |  28 ++++++++
 ptx/src/test/spirv_run/mod.rs       |  13 +++-
 ptx/src/translate.rs                |   9 +++
 7 files changed, 208 insertions(+), 19 deletions(-)
 create mode 100644 ptx/src/test/spirv_run/isspacep.ll
 create mode 100644 ptx/src/test/spirv_run/isspacep.ptx
diff --git a/ptx/src/ast.rs b/ptx/src/ast.rs
index 0281961..225fc1d 100644
--- a/ptx/src/ast.rs
+++ b/ptx/src/ast.rs
@@ -475,6 +475,7 @@ pub enum Instruction<P: ArgParams> {
     MatchAny(Arg3<P>),
     Red(AtomDetails, Arg2St<P>),
     Nanosleep(Arg1<P>),
+    Isspacep(StateSpace, Arg2<P>),
 }
 
 #[derive(Copy, Clone)]
diff --git a/ptx/src/emit.rs b/ptx/src/emit.rs
index 94cc973..af72f89 100644
--- a/ptx/src/emit.rs
+++ b/ptx/src/emit.rs
@@ -1137,6 +1137,7 @@ fn emit_instruction(
         ast::Instruction::Vshr(arg) => emit_inst_vshr(ctx, arg)?,
         ast::Instruction::Set(details, arg) => emit_inst_set(ctx, details, arg)?,
         ast::Instruction::Red(details, arg) => emit_inst_red(ctx, details, arg)?,
+        ast::Instruction::Isspacep(space, arg) => emit_inst_isspacep(ctx, *space, arg)?,
         // replaced by function calls or Statement variants
         ast::Instruction::Activemask { .. }
         | ast::Instruction::Bar(..)
@@ -1161,6 +1162,70 @@ fn emit_instruction(
     })
 }
 
+fn emit_inst_isspacep(
+    ctx: &mut EmitContext,
+    space: ast::StateSpace,
+    arg: &ast::Arg2<ExpandedArgParams>,
+) -> Result<(), TranslateError> {
+    match space {
+        ast::StateSpace::Local => {
+            emit_inst_isspacep_impl(ctx, Some(arg.dst), arg.src, b"llvm.amdgcn.is.private\0")?;
+            Ok(())
+        }
+        ast::StateSpace::Shared => {
+            emit_inst_isspacep_impl(ctx, Some(arg.dst), arg.src, b"llvm.amdgcn.is.shared\0")?;
+            Ok(())
+        }
+        ast::StateSpace::Global => {
+            let builder = ctx.builder.get();
+            let is_private =
+                emit_inst_isspacep_impl(ctx, None, arg.src, b"llvm.amdgcn.is.private\0")?;
+            let is_shared =
+                emit_inst_isspacep_impl(ctx, None, arg.src, b"llvm.amdgcn.is.shared\0")?;
+            let private_or_shared =
+                unsafe { LLVMBuildOr(builder, is_private, is_shared, LLVM_UNNAMED) };
+            let i1_true = unsafe {
+                LLVMConstInt(
+                    get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::Pred))?,
+                    1,
+                    0,
+                )
+            };
+            ctx.names.register_result(arg.dst, |dst| unsafe {
+                // I'd rathr user LLVMBuildNeg(...), but when using LLVMBuildNeg(...) in LLVM 15,
+                // LLVM emits this broken IR:
+                //      %"14" = sub i1 false, %4
+                LLVMBuildSub(builder, i1_true, private_or_shared, dst)
+            });
+            Ok(())
+        }
+        _ => Err(TranslateError::unreachable()),
+    }
+}
+
+fn emit_inst_isspacep_impl(
+    ctx: &mut EmitContext,
+    dst: Option<Id>,
+    src: Id,
+    intrinsic: &[u8],
+) -> Result<LLVMValueRef, TranslateError> {
+    let src = ctx.names.value(src)?;
+    let b8 = get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::B8))?;
+    let b8_generic_ptr = unsafe {
+        LLVMPointerType(
+            b8,
+            get_llvm_address_space(&ctx.constants, ast::StateSpace::Generic)?,
+        )
+    };
+    let src = unsafe { LLVMBuildIntToPtr(ctx.builder.get(), src, b8_generic_ptr, LLVM_UNNAMED) };
+    emit_intrinsic_arg2(
+        ctx,
+        (ast::ScalarType::Pred, dst),
+        (ast::ScalarType::B8, ast::StateSpace::Generic, src),
+        intrinsic,
+    )
+}
+
 fn emit_inst_red(
     ctx: &mut EmitContext,
     details: &ast::AtomDetails,
@@ -1397,7 +1462,7 @@ fn emit_inst_abs(
         emit_intrinsic_arg2(
             ctx,
             (details.typ, Some(args.dst)),
-            (details.typ, args.src),
+            (details.typ, ast::StateSpace::Reg, args.src),
             intrinsic_name.as_bytes(),
         )?;
     } else {
@@ -1565,7 +1630,7 @@ fn emit_inst_rsqrt(
     let sqrt_result = emit_intrinsic_arg2(
         ctx,
         (details.typ, None),
-        (details.typ, args.src),
+        (details.typ, ast::StateSpace::Reg, args.src),
         intrinsic_fn,
     )?;
     unsafe { LLVMZludaSetFastMathFlags(sqrt_result, FastMathFlags::ApproxFunc) };
@@ -1623,7 +1688,7 @@ fn emit_inst_sqrt(
     let sqrt_result = emit_intrinsic_arg2(
         ctx,
         (details.type_, Some(args.dst)),
-        (details.type_, args.src),
+        (details.type_, ast::StateSpace::Reg, args.src),
         intrinsic_fn,
     )?;
     unsafe { LLVMZludaSetFastMathFlags(sqrt_result, fast_math) };
@@ -2468,7 +2533,7 @@ fn emit_inst_cvt(
                         emit_intrinsic_arg2(
                             ctx,
                             (type_, Some(args.dst)),
-                            (type_, args.src),
+                            (type_, ast::StateSpace::Reg, args.src),
                             intrinsic_fn,
                         )?;
                     }
@@ -2482,7 +2547,7 @@ fn emit_inst_cvt(
                         emit_intrinsic_arg2(
                             ctx,
                             (type_, Some(args.dst)),
-                            (type_, args.src),
+                            (type_, ast::StateSpace::Reg, args.src),
                             intrinsic_fn,
                         )?;
                     }
@@ -2496,7 +2561,7 @@ fn emit_inst_cvt(
                         emit_intrinsic_arg2(
                             ctx,
                             (type_, Some(args.dst)),
-                            (type_, args.src),
+                            (type_, ast::StateSpace::Reg, args.src),
                             intrinsic_fn,
                         )?;
                     }
@@ -2510,7 +2575,7 @@ fn emit_inst_cvt(
                         emit_intrinsic_arg2(
                             ctx,
                             (type_, Some(args.dst)),
-                            (type_, args.src),
+                            (type_, ast::StateSpace::Reg, args.src),
                             intrinsic_fn,
                         )?;
                     }
@@ -2676,7 +2741,7 @@ fn emit_inst_cos(
     let cos_value = emit_intrinsic_arg2(
         ctx,
         (ast::ScalarType::F32, Some(args.dst)),
-        (ast::ScalarType::F32, args.src),
+        (ast::ScalarType::F32, ast::StateSpace::Reg, args.src),
         function_name,
     )?;
     unsafe { LLVMZludaSetFastMathFlags(cos_value, FastMathFlags::ApproxFunc) };
@@ -2691,7 +2756,7 @@ fn emit_inst_sin(
     let cos_value = emit_intrinsic_arg2(
         ctx,
         (ast::ScalarType::F32, Some(args.dst)),
-        (ast::ScalarType::F32, args.src),
+        (ast::ScalarType::F32, ast::StateSpace::Reg, args.src),
         function_name,
     )?;
     unsafe { LLVMZludaSetFastMathFlags(cos_value, FastMathFlags::ApproxFunc) };
@@ -2895,7 +2960,7 @@ fn emit_inst_brev(
     emit_intrinsic_arg2(
         ctx,
         (type_, Some(args.dst)),
-        (type_, args.src),
+        (type_, ast::StateSpace::Reg, args.src),
         function_name,
     )?;
     Ok(())
@@ -2913,8 +2978,12 @@ fn emit_inst_popc(
         _ => return Err(TranslateError::unreachable()),
     };
     let popc_dst = if shorten { None } else { Some(args.dst) };
-    let popc_result =
-        emit_intrinsic_arg2(ctx, (type_, popc_dst), (type_, args.src), function_name)?;
+    let popc_result = emit_intrinsic_arg2(
+        ctx,
+        (type_, popc_dst),
+        (type_, ast::StateSpace::Reg, args.src),
+        function_name,
+    )?;
     if shorten {
         let llvm_i32 = get_llvm_type(ctx, &ast::Type::Scalar(ast::ScalarType::U32))?;
         ctx.names.register_result(args.dst, |dst_name| unsafe {
@@ -2932,7 +3001,7 @@ fn emit_inst_ex2(
     let llvm_value = emit_intrinsic_arg2(
         ctx,
         (ast::ScalarType::F32, Some(args.dst)),
-        (ast::ScalarType::F32, args.src),
+        (ast::ScalarType::F32, ast::StateSpace::Reg, args.src),
         function_name,
     )?;
     unsafe { LLVMZludaSetFastMathFlags(llvm_value, FastMathFlags::ApproxFunc) };
@@ -2947,7 +3016,7 @@ fn emit_inst_lg2(
     let llvm_value = emit_intrinsic_arg2(
         ctx,
         (ast::ScalarType::F32, Some(args.dst)),
-        (ast::ScalarType::F32, args.src),
+        (ast::ScalarType::F32, ast::StateSpace::Reg, args.src),
         function_name,
     )?;
     unsafe { LLVMZludaSetFastMathFlags(llvm_value, FastMathFlags::ApproxFunc) };
@@ -2986,16 +3055,16 @@ fn emit_intrinsic_arg0(
 fn emit_intrinsic_arg2(
     ctx: &mut EmitContext,
     (dst_type, dst): (ast::ScalarType, Option<Id>),
-    (src_type, src): (ast::ScalarType, Id),
+    (src_type, src_space, src): (ast::ScalarType, ast::StateSpace, impl GetLLVMValue),
     intrinsic_name: &[u8],
 ) -> Result<LLVMValueRef, TranslateError> {
     let builder = ctx.builder.get();
-    let mut llvm_src = ctx.names.value(src)?;
+    let mut llvm_src = src.get_llvm_value(&mut ctx.names)?;
     let dst_type = get_llvm_type(ctx, &ast::Type::Scalar(dst_type))?;
     let function_type = get_llvm_function_type(
         ctx,
         dst_type,
-        iter::once((&ast::Type::Scalar(src_type), ast::StateSpace::Reg)),
+        iter::once((&ast::Type::Scalar(src_type), src_space)),
     )?;
     let mut function_value =
         unsafe { LLVMGetNamedFunction(ctx.module.get(), intrinsic_name.as_ptr() as _) };
diff --git a/ptx/src/ptx.lalrpop b/ptx/src/ptx.lalrpop
index ae57575..547810f 100644
--- a/ptx/src/ptx.lalrpop
+++ b/ptx/src/ptx.lalrpop
@@ -201,6 +201,7 @@ match {
     "function_name",
     "generic",
     "inlined_at",
+    "isspacep",
     "ld",
     "ldmatrix",
     "lg2",
@@ -282,6 +283,7 @@ ExtendedID : &'input str = {
     "function_name",
     "generic",
     "inlined_at",
+    "isspacep",
     "ld",
     "ldmatrix",
     "lg2",
@@ -839,6 +841,7 @@ Instruction: ast::Instruction<ast::ParsedArgParams<'input>> = {
     InstMatch,
     InstRed,
     InstNanosleep,
+    InstIsspacep,
 };
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
@@ -2372,6 +2375,19 @@ InstNanosleep: ast::Instruction<ast::ParsedArgParams<'input>> = {
     }
 }
 
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-nanosleep
+InstIsspacep: ast::Instruction<ast::ParsedArgParams<'input>> = {
+    "isspacep" ".local" <a:Arg2> => {
+        ast::Instruction::Isspacep(ast::StateSpace::Local, a)
+    },
+    "isspacep" ".shared" <a:Arg2> => {
+        ast::Instruction::Isspacep(ast::StateSpace::Shared, a)
+    },
+    "isspacep" ".global" <a:Arg2> => {
+        ast::Instruction::Isspacep(ast::StateSpace::Global, a)
+    }
+}
+
 NegTypeFtz: ast::ScalarType = {
     ".f16" => ast::ScalarType::F16,
     ".f16x2" => ast::ScalarType::F16x2,
diff --git a/ptx/src/test/spirv_run/isspacep.ll b/ptx/src/test/spirv_run/isspacep.ll
new file mode 100644
index 0000000..08371e3
--- /dev/null
+++ b/ptx/src/test/spirv_run/isspacep.ll
@@ -0,0 +1,57 @@
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @isspacep(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
+"36":
+  %"10" = alloca i1, align 1, addrspace(5)
+  store i1 false, ptr addrspace(5) %"10", align 1
+  %"11" = alloca i1, align 1, addrspace(5)
+  store i1 false, ptr addrspace(5) %"11", align 1
+  %"4" = alloca i64, align 8, addrspace(5)
+  %"5" = alloca i64, align 8, addrspace(5)
+  %"6" = alloca i1, align 1, addrspace(5)
+  %"7" = alloca i1, align 1, addrspace(5)
+  %"8" = alloca i32, align 4, addrspace(5)
+  %"9" = alloca i32, align 4, addrspace(5)
+  %"12" = load i64, ptr addrspace(4) %"32", align 8
+  store i64 %"12", ptr addrspace(5) %"4", align 8
+  %"13" = load i64, ptr addrspace(4) %"33", align 8
+  store i64 %"13", ptr addrspace(5) %"5", align 8
+  %"15" = load i64, ptr addrspace(5) %"4", align 8
+  %0 = inttoptr i64 %"15" to ptr
+  %1 = call i1 @llvm.amdgcn.is.private(ptr %0)
+  %2 = inttoptr i64 %"15" to ptr
+  %3 = call i1 @llvm.amdgcn.is.shared(ptr %2)
+  %4 = or i1 %1, %3
+  %"14" = sub i1 true, %4
+  store i1 %"14", ptr addrspace(5) %"6", align 1
+  %"17" = load i1, ptr addrspace(5) %"6", align 1
+  %"16" = select i1 %"17", i32 1, i32 0
+  store i32 %"16", ptr addrspace(5) %"8", align 4
+  %"19" = load i64, ptr addrspace(5) %"4", align 8
+  %5 = inttoptr i64 %"19" to ptr
+  %"18" = call i1 @llvm.amdgcn.is.shared(ptr %5)
+  store i1 %"18", ptr addrspace(5) %"7", align 1
+  %"21" = load i1, ptr addrspace(5) %"7", align 1
+  %"20" = select i1 %"21", i32 1, i32 0
+  store i32 %"20", ptr addrspace(5) %"9", align 4
+  %"22" = load i64, ptr addrspace(5) %"5", align 8
+  %"23" = load i32, ptr addrspace(5) %"8", align 4
+  %"34" = inttoptr i64 %"22" to ptr
+  store i32 %"23", ptr %"34", align 4
+  %"24" = load i64, ptr addrspace(5) %"5", align 8
+  %"25" = load i32, ptr addrspace(5) %"9", align 4
+  %"35" = inttoptr i64 %"24" to ptr
+  %"38" = getelementptr inbounds i8, ptr %"35", i64 4
+  store i32 %"25", ptr %"38", align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i1 @llvm.amdgcn.is.private(ptr nocapture) #1
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare i1 @llvm.amdgcn.is.shared(ptr nocapture) #1
+
+attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
+attributes #1 = { nounwind readnone speculatable willreturn }
diff --git a/ptx/src/test/spirv_run/isspacep.ptx b/ptx/src/test/spirv_run/isspacep.ptx
new file mode 100644
index 0000000..55d39f5
--- /dev/null
+++ b/ptx/src/test/spirv_run/isspacep.ptx
@@ -0,0 +1,28 @@
+.version 6.5
+.target sm_30
+.address_size 64
+
+.visible .entry isspacep(
+	.param .u64 input,
+	.param .u64 output
+)
+{
+	.reg .u64 	    in_addr;
+    .reg .u64 	    out_addr;
+    .reg .pred      is_global;
+    .reg .pred      is_shared;
+
+    .reg .u32       is_global_u32;
+    .reg .u32       is_shared_u32;
+
+	ld.param.u64 	in_addr, [input];
+    ld.param.u64 	out_addr, [output];
+
+	isspacep.global	is_global, in_addr;
+    selp.u32        is_global_u32, 1, 0, is_global;
+	isspacep.shared	is_shared, in_addr;
+    selp.u32        is_shared_u32, 1, 0, is_shared;
+    st.u32          [out_addr], is_global_u32;
+    st.u32          [out_addr+4], is_shared_u32;
+	ret;
+}
diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs
index a65240c..36d82d2 100644
--- a/ptx/src/test/spirv_run/mod.rs
+++ b/ptx/src/test/spirv_run/mod.rs
@@ -271,7 +271,11 @@ test_ptx!(const, [0u16], [10u16, 20, 30, 40]);
 test_ptx!(cvt_s16_s8, [0x139231C2u32], [0xFFFFFFC2u32]);
 test_ptx!(cvt_f64_f32, [0.125f32], [0.125f64]);
 test_ptx!(cvt_f32_f16, [0xa1u16], [0x37210000u32]);
-test_ptx!(prmt, [0x70c507d6u32, 0x6fbd4b5cu32], [0x6fbdd65cu32, 0x6FFFD600]);
+test_ptx!(
+    prmt,
+    [0x70c507d6u32, 0x6fbd4b5cu32],
+    [0x6fbdd65cu32, 0x6FFFD600]
+);
 test_ptx!(
     prmt_non_immediate,
     [0x70c507d6u32, 0x6fbd4b5cu32],
@@ -336,7 +340,11 @@ test_ptx!(
     [f16::from_f32(2.0), f16::from_f32(3.0)],
     [f16::from_f32(2.0), f16::from_f32(5.0)]
 );
-test_ptx!(set_f16x2, [0xc1690e6eu32, 0x13739444u32, 0x424834CC, 0x4248B4CC], [0xffffu32, 0x3C000000]);
+test_ptx!(
+    set_f16x2,
+    [0xc1690e6eu32, 0x13739444u32, 0x424834CC, 0x4248B4CC],
+    [0xffffu32, 0x3C000000]
+);
 test_ptx!(
     dp4a,
     [0xde3032f5u32, 0x2474fe15, 0xf51d8d6c],
@@ -350,6 +358,7 @@ test_ptx!(
     [1923569713u64, 1923569712],
     [1923569713u64, 1923569712]
 );
+test_ptx!(isspacep, [0xDEADu32], [1u32, 0]);
 
 test_ptx_warp!(
     shfl,
diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs
index 041c690..f7fd281 100644
--- a/ptx/src/translate.rs
+++ b/ptx/src/translate.rs
@@ -6594,6 +6594,14 @@ impl<T: ArgParamsEx> ast::Instruction<T> {
                     ast::StateSpace::Reg,
                 )),
             )?),
+            ast::Instruction::Isspacep(space, arg) => ast::Instruction::Isspacep(
+                space,
+                arg.map_different_types(
+                    visitor,
+                    &ast::Type::Scalar(ast::ScalarType::Pred),
+                    &ast::Type::Scalar(ast::ScalarType::U64),
+                )?,
+            ),
         })
     }
 }
@@ -6915,6 +6923,7 @@ impl<T: ast::ArgParams> ast::Instruction<T> {
             ast::Instruction::Vshr { .. } => None,
             ast::Instruction::Dp4a { .. } => None,
             ast::Instruction::MatchAny { .. } => None,
+            ast::Instruction::Isspacep { .. } => None,
             ast::Instruction::Sub(ast::ArithDetails::Signed(_), _) => None,
             ast::Instruction::Sub(ast::ArithDetails::Unsigned(_), _) => None,
             ast::Instruction::Add(ast::ArithDetails::Signed(_), _) => None,