Map remaining instructions

2025-07-30 21:08:38 +00:00 · 2024-08-21 16:57:33 +02:00 · 2024-08-21 16:57:33 +02:00 · 0760c3d58f
commit 0760c3d58f
parent 39faaa7214
2 changed files with 531 additions and 32 deletions
--- a/ptx_parser/src/ast.rs
+++ b/ptx_parser/src/ast.rs
@ -201,7 +201,7 @@ gen::generate_instruction_type!(
            }
        },
        Abs {
-            data: AbsDetails,
+            data: TypeFtz,
            type: { Type::Scalar(data.type_) },
            arguments<T>: {
                dst: T,
@ -276,7 +276,7 @@ gen::generate_instruction_type!(
        },
        Rsqrt {
            type: { Type::from(data.type_) },
-            data: RsqrtData,
+            data: TypeFtz,
            arguments<T>: {
                dst: T,
                src: T,
@ -328,6 +328,163 @@ gen::generate_instruction_type!(
                src3: T,
            }
        },
+        Div {
+            type: Type::Scalar(data.type_()),
+            data: DivDetails,
+            arguments<T>: {
+                dst: T,
+                src1: T,
+                src2: T,
+            }
+        },
+        Neg {
+            type: Type::Scalar(data.type_),
+            data: TypeFtz,
+            arguments<T>: {
+                dst: T,
+                src: T
+            }
+        },
+        Sin {
+            type: Type::Scalar(ScalarType::F32),
+            data: FlushToZero,
+            arguments<T>: {
+                dst: T,
+                src: T
+            }
+        },
+        Cos {
+            type: Type::Scalar(ScalarType::F32),
+            data: FlushToZero,
+            arguments<T>: {
+                dst: T,
+                src: T
+            }
+        },
+        Lg2 {
+            type: Type::Scalar(ScalarType::F32),
+            data: FlushToZero,
+            arguments<T>: {
+                dst: T,
+                src: T
+            }
+        },
+        Ex2 {
+            type: Type::Scalar(ScalarType::F32),
+            data: TypeFtz,
+            arguments<T>: {
+                dst: T,
+                src: T
+            }
+        },
+        Clz {
+            type: Type::Scalar(data.clone()),
+            data: ScalarType,
+            arguments<T>: {
+                dst: {
+                    repr: T,
+                    type: Type::Scalar(ScalarType::U32)
+                },
+                src: T
+            }
+        },
+        Brev {
+            type: Type::Scalar(data.clone()),
+            data: ScalarType,
+            arguments<T>: {
+                dst: T,
+                src: T
+            }
+        },
+        Popc {
+            type: Type::Scalar(data.clone()),
+            data: ScalarType,
+            arguments<T>: {
+                dst: {
+                    repr: T,
+                    type: Type::Scalar(ScalarType::U32)
+                },
+                src: T
+            }
+        },
+        Xor {
+            type: Type::Scalar(data.clone()),
+            data: ScalarType,
+            arguments<T>: {
+                dst: T,
+                src1: T,
+                src2: T
+            }
+        },
+        Rem {
+            type: Type::Scalar(data.clone()),
+            data: ScalarType,
+            arguments<T>: {
+                dst: T,
+                src1: T,
+                src2: T
+            }
+        },
+        Bfe {
+            type: Type::Scalar(data.clone()),
+            data: ScalarType,
+            arguments<T>: {
+                dst: T,
+                src1: T,
+                src2: {
+                    repr: T,
+                    type: Type::Scalar(ScalarType::U32)
+                },
+                src3: {
+                    repr: T,
+                    type: Type::Scalar(ScalarType::U32)
+                },
+            }
+        },
+        Bfi {
+            type: Type::Scalar(data.clone()),
+            data: ScalarType,
+            arguments<T>: {
+                dst: T,
+                src1: T,
+                src2: T,
+                src3: {
+                    repr: T,
+                    type: Type::Scalar(ScalarType::U32)
+                },
+                src4: {
+                    repr: T,
+                    type: Type::Scalar(ScalarType::U32)
+                },
+            }
+        },
+        PrmtSlow {
+            type: Type::Scalar(ScalarType::U32),
+            arguments<T>: {
+                dst: T,
+                src1: T,
+                src2: T,
+                src3: T
+            }
+        },
+        Prmt {
+            type: Type::Scalar(ScalarType::B32),
+            data: u16,
+            arguments<T>: {
+                dst: T,
+                src1: T,
+                src2: T
+            }
+        },
+        Activemask {
+            type: Type::Scalar(ScalarType::B32),
+            arguments<T>: {
+                dst: T
+            }
+        },
+        Membar {
+            data: MemScope
+        },
        Trap { }
    }
 );
@ -1121,8 +1278,8 @@ pub enum CvtaDirection {
    ExplicitToGeneric,
 }

-#[derive(Copy, Clone)]
-pub struct AbsDetails {
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub struct TypeFtz {
    pub flush_to_zero: Option<bool>,
    pub type_: ScalarType,
 }
@ -1187,13 +1344,6 @@ pub struct MinMaxFloat {
    pub type_: ScalarType,
 }

-#[derive(Copy, Clone, Eq, PartialEq)]
-pub enum DivFloatKind {
-    Approx,
-    Full,
-    Rounding(RoundingMode),
-}
-
 #[derive(Copy, Clone)]
 pub struct RcpData {
    pub kind: RcpKind,
@ -1204,13 +1354,7 @@ pub struct RcpData {
 #[derive(Copy, Clone, Eq, PartialEq)]
 pub enum RcpKind {
    Approx,
-    Full(RoundingMode),
-}
-
-#[derive(Copy, Clone)]
-pub struct RsqrtData {
-    pub flush_to_zero: Option<bool>,
-    pub type_: ScalarType,
+    Compliant(RoundingMode),
 }

 pub struct BarData {
@ -1270,3 +1414,39 @@ pub struct AtomCasDetails {
    pub scope: MemScope,
    pub space: StateSpace,
 }
+
+#[derive(Copy, Clone)]
+pub enum DivDetails {
+    Unsigned(ScalarType),
+    Signed(ScalarType),
+    Float(DivFloatDetails),
+}
+
+impl DivDetails {
+    pub fn type_(&self) -> ScalarType {
+        match self {
+            DivDetails::Unsigned(t) => *t,
+            DivDetails::Signed(t) => *t,
+            DivDetails::Float(float) => float.type_,
+        }
+    }
+}
+
+#[derive(Copy, Clone)]
+pub struct DivFloatDetails {
+    pub type_: ScalarType,
+    pub flush_to_zero: Option<bool>,
+    pub kind: DivFloatKind,
+}
+
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub enum DivFloatKind {
+    Approx,
+    ApproxFull,
+    Rounding(RoundingMode),
+}
+
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub struct FlushToZero {
+    pub flush_to_zero: bool
+}
--- a/ptx_parser/src/main.rs
+++ b/ptx_parser/src/main.rs
@ -1723,7 +1723,7 @@ derive_parser!(
    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-abs
    abs.type        d, a => {
        ast::Instruction::Abs {
-            data: ast::AbsDetails {
+            data: ast::TypeFtz {
                flush_to_zero: None,
                type_
            },
@ -1734,7 +1734,7 @@ derive_parser!(
    }
    abs{.ftz}.f32   d, a => {
        ast::Instruction::Abs {
-            data: ast::AbsDetails {
+            data: ast::TypeFtz {
                flush_to_zero: Some(ftz),
                type_: f32
            },
@ -1745,7 +1745,7 @@ derive_parser!(
    }
    abs.f64         d, a => {
        ast::Instruction::Abs {
-            data: ast::AbsDetails {
+            data: ast::TypeFtz {
                flush_to_zero: None,
                type_: f64
            },
@ -1756,7 +1756,7 @@ derive_parser!(
    }
    abs{.ftz}.f16   d, a => {
        ast::Instruction::Abs {
-            data: ast::AbsDetails {
+            data: ast::TypeFtz {
                flush_to_zero: Some(ftz),
                type_: f16
            },
@ -1767,7 +1767,7 @@ derive_parser!(
    }
    abs{.ftz}.f16x2 d, a => {
        ast::Instruction::Abs {
-            data: ast::AbsDetails {
+            data: ast::TypeFtz {
                flush_to_zero: Some(ftz),
                type_: f16x2
            },
@ -1778,7 +1778,7 @@ derive_parser!(
    }
    abs.bf16        d, a => {
        ast::Instruction::Abs {
-            data: ast::AbsDetails {
+            data: ast::TypeFtz {
                flush_to_zero: None,
                type_: bf16
            },
@ -1789,7 +1789,7 @@ derive_parser!(
    }
    abs.bf16x2      d, a => {
        ast::Instruction::Abs {
-            data: ast::AbsDetails {
+            data: ast::TypeFtz {
                flush_to_zero: None,
                type_: bf16x2
            },
@ -2272,7 +2272,7 @@ derive_parser!(
    rcp.rnd{.ftz}.f32       d, a => {
        ast::Instruction::Rcp {
            data: ast::RcpData {
-                kind: ast::RcpKind::Full(rnd.into()),
+                kind: ast::RcpKind::Compliant(rnd.into()),
                flush_to_zero: Some(ftz),
                type_: f32
            },
@ -2282,7 +2282,7 @@ derive_parser!(
    rcp.rnd.f64             d, a => {
        ast::Instruction::Rcp {
            data: ast::RcpData {
-                kind: ast::RcpKind::Full(rnd.into()),
+                kind: ast::RcpKind::Compliant(rnd.into()),
                flush_to_zero: None,
                type_: f64
            },
@ -2307,7 +2307,7 @@ derive_parser!(
    sqrt.rnd{.ftz}.f32     d, a => {
        ast::Instruction::Sqrt {
            data: ast::RcpData {
-                kind: ast::RcpKind::Full(rnd.into()),
+                kind: ast::RcpKind::Compliant(rnd.into()),
                flush_to_zero: Some(ftz),
                type_: f32
            },
@ -2317,7 +2317,7 @@ derive_parser!(
    sqrt.rnd.f64           d, a => {
        ast::Instruction::Sqrt {
            data: ast::RcpData {
-                kind: ast::RcpKind::Full(rnd.into()),
+                kind: ast::RcpKind::Compliant(rnd.into()),
                flush_to_zero: None,
                type_: f64
            },
@ -2331,7 +2331,7 @@ derive_parser!(
    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-rsqrt-approx-ftz-f64
    rsqrt.approx{.ftz}.f32  d, a => {
        ast::Instruction::Rsqrt {
-            data: ast::RsqrtData {
+            data: ast::TypeFtz {
                flush_to_zero: Some(ftz),
                type_: f32
            },
@ -2340,7 +2340,7 @@ derive_parser!(
    }
    rsqrt.approx.f64        d, a => {
        ast::Instruction::Rsqrt {
-            data: ast::RsqrtData {
+            data: ast::TypeFtz {
                flush_to_zero: None,
                type_: f64
            },
@ -2349,7 +2349,7 @@ derive_parser!(
    }
    rsqrt.approx.ftz.f64 d, a => {
        ast::Instruction::Rsqrt {
-            data: ast::RsqrtData {
+            data: ast::TypeFtz {
                flush_to_zero: None,
                type_: f64
            },
@ -2499,6 +2499,325 @@ derive_parser!(
    StateSpace =                    { .global };
    RawAtomicOp =                   { .exch };

+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-div
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-div
+    div.type  d, a, b => {
+        ast::Instruction::Div {
+            data: if type_.kind() == ast::ScalarKind::Signed {
+                ast::DivDetails::Signed(type_)
+            } else {
+                ast::DivDetails::Unsigned(type_)
+            },
+            arguments: DivArgs {
+                dst: d,
+                src1: a,
+                src2: b,
+            },
+        }
+    }
+    .type: ScalarType = { .u16, .u32, .u64,
+                          .s16, .s32, .s64 };
+
+    div.approx{.ftz}.f32  d, a, b => {
+        ast::Instruction::Div {
+            data: ast::DivDetails::Float(ast::DivFloatDetails{
+                type_: f32,
+                flush_to_zero: Some(ftz),
+                kind: ast::DivFloatKind::Approx
+            }),
+            arguments: DivArgs {
+                dst: d,
+                src1: a,
+                src2: b,
+            },
+        }
+    }
+    div.full{.ftz}.f32    d, a, b => {
+        ast::Instruction::Div {
+            data: ast::DivDetails::Float(ast::DivFloatDetails{
+                type_: f32,
+                flush_to_zero: Some(ftz),
+                kind: ast::DivFloatKind::ApproxFull
+            }),
+            arguments: DivArgs {
+                dst: d,
+                src1: a,
+                src2: b,
+            },
+        }
+    }
+    div.rnd{.ftz}.f32     d, a, b => {
+        ast::Instruction::Div {
+            data: ast::DivDetails::Float(ast::DivFloatDetails{
+                type_: f32,
+                flush_to_zero: Some(ftz),
+                kind: ast::DivFloatKind::Rounding(rnd.into())
+            }),
+            arguments: DivArgs {
+                dst: d,
+                src1: a,
+                src2: b,
+            },
+        }
+    }
+    div.rnd.f64           d, a, b => {
+        ast::Instruction::Div {
+            data: ast::DivDetails::Float(ast::DivFloatDetails{
+                type_: f64,
+                flush_to_zero: None,
+                kind: ast::DivFloatKind::Rounding(rnd.into())
+            }),
+            arguments: DivArgs {
+                dst: d,
+                src1: a,
+                src2: b,
+            },
+        }
+    }
+    .rnd: RawRoundingMode = { .rn, .rz, .rm, .rp };
+    ScalarType = { .f32, .f64 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-neg
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-neg
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-neg
+    neg.type  d, a => {
+        ast::Instruction::Neg {
+            data: TypeFtz {
+                type_,
+                flush_to_zero: None
+            },
+            arguments: NegArgs { dst: d, src: a, },
+        }
+    }
+    .type: ScalarType = { .s16, .s32, .s64 };
+
+    neg{.ftz}.f32  d, a => {
+        ast::Instruction::Neg {
+            data: TypeFtz {
+                type_: f32,
+                flush_to_zero: Some(ftz)
+            },
+            arguments: NegArgs { dst: d, src: a, },
+        }
+    }
+    neg.f64        d, a => {
+        ast::Instruction::Neg {
+            data: TypeFtz {
+                type_: f64,
+                flush_to_zero: None
+            },
+            arguments: NegArgs { dst: d, src: a, },
+        }
+    }
+    neg{.ftz}.f16    d, a => {
+        ast::Instruction::Neg {
+            data: TypeFtz {
+                type_: f16,
+                flush_to_zero: Some(ftz)
+            },
+            arguments: NegArgs { dst: d, src: a, },
+        }
+    }
+    neg{.ftz}.f16x2  d, a => {
+        ast::Instruction::Neg {
+            data: TypeFtz {
+                type_: f16x2,
+                flush_to_zero: Some(ftz)
+            },
+            arguments: NegArgs { dst: d, src: a, },
+        }
+    }
+    neg.bf16         d, a => {
+        ast::Instruction::Neg {
+            data: TypeFtz {
+                type_: bf16,
+                flush_to_zero: None
+            },
+            arguments: NegArgs { dst: d, src: a, },
+        }
+    }
+    neg.bf16x2       d, a => {
+        ast::Instruction::Neg {
+            data: TypeFtz {
+                type_: bf16x2,
+                flush_to_zero: None
+            },
+            arguments: NegArgs { dst: d, src: a, },
+        }
+    }
+    ScalarType = { .f32, .f64, .f16, .f16x2, .bf16, .bf16x2 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-sin
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-cos
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-lg2
+    sin.approx{.ftz}.f32  d, a => {
+        ast::Instruction::Sin {
+            data: ast::FlushToZero {
+                flush_to_zero: ftz
+            },
+            arguments: SinArgs { dst: d, src: a, },
+        }
+    }
+    cos.approx{.ftz}.f32  d, a => {
+        ast::Instruction::Cos {
+            data: ast::FlushToZero {
+                flush_to_zero: ftz
+            },
+            arguments: CosArgs { dst: d, src: a, },
+        }
+    }
+    lg2.approx{.ftz}.f32  d, a => {
+        ast::Instruction::Lg2 {
+            data: ast::FlushToZero {
+                flush_to_zero: ftz
+            },
+            arguments: Lg2Args { dst: d, src: a, },
+        }
+    }
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-ex2
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-ex2
+    ex2.approx{.ftz}.f32  d, a => {
+        ast::Instruction::Ex2 {
+            data: ast::TypeFtz {
+                type_: f32,
+                flush_to_zero: Some(ftz)
+            },
+            arguments: Ex2Args { dst: d, src: a, },
+        }
+    }
+    ex2.approx.atype     d, a => {
+        ast::Instruction::Ex2 {
+            data: ast::TypeFtz {
+                type_: atype,
+                flush_to_zero: None
+            },
+            arguments: Ex2Args { dst: d, src: a, },
+        }
+    }
+    ex2.approx.ftz.btype d, a => {
+        ast::Instruction::Ex2 {
+            data: ast::TypeFtz {
+                type_: btype,
+                flush_to_zero: Some(true)
+            },
+            arguments: Ex2Args { dst: d, src: a, },
+        }
+    }
+    .atype: ScalarType = { .f16,  .f16x2 };
+    .btype: ScalarType = { .bf16, .bf16x2 };
+    ScalarType = { .f32 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-clz
+    clz.type  d, a => {
+        ast::Instruction::Clz {
+            data: type_,
+            arguments: ClzArgs { dst: d, src: a, },
+        }
+    }
+    .type: ScalarType = { .b32, .b64 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-brev
+    brev.type  d, a => {
+        ast::Instruction::Brev {
+            data: type_,
+            arguments: BrevArgs { dst: d, src: a, },
+        }
+    }
+    .type: ScalarType = { .b32, .b64 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-popc
+    popc.type  d, a => {
+        ast::Instruction::Popc {
+            data: type_,
+            arguments: PopcArgs { dst: d, src: a, },
+        }
+    }
+    .type: ScalarType = { .b32, .b64 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-xor
+    xor.type d, a, b => {
+        ast::Instruction::Xor {
+            data: type_,
+            arguments: XorArgs { dst: d, src1: a, src2: b, },
+        }
+    }
+    .type: ScalarType = { .pred, .b16, .b32, .b64 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-rem
+    rem.type  d, a, b => {
+        ast::Instruction::Rem {
+            data: type_,
+            arguments: RemArgs { dst: d, src1: a, src2: b, },
+        }
+    }
+    .type: ScalarType = { .u16, .u32, .u64, .s16, .s32, .s64 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfe
+    bfe.type  d, a, b, c => {
+        ast::Instruction::Bfe {
+            data: type_,
+            arguments: BfeArgs { dst: d, src1: a, src2: b, src3: c },
+        }
+    }
+    .type: ScalarType = { .u32, .u64, .s32, .s64 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-bfi
+    bfi.type  f, a, b, c, d => {
+        ast::Instruction::Bfi {
+            data: type_,
+            arguments: BfiArgs { dst: f, src1: a, src2: b, src3: c, src4: d },
+        }
+    }
+    .type: ScalarType = { .b32, .b64 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
+    // prmt.b32{.mode}  d, a, b, c;
+    // .mode = { .f4e, .b4e, .rc8, .ecl, .ecr, .rc16 };
+    prmt.b32  d, a, b, c => {
+        match c {
+            ast::ParsedOperand::Imm(ImmediateValue::U64(control)) => ast::Instruction::Prmt {
+                data: control as u16,
+                arguments: PrmtArgs {
+                    dst: d, src1: a, src2: b
+                }
+            },
+            _ => ast::Instruction::PrmtSlow {
+                arguments: PrmtSlowArgs {
+                    dst: d, src1: a, src2: b, src3: c
+                }
+            }
+        }
+    }
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
+    activemask.b32 d => {
+        ast::Instruction::Activemask {
+            arguments: ActivemaskArgs { dst: d }
+        }
+    }
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
+    // fence{.sem}.scope;
+    // fence.op_restrict.release.cluster;
+    // fence.proxy.proxykind;
+    // fence.proxy.to_proxykind::from_proxykind.release.scope;
+    // fence.proxy.to_proxykind::from_proxykind.acquire.scope  [addr], size;
+    //membar.proxy.proxykind;
+    //.sem       = { .sc, .acq_rel };
+    //.scope     = { .cta, .cluster, .gpu, .sys };
+    //.proxykind = { .alias, .async, async.global, .async.shared::{cta, cluster} };
+    //.op_restrict = { .mbarrier_init };
+    //.to_proxykind::from_proxykind = {.tensormap::generic};
+
+    membar.level => {
+        ast::Instruction::Membar { data: level }
+    }
+    membar.gl => {
+        ast::Instruction::Membar { data: MemScope::Gpu }
+    }
+    .level: MemScope      = { .cta, .sys };
+
    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret
    ret{.uni} => {
        Instruction::Ret { data: RetData { uniform: uni } }