Add atom and atom.cas

2025-09-18 15:32:38 +00:00 · 2024-08-21 15:46:06 +02:00 · 2024-08-21 15:46:06 +02:00 · 39faaa7214
commit 39faaa7214
parent c16bae32b5
2 changed files with 261 additions and 4 deletions
--- a/ptx_parser/src/ast.rs
+++ b/ptx_parser/src/ast.rs
@ -1,8 +1,8 @@
 use std::cmp::Ordering;

 use super::{
-    MemScope, RawRoundingMode, RawSetpCompareOp, ScalarType, SetpBoolPostOp, StateSpace,
-    VectorPrefix,
+    AtomSemantics, MemScope, RawRoundingMode, RawSetpCompareOp, ScalarType, SetpBoolPostOp,
+    StateSpace, VectorPrefix,
 };
 use crate::{PtxError, PtxParserState};
 use bitflags::bitflags;
@ -282,6 +282,52 @@ gen::generate_instruction_type!(
                src: T,
            }
        },
+        Selp {
+            type: { Type::Scalar(data.clone()) },
+            data: ScalarType,
+            arguments<T>: {
+                dst: T,
+                src1: T,
+                src2: T,
+                src3: {
+                    repr: T,
+                    type: Type::Scalar(ScalarType::Pred)
+                },
+            }
+        },
+        Bar {
+            type: Type::Scalar(ScalarType::U32),
+            data: BarData,
+            arguments<T>: {
+                src1: T,
+                src2: Option<T>,
+            }
+        },
+        Atom {
+            type: &data.type_,
+            data: AtomDetails,
+            arguments<T>: {
+                dst: T,
+                src1: {
+                    repr: T,
+                    space: { data.space },
+                },
+                src2: T,
+            }
+        },
+        AtomCas {
+            type: Type::Scalar(data.type_),
+            data: AtomCasDetails,
+            arguments<T>: {
+                dst: T,
+                src1: {
+                    repr: T,
+                    space: { data.space },
+                },
+                src2: T,
+                src3: T,
+            }
+        },
        Trap { }
    }
 );
@ -408,8 +454,7 @@ pub enum Type {
 impl Type {
    pub(crate) fn maybe_vector(vector: Option<VectorPrefix>, scalar: ScalarType) -> Self {
        match vector {
-            Some(VectorPrefix::V2) => Type::Vector(scalar, 2),
-            Some(VectorPrefix::V4) => Type::Vector(scalar, 4),
+            Some(prefix) => Type::Vector(scalar, prefix.len()),
            None => Type::Scalar(scalar),
        }
    }
@ -1167,3 +1212,61 @@ pub struct RsqrtData {
    pub flush_to_zero: Option<bool>,
    pub type_: ScalarType,
 }
+
+pub struct BarData {
+    pub aligned: bool,
+}
+
+pub struct AtomDetails {
+    pub type_: Type,
+    pub semantics: AtomSemantics,
+    pub scope: MemScope,
+    pub space: StateSpace,
+    pub op: AtomicOp,
+}
+
+#[derive(Copy, Clone)]
+pub enum AtomicOp {
+    And,
+    Or,
+    Xor,
+    Exchange,
+    Add,
+    IncrementWrap,
+    DecrementWrap,
+    SignedMin,
+    UnsignedMin,
+    SignedMax,
+    UnsignedMax,
+    FloatAdd,
+    FloatMin,
+    FloatMax,
+}
+
+impl AtomicOp {
+    pub(crate) fn new(op: super::RawAtomicOp, kind: ScalarKind) -> Self {
+        use super::RawAtomicOp;
+        match (op, kind) {
+            (RawAtomicOp::And, _) => Self::And,
+            (RawAtomicOp::Or, _) => Self::Or,
+            (RawAtomicOp::Xor, _) => Self::Xor,
+            (RawAtomicOp::Exch, _) => Self::Exchange,
+            (RawAtomicOp::Add, _) => Self::Add,
+            (RawAtomicOp::Inc, _) => Self::IncrementWrap,
+            (RawAtomicOp::Dec, _) => Self::DecrementWrap,
+            (RawAtomicOp::Min, ScalarKind::Signed) => Self::SignedMin,
+            (RawAtomicOp::Min, ScalarKind::Float) => Self::FloatMin,
+            (RawAtomicOp::Min, _) => Self::UnsignedMin,
+            (RawAtomicOp::Max, ScalarKind::Signed) => Self::SignedMax,
+            (RawAtomicOp::Max, ScalarKind::Float) => Self::FloatMax,
+            (RawAtomicOp::Max, _) => Self::UnsignedMax,
+        }
+    }
+}
+
+pub struct AtomCasDetails {
+    pub type_: ScalarType,
+    pub semantics: AtomSemantics,
+    pub scope: MemScope,
+    pub space: StateSpace,
+}
--- a/ptx_parser/src/main.rs
+++ b/ptx_parser/src/main.rs
@ -71,6 +71,16 @@ impl From<RawRoundingMode> for ast::RoundingMode {
    }
 }

+impl VectorPrefix {
+    pub(crate) fn len(self) -> u8 {
+        match self {
+            VectorPrefix::V2 => 2,
+            VectorPrefix::V4 => 4,
+            VectorPrefix::V8 => 8,
+        }
+    }
+}
+
 struct PtxParserState<'input> {
    errors: Vec<PtxError>,
    function_declarations:
@ -1135,6 +1145,9 @@ derive_parser!(
    #[derive(Copy, Clone, PartialEq, Eq, Hash)]
    pub enum SetpBoolPostOp { }

+    #[derive(Copy, Clone, PartialEq, Eq, Hash)]
+    pub enum AtomSemantics { }
+
    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov
    mov{.vec}.type  d, a => {
        Instruction::Mov {
@ -2345,6 +2358,147 @@ derive_parser!(
    }
    ScalarType =        { .f32, .f64 };

+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
+    selp.type d, a, b, c => {
+        ast::Instruction::Selp {
+            data: type_,
+            arguments: SelpArgs { dst: d, src1: a, src2: b, src3: c  }
+        }
+    }
+    .type: ScalarType = { .b16, .b32, .b64,
+                          .u16, .u32, .u64,
+                          .s16, .s32, .s64,
+                          .f32, .f64 };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar
+    barrier{.cta}.sync{.aligned}    a{, b} => {
+        let _ = cta;
+        ast::Instruction::Bar {
+            data: ast::BarData { aligned },
+            arguments: BarArgs { src1: a, src2: b }
+        }
+    }
+    //barrier{.cta}.arrive{.aligned}    a, b;
+    //barrier{.cta}.red.popc{.aligned}.u32  d, a{, b}, {!}c;
+    //barrier{.cta}.red.op{.aligned}.pred   p, a{, b}, {!}c;
+    bar{.cta}.sync                  a{, b} => {
+        let _ = cta;
+        ast::Instruction::Bar {
+            data: ast::BarData { aligned: true },
+            arguments: BarArgs { src1: a, src2: b }
+        }
+    }
+    //bar{.cta}.arrive    a, b;
+    //bar{.cta}.red.popc.u32  d, a{, b}, {!}c;
+    //bar{.cta}.red.op.pred   p, a{, b}, {!}c;
+    //.op = { .and, .or };
+
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
+    atom{.sem}{.scope}{.space}.op{.level::cache_hint}.type                                      d, [a], b{, cache_policy} => {
+        if level_cache_hint || cache_policy.is_some() {
+            state.errors.push(PtxError::Todo);
+        }
+        ast::Instruction::Atom {
+            data: AtomDetails {
+                semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed),
+                scope: scope.unwrap_or(MemScope::Gpu),
+                space: space.unwrap_or(StateSpace::Generic),
+                op: ast::AtomicOp::new(op, type_.kind()),
+                type_: type_.into()
+            },
+            arguments: AtomArgs { dst: d, src1: a, src2: b }
+        }
+    }
+    atom{.sem}{.scope}{.space}.cas.cas_type                                                     d, [a], b, c => {
+        ast::Instruction::AtomCas {
+            data: AtomCasDetails {
+                semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed),
+                scope: scope.unwrap_or(MemScope::Gpu),
+                space: space.unwrap_or(StateSpace::Generic),
+                type_: cas_type
+            },
+            arguments: AtomCasArgs { dst: d, src1: a, src2: b, src3: c }
+        }
+    }
+    atom{.sem}{.scope}{.space}.exch{.level::cache_hint}.b128                                    d, [a], b{, cache_policy} => {
+        if level_cache_hint || cache_policy.is_some() {
+            state.errors.push(PtxError::Todo);
+        }
+        ast::Instruction::Atom {
+            data: AtomDetails {
+                semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed),
+                scope: scope.unwrap_or(MemScope::Gpu),
+                space: space.unwrap_or(StateSpace::Generic),
+                op: ast::AtomicOp::new(exch, b128.kind()),
+                type_: b128.into()
+            },
+            arguments: AtomArgs { dst: d, src1: a, src2: b }
+        }
+    }
+    atom{.sem}{.scope}{.global}.float_op{.level::cache_hint}.vec_32_bit.f32                     d, [a], b{, cache_policy} => {
+        if level_cache_hint || cache_policy.is_some() {
+            state.errors.push(PtxError::Todo);
+        }
+        ast::Instruction::Atom {
+            data: AtomDetails {
+                semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed),
+                scope: scope.unwrap_or(MemScope::Gpu),
+                space: global.unwrap_or(StateSpace::Generic),
+                op: ast::AtomicOp::new(float_op, f32.kind()),
+                type_: ast::Type::Vector(f32, vec_32_bit.len())
+            },
+            arguments: AtomArgs { dst: d, src1: a, src2: b }
+        }
+    }
+    atom{.sem}{.scope}{.global}.float_op.noftz{.level::cache_hint}{.vec_16_bit}.half_word_type  d, [a], b{, cache_policy} => {
+        if level_cache_hint || cache_policy.is_some() {
+            state.errors.push(PtxError::Todo);
+        }
+        ast::Instruction::Atom {
+            data: AtomDetails {
+                semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed),
+                scope: scope.unwrap_or(MemScope::Gpu),
+                space: global.unwrap_or(StateSpace::Generic),
+                op: ast::AtomicOp::new(float_op, half_word_type.kind()),
+                type_: ast::Type::maybe_vector(vec_16_bit, half_word_type)
+            },
+            arguments: AtomArgs { dst: d, src1: a, src2: b }
+        }
+    }
+    atom{.sem}{.scope}{.global}.float_op.noftz{.level::cache_hint}{.vec_32_bit}.packed_type     d, [a], b{, cache_policy} => {
+        if level_cache_hint || cache_policy.is_some() {
+            state.errors.push(PtxError::Todo);
+        }
+        ast::Instruction::Atom {
+            data: AtomDetails {
+                semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed),
+                scope: scope.unwrap_or(MemScope::Gpu),
+                space: global.unwrap_or(StateSpace::Generic),
+                op: ast::AtomicOp::new(float_op, packed_type.kind()),
+                type_: ast::Type::maybe_vector(vec_32_bit, packed_type)
+            },
+            arguments: AtomArgs { dst: d, src1: a, src2: b }
+        }
+    }
+    .space: StateSpace =            { .global, .shared{::cta, ::cluster} };
+    .sem: AtomSemantics =           { .relaxed, .acquire, .release, .acq_rel };
+    .scope: MemScope =              { .cta, .cluster, .gpu, .sys };
+    .op: RawAtomicOp =              { .and, .or, .xor,
+                                      .exch,
+                                      .add, .inc, .dec,
+                                      .min, .max };
+    .level::cache_hint =            { .L2::cache_hint };
+    .type: ScalarType =             { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64 };
+    .cas_type: ScalarType =         { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64, .b16, .b128 };
+    .half_word_type: ScalarType =   { .f16, .bf16 };
+    .packed_type: ScalarType =      { .f16x2, .bf16x2 };
+    .vec_16_bit: VectorPrefix =     { .v2, .v4, .v8 };
+    .vec_32_bit:  VectorPrefix =    { .v2, .v4 };
+    .float_op: RawAtomicOp =        { .add, .min, .max };
+    ScalarType =                    { .b16, .b128, .f32 };
+    StateSpace =                    { .global };
+    RawAtomicOp =                   { .exch };
+
    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret
    ret{.uni} => {
        Instruction::Ret { data: RetData { uniform: uni } }