diff --git a/ptx_parser/src/ast.rs b/ptx_parser/src/ast.rs index 944341c..232fdfc 100644 --- a/ptx_parser/src/ast.rs +++ b/ptx_parser/src/ast.rs @@ -1,8 +1,8 @@ use std::cmp::Ordering; use super::{ - MemScope, RawRoundingMode, RawSetpCompareOp, ScalarType, SetpBoolPostOp, StateSpace, - VectorPrefix, + AtomSemantics, MemScope, RawRoundingMode, RawSetpCompareOp, ScalarType, SetpBoolPostOp, + StateSpace, VectorPrefix, }; use crate::{PtxError, PtxParserState}; use bitflags::bitflags; @@ -282,6 +282,52 @@ gen::generate_instruction_type!( src: T, } }, + Selp { + type: { Type::Scalar(data.clone()) }, + data: ScalarType, + arguments: { + dst: T, + src1: T, + src2: T, + src3: { + repr: T, + type: Type::Scalar(ScalarType::Pred) + }, + } + }, + Bar { + type: Type::Scalar(ScalarType::U32), + data: BarData, + arguments: { + src1: T, + src2: Option, + } + }, + Atom { + type: &data.type_, + data: AtomDetails, + arguments: { + dst: T, + src1: { + repr: T, + space: { data.space }, + }, + src2: T, + } + }, + AtomCas { + type: Type::Scalar(data.type_), + data: AtomCasDetails, + arguments: { + dst: T, + src1: { + repr: T, + space: { data.space }, + }, + src2: T, + src3: T, + } + }, Trap { } } ); @@ -408,8 +454,7 @@ pub enum Type { impl Type { pub(crate) fn maybe_vector(vector: Option, scalar: ScalarType) -> Self { match vector { - Some(VectorPrefix::V2) => Type::Vector(scalar, 2), - Some(VectorPrefix::V4) => Type::Vector(scalar, 4), + Some(prefix) => Type::Vector(scalar, prefix.len()), None => Type::Scalar(scalar), } } @@ -1167,3 +1212,61 @@ pub struct RsqrtData { pub flush_to_zero: Option, pub type_: ScalarType, } + +pub struct BarData { + pub aligned: bool, +} + +pub struct AtomDetails { + pub type_: Type, + pub semantics: AtomSemantics, + pub scope: MemScope, + pub space: StateSpace, + pub op: AtomicOp, +} + +#[derive(Copy, Clone)] +pub enum AtomicOp { + And, + Or, + Xor, + Exchange, + Add, + IncrementWrap, + DecrementWrap, + SignedMin, + UnsignedMin, + SignedMax, + UnsignedMax, + FloatAdd, + FloatMin, + FloatMax, +} + +impl AtomicOp { + pub(crate) fn new(op: super::RawAtomicOp, kind: ScalarKind) -> Self { + use super::RawAtomicOp; + match (op, kind) { + (RawAtomicOp::And, _) => Self::And, + (RawAtomicOp::Or, _) => Self::Or, + (RawAtomicOp::Xor, _) => Self::Xor, + (RawAtomicOp::Exch, _) => Self::Exchange, + (RawAtomicOp::Add, _) => Self::Add, + (RawAtomicOp::Inc, _) => Self::IncrementWrap, + (RawAtomicOp::Dec, _) => Self::DecrementWrap, + (RawAtomicOp::Min, ScalarKind::Signed) => Self::SignedMin, + (RawAtomicOp::Min, ScalarKind::Float) => Self::FloatMin, + (RawAtomicOp::Min, _) => Self::UnsignedMin, + (RawAtomicOp::Max, ScalarKind::Signed) => Self::SignedMax, + (RawAtomicOp::Max, ScalarKind::Float) => Self::FloatMax, + (RawAtomicOp::Max, _) => Self::UnsignedMax, + } + } +} + +pub struct AtomCasDetails { + pub type_: ScalarType, + pub semantics: AtomSemantics, + pub scope: MemScope, + pub space: StateSpace, +} diff --git a/ptx_parser/src/main.rs b/ptx_parser/src/main.rs index 159b918..060c767 100644 --- a/ptx_parser/src/main.rs +++ b/ptx_parser/src/main.rs @@ -71,6 +71,16 @@ impl From for ast::RoundingMode { } } +impl VectorPrefix { + pub(crate) fn len(self) -> u8 { + match self { + VectorPrefix::V2 => 2, + VectorPrefix::V4 => 4, + VectorPrefix::V8 => 8, + } + } +} + struct PtxParserState<'input> { errors: Vec, function_declarations: @@ -1135,6 +1145,9 @@ derive_parser!( #[derive(Copy, Clone, PartialEq, Eq, Hash)] pub enum SetpBoolPostOp { } + #[derive(Copy, Clone, PartialEq, Eq, Hash)] + pub enum AtomSemantics { } + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov mov{.vec}.type d, a => { Instruction::Mov { @@ -2345,6 +2358,147 @@ derive_parser!( } ScalarType = { .f32, .f64 }; + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp + selp.type d, a, b, c => { + ast::Instruction::Selp { + data: type_, + arguments: SelpArgs { dst: d, src1: a, src2: b, src3: c } + } + } + .type: ScalarType = { .b16, .b32, .b64, + .u16, .u32, .u64, + .s16, .s32, .s64, + .f32, .f64 }; + + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar + barrier{.cta}.sync{.aligned} a{, b} => { + let _ = cta; + ast::Instruction::Bar { + data: ast::BarData { aligned }, + arguments: BarArgs { src1: a, src2: b } + } + } + //barrier{.cta}.arrive{.aligned} a, b; + //barrier{.cta}.red.popc{.aligned}.u32 d, a{, b}, {!}c; + //barrier{.cta}.red.op{.aligned}.pred p, a{, b}, {!}c; + bar{.cta}.sync a{, b} => { + let _ = cta; + ast::Instruction::Bar { + data: ast::BarData { aligned: true }, + arguments: BarArgs { src1: a, src2: b } + } + } + //bar{.cta}.arrive a, b; + //bar{.cta}.red.popc.u32 d, a{, b}, {!}c; + //bar{.cta}.red.op.pred p, a{, b}, {!}c; + //.op = { .and, .or }; + + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom + atom{.sem}{.scope}{.space}.op{.level::cache_hint}.type d, [a], b{, cache_policy} => { + if level_cache_hint || cache_policy.is_some() { + state.errors.push(PtxError::Todo); + } + ast::Instruction::Atom { + data: AtomDetails { + semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed), + scope: scope.unwrap_or(MemScope::Gpu), + space: space.unwrap_or(StateSpace::Generic), + op: ast::AtomicOp::new(op, type_.kind()), + type_: type_.into() + }, + arguments: AtomArgs { dst: d, src1: a, src2: b } + } + } + atom{.sem}{.scope}{.space}.cas.cas_type d, [a], b, c => { + ast::Instruction::AtomCas { + data: AtomCasDetails { + semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed), + scope: scope.unwrap_or(MemScope::Gpu), + space: space.unwrap_or(StateSpace::Generic), + type_: cas_type + }, + arguments: AtomCasArgs { dst: d, src1: a, src2: b, src3: c } + } + } + atom{.sem}{.scope}{.space}.exch{.level::cache_hint}.b128 d, [a], b{, cache_policy} => { + if level_cache_hint || cache_policy.is_some() { + state.errors.push(PtxError::Todo); + } + ast::Instruction::Atom { + data: AtomDetails { + semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed), + scope: scope.unwrap_or(MemScope::Gpu), + space: space.unwrap_or(StateSpace::Generic), + op: ast::AtomicOp::new(exch, b128.kind()), + type_: b128.into() + }, + arguments: AtomArgs { dst: d, src1: a, src2: b } + } + } + atom{.sem}{.scope}{.global}.float_op{.level::cache_hint}.vec_32_bit.f32 d, [a], b{, cache_policy} => { + if level_cache_hint || cache_policy.is_some() { + state.errors.push(PtxError::Todo); + } + ast::Instruction::Atom { + data: AtomDetails { + semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed), + scope: scope.unwrap_or(MemScope::Gpu), + space: global.unwrap_or(StateSpace::Generic), + op: ast::AtomicOp::new(float_op, f32.kind()), + type_: ast::Type::Vector(f32, vec_32_bit.len()) + }, + arguments: AtomArgs { dst: d, src1: a, src2: b } + } + } + atom{.sem}{.scope}{.global}.float_op.noftz{.level::cache_hint}{.vec_16_bit}.half_word_type d, [a], b{, cache_policy} => { + if level_cache_hint || cache_policy.is_some() { + state.errors.push(PtxError::Todo); + } + ast::Instruction::Atom { + data: AtomDetails { + semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed), + scope: scope.unwrap_or(MemScope::Gpu), + space: global.unwrap_or(StateSpace::Generic), + op: ast::AtomicOp::new(float_op, half_word_type.kind()), + type_: ast::Type::maybe_vector(vec_16_bit, half_word_type) + }, + arguments: AtomArgs { dst: d, src1: a, src2: b } + } + } + atom{.sem}{.scope}{.global}.float_op.noftz{.level::cache_hint}{.vec_32_bit}.packed_type d, [a], b{, cache_policy} => { + if level_cache_hint || cache_policy.is_some() { + state.errors.push(PtxError::Todo); + } + ast::Instruction::Atom { + data: AtomDetails { + semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed), + scope: scope.unwrap_or(MemScope::Gpu), + space: global.unwrap_or(StateSpace::Generic), + op: ast::AtomicOp::new(float_op, packed_type.kind()), + type_: ast::Type::maybe_vector(vec_32_bit, packed_type) + }, + arguments: AtomArgs { dst: d, src1: a, src2: b } + } + } + .space: StateSpace = { .global, .shared{::cta, ::cluster} }; + .sem: AtomSemantics = { .relaxed, .acquire, .release, .acq_rel }; + .scope: MemScope = { .cta, .cluster, .gpu, .sys }; + .op: RawAtomicOp = { .and, .or, .xor, + .exch, + .add, .inc, .dec, + .min, .max }; + .level::cache_hint = { .L2::cache_hint }; + .type: ScalarType = { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64 }; + .cas_type: ScalarType = { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64, .b16, .b128 }; + .half_word_type: ScalarType = { .f16, .bf16 }; + .packed_type: ScalarType = { .f16x2, .bf16x2 }; + .vec_16_bit: VectorPrefix = { .v2, .v4, .v8 }; + .vec_32_bit: VectorPrefix = { .v2, .v4 }; + .float_op: RawAtomicOp = { .add, .min, .max }; + ScalarType = { .b16, .b128, .f32 }; + StateSpace = { .global }; + RawAtomicOp = { .exch }; + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret ret{.uni} => { Instruction::Ret { data: RetData { uniform: uni } }