Add atom and atom.cas

This commit is contained in:
Andrzej Janik 2024-08-21 15:46:06 +02:00
parent c16bae32b5
commit 39faaa7214
2 changed files with 261 additions and 4 deletions

View file

@ -1,8 +1,8 @@
use std::cmp::Ordering;
use super::{
MemScope, RawRoundingMode, RawSetpCompareOp, ScalarType, SetpBoolPostOp, StateSpace,
VectorPrefix,
AtomSemantics, MemScope, RawRoundingMode, RawSetpCompareOp, ScalarType, SetpBoolPostOp,
StateSpace, VectorPrefix,
};
use crate::{PtxError, PtxParserState};
use bitflags::bitflags;
@ -282,6 +282,52 @@ gen::generate_instruction_type!(
src: T,
}
},
Selp {
type: { Type::Scalar(data.clone()) },
data: ScalarType,
arguments<T>: {
dst: T,
src1: T,
src2: T,
src3: {
repr: T,
type: Type::Scalar(ScalarType::Pred)
},
}
},
Bar {
type: Type::Scalar(ScalarType::U32),
data: BarData,
arguments<T>: {
src1: T,
src2: Option<T>,
}
},
Atom {
type: &data.type_,
data: AtomDetails,
arguments<T>: {
dst: T,
src1: {
repr: T,
space: { data.space },
},
src2: T,
}
},
AtomCas {
type: Type::Scalar(data.type_),
data: AtomCasDetails,
arguments<T>: {
dst: T,
src1: {
repr: T,
space: { data.space },
},
src2: T,
src3: T,
}
},
Trap { }
}
);
@ -408,8 +454,7 @@ pub enum Type {
impl Type {
pub(crate) fn maybe_vector(vector: Option<VectorPrefix>, scalar: ScalarType) -> Self {
match vector {
Some(VectorPrefix::V2) => Type::Vector(scalar, 2),
Some(VectorPrefix::V4) => Type::Vector(scalar, 4),
Some(prefix) => Type::Vector(scalar, prefix.len()),
None => Type::Scalar(scalar),
}
}
@ -1167,3 +1212,61 @@ pub struct RsqrtData {
pub flush_to_zero: Option<bool>,
pub type_: ScalarType,
}
pub struct BarData {
pub aligned: bool,
}
pub struct AtomDetails {
pub type_: Type,
pub semantics: AtomSemantics,
pub scope: MemScope,
pub space: StateSpace,
pub op: AtomicOp,
}
#[derive(Copy, Clone)]
pub enum AtomicOp {
And,
Or,
Xor,
Exchange,
Add,
IncrementWrap,
DecrementWrap,
SignedMin,
UnsignedMin,
SignedMax,
UnsignedMax,
FloatAdd,
FloatMin,
FloatMax,
}
impl AtomicOp {
pub(crate) fn new(op: super::RawAtomicOp, kind: ScalarKind) -> Self {
use super::RawAtomicOp;
match (op, kind) {
(RawAtomicOp::And, _) => Self::And,
(RawAtomicOp::Or, _) => Self::Or,
(RawAtomicOp::Xor, _) => Self::Xor,
(RawAtomicOp::Exch, _) => Self::Exchange,
(RawAtomicOp::Add, _) => Self::Add,
(RawAtomicOp::Inc, _) => Self::IncrementWrap,
(RawAtomicOp::Dec, _) => Self::DecrementWrap,
(RawAtomicOp::Min, ScalarKind::Signed) => Self::SignedMin,
(RawAtomicOp::Min, ScalarKind::Float) => Self::FloatMin,
(RawAtomicOp::Min, _) => Self::UnsignedMin,
(RawAtomicOp::Max, ScalarKind::Signed) => Self::SignedMax,
(RawAtomicOp::Max, ScalarKind::Float) => Self::FloatMax,
(RawAtomicOp::Max, _) => Self::UnsignedMax,
}
}
}
pub struct AtomCasDetails {
pub type_: ScalarType,
pub semantics: AtomSemantics,
pub scope: MemScope,
pub space: StateSpace,
}

View file

@ -71,6 +71,16 @@ impl From<RawRoundingMode> for ast::RoundingMode {
}
}
impl VectorPrefix {
pub(crate) fn len(self) -> u8 {
match self {
VectorPrefix::V2 => 2,
VectorPrefix::V4 => 4,
VectorPrefix::V8 => 8,
}
}
}
struct PtxParserState<'input> {
errors: Vec<PtxError>,
function_declarations:
@ -1135,6 +1145,9 @@ derive_parser!(
#[derive(Copy, Clone, PartialEq, Eq, Hash)]
pub enum SetpBoolPostOp { }
#[derive(Copy, Clone, PartialEq, Eq, Hash)]
pub enum AtomSemantics { }
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov
mov{.vec}.type d, a => {
Instruction::Mov {
@ -2345,6 +2358,147 @@ derive_parser!(
}
ScalarType = { .f32, .f64 };
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#comparison-and-selection-instructions-selp
selp.type d, a, b, c => {
ast::Instruction::Selp {
data: type_,
arguments: SelpArgs { dst: d, src1: a, src2: b, src3: c }
}
}
.type: ScalarType = { .b16, .b32, .b64,
.u16, .u32, .u64,
.s16, .s32, .s64,
.f32, .f64 };
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar
barrier{.cta}.sync{.aligned} a{, b} => {
let _ = cta;
ast::Instruction::Bar {
data: ast::BarData { aligned },
arguments: BarArgs { src1: a, src2: b }
}
}
//barrier{.cta}.arrive{.aligned} a, b;
//barrier{.cta}.red.popc{.aligned}.u32 d, a{, b}, {!}c;
//barrier{.cta}.red.op{.aligned}.pred p, a{, b}, {!}c;
bar{.cta}.sync a{, b} => {
let _ = cta;
ast::Instruction::Bar {
data: ast::BarData { aligned: true },
arguments: BarArgs { src1: a, src2: b }
}
}
//bar{.cta}.arrive a, b;
//bar{.cta}.red.popc.u32 d, a{, b}, {!}c;
//bar{.cta}.red.op.pred p, a{, b}, {!}c;
//.op = { .and, .or };
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
atom{.sem}{.scope}{.space}.op{.level::cache_hint}.type d, [a], b{, cache_policy} => {
if level_cache_hint || cache_policy.is_some() {
state.errors.push(PtxError::Todo);
}
ast::Instruction::Atom {
data: AtomDetails {
semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed),
scope: scope.unwrap_or(MemScope::Gpu),
space: space.unwrap_or(StateSpace::Generic),
op: ast::AtomicOp::new(op, type_.kind()),
type_: type_.into()
},
arguments: AtomArgs { dst: d, src1: a, src2: b }
}
}
atom{.sem}{.scope}{.space}.cas.cas_type d, [a], b, c => {
ast::Instruction::AtomCas {
data: AtomCasDetails {
semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed),
scope: scope.unwrap_or(MemScope::Gpu),
space: space.unwrap_or(StateSpace::Generic),
type_: cas_type
},
arguments: AtomCasArgs { dst: d, src1: a, src2: b, src3: c }
}
}
atom{.sem}{.scope}{.space}.exch{.level::cache_hint}.b128 d, [a], b{, cache_policy} => {
if level_cache_hint || cache_policy.is_some() {
state.errors.push(PtxError::Todo);
}
ast::Instruction::Atom {
data: AtomDetails {
semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed),
scope: scope.unwrap_or(MemScope::Gpu),
space: space.unwrap_or(StateSpace::Generic),
op: ast::AtomicOp::new(exch, b128.kind()),
type_: b128.into()
},
arguments: AtomArgs { dst: d, src1: a, src2: b }
}
}
atom{.sem}{.scope}{.global}.float_op{.level::cache_hint}.vec_32_bit.f32 d, [a], b{, cache_policy} => {
if level_cache_hint || cache_policy.is_some() {
state.errors.push(PtxError::Todo);
}
ast::Instruction::Atom {
data: AtomDetails {
semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed),
scope: scope.unwrap_or(MemScope::Gpu),
space: global.unwrap_or(StateSpace::Generic),
op: ast::AtomicOp::new(float_op, f32.kind()),
type_: ast::Type::Vector(f32, vec_32_bit.len())
},
arguments: AtomArgs { dst: d, src1: a, src2: b }
}
}
atom{.sem}{.scope}{.global}.float_op.noftz{.level::cache_hint}{.vec_16_bit}.half_word_type d, [a], b{, cache_policy} => {
if level_cache_hint || cache_policy.is_some() {
state.errors.push(PtxError::Todo);
}
ast::Instruction::Atom {
data: AtomDetails {
semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed),
scope: scope.unwrap_or(MemScope::Gpu),
space: global.unwrap_or(StateSpace::Generic),
op: ast::AtomicOp::new(float_op, half_word_type.kind()),
type_: ast::Type::maybe_vector(vec_16_bit, half_word_type)
},
arguments: AtomArgs { dst: d, src1: a, src2: b }
}
}
atom{.sem}{.scope}{.global}.float_op.noftz{.level::cache_hint}{.vec_32_bit}.packed_type d, [a], b{, cache_policy} => {
if level_cache_hint || cache_policy.is_some() {
state.errors.push(PtxError::Todo);
}
ast::Instruction::Atom {
data: AtomDetails {
semantics: sem.map(Into::into).unwrap_or(AtomSemantics::Relaxed),
scope: scope.unwrap_or(MemScope::Gpu),
space: global.unwrap_or(StateSpace::Generic),
op: ast::AtomicOp::new(float_op, packed_type.kind()),
type_: ast::Type::maybe_vector(vec_32_bit, packed_type)
},
arguments: AtomArgs { dst: d, src1: a, src2: b }
}
}
.space: StateSpace = { .global, .shared{::cta, ::cluster} };
.sem: AtomSemantics = { .relaxed, .acquire, .release, .acq_rel };
.scope: MemScope = { .cta, .cluster, .gpu, .sys };
.op: RawAtomicOp = { .and, .or, .xor,
.exch,
.add, .inc, .dec,
.min, .max };
.level::cache_hint = { .L2::cache_hint };
.type: ScalarType = { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64 };
.cas_type: ScalarType = { .b32, .b64, .u32, .u64, .s32, .s64, .f32, .f64, .b16, .b128 };
.half_word_type: ScalarType = { .f16, .bf16 };
.packed_type: ScalarType = { .f16x2, .bf16x2 };
.vec_16_bit: VectorPrefix = { .v2, .v4, .v8 };
.vec_32_bit: VectorPrefix = { .v2, .v4 };
.float_op: RawAtomicOp = { .add, .min, .max };
ScalarType = { .b16, .b128, .f32 };
StateSpace = { .global };
RawAtomicOp = { .exch };
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#control-flow-instructions-ret
ret{.uni} => {
Instruction::Ret { data: RetData { uniform: uni } }