Implement atomic instructions

This commit is contained in:
Andrzej Janik 2020-10-31 21:28:15 +01:00
parent 861116f223
commit a82eb20817
24 changed files with 1672 additions and 88 deletions

View file

@ -7,4 +7,8 @@ edition = "2018"
[lib]
[dependencies]
level_zero-sys = { path = "../level_zero-sys" }
level_zero-sys = { path = "../level_zero-sys" }
[dependencies.ocl-core]
version = "0.11"
features = ["opencl_version_1_2", "opencl_version_2_0", "opencl_version_2_1"]

View file

@ -238,7 +238,76 @@ impl Drop for CommandQueue {
pub struct Module(sys::ze_module_handle_t);
impl Module {
pub fn new_spirv(
// HACK ALERT
// We use OpenCL for now to do SPIR-V linking, because Level0
// does not allow linking. Don't let presence of zeModuleDynamicLink fool
// you, it's not currently possible to create non-compiled modules.
// zeModuleCreate always compiles (builds and links).
pub fn build_link_spirv<'a>(
ctx: &mut Context,
d: &Device,
binaries: &[&'a [u8]],
) -> (Result<Self>, Option<BuildLog>) {
let ocl_program = match Self::build_link_spirv_impl(binaries) {
Err(_) => return (Err(sys::ze_result_t::ZE_RESULT_ERROR_UNKNOWN), None),
Ok(prog) => prog,
};
match ocl_core::get_program_info(&ocl_program, ocl_core::ProgramInfo::Binaries) {
Ok(ocl_core::ProgramInfoResult::Binaries(binaries)) => {
let (module, build_log) = Self::build_native(ctx, d, &binaries[0]);
(module, Some(build_log))
}
_ => return (Err(sys::ze_result_t::ZE_RESULT_ERROR_UNKNOWN), None),
}
}
fn build_link_spirv_impl<'a>(binaries: &[&'a [u8]]) -> ocl_core::Result<ocl_core::Program> {
let platforms = ocl_core::get_platform_ids()?;
let (platform, device) = platforms
.iter()
.find_map(|plat| {
let devices =
ocl_core::get_device_ids(plat, Some(ocl_core::DeviceType::GPU), None).ok()?;
for dev in devices {
let vendor =
ocl_core::get_device_info(dev, ocl_core::DeviceInfo::VendorId).ok()?;
if let ocl_core::DeviceInfoResult::VendorId(0x8086) = vendor {
let dev_type =
ocl_core::get_device_info(dev, ocl_core::DeviceInfo::Type).ok()?;
if let ocl_core::DeviceInfoResult::Type(ocl_core::DeviceType::GPU) =
dev_type
{
return Some((plat.clone(), dev));
}
}
}
None
})
.ok_or("")?;
let ctx_props = ocl_core::ContextProperties::new().platform(platform);
let ocl_ctx = ocl_core::create_context_from_type::<ocl_core::DeviceId>(
Some(&ctx_props),
ocl_core::DeviceType::GPU,
None,
None,
)?;
let mut programs = Vec::with_capacity(binaries.len());
for binary in binaries {
programs.push(ocl_core::create_program_with_il(&ocl_ctx, binary, None)?);
}
let options = CString::default();
ocl_core::link_program::<ocl_core::DeviceId, _>(
&ocl_ctx,
Some(&[device]),
&options,
&programs.iter().collect::<Vec<_>>(),
None,
None,
None,
)
}
pub fn build_spirv(
ctx: &mut Context,
d: &Device,
bin: &[u8],
@ -247,7 +316,7 @@ impl Module {
Module::new(ctx, true, d, bin, opts)
}
pub fn new_native(ctx: &mut Context, d: &Device, bin: &[u8]) -> (Result<Self>, BuildLog) {
pub fn build_native(ctx: &mut Context, d: &Device, bin: &[u8]) -> (Result<Self>, BuildLog) {
Module::new(ctx, false, d, bin, None)
}

View file

@ -53,7 +53,7 @@ impl ModuleData {
Ok(_) if errors.len() > 0 => return Err(ModuleCompileError::Parse(errors, None)),
Ok(ast) => ast,
};
let (spirv, all_arg_lens) = ptx::to_spirv(ast)?;
let (_, spirv, all_arg_lens) = ptx::to_spirv(ast)?;
let byte_il = unsafe {
slice::from_raw_parts::<u8>(
spirv.as_ptr() as *const _,
@ -61,7 +61,7 @@ impl ModuleData {
)
};
let module = super::device::with_current_exclusive(|dev| {
l0::Module::new_spirv(&mut dev.l0_context, &dev.base, byte_il, None)
l0::Module::build_spirv(&mut dev.l0_context, &dev.base, byte_il, None)
});
match module {
Ok((Ok(module), _)) => Ok(Mutex::new(Self {

121
ptx/lib/notcuda_ptx_impl.cl Normal file
View file

@ -0,0 +1,121 @@
// Every time this file changes it must te rebuilt:
// ocloc -file notcuda_ptx_impl.cl -64 -options "-cl-std=CL2.0" -out_dir . -device kbl -output_no_suffix -spv_only
// Additionally you should strip names:
// spirv-opt --strip-debug notcuda_ptx_impl.spv -o notcuda_ptx_impl.spv
#define FUNC(NAME) __notcuda_ptx_impl__ ## NAME
#define atomic_inc(NAME, SUCCESS, FAILURE, SCOPE, SPACE) \
uint FUNC(NAME)(SPACE uint* ptr, uint threshold) { \
uint expected = *ptr; \
uint desired; \
do { \
desired = (expected >= threshold) ? 0 : expected + 1; \
} while (!atomic_compare_exchange_strong_explicit((volatile SPACE atomic_uint*)ptr, &expected, desired, SUCCESS, FAILURE, SCOPE)); \
return expected; \
}
#define atomic_dec(NAME, SUCCESS, FAILURE, SCOPE, SPACE) \
uint FUNC(NAME)(SPACE uint* ptr, uint threshold) { \
uint expected = *ptr; \
uint desired; \
do { \
desired = (expected == 0 || expected > threshold) ? threshold : expected - 1; \
} while (!atomic_compare_exchange_strong_explicit((volatile SPACE atomic_uint*)ptr, &expected, desired, SUCCESS, FAILURE, SCOPE)); \
return expected; \
}
// We are doing all this mess instead of accepting memory_order and memory_scope parameters
// because ocloc emits broken (failing spirv-dis) SPIR-V when memory_order or memory_scope is a parameter
// atom.inc
atomic_inc(atom_relaxed_cta_generic_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, );
atomic_inc(atom_acquire_cta_generic_inc, memory_order_acquire, memory_order_acquire, memory_scope_work_group, );
atomic_inc(atom_release_cta_generic_inc, memory_order_release, memory_order_acquire, memory_scope_work_group, );
atomic_inc(atom_acq_rel_cta_generic_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, );
atomic_inc(atom_relaxed_gpu_generic_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_device, );
atomic_inc(atom_acquire_gpu_generic_inc, memory_order_acquire, memory_order_acquire, memory_scope_device, );
atomic_inc(atom_release_gpu_generic_inc, memory_order_release, memory_order_acquire, memory_scope_device, );
atomic_inc(atom_acq_rel_gpu_generic_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_device, );
atomic_inc(atom_relaxed_sys_generic_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_device, );
atomic_inc(atom_acquire_sys_generic_inc, memory_order_acquire, memory_order_acquire, memory_scope_device, );
atomic_inc(atom_release_sys_generic_inc, memory_order_release, memory_order_acquire, memory_scope_device, );
atomic_inc(atom_acq_rel_sys_generic_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_device, );
atomic_inc(atom_relaxed_cta_global_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global);
atomic_inc(atom_acquire_cta_global_inc, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global);
atomic_inc(atom_release_cta_global_inc, memory_order_release, memory_order_acquire, memory_scope_work_group, __global);
atomic_inc(atom_acq_rel_cta_global_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global);
atomic_inc(atom_relaxed_gpu_global_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global);
atomic_inc(atom_acquire_gpu_global_inc, memory_order_acquire, memory_order_acquire, memory_scope_device, __global);
atomic_inc(atom_release_gpu_global_inc, memory_order_release, memory_order_acquire, memory_scope_device, __global);
atomic_inc(atom_acq_rel_gpu_global_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global);
atomic_inc(atom_relaxed_sys_global_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global);
atomic_inc(atom_acquire_sys_global_inc, memory_order_acquire, memory_order_acquire, memory_scope_device, __global);
atomic_inc(atom_release_sys_global_inc, memory_order_release, memory_order_acquire, memory_scope_device, __global);
atomic_inc(atom_acq_rel_sys_global_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global);
atomic_inc(atom_relaxed_cta_shared_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local);
atomic_inc(atom_acquire_cta_shared_inc, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local);
atomic_inc(atom_release_cta_shared_inc, memory_order_release, memory_order_acquire, memory_scope_work_group, __local);
atomic_inc(atom_acq_rel_cta_shared_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local);
atomic_inc(atom_relaxed_gpu_shared_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
atomic_inc(atom_acquire_gpu_shared_inc, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
atomic_inc(atom_release_gpu_shared_inc, memory_order_release, memory_order_acquire, memory_scope_device, __local);
atomic_inc(atom_acq_rel_gpu_shared_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
atomic_inc(atom_relaxed_sys_shared_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
atomic_inc(atom_acquire_sys_shared_inc, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
atomic_inc(atom_release_sys_shared_inc, memory_order_release, memory_order_acquire, memory_scope_device, __local);
atomic_inc(atom_acq_rel_sys_shared_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
// atom.dec
atomic_dec(atom_relaxed_cta_generic_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, );
atomic_dec(atom_acquire_cta_generic_dec, memory_order_acquire, memory_order_acquire, memory_scope_work_group, );
atomic_dec(atom_release_cta_generic_dec, memory_order_release, memory_order_acquire, memory_scope_work_group, );
atomic_dec(atom_acq_rel_cta_generic_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, );
atomic_dec(atom_relaxed_gpu_generic_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, );
atomic_dec(atom_acquire_gpu_generic_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, );
atomic_dec(atom_release_gpu_generic_dec, memory_order_release, memory_order_acquire, memory_scope_device, );
atomic_dec(atom_acq_rel_gpu_generic_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, );
atomic_dec(atom_relaxed_sys_generic_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, );
atomic_dec(atom_acquire_sys_generic_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, );
atomic_dec(atom_release_sys_generic_dec, memory_order_release, memory_order_acquire, memory_scope_device, );
atomic_dec(atom_acq_rel_sys_generic_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, );
atomic_dec(atom_relaxed_cta_global_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global);
atomic_dec(atom_acquire_cta_global_dec, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global);
atomic_dec(atom_release_cta_global_dec, memory_order_release, memory_order_acquire, memory_scope_work_group, __global);
atomic_dec(atom_acq_rel_cta_global_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global);
atomic_dec(atom_relaxed_gpu_global_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global);
atomic_dec(atom_acquire_gpu_global_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __global);
atomic_dec(atom_release_gpu_global_dec, memory_order_release, memory_order_acquire, memory_scope_device, __global);
atomic_dec(atom_acq_rel_gpu_global_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global);
atomic_dec(atom_relaxed_sys_global_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global);
atomic_dec(atom_acquire_sys_global_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __global);
atomic_dec(atom_release_sys_global_dec, memory_order_release, memory_order_acquire, memory_scope_device, __global);
atomic_dec(atom_acq_rel_sys_global_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global);
atomic_dec(atom_relaxed_cta_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local);
atomic_dec(atom_acquire_cta_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local);
atomic_dec(atom_release_cta_shared_dec, memory_order_release, memory_order_acquire, memory_scope_work_group, __local);
atomic_dec(atom_acq_rel_cta_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local);
atomic_dec(atom_relaxed_gpu_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
atomic_dec(atom_acquire_gpu_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
atomic_dec(atom_release_gpu_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
atomic_dec(atom_acq_rel_gpu_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
atomic_dec(atom_relaxed_sys_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
atomic_dec(atom_acquire_sys_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
atomic_dec(atom_release_sys_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);

Binary file not shown.

View file

@ -109,11 +109,12 @@ macro_rules! sub_type {
};
}
// Pointer is used when doing SLM converison to SPIRV
sub_type! {
VariableRegType {
Scalar(ScalarType),
Vector(SizedScalarType, u8),
// Pointer variant is used when passing around SLM pointer between
// function calls for dynamic SLM
Pointer(SizedScalarType, PointerStateSpace)
}
}
@ -215,6 +216,11 @@ sub_enum!(SelpType {
F64,
});
#[derive(Copy, Clone, Eq, PartialEq)]
pub enum BarDetails {
SyncAligned,
}
pub trait UnwrapWithVec<E, To> {
fn unwrap_with(self, errs: &mut Vec<E>) -> To;
}
@ -301,6 +307,7 @@ impl From<FnArgumentType> for Type {
sub_enum!(
PointerStateSpace : LdStateSpace {
Generic,
Global,
Const,
Shared,
@ -372,6 +379,8 @@ sub_enum!(IntType {
S64
});
sub_enum!(BitType { B8, B16, B32, B64 });
sub_enum!(UIntType { U8, U16, U32, U64 });
sub_enum!(SIntType { S8, S16, S32, S64 });
@ -527,6 +536,9 @@ pub enum Instruction<P: ArgParams> {
Rcp(RcpDetails, Arg2<P>),
And(OrAndType, Arg3<P>),
Selp(SelpType, Arg4<P>),
Bar(BarDetails, Arg1Bar<P>),
Atom(AtomDetails, Arg3<P>),
AtomCas(AtomCasDetails, Arg4<P>),
}
#[derive(Copy, Clone)]
@ -577,6 +589,10 @@ pub struct Arg1<P: ArgParams> {
pub src: P::Id, // it is a jump destination, but in terms of operands it is a source operand
}
pub struct Arg1Bar<P: ArgParams> {
pub src: P::Operand,
}
pub struct Arg2<P: ArgParams> {
pub dst: P::Id,
pub src: P::Operand,
@ -712,12 +728,12 @@ impl From<LdStType> for PointerType {
pub enum LdStQualifier {
Weak,
Volatile,
Relaxed(LdScope),
Acquire(LdScope),
Relaxed(MemScope),
Acquire(MemScope),
}
#[derive(Copy, Clone, PartialEq, Eq)]
pub enum LdScope {
pub enum MemScope {
Cta,
Gpu,
Sys,
@ -1051,6 +1067,74 @@ pub struct MinMaxFloat {
pub typ: FloatType,
}
#[derive(Copy, Clone)]
pub struct AtomDetails {
pub semantics: AtomSemantics,
pub scope: MemScope,
pub space: AtomSpace,
pub inner: AtomInnerDetails,
}
#[derive(Copy, Clone)]
pub enum AtomSemantics {
Relaxed,
Acquire,
Release,
AcquireRelease,
}
#[derive(Copy, Clone)]
pub enum AtomSpace {
Generic,
Global,
Shared,
}
#[derive(Copy, Clone)]
pub enum AtomInnerDetails {
Bit { op: AtomBitOp, typ: BitType },
Unsigned { op: AtomUIntOp, typ: UIntType },
Signed { op: AtomSIntOp, typ: SIntType },
Float { op: AtomFloatOp, typ: FloatType },
}
#[derive(Copy, Clone, Eq, PartialEq)]
pub enum AtomBitOp {
And,
Or,
Xor,
Exchange,
}
#[derive(Copy, Clone, Eq, PartialEq)]
pub enum AtomUIntOp {
Add,
Inc,
Dec,
Min,
Max,
}
#[derive(Copy, Clone, Eq, PartialEq)]
pub enum AtomSIntOp {
Add,
Min,
Max,
}
#[derive(Copy, Clone, Eq, PartialEq)]
pub enum AtomFloatOp {
Add,
}
#[derive(Copy, Clone)]
pub struct AtomCasDetails {
pub semantics: AtomSemantics,
pub scope: MemScope,
pub space: AtomSpace,
pub typ: BitType
}
pub enum NumsOrArrays<'a> {
Nums(Vec<(&'a str, u32)>),
Arrays(Vec<NumsOrArrays<'a>>),

View file

@ -35,9 +35,12 @@ match {
"<", ">",
"|",
"=",
".acq_rel",
".acquire",
".add",
".address_size",
".align",
".aligned",
".and",
".approx",
".b16",
@ -45,14 +48,17 @@ match {
".b64",
".b8",
".ca",
".cas",
".cg",
".const",
".cs",
".cta",
".cv",
".dec",
".entry",
".eq",
".equ",
".exch",
".extern",
".f16",
".f16x2",
@ -69,6 +75,7 @@ match {
".gtu",
".hi",
".hs",
".inc",
".le",
".leu",
".lo",
@ -78,6 +85,8 @@ match {
".lt",
".ltu",
".lu",
".max",
".min",
".nan",
".NaN",
".ne",
@ -88,6 +97,7 @@ match {
".pred",
".reg",
".relaxed",
".release",
".rm",
".rmi",
".rn",
@ -103,6 +113,7 @@ match {
".sat",
".section",
".shared",
".sync",
".sys",
".target",
".to",
@ -126,6 +137,9 @@ match {
"abs",
"add",
"and",
"atom",
"bar",
"barrier",
"bra",
"call",
"cvt",
@ -162,6 +176,9 @@ ExtendedID : &'input str = {
"abs",
"add",
"and",
"atom",
"bar",
"barrier",
"bra",
"call",
"cvt",
@ -372,6 +389,7 @@ StateSpaceSpecifier: ast::StateSpace = {
".param" => ast::StateSpace::Param, // used to prepare function call
};
#[inline]
ScalarType: ast::ScalarType = {
".f16" => ast::ScalarType::F16,
".f16x2" => ast::ScalarType::F16x2,
@ -438,6 +456,7 @@ Variable: ast::Variable<ast::VariableType, &'input str> = {
let v_type = ast::VariableType::Param(v_type);
ast::Variable {align, v_type, name, array_init}
},
SharedVariable,
};
RegVariable: (Option<u32>, ast::VariableRegType, &'input str) = {
@ -478,6 +497,32 @@ LocalVariable: ast::Variable<ast::VariableType, &'input str> = {
}
}
SharedVariable: ast::Variable<ast::VariableType, &'input str> = {
".shared" <var:VariableScalar<SizedScalarType>> => {
let (align, t, name) = var;
let v_type = ast::VariableGlobalType::Scalar(t);
ast::Variable { align, v_type: ast::VariableType::Shared(v_type), name, array_init: Vec::new() }
},
".shared" <var:VariableVector<SizedScalarType>> => {
let (align, v_len, t, name) = var;
let v_type = ast::VariableGlobalType::Vector(t, v_len);
ast::Variable { align, v_type: ast::VariableType::Shared(v_type), name, array_init: Vec::new() }
},
".shared" <var:VariableArrayOrPointer<SizedScalarType>> =>? {
let (align, t, name, arr_or_ptr) = var;
let (v_type, array_init) = match arr_or_ptr {
ast::ArrayOrPointer::Array { dimensions, init } => {
(ast::VariableGlobalType::Array(t, dimensions), init)
}
ast::ArrayOrPointer::Pointer => {
return Err(ParseError::User { error: ast::PtxError::ZeroDimensionArray });
}
};
Ok(ast::Variable { align, v_type: ast::VariableType::Shared(v_type), name, array_init })
}
}
ModuleVariable: ast::Variable<ast::VariableType, &'input str> = {
LinkingDirectives ".global" <def:GlobalVariableDefinitionNoArray> => {
let (align, v_type, name, array_init) = def;
@ -619,7 +664,10 @@ Instruction: ast::Instruction<ast::ParsedArgParams<'input>> = {
InstMin,
InstMax,
InstRcp,
InstSelp
InstSelp,
InstBar,
InstAtom,
InstAtomCas
};
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
@ -655,14 +703,14 @@ LdStType: ast::LdStType = {
LdStQualifier: ast::LdStQualifier = {
".weak" => ast::LdStQualifier::Weak,
".volatile" => ast::LdStQualifier::Volatile,
".relaxed" <s:LdScope> => ast::LdStQualifier::Relaxed(s),
".acquire" <s:LdScope> => ast::LdStQualifier::Acquire(s),
".relaxed" <s:MemScope> => ast::LdStQualifier::Relaxed(s),
".acquire" <s:MemScope> => ast::LdStQualifier::Acquire(s),
};
LdScope: ast::LdScope = {
".cta" => ast::LdScope::Cta,
".gpu" => ast::LdScope::Gpu,
".sys" => ast::LdScope::Sys
MemScope: ast::MemScope = {
".cta" => ast::MemScope::Cta,
".gpu" => ast::MemScope::Gpu,
".sys" => ast::MemScope::Sys
};
LdStateSpace: ast::LdStateSpace = {
@ -798,6 +846,13 @@ SIntType: ast::SIntType = {
".s64" => ast::SIntType::S64,
};
FloatType: ast::FloatType = {
".f16" => ast::FloatType::F16,
".f16x2" => ast::FloatType::F16x2,
".f32" => ast::FloatType::F32,
".f64" => ast::FloatType::F64,
};
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-add
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add
@ -1296,6 +1351,140 @@ SelpType: ast::SelpType = {
".f64" => ast::SelpType::F64,
};
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar
InstBar: ast::Instruction<ast::ParsedArgParams<'input>> = {
"barrier" ".sync" ".aligned" <a:Arg1Bar> => ast::Instruction::Bar(ast::BarDetails::SyncAligned, a),
"bar" ".sync" <a:Arg1Bar> => ast::Instruction::Bar(ast::BarDetails::SyncAligned, a)
}
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
// The documentation does not mention all spported operations:
// * Operation .add requires .u32 or .s32 or .u64 or .f64 or f16 or f16x2 or .f32
// * Operation .inc requires .u32 type for instuction
// * Operation .dec requires .u32 type for instuction
// Otherwise as documented
InstAtom: ast::Instruction<ast::ParsedArgParams<'input>> = {
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> <op:AtomBitOp> <typ:AtomBitType> <a:Arg3Atom> => {
let details = ast::AtomDetails {
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
scope: scope.unwrap_or(ast::MemScope::Gpu),
space: space.unwrap_or(ast::AtomSpace::Generic),
inner: ast::AtomInnerDetails::Bit { op, typ }
};
ast::Instruction::Atom(details,a)
},
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> ".inc" ".u32" <a:Arg3Atom> => {
let details = ast::AtomDetails {
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
scope: scope.unwrap_or(ast::MemScope::Gpu),
space: space.unwrap_or(ast::AtomSpace::Generic),
inner: ast::AtomInnerDetails::Unsigned {
op: ast::AtomUIntOp::Inc,
typ: ast::UIntType::U32
}
};
ast::Instruction::Atom(details,a)
},
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> ".dec" ".u32" <a:Arg3Atom> => {
let details = ast::AtomDetails {
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
scope: scope.unwrap_or(ast::MemScope::Gpu),
space: space.unwrap_or(ast::AtomSpace::Generic),
inner: ast::AtomInnerDetails::Unsigned {
op: ast::AtomUIntOp::Dec,
typ: ast::UIntType::U32
}
};
ast::Instruction::Atom(details,a)
},
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> ".add" <typ:FloatType> <a:Arg3Atom> => {
let op = ast::AtomFloatOp::Add;
let details = ast::AtomDetails {
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
scope: scope.unwrap_or(ast::MemScope::Gpu),
space: space.unwrap_or(ast::AtomSpace::Generic),
inner: ast::AtomInnerDetails::Float { op, typ }
};
ast::Instruction::Atom(details,a)
},
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> <op: AtomUIntOp> <typ:AtomUIntType> <a:Arg3Atom> => {
let details = ast::AtomDetails {
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
scope: scope.unwrap_or(ast::MemScope::Gpu),
space: space.unwrap_or(ast::AtomSpace::Generic),
inner: ast::AtomInnerDetails::Unsigned { op, typ }
};
ast::Instruction::Atom(details,a)
},
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> <op: AtomSIntOp> <typ:AtomSIntType> <a:Arg3Atom> => {
let details = ast::AtomDetails {
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
scope: scope.unwrap_or(ast::MemScope::Gpu),
space: space.unwrap_or(ast::AtomSpace::Generic),
inner: ast::AtomInnerDetails::Signed { op, typ }
};
ast::Instruction::Atom(details,a)
}
}
InstAtomCas: ast::Instruction<ast::ParsedArgParams<'input>> = {
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> ".cas" <typ:AtomBitType> <a:Arg4Atom> => {
let details = ast::AtomCasDetails {
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
scope: scope.unwrap_or(ast::MemScope::Gpu),
space: space.unwrap_or(ast::AtomSpace::Generic),
typ,
};
ast::Instruction::AtomCas(details,a)
},
}
AtomSemantics: ast::AtomSemantics = {
".relaxed" => ast::AtomSemantics::Relaxed,
".acquire" => ast::AtomSemantics::Acquire,
".release" => ast::AtomSemantics::Release,
".acq_rel" => ast::AtomSemantics::AcquireRelease
}
AtomSpace: ast::AtomSpace = {
".global" => ast::AtomSpace::Global,
".shared" => ast::AtomSpace::Shared
}
AtomBitOp: ast::AtomBitOp = {
".and" => ast::AtomBitOp::And,
".or" => ast::AtomBitOp::Or,
".xor" => ast::AtomBitOp::Xor,
".exch" => ast::AtomBitOp::Exchange,
}
AtomUIntOp: ast::AtomUIntOp = {
".add" => ast::AtomUIntOp::Add,
".min" => ast::AtomUIntOp::Min,
".max" => ast::AtomUIntOp::Max,
}
AtomSIntOp: ast::AtomSIntOp = {
".add" => ast::AtomSIntOp::Add,
".min" => ast::AtomSIntOp::Min,
".max" => ast::AtomSIntOp::Max,
}
AtomBitType: ast::BitType = {
".b32" => ast::BitType::B32,
".b64" => ast::BitType::B64,
}
AtomUIntType: ast::UIntType = {
".u32" => ast::UIntType::U32,
".u64" => ast::UIntType::U64,
}
AtomSIntType: ast::SIntType = {
".s32" => ast::SIntType::S32,
".s64" => ast::SIntType::S64,
}
ArithDetails: ast::ArithDetails = {
<t:UIntType> => ast::ArithDetails::Unsigned(t),
<t:SIntType> => ast::ArithDetails::Signed(ast::ArithSInt {
@ -1414,6 +1603,10 @@ Arg1: ast::Arg1<ast::ParsedArgParams<'input>> = {
<src:ExtendedID> => ast::Arg1{<>}
};
Arg1Bar: ast::Arg1Bar<ast::ParsedArgParams<'input>> = {
<src:Operand> => ast::Arg1Bar{<>}
};
Arg2: ast::Arg2<ast::ParsedArgParams<'input>> = {
<dst:ExtendedID> "," <src:Operand> => ast::Arg2{<>}
};
@ -1448,10 +1641,18 @@ Arg3: ast::Arg3<ast::ParsedArgParams<'input>> = {
<dst:ExtendedID> "," <src1:Operand> "," <src2:Operand> => ast::Arg3{<>}
};
Arg3Atom: ast::Arg3<ast::ParsedArgParams<'input>> = {
<dst:ExtendedID> "," "[" <src1:Operand> "]" "," <src2:Operand> => ast::Arg3{<>}
};
Arg4: ast::Arg4<ast::ParsedArgParams<'input>> = {
<dst:ExtendedID> "," <src1:Operand> "," <src2:Operand> "," <src3:Operand> => ast::Arg4{<>}
};
Arg4Atom: ast::Arg4<ast::ParsedArgParams<'input>> = {
<dst:ExtendedID> "," "[" <src1:Operand> "]" "," <src2:Operand> "," <src3:Operand> => ast::Arg4{<>}
};
Arg4Setp: ast::Arg4Setp<ast::ParsedArgParams<'input>> = {
<dst1:ExtendedID> <dst2:OptionalDst?> "," <src1:Operand> "," <src2:Operand> => ast::Arg4Setp{<>}
};

View file

@ -0,0 +1,10 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry bar_sync()
{
.reg .u32 temp_32;
bar.sync temp_32;
ret;
}

View file

@ -11,8 +11,8 @@ OpCapability Int16
OpCapability Int64
OpCapability Float16
OpCapability Float64
OpCapability FunctionFloatControlINTEL
OpExtension "SPV_INTEL_float_controls2"
; OpCapability FunctionFloatControlINTEL
; OpExtension "SPV_INTEL_float_controls2"
%33 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %1 "and"

View file

@ -0,0 +1,28 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry atom_add(
.param .u64 input,
.param .u64 output
)
{
.shared .align 4 .b8 shared_mem[1024];
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .u32 temp1;
.reg .u32 temp2;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.u32 temp1, [in_addr];
ld.u32 temp2, [in_addr+4];
st.shared.u32 [shared_mem], temp1;
atom.shared.add.u32 temp1, [shared_mem], temp2;
ld.shared.u32 temp2, [shared_mem];
st.u32 [out_addr], temp1;
st.u32 [out_addr+4], temp2;
ret;
}

View file

@ -0,0 +1,84 @@
; SPIR-V
; Version: 1.3
; Generator: rspirv
; Bound: 55
OpCapability GenericPointer
OpCapability Linkage
OpCapability Addresses
OpCapability Kernel
OpCapability Int8
OpCapability Int16
OpCapability Int64
OpCapability Float16
OpCapability Float64
; OpCapability FunctionFloatControlINTEL
; OpExtension "SPV_INTEL_float_controls2"
%40 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %1 "atom_add" %4
OpDecorate %4 Alignment 4
%41 = OpTypeVoid
%42 = OpTypeInt 32 0
%43 = OpTypeInt 8 0
%44 = OpConstant %42 1024
%45 = OpTypeArray %43 %44
%46 = OpTypePointer Workgroup %45
%4 = OpVariable %46 Workgroup
%47 = OpTypeInt 64 0
%48 = OpTypeFunction %41 %47 %47
%49 = OpTypePointer Function %47
%50 = OpTypePointer Function %42
%51 = OpTypePointer Generic %42
%27 = OpConstant %47 4
%52 = OpTypePointer Workgroup %42
%53 = OpConstant %42 1
%54 = OpConstant %42 0
%29 = OpConstant %47 4
%1 = OpFunction %41 None %48
%9 = OpFunctionParameter %47
%10 = OpFunctionParameter %47
%38 = OpLabel
%2 = OpVariable %49 Function
%3 = OpVariable %49 Function
%5 = OpVariable %49 Function
%6 = OpVariable %49 Function
%7 = OpVariable %50 Function
%8 = OpVariable %50 Function
OpStore %2 %9
OpStore %3 %10
%12 = OpLoad %47 %2
%11 = OpCopyObject %47 %12
OpStore %5 %11
%14 = OpLoad %47 %3
%13 = OpCopyObject %47 %14
OpStore %6 %13
%16 = OpLoad %47 %5
%31 = OpConvertUToPtr %51 %16
%15 = OpLoad %42 %31
OpStore %7 %15
%18 = OpLoad %47 %5
%28 = OpIAdd %47 %18 %27
%32 = OpConvertUToPtr %51 %28
%17 = OpLoad %42 %32
OpStore %8 %17
%19 = OpLoad %42 %7
%33 = OpBitcast %52 %4
OpStore %33 %19
%21 = OpLoad %42 %8
%34 = OpBitcast %52 %4
%20 = OpAtomicIAdd %42 %34 %53 %54 %21
OpStore %7 %20
%35 = OpBitcast %52 %4
%22 = OpLoad %42 %35
OpStore %8 %22
%23 = OpLoad %47 %6
%24 = OpLoad %42 %7
%36 = OpConvertUToPtr %51 %23
OpStore %36 %24
%25 = OpLoad %47 %6
%26 = OpLoad %42 %8
%30 = OpIAdd %47 %25 %29
%37 = OpConvertUToPtr %51 %30
OpStore %37 %26
OpReturn
OpFunctionEnd

View file

@ -0,0 +1,24 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry atom_cas(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .u32 temp1;
.reg .u32 temp2;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.u32 temp1, [in_addr];
atom.cas.b32 temp1, [in_addr+4], temp1, 100;
ld.u32 temp2, [in_addr+4];
st.u32 [out_addr], temp1;
st.u32 [out_addr+4], temp2;
ret;
}

View file

@ -0,0 +1,77 @@
; SPIR-V
; Version: 1.3
; Generator: rspirv
; Bound: 51
OpCapability GenericPointer
OpCapability Linkage
OpCapability Addresses
OpCapability Kernel
OpCapability Int8
OpCapability Int16
OpCapability Int64
OpCapability Float16
OpCapability Float64
; OpCapability FunctionFloatControlINTEL
; OpExtension "SPV_INTEL_float_controls2"
%41 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %1 "atom_cas"
%42 = OpTypeVoid
%43 = OpTypeInt 64 0
%44 = OpTypeFunction %42 %43 %43
%45 = OpTypePointer Function %43
%46 = OpTypeInt 32 0
%47 = OpTypePointer Function %46
%48 = OpTypePointer Generic %46
%25 = OpConstant %43 4
%27 = OpConstant %46 100
%49 = OpConstant %46 1
%50 = OpConstant %46 0
%28 = OpConstant %43 4
%30 = OpConstant %43 4
%1 = OpFunction %42 None %44
%8 = OpFunctionParameter %43
%9 = OpFunctionParameter %43
%39 = OpLabel
%2 = OpVariable %45 Function
%3 = OpVariable %45 Function
%4 = OpVariable %45 Function
%5 = OpVariable %45 Function
%6 = OpVariable %47 Function
%7 = OpVariable %47 Function
OpStore %2 %8
OpStore %3 %9
%11 = OpLoad %43 %2
%10 = OpCopyObject %43 %11
OpStore %4 %10
%13 = OpLoad %43 %3
%12 = OpCopyObject %43 %13
OpStore %5 %12
%15 = OpLoad %43 %4
%32 = OpConvertUToPtr %48 %15
%14 = OpLoad %46 %32
OpStore %6 %14
%17 = OpLoad %43 %4
%18 = OpLoad %46 %6
%26 = OpIAdd %43 %17 %25
%34 = OpConvertUToPtr %48 %26
%35 = OpCopyObject %46 %18
%33 = OpAtomicCompareExchange %46 %34 %49 %50 %50 %27 %35
%16 = OpCopyObject %46 %33
OpStore %6 %16
%20 = OpLoad %43 %4
%29 = OpIAdd %43 %20 %28
%36 = OpConvertUToPtr %48 %29
%19 = OpLoad %46 %36
OpStore %7 %19
%21 = OpLoad %43 %5
%22 = OpLoad %46 %6
%37 = OpConvertUToPtr %48 %21
OpStore %37 %22
%23 = OpLoad %43 %5
%24 = OpLoad %46 %7
%31 = OpIAdd %43 %23 %30
%38 = OpConvertUToPtr %48 %31
OpStore %38 %24
OpReturn
OpFunctionEnd

View file

@ -0,0 +1,26 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry atom_inc(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .u32 temp1;
.reg .u32 temp2;
.reg .u32 temp3;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
atom.inc.u32 temp1, [in_addr], 101;
atom.global.inc.u32 temp2, [in_addr], 101;
ld.u32 temp3, [in_addr];
st.u32 [out_addr], temp1;
st.u32 [out_addr+4], temp2;
st.u32 [out_addr+8], temp3;
ret;
}

View file

@ -0,0 +1,89 @@
; SPIR-V
; Version: 1.3
; Generator: rspirv
; Bound: 60
OpCapability GenericPointer
OpCapability Linkage
OpCapability Addresses
OpCapability Kernel
OpCapability Int8
OpCapability Int16
OpCapability Int64
OpCapability Float16
OpCapability Float64
; OpCapability FunctionFloatControlINTEL
; OpExtension "SPV_INTEL_float_controls2"
%49 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %1 "atom_inc"
OpDecorate %40 LinkageAttributes "__notcuda_ptx_impl__atom_relaxed_gpu_generic_inc" Import
OpDecorate %44 LinkageAttributes "__notcuda_ptx_impl__atom_relaxed_gpu_global_inc" Import
%50 = OpTypeVoid
%51 = OpTypeInt 32 0
%52 = OpTypePointer Generic %51
%53 = OpTypeFunction %51 %52 %51
%54 = OpTypePointer CrossWorkgroup %51
%55 = OpTypeFunction %51 %54 %51
%56 = OpTypeInt 64 0
%57 = OpTypeFunction %50 %56 %56
%58 = OpTypePointer Function %56
%59 = OpTypePointer Function %51
%27 = OpConstant %51 101
%28 = OpConstant %51 101
%29 = OpConstant %56 4
%31 = OpConstant %56 8
%40 = OpFunction %51 None %53
%42 = OpFunctionParameter %52
%43 = OpFunctionParameter %51
OpFunctionEnd
%44 = OpFunction %51 None %55
%46 = OpFunctionParameter %54
%47 = OpFunctionParameter %51
OpFunctionEnd
%1 = OpFunction %50 None %57
%9 = OpFunctionParameter %56
%10 = OpFunctionParameter %56
%39 = OpLabel
%2 = OpVariable %58 Function
%3 = OpVariable %58 Function
%4 = OpVariable %58 Function
%5 = OpVariable %58 Function
%6 = OpVariable %59 Function
%7 = OpVariable %59 Function
%8 = OpVariable %59 Function
OpStore %2 %9
OpStore %3 %10
%12 = OpLoad %56 %2
%11 = OpCopyObject %56 %12
OpStore %4 %11
%14 = OpLoad %56 %3
%13 = OpCopyObject %56 %14
OpStore %5 %13
%16 = OpLoad %56 %4
%33 = OpConvertUToPtr %52 %16
%15 = OpFunctionCall %51 %40 %33 %27
OpStore %6 %15
%18 = OpLoad %56 %4
%34 = OpConvertUToPtr %54 %18
%17 = OpFunctionCall %51 %44 %34 %28
OpStore %7 %17
%20 = OpLoad %56 %4
%35 = OpConvertUToPtr %52 %20
%19 = OpLoad %51 %35
OpStore %8 %19
%21 = OpLoad %56 %5
%22 = OpLoad %51 %6
%36 = OpConvertUToPtr %52 %21
OpStore %36 %22
%23 = OpLoad %56 %5
%24 = OpLoad %51 %7
%30 = OpIAdd %56 %23 %29
%37 = OpConvertUToPtr %52 %30
OpStore %37 %24
%25 = OpLoad %56 %5
%26 = OpLoad %51 %8
%32 = OpIAdd %56 %25 %31
%38 = OpConvertUToPtr %52 %32
OpStore %38 %26
OpReturn
OpFunctionEnd

View file

@ -11,12 +11,12 @@ OpCapability Int16
OpCapability Int64
OpCapability Float16
OpCapability Float64
OpCapability FunctionFloatControlINTEL
OpExtension "SPV_INTEL_float_controls2"
; OpCapability FunctionFloatControlINTEL
; OpExtension "SPV_INTEL_float_controls2"
%24 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %1 "constant_f32"
OpDecorate %1 FunctionDenormModeINTEL 32 Preserve
; OpDecorate %1 FunctionDenormModeINTEL 32 Preserve
%25 = OpTypeVoid
%26 = OpTypeInt 64 0
%27 = OpTypeFunction %25 %26 %26

View file

@ -11,8 +11,8 @@ OpCapability Int16
OpCapability Int64
OpCapability Float16
OpCapability Float64
OpCapability FunctionFloatControlINTEL
OpExtension "SPV_INTEL_float_controls2"
; OpCapability FunctionFloatControlINTEL
; OpExtension "SPV_INTEL_float_controls2"
%24 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %1 "constant_negative"

View file

@ -11,12 +11,12 @@ OpCapability Int16
OpCapability Int64
OpCapability Float16
OpCapability Float64
OpCapability FunctionFloatControlINTEL
OpExtension "SPV_INTEL_float_controls2"
; OpCapability FunctionFloatControlINTEL
; OpExtension "SPV_INTEL_float_controls2"
%37 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %1 "fma"
OpDecorate %1 FunctionDenormModeINTEL 32 Preserve
; OpDecorate %1 FunctionDenormModeINTEL 32 Preserve
%38 = OpTypeVoid
%39 = OpTypeInt 64 0
%40 = OpTypeFunction %38 %39 %39

View file

@ -86,12 +86,20 @@ test_ptx!(rcp, [2f32], [0.5f32]);
// 0x3f000000 is 0.5
// TODO: mul_ftz fails because IGC does not yet handle SPV_INTEL_float_controls2
// test_ptx!(mul_ftz, [0b1_00000000_10000000000000000000000u32, 0x3f000000u32], [0u32]);
test_ptx!(mul_non_ftz, [0b1_00000000_10000000000000000000000u32, 0x3f000000u32], [0b1_00000000_01000000000000000000000u32]);
test_ptx!(
mul_non_ftz,
[0b1_00000000_10000000000000000000000u32, 0x3f000000u32],
[0b1_00000000_01000000000000000000000u32]
);
test_ptx!(constant_f32, [10f32], [5f32]);
test_ptx!(constant_negative, [-101i32], [101i32]);
test_ptx!(and, [6u32, 3u32], [2u32]);
test_ptx!(selp, [100u16, 200u16], [200u16]);
test_ptx!(fma, [2f32, 3f32, 5f32], [11f32]);
test_ptx!(fma, [2f32, 3f32, 5f32], [11f32]);
test_ptx!(shared_variable, [513u64], [513u64]);
test_ptx!(atom_cas, [91u32, 91u32], [91u32, 100u32]);
test_ptx!(atom_inc, [100u32], [100u32, 101u32, 0u32]);
test_ptx!(atom_add, [2u32, 4u32], [2u32, 6u32]);
struct DisplayError<T: Debug> {
err: T,
@ -124,7 +132,7 @@ fn test_ptx_assert<'a, T: From<u8> + ze::SafeRepr + Debug + Copy + PartialEq>(
let name = CString::new(name)?;
let result = run_spirv(name.as_c_str(), notcuda_module, input, output)
.map_err(|err| DisplayError { err })?;
assert_eq!(output, result.as_slice());
assert_eq!(result.as_slice(), output);
Ok(())
}
@ -145,8 +153,8 @@ fn run_spirv<T: From<u8> + ze::SafeRepr + Copy + Debug>(
let use_shared_mem = module
.kernel_info
.get(name.to_str().unwrap())
.unwrap()
.uses_shared_mem;
.map(|info| info.uses_shared_mem)
.unwrap_or(false);
let mut result = vec![0u8.into(); output.len()];
{
let mut drivers = ze::Driver::get()?;
@ -155,11 +163,20 @@ fn run_spirv<T: From<u8> + ze::SafeRepr + Copy + Debug>(
let mut devices = drv.devices()?;
let dev = devices.drain(0..1).next().unwrap();
let queue = ze::CommandQueue::new(&mut ctx, &dev)?;
let (module, log) = ze::Module::new_spirv(&mut ctx, &dev, byte_il, None);
let (module, maybe_log) = match module.should_link_ptx_impl {
Some(ptx_impl) => ze::Module::build_link_spirv(&mut ctx, &dev, &[ptx_impl, byte_il]),
None => {
let (module, log) = ze::Module::build_spirv(&mut ctx, &dev, byte_il, None);
(module, Some(log))
}
};
let module = match module {
Ok(m) => m,
Err(err) => {
let raw_err_string = log.get_cstring()?;
let raw_err_string = maybe_log
.map(|log| log.get_cstring())
.transpose()?
.unwrap_or(CString::default());
let err_string = raw_err_string.to_string_lossy();
panic!("{:?}\n{}", err, err_string);
}
@ -215,7 +232,11 @@ fn test_spvtxt_assert<'a>(
ptr::null_mut(),
)
};
assert!(result == spv_result_t::SPV_SUCCESS);
if result != spv_result_t::SPV_SUCCESS {
panic!("{:?}\n{}", result, unsafe {
str::from_utf8_unchecked(spirv_txt)
});
}
let mut parsed_spirv = Vec::<u32>::new();
let result = unsafe {
spirv_tools::spvBinaryParse(

View file

@ -11,8 +11,8 @@ OpCapability Int16
OpCapability Int64
OpCapability Float16
OpCapability Float64
OpCapability FunctionFloatControlINTEL
OpExtension "SPV_INTEL_float_controls2"
; OpCapability FunctionFloatControlINTEL
; OpExtension "SPV_INTEL_float_controls2"
%30 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %1 "mul_ftz"

View file

@ -11,8 +11,8 @@ OpCapability Int16
OpCapability Int64
OpCapability Float16
OpCapability Float64
OpCapability FunctionFloatControlINTEL
OpExtension "SPV_INTEL_float_controls2"
; OpCapability FunctionFloatControlINTEL
; OpExtension "SPV_INTEL_float_controls2"
%31 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %1 "selp"

View file

@ -0,0 +1,26 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry shared_variable(
.param .u64 input,
.param .u64 output
)
{
.shared .align 4 .b8 shared_mem1[128];
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .u64 temp1;
.reg .u64 temp2;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
ld.global.u64 temp1, [in_addr];
st.shared.u64 [shared_mem1], temp1;
ld.shared.u64 temp2, [shared_mem1];
st.global.u64 [out_addr], temp2;
ret;
}

View file

@ -0,0 +1,65 @@
; SPIR-V
; Version: 1.3
; Generator: rspirv
; Bound: 39
OpCapability GenericPointer
OpCapability Linkage
OpCapability Addresses
OpCapability Kernel
OpCapability Int8
OpCapability Int16
OpCapability Int64
OpCapability Float16
OpCapability Float64
; OpCapability FunctionFloatControlINTEL
; OpExtension "SPV_INTEL_float_controls2"
%27 = OpExtInstImport "OpenCL.std"
OpMemoryModel Physical64 OpenCL
OpEntryPoint Kernel %1 "shared_variable" %4
OpDecorate %4 Alignment 4
%28 = OpTypeVoid
%29 = OpTypeInt 32 0
%30 = OpTypeInt 8 0
%31 = OpConstant %29 128
%32 = OpTypeArray %30 %31
%33 = OpTypePointer Workgroup %32
%4 = OpVariable %33 Workgroup
%34 = OpTypeInt 64 0
%35 = OpTypeFunction %28 %34 %34
%36 = OpTypePointer Function %34
%37 = OpTypePointer CrossWorkgroup %34
%38 = OpTypePointer Workgroup %34
%1 = OpFunction %28 None %35
%9 = OpFunctionParameter %34
%10 = OpFunctionParameter %34
%25 = OpLabel
%2 = OpVariable %36 Function
%3 = OpVariable %36 Function
%5 = OpVariable %36 Function
%6 = OpVariable %36 Function
%7 = OpVariable %36 Function
%8 = OpVariable %36 Function
OpStore %2 %9
OpStore %3 %10
%12 = OpLoad %34 %2
%11 = OpCopyObject %34 %12
OpStore %5 %11
%14 = OpLoad %34 %3
%13 = OpCopyObject %34 %14
OpStore %6 %13
%16 = OpLoad %34 %5
%21 = OpConvertUToPtr %37 %16
%15 = OpLoad %34 %21
OpStore %7 %15
%17 = OpLoad %34 %7
%22 = OpBitcast %38 %4
OpStore %22 %17
%23 = OpBitcast %38 %4
%18 = OpLoad %34 %23
OpStore %8 %18
%19 = OpLoad %34 %6
%20 = OpLoad %34 %8
%24 = OpConvertUToPtr %37 %19
OpStore %24 %20
OpReturn
OpFunctionEnd

File diff suppressed because it is too large Load diff