mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-04-19 16:04:44 +00:00
Implement atomic instructions
This commit is contained in:
parent
861116f223
commit
a82eb20817
24 changed files with 1672 additions and 88 deletions
|
@ -7,4 +7,8 @@ edition = "2018"
|
|||
[lib]
|
||||
|
||||
[dependencies]
|
||||
level_zero-sys = { path = "../level_zero-sys" }
|
||||
level_zero-sys = { path = "../level_zero-sys" }
|
||||
|
||||
[dependencies.ocl-core]
|
||||
version = "0.11"
|
||||
features = ["opencl_version_1_2", "opencl_version_2_0", "opencl_version_2_1"]
|
|
@ -238,7 +238,76 @@ impl Drop for CommandQueue {
|
|||
pub struct Module(sys::ze_module_handle_t);
|
||||
|
||||
impl Module {
|
||||
pub fn new_spirv(
|
||||
// HACK ALERT
|
||||
// We use OpenCL for now to do SPIR-V linking, because Level0
|
||||
// does not allow linking. Don't let presence of zeModuleDynamicLink fool
|
||||
// you, it's not currently possible to create non-compiled modules.
|
||||
// zeModuleCreate always compiles (builds and links).
|
||||
pub fn build_link_spirv<'a>(
|
||||
ctx: &mut Context,
|
||||
d: &Device,
|
||||
binaries: &[&'a [u8]],
|
||||
) -> (Result<Self>, Option<BuildLog>) {
|
||||
let ocl_program = match Self::build_link_spirv_impl(binaries) {
|
||||
Err(_) => return (Err(sys::ze_result_t::ZE_RESULT_ERROR_UNKNOWN), None),
|
||||
Ok(prog) => prog,
|
||||
};
|
||||
match ocl_core::get_program_info(&ocl_program, ocl_core::ProgramInfo::Binaries) {
|
||||
Ok(ocl_core::ProgramInfoResult::Binaries(binaries)) => {
|
||||
let (module, build_log) = Self::build_native(ctx, d, &binaries[0]);
|
||||
(module, Some(build_log))
|
||||
}
|
||||
_ => return (Err(sys::ze_result_t::ZE_RESULT_ERROR_UNKNOWN), None),
|
||||
}
|
||||
}
|
||||
|
||||
fn build_link_spirv_impl<'a>(binaries: &[&'a [u8]]) -> ocl_core::Result<ocl_core::Program> {
|
||||
let platforms = ocl_core::get_platform_ids()?;
|
||||
let (platform, device) = platforms
|
||||
.iter()
|
||||
.find_map(|plat| {
|
||||
let devices =
|
||||
ocl_core::get_device_ids(plat, Some(ocl_core::DeviceType::GPU), None).ok()?;
|
||||
for dev in devices {
|
||||
let vendor =
|
||||
ocl_core::get_device_info(dev, ocl_core::DeviceInfo::VendorId).ok()?;
|
||||
if let ocl_core::DeviceInfoResult::VendorId(0x8086) = vendor {
|
||||
let dev_type =
|
||||
ocl_core::get_device_info(dev, ocl_core::DeviceInfo::Type).ok()?;
|
||||
if let ocl_core::DeviceInfoResult::Type(ocl_core::DeviceType::GPU) =
|
||||
dev_type
|
||||
{
|
||||
return Some((plat.clone(), dev));
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
.ok_or("")?;
|
||||
let ctx_props = ocl_core::ContextProperties::new().platform(platform);
|
||||
let ocl_ctx = ocl_core::create_context_from_type::<ocl_core::DeviceId>(
|
||||
Some(&ctx_props),
|
||||
ocl_core::DeviceType::GPU,
|
||||
None,
|
||||
None,
|
||||
)?;
|
||||
let mut programs = Vec::with_capacity(binaries.len());
|
||||
for binary in binaries {
|
||||
programs.push(ocl_core::create_program_with_il(&ocl_ctx, binary, None)?);
|
||||
}
|
||||
let options = CString::default();
|
||||
ocl_core::link_program::<ocl_core::DeviceId, _>(
|
||||
&ocl_ctx,
|
||||
Some(&[device]),
|
||||
&options,
|
||||
&programs.iter().collect::<Vec<_>>(),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn build_spirv(
|
||||
ctx: &mut Context,
|
||||
d: &Device,
|
||||
bin: &[u8],
|
||||
|
@ -247,7 +316,7 @@ impl Module {
|
|||
Module::new(ctx, true, d, bin, opts)
|
||||
}
|
||||
|
||||
pub fn new_native(ctx: &mut Context, d: &Device, bin: &[u8]) -> (Result<Self>, BuildLog) {
|
||||
pub fn build_native(ctx: &mut Context, d: &Device, bin: &[u8]) -> (Result<Self>, BuildLog) {
|
||||
Module::new(ctx, false, d, bin, None)
|
||||
}
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ impl ModuleData {
|
|||
Ok(_) if errors.len() > 0 => return Err(ModuleCompileError::Parse(errors, None)),
|
||||
Ok(ast) => ast,
|
||||
};
|
||||
let (spirv, all_arg_lens) = ptx::to_spirv(ast)?;
|
||||
let (_, spirv, all_arg_lens) = ptx::to_spirv(ast)?;
|
||||
let byte_il = unsafe {
|
||||
slice::from_raw_parts::<u8>(
|
||||
spirv.as_ptr() as *const _,
|
||||
|
@ -61,7 +61,7 @@ impl ModuleData {
|
|||
)
|
||||
};
|
||||
let module = super::device::with_current_exclusive(|dev| {
|
||||
l0::Module::new_spirv(&mut dev.l0_context, &dev.base, byte_il, None)
|
||||
l0::Module::build_spirv(&mut dev.l0_context, &dev.base, byte_il, None)
|
||||
});
|
||||
match module {
|
||||
Ok((Ok(module), _)) => Ok(Mutex::new(Self {
|
||||
|
|
121
ptx/lib/notcuda_ptx_impl.cl
Normal file
121
ptx/lib/notcuda_ptx_impl.cl
Normal file
|
@ -0,0 +1,121 @@
|
|||
// Every time this file changes it must te rebuilt:
|
||||
// ocloc -file notcuda_ptx_impl.cl -64 -options "-cl-std=CL2.0" -out_dir . -device kbl -output_no_suffix -spv_only
|
||||
// Additionally you should strip names:
|
||||
// spirv-opt --strip-debug notcuda_ptx_impl.spv -o notcuda_ptx_impl.spv
|
||||
|
||||
#define FUNC(NAME) __notcuda_ptx_impl__ ## NAME
|
||||
|
||||
#define atomic_inc(NAME, SUCCESS, FAILURE, SCOPE, SPACE) \
|
||||
uint FUNC(NAME)(SPACE uint* ptr, uint threshold) { \
|
||||
uint expected = *ptr; \
|
||||
uint desired; \
|
||||
do { \
|
||||
desired = (expected >= threshold) ? 0 : expected + 1; \
|
||||
} while (!atomic_compare_exchange_strong_explicit((volatile SPACE atomic_uint*)ptr, &expected, desired, SUCCESS, FAILURE, SCOPE)); \
|
||||
return expected; \
|
||||
}
|
||||
|
||||
#define atomic_dec(NAME, SUCCESS, FAILURE, SCOPE, SPACE) \
|
||||
uint FUNC(NAME)(SPACE uint* ptr, uint threshold) { \
|
||||
uint expected = *ptr; \
|
||||
uint desired; \
|
||||
do { \
|
||||
desired = (expected == 0 || expected > threshold) ? threshold : expected - 1; \
|
||||
} while (!atomic_compare_exchange_strong_explicit((volatile SPACE atomic_uint*)ptr, &expected, desired, SUCCESS, FAILURE, SCOPE)); \
|
||||
return expected; \
|
||||
}
|
||||
|
||||
// We are doing all this mess instead of accepting memory_order and memory_scope parameters
|
||||
// because ocloc emits broken (failing spirv-dis) SPIR-V when memory_order or memory_scope is a parameter
|
||||
|
||||
// atom.inc
|
||||
atomic_inc(atom_relaxed_cta_generic_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, );
|
||||
atomic_inc(atom_acquire_cta_generic_inc, memory_order_acquire, memory_order_acquire, memory_scope_work_group, );
|
||||
atomic_inc(atom_release_cta_generic_inc, memory_order_release, memory_order_acquire, memory_scope_work_group, );
|
||||
atomic_inc(atom_acq_rel_cta_generic_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, );
|
||||
|
||||
atomic_inc(atom_relaxed_gpu_generic_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_device, );
|
||||
atomic_inc(atom_acquire_gpu_generic_inc, memory_order_acquire, memory_order_acquire, memory_scope_device, );
|
||||
atomic_inc(atom_release_gpu_generic_inc, memory_order_release, memory_order_acquire, memory_scope_device, );
|
||||
atomic_inc(atom_acq_rel_gpu_generic_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_device, );
|
||||
|
||||
atomic_inc(atom_relaxed_sys_generic_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_device, );
|
||||
atomic_inc(atom_acquire_sys_generic_inc, memory_order_acquire, memory_order_acquire, memory_scope_device, );
|
||||
atomic_inc(atom_release_sys_generic_inc, memory_order_release, memory_order_acquire, memory_scope_device, );
|
||||
atomic_inc(atom_acq_rel_sys_generic_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_device, );
|
||||
|
||||
atomic_inc(atom_relaxed_cta_global_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global);
|
||||
atomic_inc(atom_acquire_cta_global_inc, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global);
|
||||
atomic_inc(atom_release_cta_global_inc, memory_order_release, memory_order_acquire, memory_scope_work_group, __global);
|
||||
atomic_inc(atom_acq_rel_cta_global_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global);
|
||||
|
||||
atomic_inc(atom_relaxed_gpu_global_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global);
|
||||
atomic_inc(atom_acquire_gpu_global_inc, memory_order_acquire, memory_order_acquire, memory_scope_device, __global);
|
||||
atomic_inc(atom_release_gpu_global_inc, memory_order_release, memory_order_acquire, memory_scope_device, __global);
|
||||
atomic_inc(atom_acq_rel_gpu_global_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global);
|
||||
|
||||
atomic_inc(atom_relaxed_sys_global_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global);
|
||||
atomic_inc(atom_acquire_sys_global_inc, memory_order_acquire, memory_order_acquire, memory_scope_device, __global);
|
||||
atomic_inc(atom_release_sys_global_inc, memory_order_release, memory_order_acquire, memory_scope_device, __global);
|
||||
atomic_inc(atom_acq_rel_sys_global_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global);
|
||||
|
||||
atomic_inc(atom_relaxed_cta_shared_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local);
|
||||
atomic_inc(atom_acquire_cta_shared_inc, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local);
|
||||
atomic_inc(atom_release_cta_shared_inc, memory_order_release, memory_order_acquire, memory_scope_work_group, __local);
|
||||
atomic_inc(atom_acq_rel_cta_shared_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local);
|
||||
|
||||
atomic_inc(atom_relaxed_gpu_shared_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
|
||||
atomic_inc(atom_acquire_gpu_shared_inc, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_inc(atom_release_gpu_shared_inc, memory_order_release, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_inc(atom_acq_rel_gpu_shared_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
|
||||
|
||||
atomic_inc(atom_relaxed_sys_shared_inc, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
|
||||
atomic_inc(atom_acquire_sys_shared_inc, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_inc(atom_release_sys_shared_inc, memory_order_release, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_inc(atom_acq_rel_sys_shared_inc, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
|
||||
|
||||
// atom.dec
|
||||
atomic_dec(atom_relaxed_cta_generic_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, );
|
||||
atomic_dec(atom_acquire_cta_generic_dec, memory_order_acquire, memory_order_acquire, memory_scope_work_group, );
|
||||
atomic_dec(atom_release_cta_generic_dec, memory_order_release, memory_order_acquire, memory_scope_work_group, );
|
||||
atomic_dec(atom_acq_rel_cta_generic_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, );
|
||||
|
||||
atomic_dec(atom_relaxed_gpu_generic_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, );
|
||||
atomic_dec(atom_acquire_gpu_generic_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, );
|
||||
atomic_dec(atom_release_gpu_generic_dec, memory_order_release, memory_order_acquire, memory_scope_device, );
|
||||
atomic_dec(atom_acq_rel_gpu_generic_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, );
|
||||
|
||||
atomic_dec(atom_relaxed_sys_generic_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, );
|
||||
atomic_dec(atom_acquire_sys_generic_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, );
|
||||
atomic_dec(atom_release_sys_generic_dec, memory_order_release, memory_order_acquire, memory_scope_device, );
|
||||
atomic_dec(atom_acq_rel_sys_generic_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, );
|
||||
|
||||
atomic_dec(atom_relaxed_cta_global_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __global);
|
||||
atomic_dec(atom_acquire_cta_global_dec, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __global);
|
||||
atomic_dec(atom_release_cta_global_dec, memory_order_release, memory_order_acquire, memory_scope_work_group, __global);
|
||||
atomic_dec(atom_acq_rel_cta_global_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __global);
|
||||
|
||||
atomic_dec(atom_relaxed_gpu_global_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global);
|
||||
atomic_dec(atom_acquire_gpu_global_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __global);
|
||||
atomic_dec(atom_release_gpu_global_dec, memory_order_release, memory_order_acquire, memory_scope_device, __global);
|
||||
atomic_dec(atom_acq_rel_gpu_global_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global);
|
||||
|
||||
atomic_dec(atom_relaxed_sys_global_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __global);
|
||||
atomic_dec(atom_acquire_sys_global_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __global);
|
||||
atomic_dec(atom_release_sys_global_dec, memory_order_release, memory_order_acquire, memory_scope_device, __global);
|
||||
atomic_dec(atom_acq_rel_sys_global_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __global);
|
||||
|
||||
atomic_dec(atom_relaxed_cta_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_work_group, __local);
|
||||
atomic_dec(atom_acquire_cta_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_work_group, __local);
|
||||
atomic_dec(atom_release_cta_shared_dec, memory_order_release, memory_order_acquire, memory_scope_work_group, __local);
|
||||
atomic_dec(atom_acq_rel_cta_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_work_group, __local);
|
||||
|
||||
atomic_dec(atom_relaxed_gpu_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
|
||||
atomic_dec(atom_acquire_gpu_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_dec(atom_release_gpu_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_dec(atom_acq_rel_gpu_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
|
||||
|
||||
atomic_dec(atom_relaxed_sys_shared_dec, memory_order_relaxed, memory_order_relaxed, memory_scope_device, __local);
|
||||
atomic_dec(atom_acquire_sys_shared_dec, memory_order_acquire, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_dec(atom_release_sys_shared_dec, memory_order_release, memory_order_acquire, memory_scope_device, __local);
|
||||
atomic_dec(atom_acq_rel_sys_shared_dec, memory_order_acq_rel, memory_order_acquire, memory_scope_device, __local);
|
BIN
ptx/lib/notcuda_ptx_impl.spv
Normal file
BIN
ptx/lib/notcuda_ptx_impl.spv
Normal file
Binary file not shown.
|
@ -109,11 +109,12 @@ macro_rules! sub_type {
|
|||
};
|
||||
}
|
||||
|
||||
// Pointer is used when doing SLM converison to SPIRV
|
||||
sub_type! {
|
||||
VariableRegType {
|
||||
Scalar(ScalarType),
|
||||
Vector(SizedScalarType, u8),
|
||||
// Pointer variant is used when passing around SLM pointer between
|
||||
// function calls for dynamic SLM
|
||||
Pointer(SizedScalarType, PointerStateSpace)
|
||||
}
|
||||
}
|
||||
|
@ -215,6 +216,11 @@ sub_enum!(SelpType {
|
|||
F64,
|
||||
});
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
pub enum BarDetails {
|
||||
SyncAligned,
|
||||
}
|
||||
|
||||
pub trait UnwrapWithVec<E, To> {
|
||||
fn unwrap_with(self, errs: &mut Vec<E>) -> To;
|
||||
}
|
||||
|
@ -301,6 +307,7 @@ impl From<FnArgumentType> for Type {
|
|||
|
||||
sub_enum!(
|
||||
PointerStateSpace : LdStateSpace {
|
||||
Generic,
|
||||
Global,
|
||||
Const,
|
||||
Shared,
|
||||
|
@ -372,6 +379,8 @@ sub_enum!(IntType {
|
|||
S64
|
||||
});
|
||||
|
||||
sub_enum!(BitType { B8, B16, B32, B64 });
|
||||
|
||||
sub_enum!(UIntType { U8, U16, U32, U64 });
|
||||
|
||||
sub_enum!(SIntType { S8, S16, S32, S64 });
|
||||
|
@ -527,6 +536,9 @@ pub enum Instruction<P: ArgParams> {
|
|||
Rcp(RcpDetails, Arg2<P>),
|
||||
And(OrAndType, Arg3<P>),
|
||||
Selp(SelpType, Arg4<P>),
|
||||
Bar(BarDetails, Arg1Bar<P>),
|
||||
Atom(AtomDetails, Arg3<P>),
|
||||
AtomCas(AtomCasDetails, Arg4<P>),
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
|
@ -577,6 +589,10 @@ pub struct Arg1<P: ArgParams> {
|
|||
pub src: P::Id, // it is a jump destination, but in terms of operands it is a source operand
|
||||
}
|
||||
|
||||
pub struct Arg1Bar<P: ArgParams> {
|
||||
pub src: P::Operand,
|
||||
}
|
||||
|
||||
pub struct Arg2<P: ArgParams> {
|
||||
pub dst: P::Id,
|
||||
pub src: P::Operand,
|
||||
|
@ -712,12 +728,12 @@ impl From<LdStType> for PointerType {
|
|||
pub enum LdStQualifier {
|
||||
Weak,
|
||||
Volatile,
|
||||
Relaxed(LdScope),
|
||||
Acquire(LdScope),
|
||||
Relaxed(MemScope),
|
||||
Acquire(MemScope),
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq)]
|
||||
pub enum LdScope {
|
||||
pub enum MemScope {
|
||||
Cta,
|
||||
Gpu,
|
||||
Sys,
|
||||
|
@ -1051,6 +1067,74 @@ pub struct MinMaxFloat {
|
|||
pub typ: FloatType,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct AtomDetails {
|
||||
pub semantics: AtomSemantics,
|
||||
pub scope: MemScope,
|
||||
pub space: AtomSpace,
|
||||
pub inner: AtomInnerDetails,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum AtomSemantics {
|
||||
Relaxed,
|
||||
Acquire,
|
||||
Release,
|
||||
AcquireRelease,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum AtomSpace {
|
||||
Generic,
|
||||
Global,
|
||||
Shared,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum AtomInnerDetails {
|
||||
Bit { op: AtomBitOp, typ: BitType },
|
||||
Unsigned { op: AtomUIntOp, typ: UIntType },
|
||||
Signed { op: AtomSIntOp, typ: SIntType },
|
||||
Float { op: AtomFloatOp, typ: FloatType },
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
pub enum AtomBitOp {
|
||||
And,
|
||||
Or,
|
||||
Xor,
|
||||
Exchange,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
pub enum AtomUIntOp {
|
||||
Add,
|
||||
Inc,
|
||||
Dec,
|
||||
Min,
|
||||
Max,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
pub enum AtomSIntOp {
|
||||
Add,
|
||||
Min,
|
||||
Max,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq)]
|
||||
pub enum AtomFloatOp {
|
||||
Add,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct AtomCasDetails {
|
||||
pub semantics: AtomSemantics,
|
||||
pub scope: MemScope,
|
||||
pub space: AtomSpace,
|
||||
pub typ: BitType
|
||||
}
|
||||
|
||||
pub enum NumsOrArrays<'a> {
|
||||
Nums(Vec<(&'a str, u32)>),
|
||||
Arrays(Vec<NumsOrArrays<'a>>),
|
||||
|
|
|
@ -35,9 +35,12 @@ match {
|
|||
"<", ">",
|
||||
"|",
|
||||
"=",
|
||||
".acq_rel",
|
||||
".acquire",
|
||||
".add",
|
||||
".address_size",
|
||||
".align",
|
||||
".aligned",
|
||||
".and",
|
||||
".approx",
|
||||
".b16",
|
||||
|
@ -45,14 +48,17 @@ match {
|
|||
".b64",
|
||||
".b8",
|
||||
".ca",
|
||||
".cas",
|
||||
".cg",
|
||||
".const",
|
||||
".cs",
|
||||
".cta",
|
||||
".cv",
|
||||
".dec",
|
||||
".entry",
|
||||
".eq",
|
||||
".equ",
|
||||
".exch",
|
||||
".extern",
|
||||
".f16",
|
||||
".f16x2",
|
||||
|
@ -69,6 +75,7 @@ match {
|
|||
".gtu",
|
||||
".hi",
|
||||
".hs",
|
||||
".inc",
|
||||
".le",
|
||||
".leu",
|
||||
".lo",
|
||||
|
@ -78,6 +85,8 @@ match {
|
|||
".lt",
|
||||
".ltu",
|
||||
".lu",
|
||||
".max",
|
||||
".min",
|
||||
".nan",
|
||||
".NaN",
|
||||
".ne",
|
||||
|
@ -88,6 +97,7 @@ match {
|
|||
".pred",
|
||||
".reg",
|
||||
".relaxed",
|
||||
".release",
|
||||
".rm",
|
||||
".rmi",
|
||||
".rn",
|
||||
|
@ -103,6 +113,7 @@ match {
|
|||
".sat",
|
||||
".section",
|
||||
".shared",
|
||||
".sync",
|
||||
".sys",
|
||||
".target",
|
||||
".to",
|
||||
|
@ -126,6 +137,9 @@ match {
|
|||
"abs",
|
||||
"add",
|
||||
"and",
|
||||
"atom",
|
||||
"bar",
|
||||
"barrier",
|
||||
"bra",
|
||||
"call",
|
||||
"cvt",
|
||||
|
@ -162,6 +176,9 @@ ExtendedID : &'input str = {
|
|||
"abs",
|
||||
"add",
|
||||
"and",
|
||||
"atom",
|
||||
"bar",
|
||||
"barrier",
|
||||
"bra",
|
||||
"call",
|
||||
"cvt",
|
||||
|
@ -372,6 +389,7 @@ StateSpaceSpecifier: ast::StateSpace = {
|
|||
".param" => ast::StateSpace::Param, // used to prepare function call
|
||||
};
|
||||
|
||||
#[inline]
|
||||
ScalarType: ast::ScalarType = {
|
||||
".f16" => ast::ScalarType::F16,
|
||||
".f16x2" => ast::ScalarType::F16x2,
|
||||
|
@ -438,6 +456,7 @@ Variable: ast::Variable<ast::VariableType, &'input str> = {
|
|||
let v_type = ast::VariableType::Param(v_type);
|
||||
ast::Variable {align, v_type, name, array_init}
|
||||
},
|
||||
SharedVariable,
|
||||
};
|
||||
|
||||
RegVariable: (Option<u32>, ast::VariableRegType, &'input str) = {
|
||||
|
@ -478,6 +497,32 @@ LocalVariable: ast::Variable<ast::VariableType, &'input str> = {
|
|||
}
|
||||
}
|
||||
|
||||
SharedVariable: ast::Variable<ast::VariableType, &'input str> = {
|
||||
".shared" <var:VariableScalar<SizedScalarType>> => {
|
||||
let (align, t, name) = var;
|
||||
let v_type = ast::VariableGlobalType::Scalar(t);
|
||||
ast::Variable { align, v_type: ast::VariableType::Shared(v_type), name, array_init: Vec::new() }
|
||||
},
|
||||
".shared" <var:VariableVector<SizedScalarType>> => {
|
||||
let (align, v_len, t, name) = var;
|
||||
let v_type = ast::VariableGlobalType::Vector(t, v_len);
|
||||
ast::Variable { align, v_type: ast::VariableType::Shared(v_type), name, array_init: Vec::new() }
|
||||
},
|
||||
".shared" <var:VariableArrayOrPointer<SizedScalarType>> =>? {
|
||||
let (align, t, name, arr_or_ptr) = var;
|
||||
let (v_type, array_init) = match arr_or_ptr {
|
||||
ast::ArrayOrPointer::Array { dimensions, init } => {
|
||||
(ast::VariableGlobalType::Array(t, dimensions), init)
|
||||
}
|
||||
ast::ArrayOrPointer::Pointer => {
|
||||
return Err(ParseError::User { error: ast::PtxError::ZeroDimensionArray });
|
||||
}
|
||||
};
|
||||
Ok(ast::Variable { align, v_type: ast::VariableType::Shared(v_type), name, array_init })
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ModuleVariable: ast::Variable<ast::VariableType, &'input str> = {
|
||||
LinkingDirectives ".global" <def:GlobalVariableDefinitionNoArray> => {
|
||||
let (align, v_type, name, array_init) = def;
|
||||
|
@ -619,7 +664,10 @@ Instruction: ast::Instruction<ast::ParsedArgParams<'input>> = {
|
|||
InstMin,
|
||||
InstMax,
|
||||
InstRcp,
|
||||
InstSelp
|
||||
InstSelp,
|
||||
InstBar,
|
||||
InstAtom,
|
||||
InstAtomCas
|
||||
};
|
||||
|
||||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
|
||||
|
@ -655,14 +703,14 @@ LdStType: ast::LdStType = {
|
|||
LdStQualifier: ast::LdStQualifier = {
|
||||
".weak" => ast::LdStQualifier::Weak,
|
||||
".volatile" => ast::LdStQualifier::Volatile,
|
||||
".relaxed" <s:LdScope> => ast::LdStQualifier::Relaxed(s),
|
||||
".acquire" <s:LdScope> => ast::LdStQualifier::Acquire(s),
|
||||
".relaxed" <s:MemScope> => ast::LdStQualifier::Relaxed(s),
|
||||
".acquire" <s:MemScope> => ast::LdStQualifier::Acquire(s),
|
||||
};
|
||||
|
||||
LdScope: ast::LdScope = {
|
||||
".cta" => ast::LdScope::Cta,
|
||||
".gpu" => ast::LdScope::Gpu,
|
||||
".sys" => ast::LdScope::Sys
|
||||
MemScope: ast::MemScope = {
|
||||
".cta" => ast::MemScope::Cta,
|
||||
".gpu" => ast::MemScope::Gpu,
|
||||
".sys" => ast::MemScope::Sys
|
||||
};
|
||||
|
||||
LdStateSpace: ast::LdStateSpace = {
|
||||
|
@ -798,6 +846,13 @@ SIntType: ast::SIntType = {
|
|||
".s64" => ast::SIntType::S64,
|
||||
};
|
||||
|
||||
FloatType: ast::FloatType = {
|
||||
".f16" => ast::FloatType::F16,
|
||||
".f16x2" => ast::FloatType::F16x2,
|
||||
".f32" => ast::FloatType::F32,
|
||||
".f64" => ast::FloatType::F64,
|
||||
};
|
||||
|
||||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-add
|
||||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#floating-point-instructions-add
|
||||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-add
|
||||
|
@ -1296,6 +1351,140 @@ SelpType: ast::SelpType = {
|
|||
".f64" => ast::SelpType::F64,
|
||||
};
|
||||
|
||||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar
|
||||
InstBar: ast::Instruction<ast::ParsedArgParams<'input>> = {
|
||||
"barrier" ".sync" ".aligned" <a:Arg1Bar> => ast::Instruction::Bar(ast::BarDetails::SyncAligned, a),
|
||||
"bar" ".sync" <a:Arg1Bar> => ast::Instruction::Bar(ast::BarDetails::SyncAligned, a)
|
||||
}
|
||||
|
||||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
|
||||
// The documentation does not mention all spported operations:
|
||||
// * Operation .add requires .u32 or .s32 or .u64 or .f64 or f16 or f16x2 or .f32
|
||||
// * Operation .inc requires .u32 type for instuction
|
||||
// * Operation .dec requires .u32 type for instuction
|
||||
// Otherwise as documented
|
||||
InstAtom: ast::Instruction<ast::ParsedArgParams<'input>> = {
|
||||
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> <op:AtomBitOp> <typ:AtomBitType> <a:Arg3Atom> => {
|
||||
let details = ast::AtomDetails {
|
||||
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
|
||||
scope: scope.unwrap_or(ast::MemScope::Gpu),
|
||||
space: space.unwrap_or(ast::AtomSpace::Generic),
|
||||
inner: ast::AtomInnerDetails::Bit { op, typ }
|
||||
};
|
||||
ast::Instruction::Atom(details,a)
|
||||
},
|
||||
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> ".inc" ".u32" <a:Arg3Atom> => {
|
||||
let details = ast::AtomDetails {
|
||||
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
|
||||
scope: scope.unwrap_or(ast::MemScope::Gpu),
|
||||
space: space.unwrap_or(ast::AtomSpace::Generic),
|
||||
inner: ast::AtomInnerDetails::Unsigned {
|
||||
op: ast::AtomUIntOp::Inc,
|
||||
typ: ast::UIntType::U32
|
||||
}
|
||||
};
|
||||
ast::Instruction::Atom(details,a)
|
||||
},
|
||||
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> ".dec" ".u32" <a:Arg3Atom> => {
|
||||
let details = ast::AtomDetails {
|
||||
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
|
||||
scope: scope.unwrap_or(ast::MemScope::Gpu),
|
||||
space: space.unwrap_or(ast::AtomSpace::Generic),
|
||||
inner: ast::AtomInnerDetails::Unsigned {
|
||||
op: ast::AtomUIntOp::Dec,
|
||||
typ: ast::UIntType::U32
|
||||
}
|
||||
};
|
||||
ast::Instruction::Atom(details,a)
|
||||
},
|
||||
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> ".add" <typ:FloatType> <a:Arg3Atom> => {
|
||||
let op = ast::AtomFloatOp::Add;
|
||||
let details = ast::AtomDetails {
|
||||
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
|
||||
scope: scope.unwrap_or(ast::MemScope::Gpu),
|
||||
space: space.unwrap_or(ast::AtomSpace::Generic),
|
||||
inner: ast::AtomInnerDetails::Float { op, typ }
|
||||
};
|
||||
ast::Instruction::Atom(details,a)
|
||||
},
|
||||
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> <op: AtomUIntOp> <typ:AtomUIntType> <a:Arg3Atom> => {
|
||||
let details = ast::AtomDetails {
|
||||
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
|
||||
scope: scope.unwrap_or(ast::MemScope::Gpu),
|
||||
space: space.unwrap_or(ast::AtomSpace::Generic),
|
||||
inner: ast::AtomInnerDetails::Unsigned { op, typ }
|
||||
};
|
||||
ast::Instruction::Atom(details,a)
|
||||
},
|
||||
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> <op: AtomSIntOp> <typ:AtomSIntType> <a:Arg3Atom> => {
|
||||
let details = ast::AtomDetails {
|
||||
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
|
||||
scope: scope.unwrap_or(ast::MemScope::Gpu),
|
||||
space: space.unwrap_or(ast::AtomSpace::Generic),
|
||||
inner: ast::AtomInnerDetails::Signed { op, typ }
|
||||
};
|
||||
ast::Instruction::Atom(details,a)
|
||||
}
|
||||
}
|
||||
|
||||
InstAtomCas: ast::Instruction<ast::ParsedArgParams<'input>> = {
|
||||
"atom" <sema:AtomSemantics?> <scope:MemScope?> <space:AtomSpace?> ".cas" <typ:AtomBitType> <a:Arg4Atom> => {
|
||||
let details = ast::AtomCasDetails {
|
||||
semantics: sema.unwrap_or(ast::AtomSemantics::Relaxed),
|
||||
scope: scope.unwrap_or(ast::MemScope::Gpu),
|
||||
space: space.unwrap_or(ast::AtomSpace::Generic),
|
||||
typ,
|
||||
};
|
||||
ast::Instruction::AtomCas(details,a)
|
||||
},
|
||||
}
|
||||
|
||||
AtomSemantics: ast::AtomSemantics = {
|
||||
".relaxed" => ast::AtomSemantics::Relaxed,
|
||||
".acquire" => ast::AtomSemantics::Acquire,
|
||||
".release" => ast::AtomSemantics::Release,
|
||||
".acq_rel" => ast::AtomSemantics::AcquireRelease
|
||||
}
|
||||
|
||||
AtomSpace: ast::AtomSpace = {
|
||||
".global" => ast::AtomSpace::Global,
|
||||
".shared" => ast::AtomSpace::Shared
|
||||
}
|
||||
|
||||
AtomBitOp: ast::AtomBitOp = {
|
||||
".and" => ast::AtomBitOp::And,
|
||||
".or" => ast::AtomBitOp::Or,
|
||||
".xor" => ast::AtomBitOp::Xor,
|
||||
".exch" => ast::AtomBitOp::Exchange,
|
||||
}
|
||||
|
||||
AtomUIntOp: ast::AtomUIntOp = {
|
||||
".add" => ast::AtomUIntOp::Add,
|
||||
".min" => ast::AtomUIntOp::Min,
|
||||
".max" => ast::AtomUIntOp::Max,
|
||||
}
|
||||
|
||||
AtomSIntOp: ast::AtomSIntOp = {
|
||||
".add" => ast::AtomSIntOp::Add,
|
||||
".min" => ast::AtomSIntOp::Min,
|
||||
".max" => ast::AtomSIntOp::Max,
|
||||
}
|
||||
|
||||
AtomBitType: ast::BitType = {
|
||||
".b32" => ast::BitType::B32,
|
||||
".b64" => ast::BitType::B64,
|
||||
}
|
||||
|
||||
AtomUIntType: ast::UIntType = {
|
||||
".u32" => ast::UIntType::U32,
|
||||
".u64" => ast::UIntType::U64,
|
||||
}
|
||||
|
||||
AtomSIntType: ast::SIntType = {
|
||||
".s32" => ast::SIntType::S32,
|
||||
".s64" => ast::SIntType::S64,
|
||||
}
|
||||
|
||||
ArithDetails: ast::ArithDetails = {
|
||||
<t:UIntType> => ast::ArithDetails::Unsigned(t),
|
||||
<t:SIntType> => ast::ArithDetails::Signed(ast::ArithSInt {
|
||||
|
@ -1414,6 +1603,10 @@ Arg1: ast::Arg1<ast::ParsedArgParams<'input>> = {
|
|||
<src:ExtendedID> => ast::Arg1{<>}
|
||||
};
|
||||
|
||||
Arg1Bar: ast::Arg1Bar<ast::ParsedArgParams<'input>> = {
|
||||
<src:Operand> => ast::Arg1Bar{<>}
|
||||
};
|
||||
|
||||
Arg2: ast::Arg2<ast::ParsedArgParams<'input>> = {
|
||||
<dst:ExtendedID> "," <src:Operand> => ast::Arg2{<>}
|
||||
};
|
||||
|
@ -1448,10 +1641,18 @@ Arg3: ast::Arg3<ast::ParsedArgParams<'input>> = {
|
|||
<dst:ExtendedID> "," <src1:Operand> "," <src2:Operand> => ast::Arg3{<>}
|
||||
};
|
||||
|
||||
Arg3Atom: ast::Arg3<ast::ParsedArgParams<'input>> = {
|
||||
<dst:ExtendedID> "," "[" <src1:Operand> "]" "," <src2:Operand> => ast::Arg3{<>}
|
||||
};
|
||||
|
||||
Arg4: ast::Arg4<ast::ParsedArgParams<'input>> = {
|
||||
<dst:ExtendedID> "," <src1:Operand> "," <src2:Operand> "," <src3:Operand> => ast::Arg4{<>}
|
||||
};
|
||||
|
||||
Arg4Atom: ast::Arg4<ast::ParsedArgParams<'input>> = {
|
||||
<dst:ExtendedID> "," "[" <src1:Operand> "]" "," <src2:Operand> "," <src3:Operand> => ast::Arg4{<>}
|
||||
};
|
||||
|
||||
Arg4Setp: ast::Arg4Setp<ast::ParsedArgParams<'input>> = {
|
||||
<dst1:ExtendedID> <dst2:OptionalDst?> "," <src1:Operand> "," <src2:Operand> => ast::Arg4Setp{<>}
|
||||
};
|
||||
|
|
10
ptx/src/test/spirv_build/bar_sync.ptx
Normal file
10
ptx/src/test/spirv_build/bar_sync.ptx
Normal file
|
@ -0,0 +1,10 @@
|
|||
.version 6.5
|
||||
.target sm_30
|
||||
.address_size 64
|
||||
|
||||
.visible .entry bar_sync()
|
||||
{
|
||||
.reg .u32 temp_32;
|
||||
bar.sync temp_32;
|
||||
ret;
|
||||
}
|
|
@ -11,8 +11,8 @@ OpCapability Int16
|
|||
OpCapability Int64
|
||||
OpCapability Float16
|
||||
OpCapability Float64
|
||||
OpCapability FunctionFloatControlINTEL
|
||||
OpExtension "SPV_INTEL_float_controls2"
|
||||
; OpCapability FunctionFloatControlINTEL
|
||||
; OpExtension "SPV_INTEL_float_controls2"
|
||||
%33 = OpExtInstImport "OpenCL.std"
|
||||
OpMemoryModel Physical64 OpenCL
|
||||
OpEntryPoint Kernel %1 "and"
|
||||
|
|
28
ptx/src/test/spirv_run/atom_add.ptx
Normal file
28
ptx/src/test/spirv_run/atom_add.ptx
Normal file
|
@ -0,0 +1,28 @@
|
|||
.version 6.5
|
||||
.target sm_30
|
||||
.address_size 64
|
||||
|
||||
.visible .entry atom_add(
|
||||
.param .u64 input,
|
||||
.param .u64 output
|
||||
)
|
||||
{
|
||||
.shared .align 4 .b8 shared_mem[1024];
|
||||
|
||||
.reg .u64 in_addr;
|
||||
.reg .u64 out_addr;
|
||||
.reg .u32 temp1;
|
||||
.reg .u32 temp2;
|
||||
|
||||
ld.param.u64 in_addr, [input];
|
||||
ld.param.u64 out_addr, [output];
|
||||
|
||||
ld.u32 temp1, [in_addr];
|
||||
ld.u32 temp2, [in_addr+4];
|
||||
st.shared.u32 [shared_mem], temp1;
|
||||
atom.shared.add.u32 temp1, [shared_mem], temp2;
|
||||
ld.shared.u32 temp2, [shared_mem];
|
||||
st.u32 [out_addr], temp1;
|
||||
st.u32 [out_addr+4], temp2;
|
||||
ret;
|
||||
}
|
84
ptx/src/test/spirv_run/atom_add.spvtxt
Normal file
84
ptx/src/test/spirv_run/atom_add.spvtxt
Normal file
|
@ -0,0 +1,84 @@
|
|||
; SPIR-V
|
||||
; Version: 1.3
|
||||
; Generator: rspirv
|
||||
; Bound: 55
|
||||
OpCapability GenericPointer
|
||||
OpCapability Linkage
|
||||
OpCapability Addresses
|
||||
OpCapability Kernel
|
||||
OpCapability Int8
|
||||
OpCapability Int16
|
||||
OpCapability Int64
|
||||
OpCapability Float16
|
||||
OpCapability Float64
|
||||
; OpCapability FunctionFloatControlINTEL
|
||||
; OpExtension "SPV_INTEL_float_controls2"
|
||||
%40 = OpExtInstImport "OpenCL.std"
|
||||
OpMemoryModel Physical64 OpenCL
|
||||
OpEntryPoint Kernel %1 "atom_add" %4
|
||||
OpDecorate %4 Alignment 4
|
||||
%41 = OpTypeVoid
|
||||
%42 = OpTypeInt 32 0
|
||||
%43 = OpTypeInt 8 0
|
||||
%44 = OpConstant %42 1024
|
||||
%45 = OpTypeArray %43 %44
|
||||
%46 = OpTypePointer Workgroup %45
|
||||
%4 = OpVariable %46 Workgroup
|
||||
%47 = OpTypeInt 64 0
|
||||
%48 = OpTypeFunction %41 %47 %47
|
||||
%49 = OpTypePointer Function %47
|
||||
%50 = OpTypePointer Function %42
|
||||
%51 = OpTypePointer Generic %42
|
||||
%27 = OpConstant %47 4
|
||||
%52 = OpTypePointer Workgroup %42
|
||||
%53 = OpConstant %42 1
|
||||
%54 = OpConstant %42 0
|
||||
%29 = OpConstant %47 4
|
||||
%1 = OpFunction %41 None %48
|
||||
%9 = OpFunctionParameter %47
|
||||
%10 = OpFunctionParameter %47
|
||||
%38 = OpLabel
|
||||
%2 = OpVariable %49 Function
|
||||
%3 = OpVariable %49 Function
|
||||
%5 = OpVariable %49 Function
|
||||
%6 = OpVariable %49 Function
|
||||
%7 = OpVariable %50 Function
|
||||
%8 = OpVariable %50 Function
|
||||
OpStore %2 %9
|
||||
OpStore %3 %10
|
||||
%12 = OpLoad %47 %2
|
||||
%11 = OpCopyObject %47 %12
|
||||
OpStore %5 %11
|
||||
%14 = OpLoad %47 %3
|
||||
%13 = OpCopyObject %47 %14
|
||||
OpStore %6 %13
|
||||
%16 = OpLoad %47 %5
|
||||
%31 = OpConvertUToPtr %51 %16
|
||||
%15 = OpLoad %42 %31
|
||||
OpStore %7 %15
|
||||
%18 = OpLoad %47 %5
|
||||
%28 = OpIAdd %47 %18 %27
|
||||
%32 = OpConvertUToPtr %51 %28
|
||||
%17 = OpLoad %42 %32
|
||||
OpStore %8 %17
|
||||
%19 = OpLoad %42 %7
|
||||
%33 = OpBitcast %52 %4
|
||||
OpStore %33 %19
|
||||
%21 = OpLoad %42 %8
|
||||
%34 = OpBitcast %52 %4
|
||||
%20 = OpAtomicIAdd %42 %34 %53 %54 %21
|
||||
OpStore %7 %20
|
||||
%35 = OpBitcast %52 %4
|
||||
%22 = OpLoad %42 %35
|
||||
OpStore %8 %22
|
||||
%23 = OpLoad %47 %6
|
||||
%24 = OpLoad %42 %7
|
||||
%36 = OpConvertUToPtr %51 %23
|
||||
OpStore %36 %24
|
||||
%25 = OpLoad %47 %6
|
||||
%26 = OpLoad %42 %8
|
||||
%30 = OpIAdd %47 %25 %29
|
||||
%37 = OpConvertUToPtr %51 %30
|
||||
OpStore %37 %26
|
||||
OpReturn
|
||||
OpFunctionEnd
|
24
ptx/src/test/spirv_run/atom_cas.ptx
Normal file
24
ptx/src/test/spirv_run/atom_cas.ptx
Normal file
|
@ -0,0 +1,24 @@
|
|||
.version 6.5
|
||||
.target sm_30
|
||||
.address_size 64
|
||||
|
||||
.visible .entry atom_cas(
|
||||
.param .u64 input,
|
||||
.param .u64 output
|
||||
)
|
||||
{
|
||||
.reg .u64 in_addr;
|
||||
.reg .u64 out_addr;
|
||||
.reg .u32 temp1;
|
||||
.reg .u32 temp2;
|
||||
|
||||
ld.param.u64 in_addr, [input];
|
||||
ld.param.u64 out_addr, [output];
|
||||
|
||||
ld.u32 temp1, [in_addr];
|
||||
atom.cas.b32 temp1, [in_addr+4], temp1, 100;
|
||||
ld.u32 temp2, [in_addr+4];
|
||||
st.u32 [out_addr], temp1;
|
||||
st.u32 [out_addr+4], temp2;
|
||||
ret;
|
||||
}
|
77
ptx/src/test/spirv_run/atom_cas.spvtxt
Normal file
77
ptx/src/test/spirv_run/atom_cas.spvtxt
Normal file
|
@ -0,0 +1,77 @@
|
|||
; SPIR-V
|
||||
; Version: 1.3
|
||||
; Generator: rspirv
|
||||
; Bound: 51
|
||||
OpCapability GenericPointer
|
||||
OpCapability Linkage
|
||||
OpCapability Addresses
|
||||
OpCapability Kernel
|
||||
OpCapability Int8
|
||||
OpCapability Int16
|
||||
OpCapability Int64
|
||||
OpCapability Float16
|
||||
OpCapability Float64
|
||||
; OpCapability FunctionFloatControlINTEL
|
||||
; OpExtension "SPV_INTEL_float_controls2"
|
||||
%41 = OpExtInstImport "OpenCL.std"
|
||||
OpMemoryModel Physical64 OpenCL
|
||||
OpEntryPoint Kernel %1 "atom_cas"
|
||||
%42 = OpTypeVoid
|
||||
%43 = OpTypeInt 64 0
|
||||
%44 = OpTypeFunction %42 %43 %43
|
||||
%45 = OpTypePointer Function %43
|
||||
%46 = OpTypeInt 32 0
|
||||
%47 = OpTypePointer Function %46
|
||||
%48 = OpTypePointer Generic %46
|
||||
%25 = OpConstant %43 4
|
||||
%27 = OpConstant %46 100
|
||||
%49 = OpConstant %46 1
|
||||
%50 = OpConstant %46 0
|
||||
%28 = OpConstant %43 4
|
||||
%30 = OpConstant %43 4
|
||||
%1 = OpFunction %42 None %44
|
||||
%8 = OpFunctionParameter %43
|
||||
%9 = OpFunctionParameter %43
|
||||
%39 = OpLabel
|
||||
%2 = OpVariable %45 Function
|
||||
%3 = OpVariable %45 Function
|
||||
%4 = OpVariable %45 Function
|
||||
%5 = OpVariable %45 Function
|
||||
%6 = OpVariable %47 Function
|
||||
%7 = OpVariable %47 Function
|
||||
OpStore %2 %8
|
||||
OpStore %3 %9
|
||||
%11 = OpLoad %43 %2
|
||||
%10 = OpCopyObject %43 %11
|
||||
OpStore %4 %10
|
||||
%13 = OpLoad %43 %3
|
||||
%12 = OpCopyObject %43 %13
|
||||
OpStore %5 %12
|
||||
%15 = OpLoad %43 %4
|
||||
%32 = OpConvertUToPtr %48 %15
|
||||
%14 = OpLoad %46 %32
|
||||
OpStore %6 %14
|
||||
%17 = OpLoad %43 %4
|
||||
%18 = OpLoad %46 %6
|
||||
%26 = OpIAdd %43 %17 %25
|
||||
%34 = OpConvertUToPtr %48 %26
|
||||
%35 = OpCopyObject %46 %18
|
||||
%33 = OpAtomicCompareExchange %46 %34 %49 %50 %50 %27 %35
|
||||
%16 = OpCopyObject %46 %33
|
||||
OpStore %6 %16
|
||||
%20 = OpLoad %43 %4
|
||||
%29 = OpIAdd %43 %20 %28
|
||||
%36 = OpConvertUToPtr %48 %29
|
||||
%19 = OpLoad %46 %36
|
||||
OpStore %7 %19
|
||||
%21 = OpLoad %43 %5
|
||||
%22 = OpLoad %46 %6
|
||||
%37 = OpConvertUToPtr %48 %21
|
||||
OpStore %37 %22
|
||||
%23 = OpLoad %43 %5
|
||||
%24 = OpLoad %46 %7
|
||||
%31 = OpIAdd %43 %23 %30
|
||||
%38 = OpConvertUToPtr %48 %31
|
||||
OpStore %38 %24
|
||||
OpReturn
|
||||
OpFunctionEnd
|
26
ptx/src/test/spirv_run/atom_inc.ptx
Normal file
26
ptx/src/test/spirv_run/atom_inc.ptx
Normal file
|
@ -0,0 +1,26 @@
|
|||
.version 6.5
|
||||
.target sm_30
|
||||
.address_size 64
|
||||
|
||||
.visible .entry atom_inc(
|
||||
.param .u64 input,
|
||||
.param .u64 output
|
||||
)
|
||||
{
|
||||
.reg .u64 in_addr;
|
||||
.reg .u64 out_addr;
|
||||
.reg .u32 temp1;
|
||||
.reg .u32 temp2;
|
||||
.reg .u32 temp3;
|
||||
|
||||
ld.param.u64 in_addr, [input];
|
||||
ld.param.u64 out_addr, [output];
|
||||
|
||||
atom.inc.u32 temp1, [in_addr], 101;
|
||||
atom.global.inc.u32 temp2, [in_addr], 101;
|
||||
ld.u32 temp3, [in_addr];
|
||||
st.u32 [out_addr], temp1;
|
||||
st.u32 [out_addr+4], temp2;
|
||||
st.u32 [out_addr+8], temp3;
|
||||
ret;
|
||||
}
|
89
ptx/src/test/spirv_run/atom_inc.spvtxt
Normal file
89
ptx/src/test/spirv_run/atom_inc.spvtxt
Normal file
|
@ -0,0 +1,89 @@
|
|||
; SPIR-V
|
||||
; Version: 1.3
|
||||
; Generator: rspirv
|
||||
; Bound: 60
|
||||
OpCapability GenericPointer
|
||||
OpCapability Linkage
|
||||
OpCapability Addresses
|
||||
OpCapability Kernel
|
||||
OpCapability Int8
|
||||
OpCapability Int16
|
||||
OpCapability Int64
|
||||
OpCapability Float16
|
||||
OpCapability Float64
|
||||
; OpCapability FunctionFloatControlINTEL
|
||||
; OpExtension "SPV_INTEL_float_controls2"
|
||||
%49 = OpExtInstImport "OpenCL.std"
|
||||
OpMemoryModel Physical64 OpenCL
|
||||
OpEntryPoint Kernel %1 "atom_inc"
|
||||
OpDecorate %40 LinkageAttributes "__notcuda_ptx_impl__atom_relaxed_gpu_generic_inc" Import
|
||||
OpDecorate %44 LinkageAttributes "__notcuda_ptx_impl__atom_relaxed_gpu_global_inc" Import
|
||||
%50 = OpTypeVoid
|
||||
%51 = OpTypeInt 32 0
|
||||
%52 = OpTypePointer Generic %51
|
||||
%53 = OpTypeFunction %51 %52 %51
|
||||
%54 = OpTypePointer CrossWorkgroup %51
|
||||
%55 = OpTypeFunction %51 %54 %51
|
||||
%56 = OpTypeInt 64 0
|
||||
%57 = OpTypeFunction %50 %56 %56
|
||||
%58 = OpTypePointer Function %56
|
||||
%59 = OpTypePointer Function %51
|
||||
%27 = OpConstant %51 101
|
||||
%28 = OpConstant %51 101
|
||||
%29 = OpConstant %56 4
|
||||
%31 = OpConstant %56 8
|
||||
%40 = OpFunction %51 None %53
|
||||
%42 = OpFunctionParameter %52
|
||||
%43 = OpFunctionParameter %51
|
||||
OpFunctionEnd
|
||||
%44 = OpFunction %51 None %55
|
||||
%46 = OpFunctionParameter %54
|
||||
%47 = OpFunctionParameter %51
|
||||
OpFunctionEnd
|
||||
%1 = OpFunction %50 None %57
|
||||
%9 = OpFunctionParameter %56
|
||||
%10 = OpFunctionParameter %56
|
||||
%39 = OpLabel
|
||||
%2 = OpVariable %58 Function
|
||||
%3 = OpVariable %58 Function
|
||||
%4 = OpVariable %58 Function
|
||||
%5 = OpVariable %58 Function
|
||||
%6 = OpVariable %59 Function
|
||||
%7 = OpVariable %59 Function
|
||||
%8 = OpVariable %59 Function
|
||||
OpStore %2 %9
|
||||
OpStore %3 %10
|
||||
%12 = OpLoad %56 %2
|
||||
%11 = OpCopyObject %56 %12
|
||||
OpStore %4 %11
|
||||
%14 = OpLoad %56 %3
|
||||
%13 = OpCopyObject %56 %14
|
||||
OpStore %5 %13
|
||||
%16 = OpLoad %56 %4
|
||||
%33 = OpConvertUToPtr %52 %16
|
||||
%15 = OpFunctionCall %51 %40 %33 %27
|
||||
OpStore %6 %15
|
||||
%18 = OpLoad %56 %4
|
||||
%34 = OpConvertUToPtr %54 %18
|
||||
%17 = OpFunctionCall %51 %44 %34 %28
|
||||
OpStore %7 %17
|
||||
%20 = OpLoad %56 %4
|
||||
%35 = OpConvertUToPtr %52 %20
|
||||
%19 = OpLoad %51 %35
|
||||
OpStore %8 %19
|
||||
%21 = OpLoad %56 %5
|
||||
%22 = OpLoad %51 %6
|
||||
%36 = OpConvertUToPtr %52 %21
|
||||
OpStore %36 %22
|
||||
%23 = OpLoad %56 %5
|
||||
%24 = OpLoad %51 %7
|
||||
%30 = OpIAdd %56 %23 %29
|
||||
%37 = OpConvertUToPtr %52 %30
|
||||
OpStore %37 %24
|
||||
%25 = OpLoad %56 %5
|
||||
%26 = OpLoad %51 %8
|
||||
%32 = OpIAdd %56 %25 %31
|
||||
%38 = OpConvertUToPtr %52 %32
|
||||
OpStore %38 %26
|
||||
OpReturn
|
||||
OpFunctionEnd
|
|
@ -11,12 +11,12 @@ OpCapability Int16
|
|||
OpCapability Int64
|
||||
OpCapability Float16
|
||||
OpCapability Float64
|
||||
OpCapability FunctionFloatControlINTEL
|
||||
OpExtension "SPV_INTEL_float_controls2"
|
||||
; OpCapability FunctionFloatControlINTEL
|
||||
; OpExtension "SPV_INTEL_float_controls2"
|
||||
%24 = OpExtInstImport "OpenCL.std"
|
||||
OpMemoryModel Physical64 OpenCL
|
||||
OpEntryPoint Kernel %1 "constant_f32"
|
||||
OpDecorate %1 FunctionDenormModeINTEL 32 Preserve
|
||||
; OpDecorate %1 FunctionDenormModeINTEL 32 Preserve
|
||||
%25 = OpTypeVoid
|
||||
%26 = OpTypeInt 64 0
|
||||
%27 = OpTypeFunction %25 %26 %26
|
||||
|
|
|
@ -11,8 +11,8 @@ OpCapability Int16
|
|||
OpCapability Int64
|
||||
OpCapability Float16
|
||||
OpCapability Float64
|
||||
OpCapability FunctionFloatControlINTEL
|
||||
OpExtension "SPV_INTEL_float_controls2"
|
||||
; OpCapability FunctionFloatControlINTEL
|
||||
; OpExtension "SPV_INTEL_float_controls2"
|
||||
%24 = OpExtInstImport "OpenCL.std"
|
||||
OpMemoryModel Physical64 OpenCL
|
||||
OpEntryPoint Kernel %1 "constant_negative"
|
||||
|
|
|
@ -11,12 +11,12 @@ OpCapability Int16
|
|||
OpCapability Int64
|
||||
OpCapability Float16
|
||||
OpCapability Float64
|
||||
OpCapability FunctionFloatControlINTEL
|
||||
OpExtension "SPV_INTEL_float_controls2"
|
||||
; OpCapability FunctionFloatControlINTEL
|
||||
; OpExtension "SPV_INTEL_float_controls2"
|
||||
%37 = OpExtInstImport "OpenCL.std"
|
||||
OpMemoryModel Physical64 OpenCL
|
||||
OpEntryPoint Kernel %1 "fma"
|
||||
OpDecorate %1 FunctionDenormModeINTEL 32 Preserve
|
||||
; OpDecorate %1 FunctionDenormModeINTEL 32 Preserve
|
||||
%38 = OpTypeVoid
|
||||
%39 = OpTypeInt 64 0
|
||||
%40 = OpTypeFunction %38 %39 %39
|
||||
|
|
|
@ -86,12 +86,20 @@ test_ptx!(rcp, [2f32], [0.5f32]);
|
|||
// 0x3f000000 is 0.5
|
||||
// TODO: mul_ftz fails because IGC does not yet handle SPV_INTEL_float_controls2
|
||||
// test_ptx!(mul_ftz, [0b1_00000000_10000000000000000000000u32, 0x3f000000u32], [0u32]);
|
||||
test_ptx!(mul_non_ftz, [0b1_00000000_10000000000000000000000u32, 0x3f000000u32], [0b1_00000000_01000000000000000000000u32]);
|
||||
test_ptx!(
|
||||
mul_non_ftz,
|
||||
[0b1_00000000_10000000000000000000000u32, 0x3f000000u32],
|
||||
[0b1_00000000_01000000000000000000000u32]
|
||||
);
|
||||
test_ptx!(constant_f32, [10f32], [5f32]);
|
||||
test_ptx!(constant_negative, [-101i32], [101i32]);
|
||||
test_ptx!(and, [6u32, 3u32], [2u32]);
|
||||
test_ptx!(selp, [100u16, 200u16], [200u16]);
|
||||
test_ptx!(fma, [2f32, 3f32, 5f32], [11f32]);
|
||||
test_ptx!(fma, [2f32, 3f32, 5f32], [11f32]);
|
||||
test_ptx!(shared_variable, [513u64], [513u64]);
|
||||
test_ptx!(atom_cas, [91u32, 91u32], [91u32, 100u32]);
|
||||
test_ptx!(atom_inc, [100u32], [100u32, 101u32, 0u32]);
|
||||
test_ptx!(atom_add, [2u32, 4u32], [2u32, 6u32]);
|
||||
|
||||
struct DisplayError<T: Debug> {
|
||||
err: T,
|
||||
|
@ -124,7 +132,7 @@ fn test_ptx_assert<'a, T: From<u8> + ze::SafeRepr + Debug + Copy + PartialEq>(
|
|||
let name = CString::new(name)?;
|
||||
let result = run_spirv(name.as_c_str(), notcuda_module, input, output)
|
||||
.map_err(|err| DisplayError { err })?;
|
||||
assert_eq!(output, result.as_slice());
|
||||
assert_eq!(result.as_slice(), output);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -145,8 +153,8 @@ fn run_spirv<T: From<u8> + ze::SafeRepr + Copy + Debug>(
|
|||
let use_shared_mem = module
|
||||
.kernel_info
|
||||
.get(name.to_str().unwrap())
|
||||
.unwrap()
|
||||
.uses_shared_mem;
|
||||
.map(|info| info.uses_shared_mem)
|
||||
.unwrap_or(false);
|
||||
let mut result = vec![0u8.into(); output.len()];
|
||||
{
|
||||
let mut drivers = ze::Driver::get()?;
|
||||
|
@ -155,11 +163,20 @@ fn run_spirv<T: From<u8> + ze::SafeRepr + Copy + Debug>(
|
|||
let mut devices = drv.devices()?;
|
||||
let dev = devices.drain(0..1).next().unwrap();
|
||||
let queue = ze::CommandQueue::new(&mut ctx, &dev)?;
|
||||
let (module, log) = ze::Module::new_spirv(&mut ctx, &dev, byte_il, None);
|
||||
let (module, maybe_log) = match module.should_link_ptx_impl {
|
||||
Some(ptx_impl) => ze::Module::build_link_spirv(&mut ctx, &dev, &[ptx_impl, byte_il]),
|
||||
None => {
|
||||
let (module, log) = ze::Module::build_spirv(&mut ctx, &dev, byte_il, None);
|
||||
(module, Some(log))
|
||||
}
|
||||
};
|
||||
let module = match module {
|
||||
Ok(m) => m,
|
||||
Err(err) => {
|
||||
let raw_err_string = log.get_cstring()?;
|
||||
let raw_err_string = maybe_log
|
||||
.map(|log| log.get_cstring())
|
||||
.transpose()?
|
||||
.unwrap_or(CString::default());
|
||||
let err_string = raw_err_string.to_string_lossy();
|
||||
panic!("{:?}\n{}", err, err_string);
|
||||
}
|
||||
|
@ -215,7 +232,11 @@ fn test_spvtxt_assert<'a>(
|
|||
ptr::null_mut(),
|
||||
)
|
||||
};
|
||||
assert!(result == spv_result_t::SPV_SUCCESS);
|
||||
if result != spv_result_t::SPV_SUCCESS {
|
||||
panic!("{:?}\n{}", result, unsafe {
|
||||
str::from_utf8_unchecked(spirv_txt)
|
||||
});
|
||||
}
|
||||
let mut parsed_spirv = Vec::<u32>::new();
|
||||
let result = unsafe {
|
||||
spirv_tools::spvBinaryParse(
|
||||
|
|
|
@ -11,8 +11,8 @@ OpCapability Int16
|
|||
OpCapability Int64
|
||||
OpCapability Float16
|
||||
OpCapability Float64
|
||||
OpCapability FunctionFloatControlINTEL
|
||||
OpExtension "SPV_INTEL_float_controls2"
|
||||
; OpCapability FunctionFloatControlINTEL
|
||||
; OpExtension "SPV_INTEL_float_controls2"
|
||||
%30 = OpExtInstImport "OpenCL.std"
|
||||
OpMemoryModel Physical64 OpenCL
|
||||
OpEntryPoint Kernel %1 "mul_ftz"
|
||||
|
|
|
@ -11,8 +11,8 @@ OpCapability Int16
|
|||
OpCapability Int64
|
||||
OpCapability Float16
|
||||
OpCapability Float64
|
||||
OpCapability FunctionFloatControlINTEL
|
||||
OpExtension "SPV_INTEL_float_controls2"
|
||||
; OpCapability FunctionFloatControlINTEL
|
||||
; OpExtension "SPV_INTEL_float_controls2"
|
||||
%31 = OpExtInstImport "OpenCL.std"
|
||||
OpMemoryModel Physical64 OpenCL
|
||||
OpEntryPoint Kernel %1 "selp"
|
||||
|
|
26
ptx/src/test/spirv_run/shared_variable.ptx
Normal file
26
ptx/src/test/spirv_run/shared_variable.ptx
Normal file
|
@ -0,0 +1,26 @@
|
|||
.version 6.5
|
||||
.target sm_30
|
||||
.address_size 64
|
||||
|
||||
|
||||
.visible .entry shared_variable(
|
||||
.param .u64 input,
|
||||
.param .u64 output
|
||||
)
|
||||
{
|
||||
.shared .align 4 .b8 shared_mem1[128];
|
||||
|
||||
.reg .u64 in_addr;
|
||||
.reg .u64 out_addr;
|
||||
.reg .u64 temp1;
|
||||
.reg .u64 temp2;
|
||||
|
||||
ld.param.u64 in_addr, [input];
|
||||
ld.param.u64 out_addr, [output];
|
||||
|
||||
ld.global.u64 temp1, [in_addr];
|
||||
st.shared.u64 [shared_mem1], temp1;
|
||||
ld.shared.u64 temp2, [shared_mem1];
|
||||
st.global.u64 [out_addr], temp2;
|
||||
ret;
|
||||
}
|
65
ptx/src/test/spirv_run/shared_variable.spvtxt
Normal file
65
ptx/src/test/spirv_run/shared_variable.spvtxt
Normal file
|
@ -0,0 +1,65 @@
|
|||
; SPIR-V
|
||||
; Version: 1.3
|
||||
; Generator: rspirv
|
||||
; Bound: 39
|
||||
OpCapability GenericPointer
|
||||
OpCapability Linkage
|
||||
OpCapability Addresses
|
||||
OpCapability Kernel
|
||||
OpCapability Int8
|
||||
OpCapability Int16
|
||||
OpCapability Int64
|
||||
OpCapability Float16
|
||||
OpCapability Float64
|
||||
; OpCapability FunctionFloatControlINTEL
|
||||
; OpExtension "SPV_INTEL_float_controls2"
|
||||
%27 = OpExtInstImport "OpenCL.std"
|
||||
OpMemoryModel Physical64 OpenCL
|
||||
OpEntryPoint Kernel %1 "shared_variable" %4
|
||||
OpDecorate %4 Alignment 4
|
||||
%28 = OpTypeVoid
|
||||
%29 = OpTypeInt 32 0
|
||||
%30 = OpTypeInt 8 0
|
||||
%31 = OpConstant %29 128
|
||||
%32 = OpTypeArray %30 %31
|
||||
%33 = OpTypePointer Workgroup %32
|
||||
%4 = OpVariable %33 Workgroup
|
||||
%34 = OpTypeInt 64 0
|
||||
%35 = OpTypeFunction %28 %34 %34
|
||||
%36 = OpTypePointer Function %34
|
||||
%37 = OpTypePointer CrossWorkgroup %34
|
||||
%38 = OpTypePointer Workgroup %34
|
||||
%1 = OpFunction %28 None %35
|
||||
%9 = OpFunctionParameter %34
|
||||
%10 = OpFunctionParameter %34
|
||||
%25 = OpLabel
|
||||
%2 = OpVariable %36 Function
|
||||
%3 = OpVariable %36 Function
|
||||
%5 = OpVariable %36 Function
|
||||
%6 = OpVariable %36 Function
|
||||
%7 = OpVariable %36 Function
|
||||
%8 = OpVariable %36 Function
|
||||
OpStore %2 %9
|
||||
OpStore %3 %10
|
||||
%12 = OpLoad %34 %2
|
||||
%11 = OpCopyObject %34 %12
|
||||
OpStore %5 %11
|
||||
%14 = OpLoad %34 %3
|
||||
%13 = OpCopyObject %34 %14
|
||||
OpStore %6 %13
|
||||
%16 = OpLoad %34 %5
|
||||
%21 = OpConvertUToPtr %37 %16
|
||||
%15 = OpLoad %34 %21
|
||||
OpStore %7 %15
|
||||
%17 = OpLoad %34 %7
|
||||
%22 = OpBitcast %38 %4
|
||||
OpStore %22 %17
|
||||
%23 = OpBitcast %38 %4
|
||||
%18 = OpLoad %34 %23
|
||||
OpStore %8 %18
|
||||
%19 = OpLoad %34 %6
|
||||
%20 = OpLoad %34 %8
|
||||
%24 = OpConvertUToPtr %37 %19
|
||||
OpStore %24 %20
|
||||
OpReturn
|
||||
OpFunctionEnd
|
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue