mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-09-29 12:49:13 +00:00
Handle PrmtSlow (#518)
This commit is contained in:
parent
f46b756fdc
commit
93820e3159
12 changed files with 172 additions and 94 deletions
Binary file not shown.
|
@ -780,4 +780,66 @@ typedef uint32_t ShflSyncResult __attribute__((ext_vector_type(2)));
|
||||||
|
|
||||||
return d_out;
|
return d_out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct byte4
|
||||||
|
{
|
||||||
|
union
|
||||||
|
{
|
||||||
|
uint32_t u32;
|
||||||
|
uint8_t u8x4[4];
|
||||||
|
};
|
||||||
|
} __attribute__((aligned(4)));
|
||||||
|
|
||||||
|
struct byte8
|
||||||
|
{
|
||||||
|
union
|
||||||
|
{
|
||||||
|
uint32_t u32x2[2];
|
||||||
|
uint8_t u8x8[8];
|
||||||
|
};
|
||||||
|
} __attribute__((aligned(8)));
|
||||||
|
|
||||||
|
uint32_t FUNC(prmt_b32)(uint32_t x, uint32_t y, uint32_t s)
|
||||||
|
{
|
||||||
|
byte4 v_perm_selector;
|
||||||
|
v_perm_selector.u32 = 0;
|
||||||
|
|
||||||
|
byte8 input;
|
||||||
|
input.u32x2[0] = x;
|
||||||
|
input.u32x2[1] = y;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < 4; i++)
|
||||||
|
{
|
||||||
|
uint8_t sel = static_cast<uint8_t>(s >> (i * 4));
|
||||||
|
uint8_t addr = sel & 0x7;
|
||||||
|
if (sel & 0x8)
|
||||||
|
{
|
||||||
|
if (addr % 2 == 1)
|
||||||
|
{
|
||||||
|
v_perm_selector.u8x4[i] = 0x8 + addr / 2;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
v_perm_selector.u8x4[i] = addr;
|
||||||
|
}
|
||||||
|
|
||||||
|
byte4 output;
|
||||||
|
output.u32 = __builtin_amdgcn_perm(input.u32x2[1], input.u32x2[0], v_perm_selector.u32);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < 4; i++)
|
||||||
|
{
|
||||||
|
uint8_t sel = static_cast<uint8_t>(s >> (i * 4));
|
||||||
|
uint8_t addr = sel & 0x7;
|
||||||
|
if (sel & 0x8)
|
||||||
|
{
|
||||||
|
if (addr % 2 != 1)
|
||||||
|
{
|
||||||
|
output.u8x4[i] = (output.u8x4[i] & 0x80) * 0xff;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return output.u32;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -164,7 +164,6 @@ fn run_instruction<'input>(
|
||||||
| ast::Instruction::Or { .. }
|
| ast::Instruction::Or { .. }
|
||||||
| ast::Instruction::Popc { .. }
|
| ast::Instruction::Popc { .. }
|
||||||
| ast::Instruction::Prmt { .. }
|
| ast::Instruction::Prmt { .. }
|
||||||
| ast::Instruction::PrmtSlow { .. }
|
|
||||||
| ast::Instruction::Rcp { .. }
|
| ast::Instruction::Rcp { .. }
|
||||||
| ast::Instruction::Rem { .. }
|
| ast::Instruction::Rem { .. }
|
||||||
| ast::Instruction::Ret { .. }
|
| ast::Instruction::Ret { .. }
|
||||||
|
|
|
@ -1818,7 +1818,6 @@ fn get_modes<T: ast::Operand>(inst: &ast::Instruction<T>) -> InstructionModes {
|
||||||
| ast::Instruction::Mov { .. }
|
| ast::Instruction::Mov { .. }
|
||||||
| ast::Instruction::Ld { .. }
|
| ast::Instruction::Ld { .. }
|
||||||
| ast::Instruction::St { .. }
|
| ast::Instruction::St { .. }
|
||||||
| ast::Instruction::PrmtSlow { .. }
|
|
||||||
| ast::Instruction::Prmt { .. }
|
| ast::Instruction::Prmt { .. }
|
||||||
| ast::Instruction::Activemask { .. }
|
| ast::Instruction::Activemask { .. }
|
||||||
| ast::Instruction::Membar { .. }
|
| ast::Instruction::Membar { .. }
|
||||||
|
|
|
@ -510,10 +510,6 @@ impl<'a> MethodEmitContext<'a> {
|
||||||
ast::Instruction::Xor { data, arguments } => self.emit_xor(data, arguments),
|
ast::Instruction::Xor { data, arguments } => self.emit_xor(data, arguments),
|
||||||
ast::Instruction::Rem { data, arguments } => self.emit_rem(data, arguments),
|
ast::Instruction::Rem { data, arguments } => self.emit_rem(data, arguments),
|
||||||
ast::Instruction::BarWarp { .. } => self.emit_bar_warp(),
|
ast::Instruction::BarWarp { .. } => self.emit_bar_warp(),
|
||||||
ast::Instruction::PrmtSlow { .. } => {
|
|
||||||
Err(error_todo_msg("PrmtSlow is not implemented yet"))
|
|
||||||
}
|
|
||||||
ast::Instruction::Prmt { data, arguments } => self.emit_prmt(data, arguments),
|
|
||||||
ast::Instruction::Membar { data } => self.emit_membar(data),
|
ast::Instruction::Membar { data } => self.emit_membar(data),
|
||||||
ast::Instruction::Trap {} => self.emit_trap(),
|
ast::Instruction::Trap {} => self.emit_trap(),
|
||||||
ast::Instruction::Tanh { data, arguments } => self.emit_tanh(data, arguments),
|
ast::Instruction::Tanh { data, arguments } => self.emit_tanh(data, arguments),
|
||||||
|
@ -533,7 +529,8 @@ impl<'a> MethodEmitContext<'a> {
|
||||||
| ast::Instruction::Nanosleep { .. }
|
| ast::Instruction::Nanosleep { .. }
|
||||||
| ast::Instruction::ReduxSync { .. }
|
| ast::Instruction::ReduxSync { .. }
|
||||||
| ast::Instruction::LdMatrix { .. }
|
| ast::Instruction::LdMatrix { .. }
|
||||||
| ast::Instruction::Mma { .. } => return Err(error_unreachable()),
|
| ast::Instruction::Mma { .. }
|
||||||
|
| ast::Instruction::Prmt { .. } => return Err(error_unreachable()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2447,48 +2444,6 @@ impl<'a> MethodEmitContext<'a> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn emit_prmt(
|
|
||||||
&mut self,
|
|
||||||
control: u16,
|
|
||||||
arguments: ptx_parser::PrmtArgs<SpirvWord>,
|
|
||||||
) -> Result<(), TranslateError> {
|
|
||||||
let components = [
|
|
||||||
(control >> 0) & 0b1111,
|
|
||||||
(control >> 4) & 0b1111,
|
|
||||||
(control >> 8) & 0b1111,
|
|
||||||
(control >> 12) & 0b1111,
|
|
||||||
];
|
|
||||||
if components.iter().any(|&c| c > 7) {
|
|
||||||
return Err(error_todo());
|
|
||||||
}
|
|
||||||
let u32_type = get_scalar_type(self.context, ast::ScalarType::U32);
|
|
||||||
let v4u8_type = get_type(self.context, &ast::Type::Vector(4, ast::ScalarType::U8))?;
|
|
||||||
let mut components = [
|
|
||||||
unsafe { LLVMConstInt(u32_type, components[0] as _, 0) },
|
|
||||||
unsafe { LLVMConstInt(u32_type, components[1] as _, 0) },
|
|
||||||
unsafe { LLVMConstInt(u32_type, components[2] as _, 0) },
|
|
||||||
unsafe { LLVMConstInt(u32_type, components[3] as _, 0) },
|
|
||||||
];
|
|
||||||
let components_indices =
|
|
||||||
unsafe { LLVMConstVector(components.as_mut_ptr(), components.len() as u32) };
|
|
||||||
let src1 = self.resolver.value(arguments.src1)?;
|
|
||||||
let src1_vector =
|
|
||||||
unsafe { LLVMBuildBitCast(self.builder, src1, v4u8_type, LLVM_UNNAMED.as_ptr()) };
|
|
||||||
let src2 = self.resolver.value(arguments.src2)?;
|
|
||||||
let src2_vector =
|
|
||||||
unsafe { LLVMBuildBitCast(self.builder, src2, v4u8_type, LLVM_UNNAMED.as_ptr()) };
|
|
||||||
self.resolver.with_result(arguments.dst, |dst| unsafe {
|
|
||||||
LLVMBuildShuffleVector(
|
|
||||||
self.builder,
|
|
||||||
src1_vector,
|
|
||||||
src2_vector,
|
|
||||||
components_indices,
|
|
||||||
dst,
|
|
||||||
)
|
|
||||||
});
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn emit_abs(
|
fn emit_abs(
|
||||||
&mut self,
|
&mut self,
|
||||||
data: ast::TypeFtz,
|
data: ast::TypeFtz,
|
||||||
|
|
|
@ -519,6 +519,9 @@ fn run_instruction<'input>(
|
||||||
i,
|
i,
|
||||||
)?
|
)?
|
||||||
}
|
}
|
||||||
|
i @ ptx_parser::Instruction::Prmt { .. } => {
|
||||||
|
to_call(resolver, fn_declarations, "prmt_b32".into(), i)?
|
||||||
|
}
|
||||||
i => i,
|
i => i,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,38 +1,39 @@
|
||||||
define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
declare hidden i32 @__zluda_ptx_impl_prmt_b32(i32, i32, i32) #0
|
||||||
%"38" = alloca i64, align 8, addrspace(5)
|
|
||||||
|
define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 {
|
||||||
%"39" = alloca i64, align 8, addrspace(5)
|
%"39" = alloca i64, align 8, addrspace(5)
|
||||||
%"40" = alloca i32, align 4, addrspace(5)
|
%"40" = alloca i64, align 8, addrspace(5)
|
||||||
%"41" = alloca i32, align 4, addrspace(5)
|
%"41" = alloca i32, align 4, addrspace(5)
|
||||||
|
%"42" = alloca i32, align 4, addrspace(5)
|
||||||
br label %1
|
br label %1
|
||||||
|
|
||||||
1: ; preds = %0
|
1: ; preds = %0
|
||||||
br label %"35"
|
br label %"36"
|
||||||
|
|
||||||
"35": ; preds = %1
|
"36": ; preds = %1
|
||||||
%"42" = load i64, ptr addrspace(4) %"36", align 8
|
|
||||||
store i64 %"42", ptr addrspace(5) %"38", align 8
|
|
||||||
%"43" = load i64, ptr addrspace(4) %"37", align 8
|
%"43" = load i64, ptr addrspace(4) %"37", align 8
|
||||||
store i64 %"43", ptr addrspace(5) %"39", align 8
|
store i64 %"43", ptr addrspace(5) %"39", align 8
|
||||||
%"45" = load i64, ptr addrspace(5) %"38", align 8
|
%"44" = load i64, ptr addrspace(4) %"38", align 8
|
||||||
%"53" = inttoptr i64 %"45" to ptr
|
store i64 %"44", ptr addrspace(5) %"40", align 8
|
||||||
%"44" = load i32, ptr %"53", align 4
|
%"46" = load i64, ptr addrspace(5) %"39", align 8
|
||||||
store i32 %"44", ptr addrspace(5) %"40", align 4
|
|
||||||
%"46" = load i64, ptr addrspace(5) %"38", align 8
|
|
||||||
%"54" = inttoptr i64 %"46" to ptr
|
%"54" = inttoptr i64 %"46" to ptr
|
||||||
%"34" = getelementptr inbounds i8, ptr %"54", i64 4
|
%"45" = load i32, ptr %"54", align 4
|
||||||
%"47" = load i32, ptr %"34", align 4
|
store i32 %"45", ptr addrspace(5) %"41", align 4
|
||||||
store i32 %"47", ptr addrspace(5) %"41", align 4
|
%"47" = load i64, ptr addrspace(5) %"39", align 8
|
||||||
%"49" = load i32, ptr addrspace(5) %"40", align 4
|
%"55" = inttoptr i64 %"47" to ptr
|
||||||
|
%"34" = getelementptr inbounds i8, ptr %"55", i64 4
|
||||||
|
%"48" = load i32, ptr %"34", align 4
|
||||||
|
store i32 %"48", ptr addrspace(5) %"42", align 4
|
||||||
%"50" = load i32, ptr addrspace(5) %"41", align 4
|
%"50" = load i32, ptr addrspace(5) %"41", align 4
|
||||||
%2 = bitcast i32 %"49" to <4 x i8>
|
%"51" = load i32, ptr addrspace(5) %"42", align 4
|
||||||
%3 = bitcast i32 %"50" to <4 x i8>
|
%"56" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"50", i32 %"51", i32 30212)
|
||||||
%"55" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
|
store i32 %"56", ptr addrspace(5) %"42", align 4
|
||||||
store <4 x i8> %"55", ptr addrspace(5) %"41", align 4
|
%"52" = load i64, ptr addrspace(5) %"40", align 8
|
||||||
%"51" = load i64, ptr addrspace(5) %"39", align 8
|
%"53" = load i32, ptr addrspace(5) %"42", align 4
|
||||||
%"52" = load i32, ptr addrspace(5) %"41", align 4
|
%"59" = inttoptr i64 %"52" to ptr
|
||||||
%"58" = inttoptr i64 %"51" to ptr
|
store i32 %"53", ptr %"59", align 4
|
||||||
store i32 %"52", ptr %"58", align 4
|
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||||
|
attributes #1 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
46
ptx/src/test/ll/prmt_slow.ll
Normal file
46
ptx/src/test/ll/prmt_slow.ll
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
declare hidden i32 @__zluda_ptx_impl_prmt_b32(i32, i32, i32) #0
|
||||||
|
|
||||||
|
define amdgpu_kernel void @prmt_slow(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 {
|
||||||
|
%"41" = alloca i64, align 8, addrspace(5)
|
||||||
|
%"42" = alloca i64, align 8, addrspace(5)
|
||||||
|
%"43" = alloca i32, align 4, addrspace(5)
|
||||||
|
%"44" = alloca i32, align 4, addrspace(5)
|
||||||
|
%"45" = alloca i32, align 4, addrspace(5)
|
||||||
|
br label %1
|
||||||
|
|
||||||
|
1: ; preds = %0
|
||||||
|
br label %"38"
|
||||||
|
|
||||||
|
"38": ; preds = %1
|
||||||
|
%"46" = load i64, ptr addrspace(4) %"39", align 8
|
||||||
|
store i64 %"46", ptr addrspace(5) %"41", align 8
|
||||||
|
%"47" = load i64, ptr addrspace(4) %"40", align 8
|
||||||
|
store i64 %"47", ptr addrspace(5) %"42", align 8
|
||||||
|
%"49" = load i64, ptr addrspace(5) %"41", align 8
|
||||||
|
%"60" = inttoptr i64 %"49" to ptr
|
||||||
|
%"48" = load i32, ptr %"60", align 4
|
||||||
|
store i32 %"48", ptr addrspace(5) %"43", align 4
|
||||||
|
%"50" = load i64, ptr addrspace(5) %"41", align 8
|
||||||
|
%"61" = inttoptr i64 %"50" to ptr
|
||||||
|
%"35" = getelementptr inbounds i8, ptr %"61", i64 4
|
||||||
|
%"51" = load i32, ptr %"35", align 4
|
||||||
|
store i32 %"51", ptr addrspace(5) %"44", align 4
|
||||||
|
%"52" = load i64, ptr addrspace(5) %"41", align 8
|
||||||
|
%"62" = inttoptr i64 %"52" to ptr
|
||||||
|
%"37" = getelementptr inbounds i8, ptr %"62", i64 8
|
||||||
|
%"53" = load i32, ptr %"37", align 4
|
||||||
|
store i32 %"53", ptr addrspace(5) %"45", align 4
|
||||||
|
%"55" = load i32, ptr addrspace(5) %"43", align 4
|
||||||
|
%"56" = load i32, ptr addrspace(5) %"44", align 4
|
||||||
|
%"57" = load i32, ptr addrspace(5) %"45", align 4
|
||||||
|
%"63" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"55", i32 %"56", i32 %"57")
|
||||||
|
store i32 %"63", ptr addrspace(5) %"44", align 4
|
||||||
|
%"58" = load i64, ptr addrspace(5) %"42", align 8
|
||||||
|
%"59" = load i32, ptr addrspace(5) %"44", align 4
|
||||||
|
%"67" = inttoptr i64 %"58" to ptr
|
||||||
|
store i32 %"59", ptr %"67", align 4
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||||
|
attributes #1 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
|
@ -275,6 +275,11 @@ test_ptx!(const_ident, [0u16], [0u64, 0u64]);
|
||||||
test_ptx!(cvt_s16_s8, [0x139231C2u32], [0xFFFFFFC2u32]);
|
test_ptx!(cvt_s16_s8, [0x139231C2u32], [0xFFFFFFC2u32]);
|
||||||
test_ptx!(cvt_f64_f32, [0.125f32], [0.125f64]);
|
test_ptx!(cvt_f64_f32, [0.125f32], [0.125f64]);
|
||||||
test_ptx!(prmt, [0x70c507d6u32, 0x6fbd4b5cu32], [0x6fbdd65cu32]);
|
test_ptx!(prmt, [0x70c507d6u32, 0x6fbd4b5cu32], [0x6fbdd65cu32]);
|
||||||
|
test_ptx!(
|
||||||
|
prmt_slow,
|
||||||
|
[0x70c507d6u32, 0x6fbd4b5cu32, 30212],
|
||||||
|
[0x6fbdd65cu32]
|
||||||
|
);
|
||||||
test_ptx!(activemask, [0u32], [1u32]);
|
test_ptx!(activemask, [0u32], [1u32]);
|
||||||
test_ptx!(membar, [152731u32], [152731u32]);
|
test_ptx!(membar, [152731u32], [152731u32]);
|
||||||
test_ptx!(shared_unify_extern, [7681u64, 7682u64], [15363u64]);
|
test_ptx!(shared_unify_extern, [7681u64, 7682u64], [15363u64]);
|
||||||
|
|
25
ptx/src/test/spirv_run/prmt_slow.ptx
Normal file
25
ptx/src/test/spirv_run/prmt_slow.ptx
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
.version 6.5
|
||||||
|
.target sm_30
|
||||||
|
.address_size 64
|
||||||
|
|
||||||
|
.visible .entry prmt_slow(
|
||||||
|
.param .u64 input,
|
||||||
|
.param .u64 output
|
||||||
|
)
|
||||||
|
{
|
||||||
|
.reg .u64 in_addr;
|
||||||
|
.reg .u64 out_addr;
|
||||||
|
.reg .u32 temp1;
|
||||||
|
.reg .u32 temp2;
|
||||||
|
.reg .u32 temp3;
|
||||||
|
|
||||||
|
ld.param.u64 in_addr, [input];
|
||||||
|
ld.param.u64 out_addr, [output];
|
||||||
|
|
||||||
|
ld.u32 temp1, [in_addr];
|
||||||
|
ld.u32 temp2, [in_addr+4];
|
||||||
|
ld.u32 temp3, [in_addr+8];
|
||||||
|
prmt.b32 temp2, temp1, temp2, temp3;
|
||||||
|
st.u32 [out_addr], temp2;
|
||||||
|
ret;
|
||||||
|
}
|
|
@ -432,15 +432,6 @@ ptx_parser_macros::generate_instruction_type!(
|
||||||
},
|
},
|
||||||
Prmt {
|
Prmt {
|
||||||
type: Type::Scalar(ScalarType::B32),
|
type: Type::Scalar(ScalarType::B32),
|
||||||
data: u16,
|
|
||||||
arguments<T>: {
|
|
||||||
dst: T,
|
|
||||||
src1: T,
|
|
||||||
src2: T
|
|
||||||
}
|
|
||||||
},
|
|
||||||
PrmtSlow {
|
|
||||||
type: Type::Scalar(ScalarType::U32),
|
|
||||||
arguments<T>: {
|
arguments<T>: {
|
||||||
dst: T,
|
dst: T,
|
||||||
src1: T,
|
src1: T,
|
||||||
|
|
|
@ -3671,20 +3671,12 @@ derive_parser!(
|
||||||
// prmt.b32{.mode} d, a, b, c;
|
// prmt.b32{.mode} d, a, b, c;
|
||||||
// .mode = { .f4e, .b4e, .rc8, .ecl, .ecr, .rc16 };
|
// .mode = { .f4e, .b4e, .rc8, .ecl, .ecr, .rc16 };
|
||||||
prmt.b32 d, a, b, c => {
|
prmt.b32 d, a, b, c => {
|
||||||
match c {
|
ast::Instruction::Prmt {
|
||||||
ast::ParsedOperand::Imm(ImmediateValue::S64(control)) => ast::Instruction::Prmt {
|
|
||||||
data: control as u16,
|
|
||||||
arguments: PrmtArgs {
|
arguments: PrmtArgs {
|
||||||
dst: d, src1: a, src2: b
|
|
||||||
}
|
|
||||||
},
|
|
||||||
_ => ast::Instruction::PrmtSlow {
|
|
||||||
arguments: PrmtSlowArgs {
|
|
||||||
dst: d, src1: a, src2: b, src3: c
|
dst: d, src1: a, src2: b, src3: c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
|
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
|
||||||
activemask.b32 d => {
|
activemask.b32 d => {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue