diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index 48ea22b..1d13514 100644 Binary files a/ptx/lib/zluda_ptx_impl.bc and b/ptx/lib/zluda_ptx_impl.bc differ diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index 420ce65..0330ba3 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -155,6 +155,221 @@ static __device__ float4::Native_vec_ __pack_to_float4(const T &t) return result; } +typedef uint32_t uint8 __attribute__((ext_vector_type(8))); +typedef uint32_t zluda_uint3 __attribute__((ext_vector_type(3))); +typedef uint8 CONSTANT_SPACE *surface_ptr; + +template +static __device__ To transmute(From f) +{ + union + { + To t; + From f; + } u = {To{0}}; + u.f = f; + return u.t; +} + +enum class ImageGeometry +{ + _1D, + _2D, + _3D, + A1D, + A2D +}; + +// clang-format off +template struct Coordinates; +template <> struct Coordinates { using type = uint1::Native_vec_; }; +template <> struct Coordinates { using type = uint2::Native_vec_; }; +template <> struct Coordinates { using type = uint4::Native_vec_; }; +template <> struct Coordinates +{ + using type = uint2::Native_vec_; using arg_type = uint1::Native_vec_; + static __device__ type pack_layer(uint32_t layer, arg_type coord) + { + return type { coord.x, layer }; + } +}; +template <> struct Coordinates +{ + using type = zluda_uint3; using arg_type = uint2::Native_vec_; + static __device__ type pack_layer(uint32_t layer, arg_type coord) + { + return type { coord.x, coord.y, layer }; + } +}; +// clang-format on + +template +static __device__ void image_store_pck(T value, typename Coordinates::type coord, surface_ptr surface) +{ + if constexpr (sizeof(T) <= sizeof(uint)) + { + uint value_dword = transmute(value); + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:1D unorm" : : "v"(value_dword), "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:2D unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:3D unorm" : : "v"(value_dword), "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:1D_ARRAY unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:2D_ARRAY unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(!std::is_same::value, "Invalid geometry"); + } + } + else if constexpr (sizeof(T) == sizeof(uint2)) + { + uint2::Native_vec_ value_dword2 = transmute(value); + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:1D unorm" : : "v"(value_dword2), "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:2D unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:3D unorm" : : "v"(value_dword2), "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:1D_ARRAY unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:2D_ARRAY unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(!std::is_same::value, "Invalid geometry"); + } + } + else if constexpr (sizeof(T) == sizeof(uint4)) + { + uint4::Native_vec_ value_dword4 = transmute(value); + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:1D unorm" : : "v"(value_dword4), "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:2D unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:3D unorm" : : "v"(value_dword4), "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(!std::is_same::value, "Invalid geometry"); + } + } + else + { + static_assert(!std::is_same::value, "Invalid vector size"); + } +} + +template +static __device__ uint4::Native_vec_ image_load_pck_full(typename Coordinates::type coord, surface_ptr surface) +{ + uint4::Native_vec_ data; + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + auto s = *surface; + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(s) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(!std::is_same::value, "Invalid geometry"); + } + return data; +} + +template +static __device__ void image_store_pck_full_with(uint4::Native_vec_ data, T value, typename Coordinates::type coord, surface_ptr surface) +{ + union + { + uint4::Native_vec_ full_vec; + T value; + } u = {0}; + u.full_vec = data; + u.value = value; + image_store_pck(u.full_vec, coord, surface); +} + +constexpr auto IMAGE_RESERVED_TOP_BITS = 3; + +static __device__ surface_ptr get_surface_pointer(uint64_t s) +{ + return (surface_ptr)((s << IMAGE_RESERVED_TOP_BITS) >> IMAGE_RESERVED_TOP_BITS); +} + +static __device__ uint32_t x_coordinate_shift(uint64_t s) +{ + return uint32_t(s >> (64 - IMAGE_RESERVED_TOP_BITS)); +} + +template +static __device__ void sust_b_indirect_zero(uint64_t surf_arg, typename Coordinates::type coord, T data) +{ + surface_ptr surface = get_surface_pointer(surf_arg); + uint32_t shift_x = x_coordinate_shift(surf_arg); + coord.x = coord.x >> shift_x; + if (shift_x <= __builtin_ctz(sizeof(T))) [[likely]] + { + image_store_pck(data, coord, surface); + } + else + { + uint4::Native_vec_ pix = image_load_pck_full(coord, surface); + image_store_pck_full_with(pix, data, coord, surface); + } +} + extern "C" { #define atomic_inc(NAME, SUCCESS, FAILURE, SCOPE, SPACE) \ @@ -620,6 +835,113 @@ extern "C" suld_b_a2d_vec(_v4, b32, uint4); // suld_b_a2d_vec(_v4, b64, ulong4); + static __device__ uint64_t repack(struct textureReference GLOBAL_SPACE *ptr) + { + uint32_t channels = uint32_t(ptr->numChannels); + uint32_t format_width = 0; + hipArray_Format format = ptr->format; + switch (format) + { + case hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8: + case hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8: + format_width = 1; + break; + case hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16: + case hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16: + case hipArray_Format::HIP_AD_FORMAT_HALF: + format_width = 2; + break; + case hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32: + case hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32: + case hipArray_Format::HIP_AD_FORMAT_FLOAT: + format_width = 4; + break; + default: + __builtin_unreachable(); + } + uint64_t shift_size = uint64_t(__builtin_ctz(format_width * channels)); + return uint64_t(ptr->textureObject) | (shift_size << (64 - 3)); + } + +#define SUST_B_ZERO(TYPE, GEOMETRY, HIP_TYPE) \ + void FUNC(sust_b_indirect_##TYPE##_zero)(uint64_t surf_arg, typename Coordinates::type coord, HIP_TYPE::Native_vec_ data) \ + { \ + sust_b_indirect_zero(surf_arg, coord, data); \ + } \ + void FUNC(sust_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, typename Coordinates::type coord, HIP_TYPE::Native_vec_ data) \ + { \ + FUNC_CALL(sust_b_indirect_##TYPE##_zero) \ + (repack(ptr), coord, data); \ + } + +#define SUST_B_ZERO_ARRAY(TYPE, GEOMETRY, HIP_TYPE) \ + void FUNC(sust_b_indirect_##TYPE##_zero)(uint64_t surf_arg, uint32_t layer, typename Coordinates::arg_type coord, HIP_TYPE::Native_vec_ data) \ + { \ + auto coord_array = Coordinates::pack_layer(layer, coord); \ + sust_b_indirect_zero(surf_arg, coord_array, data); \ + } \ + void FUNC(sust_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, uint32_t layer, typename Coordinates::arg_type coord, HIP_TYPE::Native_vec_ data) \ + { \ + FUNC_CALL(sust_b_indirect_##TYPE##_zero) \ + (repack(ptr), layer, coord, data); \ + } + + SUST_B_ZERO(1d_b8, ImageGeometry::_1D, uchar1); + SUST_B_ZERO(1d_b16, ImageGeometry::_1D, ushort1); + SUST_B_ZERO(1d_b32, ImageGeometry::_1D, uint1); + SUST_B_ZERO(1d_b64, ImageGeometry::_1D, ulong1); + SUST_B_ZERO(1d_v2_b8, ImageGeometry::_1D, uchar2); + SUST_B_ZERO(1d_v2_b16, ImageGeometry::_1D, ushort2); + SUST_B_ZERO(1d_v2_b32, ImageGeometry::_1D, uint2); + SUST_B_ZERO(1d_v2_b64, ImageGeometry::_1D, ulong2); + SUST_B_ZERO(1d_v4_b8, ImageGeometry::_1D, uchar4); + SUST_B_ZERO(1d_v4_b16, ImageGeometry::_1D, ushort4); + SUST_B_ZERO(1d_v4_b32, ImageGeometry::_1D, uint4); + SUST_B_ZERO(2d_b8, ImageGeometry::_2D, uchar1); + SUST_B_ZERO(2d_b16, ImageGeometry::_2D, ushort1); + SUST_B_ZERO(2d_b32, ImageGeometry::_2D, uint1); + SUST_B_ZERO(2d_b64, ImageGeometry::_2D, ulong1); + SUST_B_ZERO(2d_v2_b8, ImageGeometry::_2D, uchar2); + SUST_B_ZERO(2d_v2_b16, ImageGeometry::_2D, ushort2); + SUST_B_ZERO(2d_v2_b32, ImageGeometry::_2D, uint2); + SUST_B_ZERO(2d_v2_b64, ImageGeometry::_2D, ulong2); + SUST_B_ZERO(2d_v4_b8, ImageGeometry::_2D, uchar4); + SUST_B_ZERO(2d_v4_b16, ImageGeometry::_2D, ushort4); + SUST_B_ZERO(2d_v4_b32, ImageGeometry::_2D, uint4); + SUST_B_ZERO(3d_b8, ImageGeometry::_3D, uchar1); + SUST_B_ZERO(3d_b16, ImageGeometry::_3D, ushort1); + SUST_B_ZERO(3d_b32, ImageGeometry::_3D, uint1); + SUST_B_ZERO(3d_b64, ImageGeometry::_3D, ulong1); + SUST_B_ZERO(3d_v2_b8, ImageGeometry::_3D, uchar2); + SUST_B_ZERO(3d_v2_b16, ImageGeometry::_3D, ushort2); + SUST_B_ZERO(3d_v2_b32, ImageGeometry::_3D, uint2); + SUST_B_ZERO(3d_v2_b64, ImageGeometry::_3D, ulong2); + SUST_B_ZERO(3d_v4_b8, ImageGeometry::_3D, uchar4); + SUST_B_ZERO(3d_v4_b16, ImageGeometry::_3D, ushort4); + SUST_B_ZERO(3d_v4_b32, ImageGeometry::_3D, uint4); + SUST_B_ZERO_ARRAY(a1d_b8, ImageGeometry::A1D, uchar1); + SUST_B_ZERO_ARRAY(a1d_b16, ImageGeometry::A1D, ushort1); + SUST_B_ZERO_ARRAY(a1d_b32, ImageGeometry::A1D, uint1); + SUST_B_ZERO_ARRAY(a1d_b64, ImageGeometry::A1D, ulong1); + SUST_B_ZERO_ARRAY(a1d_v2_b8, ImageGeometry::A1D, uchar2); + SUST_B_ZERO_ARRAY(a1d_v2_b16, ImageGeometry::A1D, ushort2); + SUST_B_ZERO_ARRAY(a1d_v2_b32, ImageGeometry::A1D, uint2); + SUST_B_ZERO_ARRAY(a1d_v2_b64, ImageGeometry::A1D, ulong2); + SUST_B_ZERO_ARRAY(a1d_v4_b8, ImageGeometry::A1D, uchar4); + SUST_B_ZERO_ARRAY(a1d_v4_b16, ImageGeometry::A1D, ushort4); + SUST_B_ZERO_ARRAY(a1d_v4_b32, ImageGeometry::A1D, uint4); + SUST_B_ZERO_ARRAY(a2d_b8, ImageGeometry::A2D, uchar1); + SUST_B_ZERO_ARRAY(a2d_b16, ImageGeometry::A2D, ushort1); + SUST_B_ZERO_ARRAY(a2d_b32, ImageGeometry::A2D, uint1); + SUST_B_ZERO_ARRAY(a2d_b64, ImageGeometry::A2D, ulong1); + SUST_B_ZERO_ARRAY(a2d_v2_b8, ImageGeometry::A2D, uchar2); + SUST_B_ZERO_ARRAY(a2d_v2_b16, ImageGeometry::A2D, ushort2); + SUST_B_ZERO_ARRAY(a2d_v2_b32, ImageGeometry::A2D, uint2); + SUST_B_ZERO_ARRAY(a2d_v2_b64, ImageGeometry::A2D, ulong2); + SUST_B_ZERO_ARRAY(a2d_v4_b8, ImageGeometry::A2D, uchar4); + SUST_B_ZERO_ARRAY(a2d_v4_b16, ImageGeometry::A2D, ushort4); + SUST_B_ZERO_ARRAY(a2d_v4_b32, ImageGeometry::A2D, uint4); + #define sust_b_1d_vec(VEC, TYPE, HIP_TYPE) \ void FUNC(sust_b_1d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, int1::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \ { \ diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs index 61a74c9..4ffe3b0 100644 --- a/ptx/src/translate.rs +++ b/ptx/src/translate.rs @@ -2955,7 +2955,7 @@ fn replace_instructions_with_builtins_impl<'input>( vector, "_", sust.type_.to_ptx_name(), - "_trap", + "_zero", ] .concat(); statements.push(instruction_to_fn_call( diff --git a/zluda/src/impl/surface.rs b/zluda/src/impl/surface.rs index fcf9a52..519e76e 100644 --- a/zluda/src/impl/surface.rs +++ b/zluda/src/impl/surface.rs @@ -7,17 +7,42 @@ use crate::hip_call_cuda; use super::{hipfix, FromCuda}; pub(crate) unsafe fn create( - p_surf_object: *mut hipSurfaceObject_t, + result: *mut hipSurfaceObject_t, p_res_desc: *const CUDA_RESOURCE_DESC, ) -> Result<(), CUresult> { if p_res_desc == ptr::null() { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } let desc = to_surface_desc(*p_res_desc)?; - hip_call_cuda!(hipCreateSurfaceObject(p_surf_object, &desc)); + let mut surf_obj = mem::zeroed(); + hip_call_cuda!(hipCreateSurfaceObject(&mut surf_obj, &desc)); + if desc.resType != hipResourceType::hipResourceTypeArray { + panic!() + } + let format_size = format_size((&*desc.res.array.array).Format); + let channels = (&*desc.res.array.array).NumChannels; + let pixel_size = format_size * channels as usize; + let shift_amount = (pixel_size.trailing_zeros() as usize) << (64 - 3); + surf_obj = (surf_obj as usize | shift_amount) as _; + dbg!(surf_obj); + *result = surf_obj; Ok(()) } +pub(crate) fn format_size(f: hipArray_Format) -> usize { + match f { + hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8 + | hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8 => 1, + hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16 + | hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16 + | hipArray_Format::HIP_AD_FORMAT_HALF => 2, + hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32 + | hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32 + | hipArray_Format::HIP_AD_FORMAT_FLOAT => 4, + _ => panic!(), + } +} + unsafe fn to_surface_desc(res_desc: CUDA_RESOURCE_DESC) -> Result { let res_type = mem::transmute(res_desc.resType); let res: hipResourceDesc__bindgen_ty_1 = match res_desc.resType { diff --git a/zluda/src/impl/surfref.rs b/zluda/src/impl/surfref.rs index 457f9c4..c1526f9 100644 --- a/zluda/src/impl/surfref.rs +++ b/zluda/src/impl/surfref.rs @@ -1,7 +1,10 @@ -use crate::{hip_call_cuda, r#impl::hipfix}; +use crate::{ + hip_call_cuda, + r#impl::{hipfix, surface}, +}; use cuda_types::{CUarray, CUresult}; use hip_runtime_sys::*; -use std::ptr; +use std::{mem, ptr}; pub(crate) unsafe fn set_array( surfref: *mut textureReference, @@ -18,6 +21,14 @@ pub(crate) unsafe fn set_array( array.Format, array.NumChannels as i32, )); + //assert_eq!(hipError_t::hipSuccess, hipHostGetDevicePointer(&mut dev_ptr, surfref.cast(), 0)); + dbg!(surfref); + // TODO: clear bits on the old textureobject hip_call_cuda!(hipTexRefSetArray(surfref, array, HIP_TRSA_OVERRIDE_FORMAT)); + let format_size = surface::format_size(array.Format); + let pixel_size = format_size * array.NumChannels as usize; + let shift_amount = (pixel_size.trailing_zeros() as usize) << (64 - 3); + let mut surfref = &mut *surfref; + surfref.textureObject = (surfref.textureObject as usize | shift_amount) as _; Ok(()) } diff --git a/zluda/tests/kernel_sust.rs b/zluda/tests/kernel_sust.rs index 831e467..5057b56 100644 --- a/zluda/tests/kernel_sust.rs +++ b/zluda/tests/kernel_sust.rs @@ -312,7 +312,9 @@ unsafe fn byte_fill(vec: &mut Vec, value: u8) { fn extend_bytes_with(slice: &[u8], elm: u8, desired_length: usize) -> Vec { let mut result = slice.to_vec(); - result.extend(std::iter::repeat(elm).take(desired_length - slice.len())); + if desired_length > slice.len() { + result.extend(std::iter::repeat(elm).take(desired_length - slice.len())); + } result } @@ -337,10 +339,6 @@ unsafe fn kernel_sust_impl< if mem::size_of::() * CHANNELS < mem::size_of::() * SUST_N { return; } - // TODO: reenable those tests - if mem::size_of::() != mem::size_of::() || CHANNELS != SUST_N { - return; - } let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed); let size = 4usize; let random_size = rand::distributions::Uniform::::new(1, size as u32); @@ -461,4 +459,8 @@ unsafe fn kernel_sust_impl< assert_eq!(expected, &*observed); let mut unused = mem::zeroed(); assert_eq!(cuda.cuCtxPopCurrent(&mut unused), CUresult::CUDA_SUCCESS); + assert_eq!( + cuda.cuDevicePrimaryCtxRelease_v2(CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); }