mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-09-04 08:36:23 +00:00
Convert OpenCL host code to SVM
This commit is contained in:
parent
638786b0ec
commit
becda31524
3 changed files with 77 additions and 189 deletions
|
@ -4,6 +4,7 @@ use cuda::{CUdevice_attribute, CUuuid_st};
|
||||||
use ocl_core::{ClDeviceIdPtr, ContextProperties, DeviceType};
|
use ocl_core::{ClDeviceIdPtr, ContextProperties, DeviceType};
|
||||||
use std::{
|
use std::{
|
||||||
cmp,
|
cmp,
|
||||||
|
collections::HashSet,
|
||||||
ffi::c_void,
|
ffi::c_void,
|
||||||
mem,
|
mem,
|
||||||
os::raw::{c_char, c_int, c_uint},
|
os::raw::{c_char, c_int, c_uint},
|
||||||
|
@ -24,175 +25,14 @@ pub struct Device {
|
||||||
pub ocl_base: ocl_core::DeviceId,
|
pub ocl_base: ocl_core::DeviceId,
|
||||||
pub default_queue: ocl_core::CommandQueue,
|
pub default_queue: ocl_core::CommandQueue,
|
||||||
pub ocl_context: ocl_core::Context,
|
pub ocl_context: ocl_core::Context,
|
||||||
pub(crate) ocl_ext: OpenCLExtensions,
|
|
||||||
pub primary_context: context::Context,
|
pub primary_context: context::Context,
|
||||||
|
pub allocations: HashSet<*mut c_void>,
|
||||||
properties: Option<Box<l0::sys::ze_device_properties_t>>,
|
properties: Option<Box<l0::sys::ze_device_properties_t>>,
|
||||||
image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>,
|
image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>,
|
||||||
memory_properties: Option<Vec<l0::sys::ze_device_memory_properties_t>>,
|
memory_properties: Option<Vec<l0::sys::ze_device_memory_properties_t>>,
|
||||||
compute_properties: Option<Box<l0::sys::ze_device_compute_properties_t>>,
|
compute_properties: Option<Box<l0::sys::ze_device_compute_properties_t>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
type cl_mem_properties_intel = ocl_core::ffi::cl_bitfield;
|
|
||||||
|
|
||||||
pub(crate) struct OpenCLExtensions {
|
|
||||||
pub clDeviceMemAllocINTEL: unsafe extern "system" fn(
|
|
||||||
ocl_core::ffi::cl_context,
|
|
||||||
ocl_core::ffi::cl_device_id,
|
|
||||||
*const cl_mem_properties_intel,
|
|
||||||
usize,
|
|
||||||
ocl_core::ffi::cl_uint,
|
|
||||||
*mut ocl_core::ffi::cl_int,
|
|
||||||
) -> *mut c_void,
|
|
||||||
pub clEnqueueMemcpyINTEL: unsafe extern "system" fn(
|
|
||||||
ocl_core::ffi::cl_command_queue,
|
|
||||||
ocl_core::ffi::cl_bool,
|
|
||||||
*mut c_void,
|
|
||||||
*const c_void,
|
|
||||||
usize,
|
|
||||||
ocl_core::ffi::cl_uint,
|
|
||||||
*const ocl_core::ffi::cl_event,
|
|
||||||
*mut ocl_core::ffi::cl_event,
|
|
||||||
) -> ocl_core::ffi::cl_int,
|
|
||||||
pub clMemBlockingFreeINTEL:
|
|
||||||
unsafe extern "system" fn(ocl_core::ffi::cl_context, *mut c_void) -> ocl_core::ffi::cl_int,
|
|
||||||
pub clEnqueueMemFillINTEL: unsafe extern "system" fn(
|
|
||||||
ocl_core::ffi::cl_command_queue,
|
|
||||||
*mut c_void,
|
|
||||||
*const c_void,
|
|
||||||
usize,
|
|
||||||
usize,
|
|
||||||
ocl_core::ffi::cl_uint,
|
|
||||||
*const ocl_core::ffi::cl_event,
|
|
||||||
*mut ocl_core::ffi::cl_event,
|
|
||||||
) -> ocl_core::ffi::cl_int,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl OpenCLExtensions {
|
|
||||||
fn new(plat: &ocl_core::PlatformId) -> Result<Self, CUresult> {
|
|
||||||
let clDeviceMemAllocINTEL = unsafe {
|
|
||||||
ocl_core::get_extension_function_address_for_platform(
|
|
||||||
plat,
|
|
||||||
"clDeviceMemAllocINTEL",
|
|
||||||
None,
|
|
||||||
)?
|
|
||||||
};
|
|
||||||
let clEnqueueMemcpyINTEL = unsafe {
|
|
||||||
ocl_core::get_extension_function_address_for_platform(
|
|
||||||
plat,
|
|
||||||
"clEnqueueMemcpyINTEL",
|
|
||||||
None,
|
|
||||||
)?
|
|
||||||
};
|
|
||||||
let clMemBlockingFreeINTEL = unsafe {
|
|
||||||
ocl_core::get_extension_function_address_for_platform(
|
|
||||||
plat,
|
|
||||||
"clMemBlockingFreeINTEL",
|
|
||||||
None,
|
|
||||||
)?
|
|
||||||
};
|
|
||||||
let clEnqueueMemFillINTEL = unsafe {
|
|
||||||
ocl_core::get_extension_function_address_for_platform(
|
|
||||||
plat,
|
|
||||||
"clEnqueueMemFillINTEL",
|
|
||||||
None,
|
|
||||||
)?
|
|
||||||
};
|
|
||||||
Ok(Self {
|
|
||||||
clDeviceMemAllocINTEL: unsafe { mem::transmute(clDeviceMemAllocINTEL) },
|
|
||||||
clEnqueueMemcpyINTEL: unsafe { mem::transmute(clEnqueueMemcpyINTEL) },
|
|
||||||
clMemBlockingFreeINTEL: unsafe { mem::transmute(clMemBlockingFreeINTEL) },
|
|
||||||
clEnqueueMemFillINTEL: unsafe { mem::transmute(clEnqueueMemFillINTEL) },
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub unsafe fn device_mem_alloc(
|
|
||||||
&self,
|
|
||||||
ctx: &ocl_core::Context,
|
|
||||||
device: &ocl_core::DeviceId,
|
|
||||||
size: usize,
|
|
||||||
alignment: ocl_core::ffi::cl_uint,
|
|
||||||
) -> Result<*mut c_void, CUresult> {
|
|
||||||
let mut error = 0;
|
|
||||||
let result = (self.clDeviceMemAllocINTEL)(
|
|
||||||
ctx.as_ptr(),
|
|
||||||
device.as_ptr(),
|
|
||||||
ptr::null(),
|
|
||||||
size,
|
|
||||||
alignment,
|
|
||||||
&mut error,
|
|
||||||
);
|
|
||||||
if error == 0 {
|
|
||||||
Ok(result)
|
|
||||||
} else {
|
|
||||||
Err(CUresult::CUDA_ERROR_UNKNOWN)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub unsafe fn enqueue_memcpy(
|
|
||||||
&self,
|
|
||||||
queue: &ocl_core::CommandQueue,
|
|
||||||
blocking: bool,
|
|
||||||
dst: *mut c_void,
|
|
||||||
src: *const c_void,
|
|
||||||
size: usize,
|
|
||||||
) -> Result<(), CUresult> {
|
|
||||||
let error = (self.clEnqueueMemcpyINTEL)(
|
|
||||||
queue.as_ptr(),
|
|
||||||
if blocking { 1 } else { 0 },
|
|
||||||
dst,
|
|
||||||
src,
|
|
||||||
size,
|
|
||||||
0,
|
|
||||||
ptr::null(),
|
|
||||||
ptr::null_mut(),
|
|
||||||
);
|
|
||||||
if error == 0 {
|
|
||||||
Ok(())
|
|
||||||
} else {
|
|
||||||
Err(CUresult::CUDA_ERROR_UNKNOWN)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub unsafe fn mem_blocking_free(
|
|
||||||
&self,
|
|
||||||
ctx: &ocl_core::Context,
|
|
||||||
mem_ptr: *mut c_void,
|
|
||||||
) -> Result<(), CUresult> {
|
|
||||||
let error = (self.clMemBlockingFreeINTEL)(ctx.as_ptr(), mem_ptr);
|
|
||||||
if error == 0 {
|
|
||||||
Ok(())
|
|
||||||
} else {
|
|
||||||
Err(CUresult::CUDA_ERROR_UNKNOWN)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub unsafe fn enqueue_memfill(
|
|
||||||
&self,
|
|
||||||
queue: &ocl_core::CommandQueue,
|
|
||||||
dst: *mut c_void,
|
|
||||||
pattern: *const c_void,
|
|
||||||
patternSize: usize,
|
|
||||||
size: usize,
|
|
||||||
) -> Result<ocl_core::Event, CUresult> {
|
|
||||||
let mut signal: ocl_core::ffi::cl_event = ptr::null_mut();
|
|
||||||
let error = (self.clEnqueueMemFillINTEL)(
|
|
||||||
queue.as_ptr(),
|
|
||||||
dst,
|
|
||||||
pattern,
|
|
||||||
patternSize,
|
|
||||||
size,
|
|
||||||
0,
|
|
||||||
ptr::null(),
|
|
||||||
&mut signal,
|
|
||||||
);
|
|
||||||
if error == 0 {
|
|
||||||
Ok(ocl_core::Event::from_raw(signal))
|
|
||||||
} else {
|
|
||||||
Err(CUresult::CUDA_ERROR_UNKNOWN)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
unsafe impl Send for Device {}
|
unsafe impl Send for Device {}
|
||||||
|
|
||||||
impl Device {
|
impl Device {
|
||||||
|
@ -202,7 +42,6 @@ impl Device {
|
||||||
ocl_dev: ocl_core::DeviceId,
|
ocl_dev: ocl_core::DeviceId,
|
||||||
idx: usize,
|
idx: usize,
|
||||||
) -> Result<Self, CUresult> {
|
) -> Result<Self, CUresult> {
|
||||||
let ocl_ext = OpenCLExtensions::new(&platform)?;
|
|
||||||
let mut props = ocl_core::ContextProperties::new();
|
let mut props = ocl_core::ContextProperties::new();
|
||||||
props.set_platform(platform);
|
props.set_platform(platform);
|
||||||
let ctx = ocl_core::create_context(Some(&props), &[ocl_dev], None, None)?;
|
let ctx = ocl_core::create_context(Some(&props), &[ocl_dev], None, None)?;
|
||||||
|
@ -210,13 +49,13 @@ impl Device {
|
||||||
let primary_context =
|
let primary_context =
|
||||||
context::Context::new(context::ContextData::new(0, true, ptr::null_mut())?);
|
context::Context::new(context::ContextData::new(0, true, ptr::null_mut())?);
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
ocl_ext,
|
|
||||||
index: Index(idx as c_int),
|
index: Index(idx as c_int),
|
||||||
base: l0_dev,
|
base: l0_dev,
|
||||||
ocl_base: ocl_dev,
|
ocl_base: ocl_dev,
|
||||||
default_queue: queue,
|
default_queue: queue,
|
||||||
ocl_context: ctx,
|
ocl_context: ctx,
|
||||||
primary_context,
|
primary_context,
|
||||||
|
allocations: HashSet::new(),
|
||||||
properties: None,
|
properties: None,
|
||||||
image_properties: None,
|
image_properties: None,
|
||||||
memory_properties: None,
|
memory_properties: None,
|
||||||
|
|
|
@ -3,7 +3,7 @@ use ocl_core::DeviceId;
|
||||||
use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
|
use super::{stream::Stream, CUresult, GlobalState, HasLivenessCookie, LiveCheck};
|
||||||
use crate::cuda::CUfunction_attribute;
|
use crate::cuda::CUfunction_attribute;
|
||||||
use ::std::os::raw::{c_uint, c_void};
|
use ::std::os::raw::{c_uint, c_void};
|
||||||
use std::{hint, ptr};
|
use std::{hint, mem, ptr};
|
||||||
|
|
||||||
const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
|
const CU_LAUNCH_PARAM_END: *mut c_void = 0 as *mut _;
|
||||||
const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _;
|
const CU_LAUNCH_PARAM_BUFFER_POINTER: *mut c_void = 1 as *mut _;
|
||||||
|
@ -101,7 +101,9 @@ pub fn launch_kernel(
|
||||||
{
|
{
|
||||||
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
|
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
|
||||||
}
|
}
|
||||||
GlobalState::lock_enqueue(hstream, |queue| {
|
GlobalState::lock_stream(hstream, |stream_data| {
|
||||||
|
let dev = unsafe { &mut *(*stream_data.context).device };
|
||||||
|
let queue = stream_data.cmd_list.as_ref().unwrap();
|
||||||
let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
|
let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
|
||||||
if kernel_params != ptr::null_mut() {
|
if kernel_params != ptr::null_mut() {
|
||||||
for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
|
for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
|
||||||
|
@ -162,6 +164,16 @@ pub fn launch_kernel(
|
||||||
)?
|
)?
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
let buffers = dev.allocations.iter().copied().collect::<Vec<_>>();
|
||||||
|
let err = unsafe {
|
||||||
|
ocl_core::ffi::clSetKernelExecInfo(
|
||||||
|
func.base.as_ptr(),
|
||||||
|
ocl_core::ffi::CL_KERNEL_EXEC_INFO_SVM_PTRS,
|
||||||
|
buffers.len() * mem::size_of::<*mut c_void>(),
|
||||||
|
buffers.as_ptr() as *const _,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
assert_eq!(err, 0);
|
||||||
let global_dims = [
|
let global_dims = [
|
||||||
(block_dim_x * grid_dim_x) as usize,
|
(block_dim_x * grid_dim_x) as usize,
|
||||||
(block_dim_y * grid_dim_y) as usize,
|
(block_dim_y * grid_dim_y) as usize,
|
||||||
|
@ -184,7 +196,7 @@ pub fn launch_kernel(
|
||||||
)?
|
)?
|
||||||
};
|
};
|
||||||
Ok::<_, CUresult>(())
|
Ok::<_, CUresult>(())
|
||||||
})
|
})?
|
||||||
}
|
}
|
||||||
|
|
||||||
fn round_up_to_multiple(x: usize, multiple: usize) -> usize {
|
fn round_up_to_multiple(x: usize, multiple: usize) -> usize {
|
||||||
|
|
|
@ -5,27 +5,39 @@ use super::{
|
||||||
use std::{
|
use std::{
|
||||||
ffi::c_void,
|
ffi::c_void,
|
||||||
mem::{self, size_of},
|
mem::{self, size_of},
|
||||||
|
ptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
|
pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
|
||||||
let ptr = GlobalState::lock_stream(CU_STREAM_LEGACY, |stream_data| {
|
let ptr = GlobalState::lock_stream(CU_STREAM_LEGACY, |stream_data| {
|
||||||
let dev = unsafe { &*(*stream_data.context).device };
|
let dev = unsafe { &mut *(*stream_data.context).device };
|
||||||
let queue = stream_data.cmd_list.as_ref().unwrap();
|
let queue = stream_data.cmd_list.as_ref().unwrap();
|
||||||
let ptr = unsafe {
|
let ptr = unsafe {
|
||||||
dev.ocl_ext
|
ocl_core::ffi::clSVMAlloc(
|
||||||
.device_mem_alloc(&dev.ocl_context, &dev.ocl_base, bytesize, 0)?
|
dev.ocl_context.as_ptr(),
|
||||||
|
ocl_core::ffi::CL_MEM_READ_WRITE,
|
||||||
|
bytesize,
|
||||||
|
0,
|
||||||
|
)
|
||||||
};
|
};
|
||||||
// CUDA does the same thing and e.g. GeekBench relies on this behavior
|
// CUDA does the same thing and e.g. GeekBench relies on this behavior
|
||||||
let event = unsafe {
|
let mut event = ptr::null_mut();
|
||||||
dev.ocl_ext.enqueue_memfill(
|
let err = unsafe {
|
||||||
queue,
|
ocl_core::ffi::clEnqueueSVMMemFill(
|
||||||
|
queue.as_ptr(),
|
||||||
ptr,
|
ptr,
|
||||||
&0u8 as *const u8 as *const c_void,
|
&0u8 as *const u8 as *const c_void,
|
||||||
1,
|
1,
|
||||||
bytesize,
|
bytesize,
|
||||||
)?
|
0,
|
||||||
|
ptr::null(),
|
||||||
|
&mut event,
|
||||||
|
)
|
||||||
};
|
};
|
||||||
ocl_core::wait_for_event(&event)?;
|
assert_eq!(err, 0);
|
||||||
|
let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
|
||||||
|
assert_eq!(err, 0);
|
||||||
|
dev.allocations.insert(ptr);
|
||||||
Ok::<_, CUresult>(ptr)
|
Ok::<_, CUresult>(ptr)
|
||||||
})??;
|
})??;
|
||||||
unsafe { *dptr = ptr };
|
unsafe { *dptr = ptr };
|
||||||
|
@ -36,10 +48,22 @@ pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result<
|
||||||
GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream_data| {
|
GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream_data| {
|
||||||
let dev = unsafe { &*(*stream_data.context).device };
|
let dev = unsafe { &*(*stream_data.context).device };
|
||||||
let queue = stream_data.cmd_list.as_ref().unwrap();
|
let queue = stream_data.cmd_list.as_ref().unwrap();
|
||||||
unsafe {
|
let mut event = ptr::null_mut();
|
||||||
dev.ocl_ext
|
let err = unsafe {
|
||||||
.enqueue_memcpy(queue, true, dst, src, bytesize)?
|
ocl_core::ffi::clEnqueueSVMMemcpy(
|
||||||
|
queue.as_ptr(),
|
||||||
|
1,
|
||||||
|
dst,
|
||||||
|
src,
|
||||||
|
bytesize,
|
||||||
|
0,
|
||||||
|
ptr::null(),
|
||||||
|
&mut event,
|
||||||
|
)
|
||||||
};
|
};
|
||||||
|
assert_eq!(err, 0);
|
||||||
|
let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
|
||||||
|
assert_eq!(err, 0);
|
||||||
Ok(())
|
Ok(())
|
||||||
})?
|
})?
|
||||||
}
|
}
|
||||||
|
@ -47,7 +71,8 @@ pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result<
|
||||||
pub fn free_v2(ptr: *mut c_void) -> Result<(), CUresult> {
|
pub fn free_v2(ptr: *mut c_void) -> Result<(), CUresult> {
|
||||||
GlobalState::lock_current_context(|ctx| {
|
GlobalState::lock_current_context(|ctx| {
|
||||||
let dev = unsafe { &mut *ctx.device };
|
let dev = unsafe { &mut *ctx.device };
|
||||||
unsafe { dev.ocl_ext.mem_blocking_free(&dev.ocl_context, ptr)? };
|
unsafe { ocl_core::ffi::clSVMFree(dev.ocl_context.as_ptr(), ptr) };
|
||||||
|
dev.allocations.remove(&ptr);
|
||||||
Ok(())
|
Ok(())
|
||||||
})?
|
})?
|
||||||
}
|
}
|
||||||
|
@ -57,16 +82,22 @@ pub(crate) fn set_d32_v2(dst: *mut c_void, mut ui: u32, n: usize) -> Result<(),
|
||||||
let dev = unsafe { &*(*stream_data.context).device };
|
let dev = unsafe { &*(*stream_data.context).device };
|
||||||
let queue = stream_data.cmd_list.as_ref().unwrap();
|
let queue = stream_data.cmd_list.as_ref().unwrap();
|
||||||
let pattern_size = mem::size_of_val(&ui);
|
let pattern_size = mem::size_of_val(&ui);
|
||||||
let event = unsafe {
|
let mut event = ptr::null_mut();
|
||||||
dev.ocl_ext.enqueue_memfill(
|
let err = unsafe {
|
||||||
queue,
|
ocl_core::ffi::clEnqueueSVMMemFill(
|
||||||
|
queue.as_ptr(),
|
||||||
dst,
|
dst,
|
||||||
&ui as *const _ as *const _,
|
&ui as *const _ as *const _,
|
||||||
pattern_size,
|
pattern_size,
|
||||||
pattern_size * n,
|
pattern_size * n,
|
||||||
)?
|
0,
|
||||||
|
ptr::null(),
|
||||||
|
&mut event,
|
||||||
|
)
|
||||||
};
|
};
|
||||||
ocl_core::wait_for_event(&event)?;
|
assert_eq!(err, 0);
|
||||||
|
let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
|
||||||
|
assert_eq!(err, 0);
|
||||||
Ok(())
|
Ok(())
|
||||||
})?
|
})?
|
||||||
}
|
}
|
||||||
|
@ -76,16 +107,22 @@ pub(crate) fn set_d8_v2(dst: *mut c_void, mut uc: u8, n: usize) -> Result<(), CU
|
||||||
let dev = unsafe { &*(*stream_data.context).device };
|
let dev = unsafe { &*(*stream_data.context).device };
|
||||||
let queue = stream_data.cmd_list.as_ref().unwrap();
|
let queue = stream_data.cmd_list.as_ref().unwrap();
|
||||||
let pattern_size = mem::size_of_val(&uc);
|
let pattern_size = mem::size_of_val(&uc);
|
||||||
let event = unsafe {
|
let mut event = ptr::null_mut();
|
||||||
dev.ocl_ext.enqueue_memfill(
|
let err = unsafe {
|
||||||
queue,
|
ocl_core::ffi::clEnqueueSVMMemFill(
|
||||||
|
queue.as_ptr(),
|
||||||
dst,
|
dst,
|
||||||
&uc as *const _ as *const _,
|
&uc as *const _ as *const _,
|
||||||
pattern_size,
|
pattern_size,
|
||||||
pattern_size * n,
|
pattern_size * n,
|
||||||
)?
|
0,
|
||||||
|
ptr::null(),
|
||||||
|
&mut event,
|
||||||
|
)
|
||||||
};
|
};
|
||||||
ocl_core::wait_for_event(&event)?;
|
assert_eq!(err, 0);
|
||||||
|
let err = unsafe { ocl_core::ffi::clWaitForEvents(1, &mut event) };
|
||||||
|
assert_eq!(err, 0);
|
||||||
Ok(())
|
Ok(())
|
||||||
})?
|
})?
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue