mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-04-20 00:14:45 +00:00
Finish converting to OpenCL
This commit is contained in:
parent
3d2024bf62
commit
9d4f26bd07
9 changed files with 320 additions and 183 deletions
|
@ -415,7 +415,7 @@ impl Module {
|
|||
}
|
||||
|
||||
pub struct KernelInfo {
|
||||
pub arguments_sizes: Vec<usize>,
|
||||
pub arguments_sizes: Vec<(usize, bool)>,
|
||||
pub uses_shared_mem: bool,
|
||||
}
|
||||
|
||||
|
@ -1024,7 +1024,12 @@ fn emit_function_header<'a>(
|
|||
let args_lens = func_decl
|
||||
.input_arguments
|
||||
.iter()
|
||||
.map(|param| param.v_type.size_of())
|
||||
.map(|param| {
|
||||
(
|
||||
param.v_type.size_of(),
|
||||
matches!(param.v_type, ast::Type::Pointer(..)),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
kernel_info.insert(
|
||||
name.to_string(),
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
use super::{device, stream::Stream, stream::StreamData, HasLivenessCookie, LiveCheck};
|
||||
use super::{CUresult, GlobalState};
|
||||
use super::{transmute_lifetime_mut, CUresult, GlobalState};
|
||||
use crate::{cuda::CUcontext, cuda_impl};
|
||||
use l0::sys::ze_result_t;
|
||||
use std::{cell::RefCell, num::NonZeroU32, os::raw::c_uint, ptr, sync::atomic::AtomicU32};
|
||||
|
@ -98,14 +98,11 @@ pub struct ContextData {
|
|||
|
||||
impl ContextData {
|
||||
pub fn new(
|
||||
l0_ctx: &'static l0::Context,
|
||||
l0_dev: l0::Device,
|
||||
flags: c_uint,
|
||||
is_primary: bool,
|
||||
host_event: (l0::Event<'static>, u64),
|
||||
dev: *mut device::Device,
|
||||
) -> Result<Self, CUresult> {
|
||||
let default_stream = StreamData::new_unitialized(l0_ctx, l0_dev, host_event)?;
|
||||
let default_stream = StreamData::new_unitialized()?;
|
||||
Ok(ContextData {
|
||||
flags: AtomicU32::new(flags),
|
||||
device: dev,
|
||||
|
@ -121,8 +118,15 @@ impl ContextData {
|
|||
|
||||
impl Context {
|
||||
pub fn late_init(&mut self) {
|
||||
let ctx_data = self.as_option_mut().unwrap();
|
||||
ctx_data.default_stream.context = ctx_data as *mut _;
|
||||
let ctx_data: &'static mut _ = {
|
||||
let this = self.as_option_mut().unwrap();
|
||||
let result = { unsafe { transmute_lifetime_mut(this) } };
|
||||
drop(this);
|
||||
result
|
||||
};
|
||||
{ self.as_option_mut().unwrap() }
|
||||
.default_stream
|
||||
.late_init(ctx_data);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -137,11 +141,8 @@ pub fn create_v2(
|
|||
let mut ctx_box = GlobalState::lock_device(dev_idx, |dev| {
|
||||
let dev_ptr = dev as *mut _;
|
||||
let mut ctx_box = Box::new(LiveCheck::new(ContextData::new(
|
||||
&dev.ocl_context,
|
||||
dev.base,
|
||||
flags,
|
||||
false,
|
||||
dev.host_event_pool.get(dev.base, &dev.ocl_context)?,
|
||||
dev_ptr as *mut _,
|
||||
)?));
|
||||
ctx_box.late_init();
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
use super::{context, transmute_lifetime, transmute_lifetime_mut, CUresult, GlobalState};
|
||||
use crate::cuda;
|
||||
use cuda::{CUdevice_attribute, CUuuid_st};
|
||||
use ocl_core::DeviceType;
|
||||
use ocl_core::{ClDeviceIdPtr, ContextProperties, DeviceType};
|
||||
use std::{
|
||||
cmp, mem,
|
||||
cmp,
|
||||
ffi::c_void,
|
||||
mem,
|
||||
os::raw::{c_char, c_int, c_uint},
|
||||
ptr,
|
||||
sync::atomic::{AtomicU32, Ordering},
|
||||
|
@ -22,6 +24,7 @@ pub struct Device {
|
|||
pub ocl_base: ocl_core::DeviceId,
|
||||
pub default_queue: ocl_core::CommandQueue,
|
||||
pub ocl_context: ocl_core::Context,
|
||||
pub(crate) ocl_ext: OpenCLExtensions,
|
||||
pub primary_context: context::Context,
|
||||
properties: Option<Box<l0::sys::ze_device_properties_t>>,
|
||||
image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>,
|
||||
|
@ -29,19 +32,185 @@ pub struct Device {
|
|||
compute_properties: Option<Box<l0::sys::ze_device_compute_properties_t>>,
|
||||
}
|
||||
|
||||
type cl_mem_properties_intel = ocl_core::ffi::cl_bitfield;
|
||||
|
||||
pub(crate) struct OpenCLExtensions {
|
||||
pub clDeviceMemAllocINTEL: unsafe extern "system" fn(
|
||||
ocl_core::ffi::cl_context,
|
||||
ocl_core::ffi::cl_device_id,
|
||||
*const cl_mem_properties_intel,
|
||||
usize,
|
||||
ocl_core::ffi::cl_uint,
|
||||
*mut ocl_core::ffi::cl_int,
|
||||
) -> *mut c_void,
|
||||
pub clEnqueueMemcpyINTEL: unsafe extern "system" fn(
|
||||
ocl_core::ffi::cl_command_queue,
|
||||
ocl_core::ffi::cl_bool,
|
||||
*mut c_void,
|
||||
*const c_void,
|
||||
usize,
|
||||
ocl_core::ffi::cl_uint,
|
||||
*const ocl_core::ffi::cl_event,
|
||||
*mut ocl_core::ffi::cl_event,
|
||||
) -> ocl_core::ffi::cl_int,
|
||||
pub clMemBlockingFreeINTEL:
|
||||
unsafe extern "system" fn(ocl_core::ffi::cl_context, *mut c_void) -> ocl_core::ffi::cl_int,
|
||||
pub clEnqueueMemFillINTEL: unsafe extern "system" fn(
|
||||
ocl_core::ffi::cl_command_queue,
|
||||
*mut c_void,
|
||||
*const c_void,
|
||||
usize,
|
||||
usize,
|
||||
ocl_core::ffi::cl_uint,
|
||||
*const ocl_core::ffi::cl_event,
|
||||
*mut ocl_core::ffi::cl_event,
|
||||
) -> ocl_core::ffi::cl_int,
|
||||
}
|
||||
|
||||
impl OpenCLExtensions {
|
||||
fn new(plat: &ocl_core::PlatformId) -> Result<Self, CUresult> {
|
||||
let clDeviceMemAllocINTEL = unsafe {
|
||||
ocl_core::get_extension_function_address_for_platform(
|
||||
plat,
|
||||
"clDeviceMemAllocINTEL",
|
||||
None,
|
||||
)?
|
||||
};
|
||||
let clEnqueueMemcpyINTEL = unsafe {
|
||||
ocl_core::get_extension_function_address_for_platform(
|
||||
plat,
|
||||
"clEnqueueMemcpyINTEL",
|
||||
None,
|
||||
)?
|
||||
};
|
||||
let clMemBlockingFreeINTEL = unsafe {
|
||||
ocl_core::get_extension_function_address_for_platform(
|
||||
plat,
|
||||
"clMemBlockingFreeINTEL",
|
||||
None,
|
||||
)?
|
||||
};
|
||||
let clEnqueueMemFillINTEL = unsafe {
|
||||
ocl_core::get_extension_function_address_for_platform(
|
||||
plat,
|
||||
"clEnqueueMemFillINTEL",
|
||||
None,
|
||||
)?
|
||||
};
|
||||
Ok(Self {
|
||||
clDeviceMemAllocINTEL: unsafe { mem::transmute(clDeviceMemAllocINTEL) },
|
||||
clEnqueueMemcpyINTEL: unsafe { mem::transmute(clEnqueueMemcpyINTEL) },
|
||||
clMemBlockingFreeINTEL: unsafe { mem::transmute(clMemBlockingFreeINTEL) },
|
||||
clEnqueueMemFillINTEL: unsafe { mem::transmute(clEnqueueMemFillINTEL) },
|
||||
})
|
||||
}
|
||||
|
||||
pub unsafe fn device_mem_alloc(
|
||||
&self,
|
||||
ctx: &ocl_core::Context,
|
||||
device: &ocl_core::DeviceId,
|
||||
size: usize,
|
||||
alignment: ocl_core::ffi::cl_uint,
|
||||
) -> Result<*mut c_void, CUresult> {
|
||||
let mut error = 0;
|
||||
let result = (self.clDeviceMemAllocINTEL)(
|
||||
ctx.as_ptr(),
|
||||
device.as_ptr(),
|
||||
ptr::null(),
|
||||
size,
|
||||
alignment,
|
||||
&mut error,
|
||||
);
|
||||
if error == 0 {
|
||||
Ok(result)
|
||||
} else {
|
||||
Err(CUresult::CUDA_ERROR_UNKNOWN)
|
||||
}
|
||||
}
|
||||
|
||||
pub unsafe fn enqueue_memcpy(
|
||||
&self,
|
||||
queue: &ocl_core::CommandQueue,
|
||||
blocking: bool,
|
||||
dst: *mut c_void,
|
||||
src: *const c_void,
|
||||
size: usize,
|
||||
) -> Result<(), CUresult> {
|
||||
let error = (self.clEnqueueMemcpyINTEL)(
|
||||
queue.as_ptr(),
|
||||
if blocking { 1 } else { 0 },
|
||||
dst,
|
||||
src,
|
||||
size,
|
||||
0,
|
||||
ptr::null(),
|
||||
ptr::null_mut(),
|
||||
);
|
||||
if error == 0 {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(CUresult::CUDA_ERROR_UNKNOWN)
|
||||
}
|
||||
}
|
||||
|
||||
pub unsafe fn mem_blocking_free(
|
||||
&self,
|
||||
ctx: &ocl_core::Context,
|
||||
mem_ptr: *mut c_void,
|
||||
) -> Result<(), CUresult> {
|
||||
let error = (self.clMemBlockingFreeINTEL)(ctx.as_ptr(), mem_ptr);
|
||||
if error == 0 {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(CUresult::CUDA_ERROR_UNKNOWN)
|
||||
}
|
||||
}
|
||||
|
||||
pub unsafe fn enqueue_memfill(
|
||||
&self,
|
||||
queue: &ocl_core::CommandQueue,
|
||||
dst: *mut c_void,
|
||||
pattern: *const c_void,
|
||||
patternSize: usize,
|
||||
size: usize,
|
||||
) -> Result<ocl_core::Event, CUresult> {
|
||||
let mut signal: ocl_core::ffi::cl_event = ptr::null_mut();
|
||||
let error = (self.clEnqueueMemFillINTEL)(
|
||||
queue.as_ptr(),
|
||||
dst,
|
||||
pattern,
|
||||
patternSize,
|
||||
size,
|
||||
0,
|
||||
ptr::null(),
|
||||
&mut signal,
|
||||
);
|
||||
if error == 0 {
|
||||
Ok(ocl_core::Event::from_raw(signal))
|
||||
} else {
|
||||
Err(CUresult::CUDA_ERROR_UNKNOWN)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl Send for Device {}
|
||||
|
||||
impl Device {
|
||||
pub fn new(
|
||||
drv: &l0::Driver,
|
||||
l0_dev: l0::Device,
|
||||
platform: ocl_core::PlatformId,
|
||||
ocl_dev: ocl_core::DeviceId,
|
||||
idx: usize,
|
||||
) -> Result<Self, CUresult> {
|
||||
let ctx = ocl_core::create_context(None, &[ocl_dev], None, None)?;
|
||||
let ocl_ext = OpenCLExtensions::new(&platform)?;
|
||||
let mut props = ocl_core::ContextProperties::new();
|
||||
props.set_platform(platform);
|
||||
let ctx = ocl_core::create_context(Some(&props), &[ocl_dev], None, None)?;
|
||||
let queue = ocl_core::create_command_queue(&ctx, ocl_dev, None)?;
|
||||
let primary_context = context::Context::new(context::ContextData::new());
|
||||
let primary_context =
|
||||
context::Context::new(context::ContextData::new(0, true, ptr::null_mut())?);
|
||||
Ok(Self {
|
||||
ocl_ext,
|
||||
index: Index(idx as c_int),
|
||||
base: l0_dev,
|
||||
ocl_base: ocl_dev,
|
||||
|
@ -55,6 +224,10 @@ impl Device {
|
|||
})
|
||||
}
|
||||
|
||||
pub fn late_init(&mut self) {
|
||||
self.primary_context.as_option_mut().unwrap().device = self as *mut _;
|
||||
}
|
||||
|
||||
fn get_properties<'a>(&'a mut self) -> l0::Result<&'a l0::sys::ze_device_properties_t> {
|
||||
if let Some(ref prop) = self.properties {
|
||||
return Ok(prop);
|
||||
|
@ -207,7 +380,7 @@ pub fn get_attribute(
|
|||
& l0::sys::ze_device_property_flags_t::ZE_DEVICE_PROPERTY_FLAG_INTEGRATED)
|
||||
== l0::sys::ze_device_property_flags_t::ZE_DEVICE_PROPERTY_FLAG_INTEGRATED
|
||||
{
|
||||
Ok(1)
|
||||
Ok::<_, CUresult>(1)
|
||||
} else {
|
||||
Ok(0)
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ impl HasLivenessCookie for FunctionData {
|
|||
|
||||
pub struct FunctionData {
|
||||
pub base: ocl_core::Kernel,
|
||||
pub arg_size: Vec<usize>,
|
||||
pub arg_size: Vec<(usize, bool)>,
|
||||
pub use_shared_mem: bool,
|
||||
pub legacy_args: LegacyArguments,
|
||||
}
|
||||
|
@ -73,14 +73,28 @@ pub fn launch_kernel(
|
|||
GlobalState::lock_enqueue(hstream, |queue| {
|
||||
let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
|
||||
if kernel_params != ptr::null_mut() {
|
||||
for (i, arg_size) in func.arg_size.iter().enumerate() {
|
||||
unsafe {
|
||||
ocl_core::set_kernel_arg(
|
||||
&func.base,
|
||||
i as u32,
|
||||
ocl_core::ArgVal::from_raw(*arg_size, *kernel_params.add(i), false),
|
||||
)?;
|
||||
};
|
||||
for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
|
||||
if is_mem {
|
||||
let error = 0;
|
||||
unsafe {
|
||||
ocl_core::ffi::clSetKernelArgSVMPointer(
|
||||
func.base.as_ptr(),
|
||||
i as u32,
|
||||
*(*kernel_params.add(i) as *const _),
|
||||
)
|
||||
};
|
||||
if error != 0 {
|
||||
panic!("clSetKernelArgSVMPointer");
|
||||
}
|
||||
} else {
|
||||
unsafe {
|
||||
ocl_core::set_kernel_arg(
|
||||
&func.base,
|
||||
i as u32,
|
||||
ocl_core::ArgVal::from_raw(arg_size, *kernel_params.add(i), is_mem),
|
||||
)?;
|
||||
};
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let mut offset = 0;
|
||||
|
@ -102,27 +116,27 @@ pub fn launch_kernel(
|
|||
match (buffer_size, buffer_ptr) {
|
||||
(Some(buffer_size), Some(buffer_ptr)) => {
|
||||
let sum_of_kernel_argument_sizes =
|
||||
func.arg_size.iter().fold(0, |offset, size_of_arg| {
|
||||
size_of_arg + round_up_to_multiple(offset, *size_of_arg)
|
||||
func.arg_size.iter().fold(0, |offset, &(size_of_arg, _)| {
|
||||
size_of_arg + round_up_to_multiple(offset, size_of_arg)
|
||||
});
|
||||
if buffer_size < sum_of_kernel_argument_sizes {
|
||||
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
|
||||
}
|
||||
let mut offset = 0;
|
||||
for (i, arg_size) in func.arg_size.iter().enumerate() {
|
||||
let buffer_offset = round_up_to_multiple(offset, *arg_size);
|
||||
for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
|
||||
let buffer_offset = round_up_to_multiple(offset, arg_size);
|
||||
unsafe {
|
||||
ocl_core::set_kernel_arg(
|
||||
&func.base,
|
||||
i as u32,
|
||||
ocl_core::ArgVal::from_raw(
|
||||
*arg_size,
|
||||
arg_size,
|
||||
buffer_ptr.add(buffer_offset) as *const _,
|
||||
false,
|
||||
is_mem,
|
||||
),
|
||||
)?;
|
||||
};
|
||||
offset = buffer_offset + *arg_size;
|
||||
offset = buffer_offset + arg_size;
|
||||
}
|
||||
}
|
||||
_ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
|
||||
|
|
|
@ -1,60 +1,77 @@
|
|||
use super::{stream, CUresult, GlobalState};
|
||||
use std::{ffi::c_void, mem};
|
||||
use std::{
|
||||
ffi::c_void,
|
||||
mem::{self, size_of},
|
||||
};
|
||||
|
||||
pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
|
||||
let ptr = GlobalState::lock_current_context(|ctx| {
|
||||
let dev = unsafe { &mut *ctx.device };
|
||||
Ok::<_, CUresult>(dev.ocl_context.mem_alloc_device(bytesize, 0, dev.base)?)
|
||||
Ok::<_, CUresult>(unsafe {
|
||||
dev.ocl_ext
|
||||
.device_mem_alloc(&dev.ocl_context, &dev.ocl_base, bytesize, 0)?
|
||||
})
|
||||
})??;
|
||||
unsafe { *dptr = ptr };
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result<(), CUresult> {
|
||||
GlobalState::lock_enqueue(stream::CU_STREAM_LEGACY, |cmd_list, signal, wait| {
|
||||
unsafe { cmd_list.append_memory_copy_raw(dst, src, bytesize, Some(signal), wait)? };
|
||||
GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream_data| {
|
||||
let dev = unsafe { &*(*stream_data.context).device };
|
||||
let queue = stream_data.cmd_list.as_ref().unwrap();
|
||||
unsafe {
|
||||
dev.ocl_ext
|
||||
.enqueue_memcpy(queue, true, dst, src, bytesize)?
|
||||
};
|
||||
Ok(())
|
||||
})
|
||||
})?
|
||||
}
|
||||
|
||||
pub fn free_v2(ptr: *mut c_void) -> Result<(), CUresult> {
|
||||
GlobalState::lock_current_context(|ctx| {
|
||||
let dev = unsafe { &mut *ctx.device };
|
||||
Ok::<_, CUresult>(dev.ocl_context.mem_free(ptr)?)
|
||||
})
|
||||
.map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?
|
||||
unsafe { dev.ocl_ext.mem_blocking_free(&dev.ocl_context, ptr)? };
|
||||
Ok(())
|
||||
})?
|
||||
}
|
||||
|
||||
pub(crate) fn set_d32_v2(dst: *mut c_void, mut ui: u32, n: usize) -> Result<(), CUresult> {
|
||||
GlobalState::lock_enqueue(stream::CU_STREAM_LEGACY, |cmd_list, signal, wait| {
|
||||
unsafe {
|
||||
cmd_list.append_memory_fill_raw(
|
||||
GlobalState::lock_stream(stream::CU_STREAM_LEGACY, move |stream_data| {
|
||||
let dev = unsafe { &*(*stream_data.context).device };
|
||||
let queue = stream_data.cmd_list.as_ref().unwrap();
|
||||
let pattern_size = mem::size_of_val(&ui);
|
||||
let event = unsafe {
|
||||
dev.ocl_ext.enqueue_memfill(
|
||||
queue,
|
||||
dst,
|
||||
&mut ui as *mut _ as *mut _,
|
||||
mem::size_of::<u32>(),
|
||||
mem::size_of::<u32>() * n,
|
||||
Some(signal),
|
||||
wait,
|
||||
)
|
||||
}?;
|
||||
&ui as *const _ as *const _,
|
||||
pattern_size,
|
||||
pattern_size * n,
|
||||
)?
|
||||
};
|
||||
ocl_core::wait_for_event(&event)?;
|
||||
Ok(())
|
||||
})
|
||||
})?
|
||||
}
|
||||
|
||||
pub(crate) fn set_d8_v2(dst: *mut c_void, mut uc: u8, n: usize) -> Result<(), CUresult> {
|
||||
GlobalState::lock_enqueue(stream::CU_STREAM_LEGACY, |cmd_list, signal, wait| {
|
||||
unsafe {
|
||||
cmd_list.append_memory_fill_raw(
|
||||
GlobalState::lock_stream(stream::CU_STREAM_LEGACY, move |stream_data| {
|
||||
let dev = unsafe { &*(*stream_data.context).device };
|
||||
let queue = stream_data.cmd_list.as_ref().unwrap();
|
||||
let pattern_size = mem::size_of_val(&uc);
|
||||
let event = unsafe {
|
||||
dev.ocl_ext.enqueue_memfill(
|
||||
queue,
|
||||
dst,
|
||||
&mut uc as *mut _ as *mut _,
|
||||
mem::size_of::<u8>(),
|
||||
mem::size_of::<u8>() * n,
|
||||
Some(signal),
|
||||
wait,
|
||||
)
|
||||
}?;
|
||||
&uc as *const _ as *const _,
|
||||
pattern_size,
|
||||
pattern_size * n,
|
||||
)?
|
||||
};
|
||||
ocl_core::wait_for_event(&event)?;
|
||||
Ok(())
|
||||
})
|
||||
})?
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
|
@ -290,15 +290,7 @@ impl GlobalState {
|
|||
let l0_dev = unsafe { (*(*stream_data.context).device).base };
|
||||
let l0_ctx = unsafe { &mut (*(*stream_data.context).device).ocl_context };
|
||||
let cmd_list = unsafe { transmute_lifetime(&stream_data.cmd_list) };
|
||||
// TODO: make new_marker drop-safe
|
||||
let (new_event, new_marker) = stream_data.get_event(l0_dev, l0_ctx)?;
|
||||
stream_data.try_reuse_finished_events()?;
|
||||
let prev_event = stream_data.get_last_event();
|
||||
let prev_event_array = prev_event.map(|e| [e]);
|
||||
let empty = [];
|
||||
let prev_event_slice = prev_event_array.as_ref().map_or(&empty[..], |arr| &arr[..]);
|
||||
f(cmd_list, &new_event, prev_event_slice)?;
|
||||
stream_data.push_event((new_event, new_marker));
|
||||
f(&stream_data.cmd_list.as_ref().unwrap())?;
|
||||
Ok(())
|
||||
})?
|
||||
}
|
||||
|
@ -350,15 +342,19 @@ pub fn init() -> Result<(), CUresult> {
|
|||
})
|
||||
.ok_or(CUresult::CUDA_ERROR_UNKNOWN)?;
|
||||
let drivers = l0::Driver::get()?;
|
||||
let devices = match drivers.into_iter().find(is_intel_gpu_driver) {
|
||||
let mut devices = match drivers.into_iter().find(is_intel_gpu_driver) {
|
||||
None => return Err(CUresult::CUDA_ERROR_UNKNOWN),
|
||||
Some(driver) => driver
|
||||
.devices()?
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(idx, l0_dev)| device::Device::new(&driver, l0_dev, device, idx).unwrap())
|
||||
.map(|(idx, l0_dev)| device::Device::new(l0_dev, platform, device, idx).unwrap())
|
||||
.collect::<Vec<_>>(),
|
||||
};
|
||||
for d in devices.iter_mut() {
|
||||
d.late_init();
|
||||
d.primary_context.late_init();
|
||||
}
|
||||
let global_heap = unsafe { os::heap_create() };
|
||||
if global_heap == ptr::null_mut() {
|
||||
return Err(CUresult::CUDA_ERROR_OUT_OF_MEMORY);
|
||||
|
|
|
@ -100,8 +100,19 @@ impl SpirvModule {
|
|||
)
|
||||
};
|
||||
let main_module = ocl_core::create_program_with_il(ctx, byte_il, None)?;
|
||||
match self.should_link_ptx_impl {
|
||||
let main_module = match self.should_link_ptx_impl {
|
||||
None => {
|
||||
ocl_core::build_program(
|
||||
&main_module,
|
||||
Some(&[dev]),
|
||||
&self.build_options,
|
||||
None,
|
||||
None,
|
||||
)?;
|
||||
main_module
|
||||
}
|
||||
Some(ptx_impl) => {
|
||||
let ptx_impl_prog = ocl_core::create_program_with_il(ctx, ptx_impl, None)?;
|
||||
ocl_core::compile_program(
|
||||
&main_module,
|
||||
Some(&[dev]),
|
||||
|
@ -112,20 +123,13 @@ impl SpirvModule {
|
|||
None,
|
||||
None,
|
||||
)?;
|
||||
}
|
||||
Some(ptx_impl) => {
|
||||
let ptx_impl_prog = ocl_core::create_program_with_il(ctx, ptx_impl, None)?;
|
||||
ocl_core::build_program(
|
||||
&main_module,
|
||||
Some(&[dev]),
|
||||
&self.build_options,
|
||||
None,
|
||||
None,
|
||||
)?;
|
||||
ocl_core::build_program(
|
||||
ocl_core::compile_program(
|
||||
&ptx_impl_prog,
|
||||
Some(&[dev]),
|
||||
&self.build_options,
|
||||
&[],
|
||||
&[],
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)?;
|
||||
|
@ -137,7 +141,7 @@ impl SpirvModule {
|
|||
None,
|
||||
None,
|
||||
None,
|
||||
)?;
|
||||
)?
|
||||
}
|
||||
};
|
||||
Ok(main_module)
|
||||
|
|
0
zluda/src/impl/ocl_ext.rs
Normal file
0
zluda/src/impl/ocl_ext.rs
Normal file
|
@ -34,118 +34,45 @@ impl HasLivenessCookie for StreamData {
|
|||
pub struct StreamData {
|
||||
pub context: *mut ContextData,
|
||||
// Immediate CommandList
|
||||
pub cmd_list: l0::CommandList<'static>,
|
||||
pub busy_events: VecDeque<(l0::Event<'static>, u64)>,
|
||||
// This could be a Vec, but I'd rather reuse earliest enqueued event not the one recently enqueued
|
||||
pub free_events: VecDeque<(l0::Event<'static>, u64)>,
|
||||
pub synchronization_event: (l0::Event<'static>, u64),
|
||||
pub cmd_list: Option<ocl_core::CommandQueue>,
|
||||
}
|
||||
|
||||
impl StreamData {
|
||||
pub fn new_unitialized(
|
||||
ctx: &'static l0::Context,
|
||||
device: l0::Device,
|
||||
host_event: (l0::Event<'static>, u64),
|
||||
) -> Result<Self, CUresult> {
|
||||
pub fn new_unitialized() -> Result<Self, CUresult> {
|
||||
Ok(StreamData {
|
||||
context: ptr::null_mut(),
|
||||
cmd_list: l0::CommandList::new_immediate(ctx, device)?,
|
||||
busy_events: VecDeque::new(),
|
||||
free_events: VecDeque::new(),
|
||||
synchronization_event: host_event,
|
||||
cmd_list: None,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn new(ctx: &mut ContextData) -> Result<Self, CUresult> {
|
||||
let l0_ctx = &mut unsafe { &mut *ctx.device }.ocl_context;
|
||||
let device = unsafe { &*ctx.device }.base;
|
||||
let synchronization_event = unsafe { &mut *ctx.device }
|
||||
.host_event_pool
|
||||
.get(device, l0_ctx)?;
|
||||
let ocl_ctx = &unsafe { &*ctx.device }.ocl_context;
|
||||
let device = unsafe { &*ctx.device }.ocl_base;
|
||||
Ok(StreamData {
|
||||
context: ctx as *mut _,
|
||||
cmd_list: l0::CommandList::new_immediate(l0_ctx, device)?,
|
||||
busy_events: VecDeque::new(),
|
||||
free_events: VecDeque::new(),
|
||||
synchronization_event,
|
||||
cmd_list: Some(ocl_core::create_command_queue::<
|
||||
&ocl_core::Context,
|
||||
ocl_core::DeviceId,
|
||||
>(ocl_ctx, device, None)?),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn try_reuse_finished_events(&mut self) -> l0::Result<()> {
|
||||
loop {
|
||||
match self.busy_events.get(0) {
|
||||
None => return Ok(()),
|
||||
Some((ev, _)) => {
|
||||
if ev.is_ready()? {
|
||||
let (ev, marker) = self.busy_events.pop_front().unwrap();
|
||||
ev.host_reset()?;
|
||||
self.free_events.push_back((ev, marker));
|
||||
} else {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn late_init(&mut self, ctx: &mut ContextData) {
|
||||
let ocl_ctx = &unsafe { &*ctx.device }.ocl_context;
|
||||
let device = unsafe { &*ctx.device }.ocl_base;
|
||||
self.context = ctx as *mut _;
|
||||
self.cmd_list = Some(
|
||||
ocl_core::create_command_queue::<&ocl_core::Context, ocl_core::DeviceId>(
|
||||
ocl_ctx, device, None,
|
||||
)
|
||||
.unwrap(),
|
||||
);
|
||||
}
|
||||
|
||||
pub fn reuse_all_finished_events(&mut self) -> l0::Result<()> {
|
||||
self.free_events.reserve(self.busy_events.len());
|
||||
for (ev, marker) in self.busy_events.drain(..) {
|
||||
ev.host_reset()?;
|
||||
self.free_events.push_back((ev, marker));
|
||||
}
|
||||
pub fn synchronize(&mut self) -> Result<(), CUresult> {
|
||||
ocl_core::finish(self.cmd_list.as_ref().unwrap())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_last_event(&self) -> Option<&l0::Event<'static>> {
|
||||
self.busy_events.iter().next_back().map(|(ev, _)| ev)
|
||||
}
|
||||
|
||||
pub fn push_event(&mut self, ev: (l0::Event<'static>, u64)) {
|
||||
self.busy_events.push_back(ev);
|
||||
}
|
||||
|
||||
pub fn synchronize(&mut self) -> l0::Result<()> {
|
||||
let empty = [];
|
||||
let busy_event_arr = self.busy_events.back().map(|(ev, _)| [ev]);
|
||||
let wait_events = busy_event_arr.as_ref().map_or(&empty[..], |arr| &arr[..]);
|
||||
unsafe {
|
||||
self.cmd_list
|
||||
.append_barrier(Some(&self.synchronization_event.0), wait_events)?
|
||||
};
|
||||
self.synchronization_event
|
||||
.0
|
||||
.host_synchronize(u64::max_value())?;
|
||||
self.synchronization_event.0.host_reset()?;
|
||||
self.reuse_all_finished_events()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_event(
|
||||
&mut self,
|
||||
l0_dev: l0::Device,
|
||||
l0_ctx: &'static l0::Context,
|
||||
) -> l0::Result<(l0::Event<'static>, u64)> {
|
||||
self.free_events
|
||||
.pop_front()
|
||||
.map(|x| Ok(x))
|
||||
.unwrap_or_else(|| {
|
||||
let event_pool = unsafe { &mut (*(*self.context).device).device_event_pool };
|
||||
event_pool.get(l0_dev, l0_ctx)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for StreamData {
|
||||
fn drop(&mut self) {
|
||||
if self.context == ptr::null_mut() {
|
||||
return;
|
||||
}
|
||||
for (_, marker) in self.busy_events.iter().chain(self.free_events.iter()) {
|
||||
let device_event_pool = unsafe { &mut (*(*self.context).device).device_event_pool };
|
||||
device_event_pool.mark_as_free(*marker);
|
||||
}
|
||||
unsafe { (&mut *self.context).streams.remove(&(&mut *self as *mut _)) };
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_ctx(hstream: *mut Stream, pctx: *mut *mut Context) -> Result<(), CUresult> {
|
||||
|
|
Loading…
Add table
Reference in a new issue