mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-08-08 01:00:14 +00:00
Finish converting to OpenCL
This commit is contained in:
parent
3d2024bf62
commit
9d4f26bd07
9 changed files with 320 additions and 183 deletions
|
@ -415,7 +415,7 @@ impl Module {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct KernelInfo {
|
pub struct KernelInfo {
|
||||||
pub arguments_sizes: Vec<usize>,
|
pub arguments_sizes: Vec<(usize, bool)>,
|
||||||
pub uses_shared_mem: bool,
|
pub uses_shared_mem: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1024,7 +1024,12 @@ fn emit_function_header<'a>(
|
||||||
let args_lens = func_decl
|
let args_lens = func_decl
|
||||||
.input_arguments
|
.input_arguments
|
||||||
.iter()
|
.iter()
|
||||||
.map(|param| param.v_type.size_of())
|
.map(|param| {
|
||||||
|
(
|
||||||
|
param.v_type.size_of(),
|
||||||
|
matches!(param.v_type, ast::Type::Pointer(..)),
|
||||||
|
)
|
||||||
|
})
|
||||||
.collect();
|
.collect();
|
||||||
kernel_info.insert(
|
kernel_info.insert(
|
||||||
name.to_string(),
|
name.to_string(),
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
use super::{device, stream::Stream, stream::StreamData, HasLivenessCookie, LiveCheck};
|
use super::{device, stream::Stream, stream::StreamData, HasLivenessCookie, LiveCheck};
|
||||||
use super::{CUresult, GlobalState};
|
use super::{transmute_lifetime_mut, CUresult, GlobalState};
|
||||||
use crate::{cuda::CUcontext, cuda_impl};
|
use crate::{cuda::CUcontext, cuda_impl};
|
||||||
use l0::sys::ze_result_t;
|
use l0::sys::ze_result_t;
|
||||||
use std::{cell::RefCell, num::NonZeroU32, os::raw::c_uint, ptr, sync::atomic::AtomicU32};
|
use std::{cell::RefCell, num::NonZeroU32, os::raw::c_uint, ptr, sync::atomic::AtomicU32};
|
||||||
|
@ -98,14 +98,11 @@ pub struct ContextData {
|
||||||
|
|
||||||
impl ContextData {
|
impl ContextData {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
l0_ctx: &'static l0::Context,
|
|
||||||
l0_dev: l0::Device,
|
|
||||||
flags: c_uint,
|
flags: c_uint,
|
||||||
is_primary: bool,
|
is_primary: bool,
|
||||||
host_event: (l0::Event<'static>, u64),
|
|
||||||
dev: *mut device::Device,
|
dev: *mut device::Device,
|
||||||
) -> Result<Self, CUresult> {
|
) -> Result<Self, CUresult> {
|
||||||
let default_stream = StreamData::new_unitialized(l0_ctx, l0_dev, host_event)?;
|
let default_stream = StreamData::new_unitialized()?;
|
||||||
Ok(ContextData {
|
Ok(ContextData {
|
||||||
flags: AtomicU32::new(flags),
|
flags: AtomicU32::new(flags),
|
||||||
device: dev,
|
device: dev,
|
||||||
|
@ -121,8 +118,15 @@ impl ContextData {
|
||||||
|
|
||||||
impl Context {
|
impl Context {
|
||||||
pub fn late_init(&mut self) {
|
pub fn late_init(&mut self) {
|
||||||
let ctx_data = self.as_option_mut().unwrap();
|
let ctx_data: &'static mut _ = {
|
||||||
ctx_data.default_stream.context = ctx_data as *mut _;
|
let this = self.as_option_mut().unwrap();
|
||||||
|
let result = { unsafe { transmute_lifetime_mut(this) } };
|
||||||
|
drop(this);
|
||||||
|
result
|
||||||
|
};
|
||||||
|
{ self.as_option_mut().unwrap() }
|
||||||
|
.default_stream
|
||||||
|
.late_init(ctx_data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -137,11 +141,8 @@ pub fn create_v2(
|
||||||
let mut ctx_box = GlobalState::lock_device(dev_idx, |dev| {
|
let mut ctx_box = GlobalState::lock_device(dev_idx, |dev| {
|
||||||
let dev_ptr = dev as *mut _;
|
let dev_ptr = dev as *mut _;
|
||||||
let mut ctx_box = Box::new(LiveCheck::new(ContextData::new(
|
let mut ctx_box = Box::new(LiveCheck::new(ContextData::new(
|
||||||
&dev.ocl_context,
|
|
||||||
dev.base,
|
|
||||||
flags,
|
flags,
|
||||||
false,
|
false,
|
||||||
dev.host_event_pool.get(dev.base, &dev.ocl_context)?,
|
|
||||||
dev_ptr as *mut _,
|
dev_ptr as *mut _,
|
||||||
)?));
|
)?));
|
||||||
ctx_box.late_init();
|
ctx_box.late_init();
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
use super::{context, transmute_lifetime, transmute_lifetime_mut, CUresult, GlobalState};
|
use super::{context, transmute_lifetime, transmute_lifetime_mut, CUresult, GlobalState};
|
||||||
use crate::cuda;
|
use crate::cuda;
|
||||||
use cuda::{CUdevice_attribute, CUuuid_st};
|
use cuda::{CUdevice_attribute, CUuuid_st};
|
||||||
use ocl_core::DeviceType;
|
use ocl_core::{ClDeviceIdPtr, ContextProperties, DeviceType};
|
||||||
use std::{
|
use std::{
|
||||||
cmp, mem,
|
cmp,
|
||||||
|
ffi::c_void,
|
||||||
|
mem,
|
||||||
os::raw::{c_char, c_int, c_uint},
|
os::raw::{c_char, c_int, c_uint},
|
||||||
ptr,
|
ptr,
|
||||||
sync::atomic::{AtomicU32, Ordering},
|
sync::atomic::{AtomicU32, Ordering},
|
||||||
|
@ -22,6 +24,7 @@ pub struct Device {
|
||||||
pub ocl_base: ocl_core::DeviceId,
|
pub ocl_base: ocl_core::DeviceId,
|
||||||
pub default_queue: ocl_core::CommandQueue,
|
pub default_queue: ocl_core::CommandQueue,
|
||||||
pub ocl_context: ocl_core::Context,
|
pub ocl_context: ocl_core::Context,
|
||||||
|
pub(crate) ocl_ext: OpenCLExtensions,
|
||||||
pub primary_context: context::Context,
|
pub primary_context: context::Context,
|
||||||
properties: Option<Box<l0::sys::ze_device_properties_t>>,
|
properties: Option<Box<l0::sys::ze_device_properties_t>>,
|
||||||
image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>,
|
image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>,
|
||||||
|
@ -29,19 +32,185 @@ pub struct Device {
|
||||||
compute_properties: Option<Box<l0::sys::ze_device_compute_properties_t>>,
|
compute_properties: Option<Box<l0::sys::ze_device_compute_properties_t>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type cl_mem_properties_intel = ocl_core::ffi::cl_bitfield;
|
||||||
|
|
||||||
|
pub(crate) struct OpenCLExtensions {
|
||||||
|
pub clDeviceMemAllocINTEL: unsafe extern "system" fn(
|
||||||
|
ocl_core::ffi::cl_context,
|
||||||
|
ocl_core::ffi::cl_device_id,
|
||||||
|
*const cl_mem_properties_intel,
|
||||||
|
usize,
|
||||||
|
ocl_core::ffi::cl_uint,
|
||||||
|
*mut ocl_core::ffi::cl_int,
|
||||||
|
) -> *mut c_void,
|
||||||
|
pub clEnqueueMemcpyINTEL: unsafe extern "system" fn(
|
||||||
|
ocl_core::ffi::cl_command_queue,
|
||||||
|
ocl_core::ffi::cl_bool,
|
||||||
|
*mut c_void,
|
||||||
|
*const c_void,
|
||||||
|
usize,
|
||||||
|
ocl_core::ffi::cl_uint,
|
||||||
|
*const ocl_core::ffi::cl_event,
|
||||||
|
*mut ocl_core::ffi::cl_event,
|
||||||
|
) -> ocl_core::ffi::cl_int,
|
||||||
|
pub clMemBlockingFreeINTEL:
|
||||||
|
unsafe extern "system" fn(ocl_core::ffi::cl_context, *mut c_void) -> ocl_core::ffi::cl_int,
|
||||||
|
pub clEnqueueMemFillINTEL: unsafe extern "system" fn(
|
||||||
|
ocl_core::ffi::cl_command_queue,
|
||||||
|
*mut c_void,
|
||||||
|
*const c_void,
|
||||||
|
usize,
|
||||||
|
usize,
|
||||||
|
ocl_core::ffi::cl_uint,
|
||||||
|
*const ocl_core::ffi::cl_event,
|
||||||
|
*mut ocl_core::ffi::cl_event,
|
||||||
|
) -> ocl_core::ffi::cl_int,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl OpenCLExtensions {
|
||||||
|
fn new(plat: &ocl_core::PlatformId) -> Result<Self, CUresult> {
|
||||||
|
let clDeviceMemAllocINTEL = unsafe {
|
||||||
|
ocl_core::get_extension_function_address_for_platform(
|
||||||
|
plat,
|
||||||
|
"clDeviceMemAllocINTEL",
|
||||||
|
None,
|
||||||
|
)?
|
||||||
|
};
|
||||||
|
let clEnqueueMemcpyINTEL = unsafe {
|
||||||
|
ocl_core::get_extension_function_address_for_platform(
|
||||||
|
plat,
|
||||||
|
"clEnqueueMemcpyINTEL",
|
||||||
|
None,
|
||||||
|
)?
|
||||||
|
};
|
||||||
|
let clMemBlockingFreeINTEL = unsafe {
|
||||||
|
ocl_core::get_extension_function_address_for_platform(
|
||||||
|
plat,
|
||||||
|
"clMemBlockingFreeINTEL",
|
||||||
|
None,
|
||||||
|
)?
|
||||||
|
};
|
||||||
|
let clEnqueueMemFillINTEL = unsafe {
|
||||||
|
ocl_core::get_extension_function_address_for_platform(
|
||||||
|
plat,
|
||||||
|
"clEnqueueMemFillINTEL",
|
||||||
|
None,
|
||||||
|
)?
|
||||||
|
};
|
||||||
|
Ok(Self {
|
||||||
|
clDeviceMemAllocINTEL: unsafe { mem::transmute(clDeviceMemAllocINTEL) },
|
||||||
|
clEnqueueMemcpyINTEL: unsafe { mem::transmute(clEnqueueMemcpyINTEL) },
|
||||||
|
clMemBlockingFreeINTEL: unsafe { mem::transmute(clMemBlockingFreeINTEL) },
|
||||||
|
clEnqueueMemFillINTEL: unsafe { mem::transmute(clEnqueueMemFillINTEL) },
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub unsafe fn device_mem_alloc(
|
||||||
|
&self,
|
||||||
|
ctx: &ocl_core::Context,
|
||||||
|
device: &ocl_core::DeviceId,
|
||||||
|
size: usize,
|
||||||
|
alignment: ocl_core::ffi::cl_uint,
|
||||||
|
) -> Result<*mut c_void, CUresult> {
|
||||||
|
let mut error = 0;
|
||||||
|
let result = (self.clDeviceMemAllocINTEL)(
|
||||||
|
ctx.as_ptr(),
|
||||||
|
device.as_ptr(),
|
||||||
|
ptr::null(),
|
||||||
|
size,
|
||||||
|
alignment,
|
||||||
|
&mut error,
|
||||||
|
);
|
||||||
|
if error == 0 {
|
||||||
|
Ok(result)
|
||||||
|
} else {
|
||||||
|
Err(CUresult::CUDA_ERROR_UNKNOWN)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub unsafe fn enqueue_memcpy(
|
||||||
|
&self,
|
||||||
|
queue: &ocl_core::CommandQueue,
|
||||||
|
blocking: bool,
|
||||||
|
dst: *mut c_void,
|
||||||
|
src: *const c_void,
|
||||||
|
size: usize,
|
||||||
|
) -> Result<(), CUresult> {
|
||||||
|
let error = (self.clEnqueueMemcpyINTEL)(
|
||||||
|
queue.as_ptr(),
|
||||||
|
if blocking { 1 } else { 0 },
|
||||||
|
dst,
|
||||||
|
src,
|
||||||
|
size,
|
||||||
|
0,
|
||||||
|
ptr::null(),
|
||||||
|
ptr::null_mut(),
|
||||||
|
);
|
||||||
|
if error == 0 {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(CUresult::CUDA_ERROR_UNKNOWN)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub unsafe fn mem_blocking_free(
|
||||||
|
&self,
|
||||||
|
ctx: &ocl_core::Context,
|
||||||
|
mem_ptr: *mut c_void,
|
||||||
|
) -> Result<(), CUresult> {
|
||||||
|
let error = (self.clMemBlockingFreeINTEL)(ctx.as_ptr(), mem_ptr);
|
||||||
|
if error == 0 {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(CUresult::CUDA_ERROR_UNKNOWN)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub unsafe fn enqueue_memfill(
|
||||||
|
&self,
|
||||||
|
queue: &ocl_core::CommandQueue,
|
||||||
|
dst: *mut c_void,
|
||||||
|
pattern: *const c_void,
|
||||||
|
patternSize: usize,
|
||||||
|
size: usize,
|
||||||
|
) -> Result<ocl_core::Event, CUresult> {
|
||||||
|
let mut signal: ocl_core::ffi::cl_event = ptr::null_mut();
|
||||||
|
let error = (self.clEnqueueMemFillINTEL)(
|
||||||
|
queue.as_ptr(),
|
||||||
|
dst,
|
||||||
|
pattern,
|
||||||
|
patternSize,
|
||||||
|
size,
|
||||||
|
0,
|
||||||
|
ptr::null(),
|
||||||
|
&mut signal,
|
||||||
|
);
|
||||||
|
if error == 0 {
|
||||||
|
Ok(ocl_core::Event::from_raw(signal))
|
||||||
|
} else {
|
||||||
|
Err(CUresult::CUDA_ERROR_UNKNOWN)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
unsafe impl Send for Device {}
|
unsafe impl Send for Device {}
|
||||||
|
|
||||||
impl Device {
|
impl Device {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
drv: &l0::Driver,
|
|
||||||
l0_dev: l0::Device,
|
l0_dev: l0::Device,
|
||||||
|
platform: ocl_core::PlatformId,
|
||||||
ocl_dev: ocl_core::DeviceId,
|
ocl_dev: ocl_core::DeviceId,
|
||||||
idx: usize,
|
idx: usize,
|
||||||
) -> Result<Self, CUresult> {
|
) -> Result<Self, CUresult> {
|
||||||
let ctx = ocl_core::create_context(None, &[ocl_dev], None, None)?;
|
let ocl_ext = OpenCLExtensions::new(&platform)?;
|
||||||
|
let mut props = ocl_core::ContextProperties::new();
|
||||||
|
props.set_platform(platform);
|
||||||
|
let ctx = ocl_core::create_context(Some(&props), &[ocl_dev], None, None)?;
|
||||||
let queue = ocl_core::create_command_queue(&ctx, ocl_dev, None)?;
|
let queue = ocl_core::create_command_queue(&ctx, ocl_dev, None)?;
|
||||||
let primary_context = context::Context::new(context::ContextData::new());
|
let primary_context =
|
||||||
|
context::Context::new(context::ContextData::new(0, true, ptr::null_mut())?);
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
|
ocl_ext,
|
||||||
index: Index(idx as c_int),
|
index: Index(idx as c_int),
|
||||||
base: l0_dev,
|
base: l0_dev,
|
||||||
ocl_base: ocl_dev,
|
ocl_base: ocl_dev,
|
||||||
|
@ -55,6 +224,10 @@ impl Device {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn late_init(&mut self) {
|
||||||
|
self.primary_context.as_option_mut().unwrap().device = self as *mut _;
|
||||||
|
}
|
||||||
|
|
||||||
fn get_properties<'a>(&'a mut self) -> l0::Result<&'a l0::sys::ze_device_properties_t> {
|
fn get_properties<'a>(&'a mut self) -> l0::Result<&'a l0::sys::ze_device_properties_t> {
|
||||||
if let Some(ref prop) = self.properties {
|
if let Some(ref prop) = self.properties {
|
||||||
return Ok(prop);
|
return Ok(prop);
|
||||||
|
@ -207,7 +380,7 @@ pub fn get_attribute(
|
||||||
& l0::sys::ze_device_property_flags_t::ZE_DEVICE_PROPERTY_FLAG_INTEGRATED)
|
& l0::sys::ze_device_property_flags_t::ZE_DEVICE_PROPERTY_FLAG_INTEGRATED)
|
||||||
== l0::sys::ze_device_property_flags_t::ZE_DEVICE_PROPERTY_FLAG_INTEGRATED
|
== l0::sys::ze_device_property_flags_t::ZE_DEVICE_PROPERTY_FLAG_INTEGRATED
|
||||||
{
|
{
|
||||||
Ok(1)
|
Ok::<_, CUresult>(1)
|
||||||
} else {
|
} else {
|
||||||
Ok(0)
|
Ok(0)
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,7 +27,7 @@ impl HasLivenessCookie for FunctionData {
|
||||||
|
|
||||||
pub struct FunctionData {
|
pub struct FunctionData {
|
||||||
pub base: ocl_core::Kernel,
|
pub base: ocl_core::Kernel,
|
||||||
pub arg_size: Vec<usize>,
|
pub arg_size: Vec<(usize, bool)>,
|
||||||
pub use_shared_mem: bool,
|
pub use_shared_mem: bool,
|
||||||
pub legacy_args: LegacyArguments,
|
pub legacy_args: LegacyArguments,
|
||||||
}
|
}
|
||||||
|
@ -73,14 +73,28 @@ pub fn launch_kernel(
|
||||||
GlobalState::lock_enqueue(hstream, |queue| {
|
GlobalState::lock_enqueue(hstream, |queue| {
|
||||||
let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
|
let func: &mut FunctionData = unsafe { &mut *f }.as_result_mut()?;
|
||||||
if kernel_params != ptr::null_mut() {
|
if kernel_params != ptr::null_mut() {
|
||||||
for (i, arg_size) in func.arg_size.iter().enumerate() {
|
for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
|
||||||
unsafe {
|
if is_mem {
|
||||||
ocl_core::set_kernel_arg(
|
let error = 0;
|
||||||
&func.base,
|
unsafe {
|
||||||
i as u32,
|
ocl_core::ffi::clSetKernelArgSVMPointer(
|
||||||
ocl_core::ArgVal::from_raw(*arg_size, *kernel_params.add(i), false),
|
func.base.as_ptr(),
|
||||||
)?;
|
i as u32,
|
||||||
};
|
*(*kernel_params.add(i) as *const _),
|
||||||
|
)
|
||||||
|
};
|
||||||
|
if error != 0 {
|
||||||
|
panic!("clSetKernelArgSVMPointer");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
unsafe {
|
||||||
|
ocl_core::set_kernel_arg(
|
||||||
|
&func.base,
|
||||||
|
i as u32,
|
||||||
|
ocl_core::ArgVal::from_raw(arg_size, *kernel_params.add(i), is_mem),
|
||||||
|
)?;
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let mut offset = 0;
|
let mut offset = 0;
|
||||||
|
@ -102,27 +116,27 @@ pub fn launch_kernel(
|
||||||
match (buffer_size, buffer_ptr) {
|
match (buffer_size, buffer_ptr) {
|
||||||
(Some(buffer_size), Some(buffer_ptr)) => {
|
(Some(buffer_size), Some(buffer_ptr)) => {
|
||||||
let sum_of_kernel_argument_sizes =
|
let sum_of_kernel_argument_sizes =
|
||||||
func.arg_size.iter().fold(0, |offset, size_of_arg| {
|
func.arg_size.iter().fold(0, |offset, &(size_of_arg, _)| {
|
||||||
size_of_arg + round_up_to_multiple(offset, *size_of_arg)
|
size_of_arg + round_up_to_multiple(offset, size_of_arg)
|
||||||
});
|
});
|
||||||
if buffer_size < sum_of_kernel_argument_sizes {
|
if buffer_size < sum_of_kernel_argument_sizes {
|
||||||
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
|
return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
|
||||||
}
|
}
|
||||||
let mut offset = 0;
|
let mut offset = 0;
|
||||||
for (i, arg_size) in func.arg_size.iter().enumerate() {
|
for (i, &(arg_size, is_mem)) in func.arg_size.iter().enumerate() {
|
||||||
let buffer_offset = round_up_to_multiple(offset, *arg_size);
|
let buffer_offset = round_up_to_multiple(offset, arg_size);
|
||||||
unsafe {
|
unsafe {
|
||||||
ocl_core::set_kernel_arg(
|
ocl_core::set_kernel_arg(
|
||||||
&func.base,
|
&func.base,
|
||||||
i as u32,
|
i as u32,
|
||||||
ocl_core::ArgVal::from_raw(
|
ocl_core::ArgVal::from_raw(
|
||||||
*arg_size,
|
arg_size,
|
||||||
buffer_ptr.add(buffer_offset) as *const _,
|
buffer_ptr.add(buffer_offset) as *const _,
|
||||||
false,
|
is_mem,
|
||||||
),
|
),
|
||||||
)?;
|
)?;
|
||||||
};
|
};
|
||||||
offset = buffer_offset + *arg_size;
|
offset = buffer_offset + arg_size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
|
_ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE),
|
||||||
|
|
|
@ -1,60 +1,77 @@
|
||||||
use super::{stream, CUresult, GlobalState};
|
use super::{stream, CUresult, GlobalState};
|
||||||
use std::{ffi::c_void, mem};
|
use std::{
|
||||||
|
ffi::c_void,
|
||||||
|
mem::{self, size_of},
|
||||||
|
};
|
||||||
|
|
||||||
pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
|
pub fn alloc_v2(dptr: *mut *mut c_void, bytesize: usize) -> Result<(), CUresult> {
|
||||||
let ptr = GlobalState::lock_current_context(|ctx| {
|
let ptr = GlobalState::lock_current_context(|ctx| {
|
||||||
let dev = unsafe { &mut *ctx.device };
|
let dev = unsafe { &mut *ctx.device };
|
||||||
Ok::<_, CUresult>(dev.ocl_context.mem_alloc_device(bytesize, 0, dev.base)?)
|
Ok::<_, CUresult>(unsafe {
|
||||||
|
dev.ocl_ext
|
||||||
|
.device_mem_alloc(&dev.ocl_context, &dev.ocl_base, bytesize, 0)?
|
||||||
|
})
|
||||||
})??;
|
})??;
|
||||||
unsafe { *dptr = ptr };
|
unsafe { *dptr = ptr };
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result<(), CUresult> {
|
pub fn copy_v2(dst: *mut c_void, src: *const c_void, bytesize: usize) -> Result<(), CUresult> {
|
||||||
GlobalState::lock_enqueue(stream::CU_STREAM_LEGACY, |cmd_list, signal, wait| {
|
GlobalState::lock_stream(stream::CU_STREAM_LEGACY, |stream_data| {
|
||||||
unsafe { cmd_list.append_memory_copy_raw(dst, src, bytesize, Some(signal), wait)? };
|
let dev = unsafe { &*(*stream_data.context).device };
|
||||||
|
let queue = stream_data.cmd_list.as_ref().unwrap();
|
||||||
|
unsafe {
|
||||||
|
dev.ocl_ext
|
||||||
|
.enqueue_memcpy(queue, true, dst, src, bytesize)?
|
||||||
|
};
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})?
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn free_v2(ptr: *mut c_void) -> Result<(), CUresult> {
|
pub fn free_v2(ptr: *mut c_void) -> Result<(), CUresult> {
|
||||||
GlobalState::lock_current_context(|ctx| {
|
GlobalState::lock_current_context(|ctx| {
|
||||||
let dev = unsafe { &mut *ctx.device };
|
let dev = unsafe { &mut *ctx.device };
|
||||||
Ok::<_, CUresult>(dev.ocl_context.mem_free(ptr)?)
|
unsafe { dev.ocl_ext.mem_blocking_free(&dev.ocl_context, ptr)? };
|
||||||
})
|
Ok(())
|
||||||
.map_err(|_| CUresult::CUDA_ERROR_INVALID_VALUE)?
|
})?
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn set_d32_v2(dst: *mut c_void, mut ui: u32, n: usize) -> Result<(), CUresult> {
|
pub(crate) fn set_d32_v2(dst: *mut c_void, mut ui: u32, n: usize) -> Result<(), CUresult> {
|
||||||
GlobalState::lock_enqueue(stream::CU_STREAM_LEGACY, |cmd_list, signal, wait| {
|
GlobalState::lock_stream(stream::CU_STREAM_LEGACY, move |stream_data| {
|
||||||
unsafe {
|
let dev = unsafe { &*(*stream_data.context).device };
|
||||||
cmd_list.append_memory_fill_raw(
|
let queue = stream_data.cmd_list.as_ref().unwrap();
|
||||||
|
let pattern_size = mem::size_of_val(&ui);
|
||||||
|
let event = unsafe {
|
||||||
|
dev.ocl_ext.enqueue_memfill(
|
||||||
|
queue,
|
||||||
dst,
|
dst,
|
||||||
&mut ui as *mut _ as *mut _,
|
&ui as *const _ as *const _,
|
||||||
mem::size_of::<u32>(),
|
pattern_size,
|
||||||
mem::size_of::<u32>() * n,
|
pattern_size * n,
|
||||||
Some(signal),
|
)?
|
||||||
wait,
|
};
|
||||||
)
|
ocl_core::wait_for_event(&event)?;
|
||||||
}?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})?
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn set_d8_v2(dst: *mut c_void, mut uc: u8, n: usize) -> Result<(), CUresult> {
|
pub(crate) fn set_d8_v2(dst: *mut c_void, mut uc: u8, n: usize) -> Result<(), CUresult> {
|
||||||
GlobalState::lock_enqueue(stream::CU_STREAM_LEGACY, |cmd_list, signal, wait| {
|
GlobalState::lock_stream(stream::CU_STREAM_LEGACY, move |stream_data| {
|
||||||
unsafe {
|
let dev = unsafe { &*(*stream_data.context).device };
|
||||||
cmd_list.append_memory_fill_raw(
|
let queue = stream_data.cmd_list.as_ref().unwrap();
|
||||||
|
let pattern_size = mem::size_of_val(&uc);
|
||||||
|
let event = unsafe {
|
||||||
|
dev.ocl_ext.enqueue_memfill(
|
||||||
|
queue,
|
||||||
dst,
|
dst,
|
||||||
&mut uc as *mut _ as *mut _,
|
&uc as *const _ as *const _,
|
||||||
mem::size_of::<u8>(),
|
pattern_size,
|
||||||
mem::size_of::<u8>() * n,
|
pattern_size * n,
|
||||||
Some(signal),
|
)?
|
||||||
wait,
|
};
|
||||||
)
|
ocl_core::wait_for_event(&event)?;
|
||||||
}?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})?
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
|
@ -290,15 +290,7 @@ impl GlobalState {
|
||||||
let l0_dev = unsafe { (*(*stream_data.context).device).base };
|
let l0_dev = unsafe { (*(*stream_data.context).device).base };
|
||||||
let l0_ctx = unsafe { &mut (*(*stream_data.context).device).ocl_context };
|
let l0_ctx = unsafe { &mut (*(*stream_data.context).device).ocl_context };
|
||||||
let cmd_list = unsafe { transmute_lifetime(&stream_data.cmd_list) };
|
let cmd_list = unsafe { transmute_lifetime(&stream_data.cmd_list) };
|
||||||
// TODO: make new_marker drop-safe
|
f(&stream_data.cmd_list.as_ref().unwrap())?;
|
||||||
let (new_event, new_marker) = stream_data.get_event(l0_dev, l0_ctx)?;
|
|
||||||
stream_data.try_reuse_finished_events()?;
|
|
||||||
let prev_event = stream_data.get_last_event();
|
|
||||||
let prev_event_array = prev_event.map(|e| [e]);
|
|
||||||
let empty = [];
|
|
||||||
let prev_event_slice = prev_event_array.as_ref().map_or(&empty[..], |arr| &arr[..]);
|
|
||||||
f(cmd_list, &new_event, prev_event_slice)?;
|
|
||||||
stream_data.push_event((new_event, new_marker));
|
|
||||||
Ok(())
|
Ok(())
|
||||||
})?
|
})?
|
||||||
}
|
}
|
||||||
|
@ -350,15 +342,19 @@ pub fn init() -> Result<(), CUresult> {
|
||||||
})
|
})
|
||||||
.ok_or(CUresult::CUDA_ERROR_UNKNOWN)?;
|
.ok_or(CUresult::CUDA_ERROR_UNKNOWN)?;
|
||||||
let drivers = l0::Driver::get()?;
|
let drivers = l0::Driver::get()?;
|
||||||
let devices = match drivers.into_iter().find(is_intel_gpu_driver) {
|
let mut devices = match drivers.into_iter().find(is_intel_gpu_driver) {
|
||||||
None => return Err(CUresult::CUDA_ERROR_UNKNOWN),
|
None => return Err(CUresult::CUDA_ERROR_UNKNOWN),
|
||||||
Some(driver) => driver
|
Some(driver) => driver
|
||||||
.devices()?
|
.devices()?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(idx, l0_dev)| device::Device::new(&driver, l0_dev, device, idx).unwrap())
|
.map(|(idx, l0_dev)| device::Device::new(l0_dev, platform, device, idx).unwrap())
|
||||||
.collect::<Vec<_>>(),
|
.collect::<Vec<_>>(),
|
||||||
};
|
};
|
||||||
|
for d in devices.iter_mut() {
|
||||||
|
d.late_init();
|
||||||
|
d.primary_context.late_init();
|
||||||
|
}
|
||||||
let global_heap = unsafe { os::heap_create() };
|
let global_heap = unsafe { os::heap_create() };
|
||||||
if global_heap == ptr::null_mut() {
|
if global_heap == ptr::null_mut() {
|
||||||
return Err(CUresult::CUDA_ERROR_OUT_OF_MEMORY);
|
return Err(CUresult::CUDA_ERROR_OUT_OF_MEMORY);
|
||||||
|
|
|
@ -100,8 +100,19 @@ impl SpirvModule {
|
||||||
)
|
)
|
||||||
};
|
};
|
||||||
let main_module = ocl_core::create_program_with_il(ctx, byte_il, None)?;
|
let main_module = ocl_core::create_program_with_il(ctx, byte_il, None)?;
|
||||||
match self.should_link_ptx_impl {
|
let main_module = match self.should_link_ptx_impl {
|
||||||
None => {
|
None => {
|
||||||
|
ocl_core::build_program(
|
||||||
|
&main_module,
|
||||||
|
Some(&[dev]),
|
||||||
|
&self.build_options,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
)?;
|
||||||
|
main_module
|
||||||
|
}
|
||||||
|
Some(ptx_impl) => {
|
||||||
|
let ptx_impl_prog = ocl_core::create_program_with_il(ctx, ptx_impl, None)?;
|
||||||
ocl_core::compile_program(
|
ocl_core::compile_program(
|
||||||
&main_module,
|
&main_module,
|
||||||
Some(&[dev]),
|
Some(&[dev]),
|
||||||
|
@ -112,20 +123,13 @@ impl SpirvModule {
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
)?;
|
)?;
|
||||||
}
|
ocl_core::compile_program(
|
||||||
Some(ptx_impl) => {
|
|
||||||
let ptx_impl_prog = ocl_core::create_program_with_il(ctx, ptx_impl, None)?;
|
|
||||||
ocl_core::build_program(
|
|
||||||
&main_module,
|
|
||||||
Some(&[dev]),
|
|
||||||
&self.build_options,
|
|
||||||
None,
|
|
||||||
None,
|
|
||||||
)?;
|
|
||||||
ocl_core::build_program(
|
|
||||||
&ptx_impl_prog,
|
&ptx_impl_prog,
|
||||||
Some(&[dev]),
|
Some(&[dev]),
|
||||||
&self.build_options,
|
&self.build_options,
|
||||||
|
&[],
|
||||||
|
&[],
|
||||||
|
None,
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
)?;
|
)?;
|
||||||
|
@ -137,7 +141,7 @@ impl SpirvModule {
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
)?;
|
)?
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
Ok(main_module)
|
Ok(main_module)
|
||||||
|
|
0
zluda/src/impl/ocl_ext.rs
Normal file
0
zluda/src/impl/ocl_ext.rs
Normal file
|
@ -34,118 +34,45 @@ impl HasLivenessCookie for StreamData {
|
||||||
pub struct StreamData {
|
pub struct StreamData {
|
||||||
pub context: *mut ContextData,
|
pub context: *mut ContextData,
|
||||||
// Immediate CommandList
|
// Immediate CommandList
|
||||||
pub cmd_list: l0::CommandList<'static>,
|
pub cmd_list: Option<ocl_core::CommandQueue>,
|
||||||
pub busy_events: VecDeque<(l0::Event<'static>, u64)>,
|
|
||||||
// This could be a Vec, but I'd rather reuse earliest enqueued event not the one recently enqueued
|
|
||||||
pub free_events: VecDeque<(l0::Event<'static>, u64)>,
|
|
||||||
pub synchronization_event: (l0::Event<'static>, u64),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl StreamData {
|
impl StreamData {
|
||||||
pub fn new_unitialized(
|
pub fn new_unitialized() -> Result<Self, CUresult> {
|
||||||
ctx: &'static l0::Context,
|
|
||||||
device: l0::Device,
|
|
||||||
host_event: (l0::Event<'static>, u64),
|
|
||||||
) -> Result<Self, CUresult> {
|
|
||||||
Ok(StreamData {
|
Ok(StreamData {
|
||||||
context: ptr::null_mut(),
|
context: ptr::null_mut(),
|
||||||
cmd_list: l0::CommandList::new_immediate(ctx, device)?,
|
cmd_list: None,
|
||||||
busy_events: VecDeque::new(),
|
|
||||||
free_events: VecDeque::new(),
|
|
||||||
synchronization_event: host_event,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new(ctx: &mut ContextData) -> Result<Self, CUresult> {
|
pub fn new(ctx: &mut ContextData) -> Result<Self, CUresult> {
|
||||||
let l0_ctx = &mut unsafe { &mut *ctx.device }.ocl_context;
|
let ocl_ctx = &unsafe { &*ctx.device }.ocl_context;
|
||||||
let device = unsafe { &*ctx.device }.base;
|
let device = unsafe { &*ctx.device }.ocl_base;
|
||||||
let synchronization_event = unsafe { &mut *ctx.device }
|
|
||||||
.host_event_pool
|
|
||||||
.get(device, l0_ctx)?;
|
|
||||||
Ok(StreamData {
|
Ok(StreamData {
|
||||||
context: ctx as *mut _,
|
context: ctx as *mut _,
|
||||||
cmd_list: l0::CommandList::new_immediate(l0_ctx, device)?,
|
cmd_list: Some(ocl_core::create_command_queue::<
|
||||||
busy_events: VecDeque::new(),
|
&ocl_core::Context,
|
||||||
free_events: VecDeque::new(),
|
ocl_core::DeviceId,
|
||||||
synchronization_event,
|
>(ocl_ctx, device, None)?),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn try_reuse_finished_events(&mut self) -> l0::Result<()> {
|
pub fn late_init(&mut self, ctx: &mut ContextData) {
|
||||||
loop {
|
let ocl_ctx = &unsafe { &*ctx.device }.ocl_context;
|
||||||
match self.busy_events.get(0) {
|
let device = unsafe { &*ctx.device }.ocl_base;
|
||||||
None => return Ok(()),
|
self.context = ctx as *mut _;
|
||||||
Some((ev, _)) => {
|
self.cmd_list = Some(
|
||||||
if ev.is_ready()? {
|
ocl_core::create_command_queue::<&ocl_core::Context, ocl_core::DeviceId>(
|
||||||
let (ev, marker) = self.busy_events.pop_front().unwrap();
|
ocl_ctx, device, None,
|
||||||
ev.host_reset()?;
|
)
|
||||||
self.free_events.push_back((ev, marker));
|
.unwrap(),
|
||||||
} else {
|
);
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn reuse_all_finished_events(&mut self) -> l0::Result<()> {
|
pub fn synchronize(&mut self) -> Result<(), CUresult> {
|
||||||
self.free_events.reserve(self.busy_events.len());
|
ocl_core::finish(self.cmd_list.as_ref().unwrap())?;
|
||||||
for (ev, marker) in self.busy_events.drain(..) {
|
|
||||||
ev.host_reset()?;
|
|
||||||
self.free_events.push_back((ev, marker));
|
|
||||||
}
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_last_event(&self) -> Option<&l0::Event<'static>> {
|
|
||||||
self.busy_events.iter().next_back().map(|(ev, _)| ev)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn push_event(&mut self, ev: (l0::Event<'static>, u64)) {
|
|
||||||
self.busy_events.push_back(ev);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn synchronize(&mut self) -> l0::Result<()> {
|
|
||||||
let empty = [];
|
|
||||||
let busy_event_arr = self.busy_events.back().map(|(ev, _)| [ev]);
|
|
||||||
let wait_events = busy_event_arr.as_ref().map_or(&empty[..], |arr| &arr[..]);
|
|
||||||
unsafe {
|
|
||||||
self.cmd_list
|
|
||||||
.append_barrier(Some(&self.synchronization_event.0), wait_events)?
|
|
||||||
};
|
|
||||||
self.synchronization_event
|
|
||||||
.0
|
|
||||||
.host_synchronize(u64::max_value())?;
|
|
||||||
self.synchronization_event.0.host_reset()?;
|
|
||||||
self.reuse_all_finished_events()?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_event(
|
|
||||||
&mut self,
|
|
||||||
l0_dev: l0::Device,
|
|
||||||
l0_ctx: &'static l0::Context,
|
|
||||||
) -> l0::Result<(l0::Event<'static>, u64)> {
|
|
||||||
self.free_events
|
|
||||||
.pop_front()
|
|
||||||
.map(|x| Ok(x))
|
|
||||||
.unwrap_or_else(|| {
|
|
||||||
let event_pool = unsafe { &mut (*(*self.context).device).device_event_pool };
|
|
||||||
event_pool.get(l0_dev, l0_ctx)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for StreamData {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
if self.context == ptr::null_mut() {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (_, marker) in self.busy_events.iter().chain(self.free_events.iter()) {
|
|
||||||
let device_event_pool = unsafe { &mut (*(*self.context).device).device_event_pool };
|
|
||||||
device_event_pool.mark_as_free(*marker);
|
|
||||||
}
|
|
||||||
unsafe { (&mut *self.context).streams.remove(&(&mut *self as *mut _)) };
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_ctx(hstream: *mut Stream, pctx: *mut *mut Context) -> Result<(), CUresult> {
|
pub(crate) fn get_ctx(hstream: *mut Stream, pctx: *mut *mut Context) -> Result<(), CUresult> {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue