Remove all use of L0

This commit is contained in:
Andrzej Janik 2021-08-06 02:15:57 +02:00
commit 5bfc2a56b9
7 changed files with 112 additions and 396 deletions

View file

@ -3829,7 +3829,7 @@ fn emit_mul_sint(
let src1 = builder.s_convert(dst_type_id, None, arg.src1)?; let src1 = builder.s_convert(dst_type_id, None, arg.src1)?;
let src2 = builder.s_convert(dst_type_id, None, arg.src2)?; let src2 = builder.s_convert(dst_type_id, None, arg.src2)?;
builder.i_mul(dst_type_id, Some(arg.dst), src1, src2)?; builder.i_mul(dst_type_id, Some(arg.dst), src1, src2)?;
builder.decorate(arg.dst, spirv::Decoration::NoSignedWrap, []); builder.decorate(arg.dst, spirv::Decoration::NoSignedWrap, iter::empty());
} }
} }
Ok(()) Ok(())
@ -3867,7 +3867,7 @@ fn emit_mul_uint(
let src1 = builder.u_convert(dst_type_id, None, arg.src1)?; let src1 = builder.u_convert(dst_type_id, None, arg.src1)?;
let src2 = builder.u_convert(dst_type_id, None, arg.src2)?; let src2 = builder.u_convert(dst_type_id, None, arg.src2)?;
builder.i_mul(dst_type_id, Some(arg.dst), src1, src2)?; builder.i_mul(dst_type_id, Some(arg.dst), src1, src2)?;
builder.decorate(arg.dst, spirv::Decoration::NoUnsignedWrap, []); builder.decorate(arg.dst, spirv::Decoration::NoUnsignedWrap, iter::empty());
} }
} }
Ok(()) Ok(())

View file

@ -9,8 +9,6 @@ name = "zluda"
[dependencies] [dependencies]
ptx = { path = "../ptx" } ptx = { path = "../ptx" }
level_zero = { path = "../level_zero" }
level_zero-sys = { path = "../level_zero-sys" }
lazy_static = "1.4" lazy_static = "1.4"
num_enum = "0.4" num_enum = "0.4"
lz4-sys = "1.9" lz4-sys = "1.9"

View file

@ -1,7 +1,6 @@
use super::{device, stream::Stream, stream::StreamData, HasLivenessCookie, LiveCheck}; use super::{device, stream::Stream, stream::StreamData, HasLivenessCookie, LiveCheck};
use super::{transmute_lifetime_mut, CUresult, GlobalState}; use super::{transmute_lifetime_mut, CUresult, GlobalState};
use crate::{cuda::CUcontext, cuda_impl}; use crate::{cuda::CUcontext, cuda_impl};
use l0::sys::ze_result_t;
use std::{cell::RefCell, num::NonZeroU32, os::raw::c_uint, ptr, sync::atomic::AtomicU32}; use std::{cell::RefCell, num::NonZeroU32, os::raw::c_uint, ptr, sync::atomic::AtomicU32};
use std::{ use std::{
collections::HashSet, collections::HashSet,
@ -193,9 +192,9 @@ pub fn pop_current_v2(pctx: *mut *mut Context) -> CUresult {
CUresult::CUDA_SUCCESS CUresult::CUDA_SUCCESS
} }
pub fn get_current(pctx: *mut *mut Context) -> l0::Result<()> { pub fn get_current(pctx: *mut *mut Context) -> Result<(), CUresult> {
if pctx == ptr::null_mut() { if pctx == ptr::null_mut() {
return Err(ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT); return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
} }
let ctx = CONTEXT_STACK.with(|stack| match stack.borrow().last() { let ctx = CONTEXT_STACK.with(|stack| match stack.borrow().last() {
Some(ctx) => *ctx as *mut _, Some(ctx) => *ctx as *mut _,

View file

@ -21,26 +21,22 @@ pub struct Index(pub c_int);
pub struct Device { pub struct Device {
pub index: Index, pub index: Index,
pub base: l0::Device,
pub ocl_base: ocl_core::DeviceId, pub ocl_base: ocl_core::DeviceId,
pub default_queue: ocl_core::CommandQueue, pub default_queue: ocl_core::CommandQueue,
pub ocl_context: ocl_core::Context, pub ocl_context: ocl_core::Context,
pub primary_context: context::Context, pub primary_context: context::Context,
pub allocations: HashSet<*mut c_void>, pub allocations: HashSet<*mut c_void>,
properties: Option<Box<l0::sys::ze_device_properties_t>>, pub is_amd: bool,
image_properties: Option<Box<l0::sys::ze_device_image_properties_t>>,
memory_properties: Option<Vec<l0::sys::ze_device_memory_properties_t>>,
compute_properties: Option<Box<l0::sys::ze_device_compute_properties_t>>,
} }
unsafe impl Send for Device {} unsafe impl Send for Device {}
impl Device { impl Device {
pub fn new( pub fn new(
l0_dev: l0::Device,
platform: ocl_core::PlatformId, platform: ocl_core::PlatformId,
ocl_dev: ocl_core::DeviceId, ocl_dev: ocl_core::DeviceId,
idx: usize, idx: usize,
is_amd: bool,
) -> Result<Self, CUresult> { ) -> Result<Self, CUresult> {
let mut props = ocl_core::ContextProperties::new(); let mut props = ocl_core::ContextProperties::new();
props.set_platform(platform); props.set_platform(platform);
@ -50,67 +46,18 @@ impl Device {
context::Context::new(context::ContextData::new(0, true, ptr::null_mut())?); context::Context::new(context::ContextData::new(0, true, ptr::null_mut())?);
Ok(Self { Ok(Self {
index: Index(idx as c_int), index: Index(idx as c_int),
base: l0_dev,
ocl_base: ocl_dev, ocl_base: ocl_dev,
default_queue: queue, default_queue: queue,
ocl_context: ctx, ocl_context: ctx,
primary_context, primary_context,
allocations: HashSet::new(), allocations: HashSet::new(),
properties: None, is_amd,
image_properties: None,
memory_properties: None,
compute_properties: None,
}) })
} }
pub fn late_init(&mut self) { pub fn late_init(&mut self) {
self.primary_context.as_option_mut().unwrap().device = self as *mut _; self.primary_context.as_option_mut().unwrap().device = self as *mut _;
} }
fn get_properties<'a>(&'a mut self) -> l0::Result<&'a l0::sys::ze_device_properties_t> {
if let Some(ref prop) = self.properties {
return Ok(prop);
}
let mut props = Default::default();
self.base.get_properties(&mut props)?;
Ok(self.properties.get_or_insert(Box::new(props)))
}
fn get_image_properties(&mut self) -> l0::Result<&l0::sys::ze_device_image_properties_t> {
if let Some(ref prop) = self.image_properties {
return Ok(prop);
}
let mut props = Default::default();
self.base.get_image_properties(&mut props)?;
Ok(self.image_properties.get_or_insert(Box::new(props)))
}
fn get_memory_properties(&mut self) -> l0::Result<&[l0::sys::ze_device_memory_properties_t]> {
if let Some(ref prop) = self.memory_properties {
return Ok(prop);
}
match self.base.get_memory_properties() {
Ok(prop) => Ok(self.memory_properties.get_or_insert(prop)),
Err(e) => Err(e),
}
}
fn get_compute_properties(&mut self) -> l0::Result<&l0::sys::ze_device_compute_properties_t> {
if let Some(ref prop) = self.compute_properties {
return Ok(prop);
}
let mut props = Default::default();
self.base.get_compute_properties(&mut props)?;
Ok(self.compute_properties.get_or_insert(Box::new(props)))
}
fn get_max_simd(&mut self) -> l0::Result<u32> {
let props = self.get_compute_properties()?;
Ok(*props.subGroupSizes[0..props.numSubGroupSizes as usize]
.iter()
.max()
.unwrap())
}
} }
pub fn get_count(count: *mut c_int) -> Result<(), CUresult> { pub fn get_count(count: *mut c_int) -> Result<(), CUresult> {
@ -136,29 +83,30 @@ pub fn get_name(name: *mut c_char, len: i32, dev_idx: Index) -> Result<(), CUres
if name == ptr::null_mut() || len < 0 { if name == ptr::null_mut() || len < 0 {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE); return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
} }
let name_ptr = GlobalState::lock_device(dev_idx, |dev| { let name_string = GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_properties()?; let props = ocl_core::get_device_info(dev.ocl_base, ocl_core::DeviceInfo::Name)?;
Ok::<_, l0::sys::ze_result_t>(props.name.as_ptr()) if let ocl_core::DeviceInfoResult::Name(name) = props {
Ok(name)
} else {
Err(CUresult::CUDA_ERROR_UNKNOWN)
}
})??; })??;
let name_len = (0..256) let mut dst_null_pos = cmp::min((len - 1) as usize, name_string.len());
.position(|i| unsafe { *name_ptr.add(i) } == 0) unsafe { std::ptr::copy_nonoverlapping(name_string.as_ptr() as *const _, name, dst_null_pos) };
.unwrap_or(256); if name_string.len() + PROJECT_URL_SUFFIX_LONG.len() < (len as usize) {
let mut dst_null_pos = cmp::min((len - 1) as usize, name_len);
unsafe { std::ptr::copy_nonoverlapping(name_ptr, name, dst_null_pos) };
if name_len + PROJECT_URL_SUFFIX_LONG.len() < (len as usize) {
unsafe { unsafe {
std::ptr::copy_nonoverlapping( std::ptr::copy_nonoverlapping(
PROJECT_URL_SUFFIX_LONG.as_ptr(), PROJECT_URL_SUFFIX_LONG.as_ptr(),
name.add(name_len) as *mut _, name.add(name_string.len()) as *mut _,
PROJECT_URL_SUFFIX_LONG.len(), PROJECT_URL_SUFFIX_LONG.len(),
) )
}; };
dst_null_pos += PROJECT_URL_SUFFIX_LONG.len(); dst_null_pos += PROJECT_URL_SUFFIX_LONG.len();
} else if name_len + PROJECT_URL_SUFFIX_SHORT.len() < (len as usize) { } else if name_string.len() + PROJECT_URL_SUFFIX_SHORT.len() < (len as usize) {
unsafe { unsafe {
std::ptr::copy_nonoverlapping( std::ptr::copy_nonoverlapping(
PROJECT_URL_SUFFIX_SHORT.as_ptr(), PROJECT_URL_SUFFIX_SHORT.as_ptr(),
name.add(name_len) as *mut _, name.add(name_string.len()) as *mut _,
PROJECT_URL_SUFFIX_SHORT.len(), PROJECT_URL_SUFFIX_SHORT.len(),
) )
}; };
@ -172,16 +120,15 @@ pub fn total_mem_v2(bytes: *mut usize, dev_idx: Index) -> Result<(), CUresult> {
if bytes == ptr::null_mut() { if bytes == ptr::null_mut() {
return Err(CUresult::CUDA_ERROR_INVALID_VALUE); return Err(CUresult::CUDA_ERROR_INVALID_VALUE);
} }
let mem_props = GlobalState::lock_device(dev_idx, |dev| { let mem_size = GlobalState::lock_device(dev_idx, |dev| {
let mem_props = dev.get_memory_properties()?; let props = ocl_core::get_device_info(dev.ocl_base, ocl_core::DeviceInfo::GlobalMemSize)?;
Ok::<_, l0::sys::ze_result_t>(mem_props) if let ocl_core::DeviceInfoResult::GlobalMemSize(mem_size) = props {
Ok(mem_size)
} else {
Err(CUresult::CUDA_ERROR_UNKNOWN)
}
})??; })??;
let max_mem = mem_props unsafe { *bytes = mem_size as usize };
.iter()
.map(|p| p.totalSize)
.max()
.ok_or(CUresult::CUDA_ERROR_ILLEGAL_STATE)?;
unsafe { *bytes = max_mem as usize };
Ok(()) Ok(())
} }
@ -213,119 +160,95 @@ pub fn get_attribute(
} }
let value = match attrib { let value = match attrib {
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_INTEGRATED => { CUdevice_attribute::CU_DEVICE_ATTRIBUTE_INTEGRATED => {
GlobalState::lock_device(dev_idx, |dev| { GlobalState::lock_device(dev_idx, |dev| if dev.is_amd { 0i32 } else { 1i32 })?
let props = dev.get_properties()?;
if (props.flags
& l0::sys::ze_device_property_flags_t::ZE_DEVICE_PROPERTY_FLAG_INTEGRATED)
== l0::sys::ze_device_property_flags_t::ZE_DEVICE_PROPERTY_FLAG_INTEGRATED
{
Ok::<_, CUresult>(1)
} else {
Ok(0)
}
})??
}
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_properties()?;
Ok::<_, l0::sys::ze_result_t>(props.maxHardwareContexts as i32)
})??
} }
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => 1,
// Streaming Multiprocessor corresponds roughly to a sub-slice (thread group can't cross either) // Streaming Multiprocessor corresponds roughly to a sub-slice (thread group can't cross either)
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT => { CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT => {
GlobalState::lock_device(dev_idx, |dev| { GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_properties()?; let props =
Ok::<_, l0::sys::ze_result_t>((props.numSlices * props.numSubslicesPerSlice) as i32) ocl_core::get_device_info(dev.ocl_base, ocl_core::DeviceInfo::MaxComputeUnits)?;
if let ocl_core::DeviceInfoResult::MaxComputeUnits(count) = props {
Ok(count as i32)
} else {
Err(CUresult::CUDA_ERROR_UNKNOWN)
}
})?? })??
} }
// I honestly don't know how to answer this query // I honestly don't know how to answer this query
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR => { CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR => {
GlobalState::lock_device(dev_idx, |dev| { GlobalState::lock_device(dev_idx, |dev| {
let max_simd = dev.get_max_simd()?; if !dev.is_amd {
let props = dev.get_properties()?; 8i32 * 7 // correct for GEN9
Ok::<_, l0::sys::ze_result_t>( } else {
(props.numEUsPerSubslice * props.numThreadsPerEU * max_simd) as i32, 4i32 * 32 // probably correct for RDNA
) }
})?? })?
} }
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => { CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK => {
GlobalState::lock_device(dev_idx, |dev| { GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?; let props = ocl_core::get_device_info(
Ok::<_, l0::sys::ze_result_t>(cmp::min( dev.ocl_base,
i32::max_value() as u32, ocl_core::DeviceInfo::MaxWorkGroupSize,
props.maxTotalGroupSize, )?;
) as i32) if let ocl_core::DeviceInfoResult::MaxWorkGroupSize(size) = props {
})?? Ok(size as i32)
} } else {
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH => { Err(CUresult::CUDA_ERROR_UNKNOWN)
GlobalState::lock_device(dev_idx, |dev| { }
let props = dev.get_image_properties()?;
Ok::<_, l0::sys::ze_result_t>(cmp::min(
props.maxImageDims1D,
c_int::max_value() as u32,
) as c_int)
})??
}
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(cmp::min(
i32::max_value() as u32,
props.maxGroupCountX,
) as i32)
})??
}
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(cmp::min(
i32::max_value() as u32,
props.maxGroupCountY,
) as i32)
})??
}
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z => {
GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?;
Ok::<_, l0::sys::ze_result_t>(cmp::min(
i32::max_value() as u32,
props.maxGroupCountZ,
) as i32)
})?? })??
} }
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X => { CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X => {
GlobalState::lock_device(dev_idx, |dev| { GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?; let props = ocl_core::get_device_info(
Ok::<_, l0::sys::ze_result_t>( dev.ocl_base,
cmp::min(i32::max_value() as u32, props.maxGroupSizeX) as i32, ocl_core::DeviceInfo::MaxWorkItemSizes,
) )?;
})?? if let ocl_core::DeviceInfoResult::MaxWorkItemSizes(sizes) = props {
Ok(sizes)
} else {
Err(CUresult::CUDA_ERROR_UNKNOWN)
}
})??[0] as i32
} }
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y => { CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y => {
GlobalState::lock_device(dev_idx, |dev| { GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?; let props = ocl_core::get_device_info(
Ok::<_, l0::sys::ze_result_t>( dev.ocl_base,
cmp::min(i32::max_value() as u32, props.maxGroupSizeY) as i32, ocl_core::DeviceInfo::MaxWorkItemSizes,
) )?;
})?? if let ocl_core::DeviceInfoResult::MaxWorkItemSizes(sizes) = props {
Ok(sizes)
} else {
Err(CUresult::CUDA_ERROR_UNKNOWN)
}
})??[1] as i32
} }
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z => { CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z => {
GlobalState::lock_device(dev_idx, |dev| { GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?; let props = ocl_core::get_device_info(
Ok::<_, l0::sys::ze_result_t>( dev.ocl_base,
cmp::min(i32::max_value() as u32, props.maxGroupSizeZ) as i32, ocl_core::DeviceInfo::MaxWorkItemSizes,
) )?;
})?? if let ocl_core::DeviceInfoResult::MaxWorkItemSizes(sizes) = props {
Ok(sizes)
} else {
Err(CUresult::CUDA_ERROR_UNKNOWN)
}
})??[2] as i32
} }
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK => { CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK => {
GlobalState::lock_device(dev_idx, |dev| { GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_compute_properties()?; let props =
Ok::<_, l0::sys::ze_result_t>(props.maxSharedLocalMemory as i32) ocl_core::get_device_info(dev.ocl_base, ocl_core::DeviceInfo::LocalMemSize)?;
})?? if let ocl_core::DeviceInfoResult::LocalMemSize(size) = props {
} Ok(size)
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => { } else {
GlobalState::lock_device(dev_idx, |dev| Ok::<_, CUresult>(dev.get_max_simd()? as i32))?? Err(CUresult::CUDA_ERROR_UNKNOWN)
}
})?? as i32
} }
CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => 32,
_ => { _ => {
// TODO: support more attributes for CUDA runtime // TODO: support more attributes for CUDA runtime
/* /*
@ -341,13 +264,9 @@ pub fn get_attribute(
} }
pub fn get_uuid(uuid: *mut CUuuid_st, dev_idx: Index) -> Result<(), CUresult> { pub fn get_uuid(uuid: *mut CUuuid_st, dev_idx: Index) -> Result<(), CUresult> {
let ze_uuid = GlobalState::lock_device(dev_idx, |dev| {
let props = dev.get_properties()?;
Ok::<_, l0::sys::ze_result_t>(props.uuid)
})??;
unsafe { unsafe {
*uuid = CUuuid_st { *uuid = CUuuid_st {
bytes: mem::transmute(ze_uuid.id), bytes: mem::zeroed(),
} }
}; };
Ok(()) Ok(())
@ -379,7 +298,7 @@ pub fn primary_ctx_get_state(
.map(|current| current == ctx_ptr) .map(|current| current == ctx_ptr)
.unwrap_or(false); .unwrap_or(false);
let flags_value = unsafe { &*flags_ptr }.load(Ordering::Relaxed); let flags_value = unsafe { &*flags_ptr }.load(Ordering::Relaxed);
Ok::<_, l0::sys::ze_result_t>((is_active, flags_value)) Ok::<_, CUresult>((is_active, flags_value))
})??; })??;
unsafe { *active = if is_active { 1 } else { 0 } }; unsafe { *active = if is_active { 1 } else { 0 } };
unsafe { *flags = flags_value }; unsafe { *flags = flags_value };
@ -399,149 +318,3 @@ pub fn primary_ctx_retain(
pub(crate) fn primary_ctx_release_v2(_dev_idx: Index) -> CUresult { pub(crate) fn primary_ctx_release_v2(_dev_idx: Index) -> CUresult {
CUresult::CUDA_SUCCESS CUresult::CUDA_SUCCESS
} }
pub struct DynamicEventPool {
count: usize,
pool_flags: l0::sys::ze_event_pool_flags_t,
signal_flags: l0::sys::ze_event_scope_flags_t,
events: Vec<DynamicEventPoolEntry>,
}
impl DynamicEventPool {
fn new(
dev: l0::Device,
ctx: &'static l0::Context,
pool_flags: l0::sys::ze_event_pool_flags_t,
signal_flags: l0::sys::ze_event_scope_flags_t,
) -> l0::Result<Self> {
Ok(DynamicEventPool {
count: 0,
pool_flags,
signal_flags,
events: vec![DynamicEventPoolEntry::new(dev, ctx, pool_flags)?],
})
}
pub fn get(
&'static mut self,
dev: l0::Device,
ctx: &'static l0::Context,
) -> l0::Result<(l0::Event<'static>, u64)> {
self.count += 1;
let events = unsafe { transmute_lifetime_mut(&mut self.events) };
let (global_idx, (ev, local_idx)) = {
for (idx, entry) in self.events.iter_mut().enumerate() {
if let Some((ev, local_idx)) = entry.get(self.signal_flags)? {
let marker = (idx << 32) as u64 | local_idx as u64;
return Ok((ev, marker));
}
}
events.push(DynamicEventPoolEntry::new(dev, ctx, self.pool_flags)?);
let global_idx = (events.len() - 1) as u64;
(
global_idx,
events.last_mut().unwrap().get(self.signal_flags)?.unwrap(),
)
};
let marker = (global_idx << 32) | local_idx as u64;
Ok((ev, marker))
}
pub fn mark_as_free(&mut self, marker: u64) {
let global_idx = (marker >> 32) as u32;
self.events[global_idx as usize].mark_as_free(marker as u32);
self.count -= 1;
// TODO: clean up empty entries
}
}
const DYNAMIC_EVENT_POOL_ENTRY_SIZE: usize = 448;
const DYNAMIC_EVENT_POOL_ENTRY_BITMAP_SIZE: usize =
DYNAMIC_EVENT_POOL_ENTRY_SIZE / (mem::size_of::<u64>() * 8);
#[repr(C)]
#[repr(align(64))]
struct DynamicEventPoolEntry {
event_pool: l0::EventPool<'static>,
bit_map: [u64; DYNAMIC_EVENT_POOL_ENTRY_BITMAP_SIZE],
}
impl DynamicEventPoolEntry {
fn new(
dev: l0::Device,
ctx: &'static l0::Context,
flags: l0::sys::ze_event_pool_flags_t,
) -> l0::Result<Self> {
Ok(DynamicEventPoolEntry {
event_pool: l0::EventPool::new(
ctx,
flags,
DYNAMIC_EVENT_POOL_ENTRY_SIZE as u32,
Some(&[dev]),
)?,
bit_map: [0; DYNAMIC_EVENT_POOL_ENTRY_BITMAP_SIZE],
})
}
fn get(
&'static mut self,
signal: l0::sys::ze_event_scope_flags_t,
) -> l0::Result<Option<(l0::Event<'static>, u32)>> {
for (idx, value) in self.bit_map.iter_mut().enumerate() {
let shift = first_index_of_zero_u64(*value);
if shift == 64 {
continue;
}
*value = *value | (1u64 << shift);
let entry_index = (idx as u32 * 64u32) + shift;
let event = l0::Event::new(
&self.event_pool,
entry_index,
signal,
l0::sys::ze_event_scope_flags_t(0),
)?;
return Ok(Some((event, entry_index)));
}
Ok(None)
}
fn mark_as_free(&mut self, idx: u32) {
let value = &mut self.bit_map[idx as usize / 64];
let shift = idx % 64;
*value = *value & !(1 << shift);
}
}
fn first_index_of_zero_u64(x: u64) -> u32 {
let x = !x;
(x & x.wrapping_neg()).trailing_zeros()
}
#[cfg(test)]
mod test {
use std::mem;
use super::DynamicEventPoolEntry;
use super::super::test::CudaDriverFns;
use super::super::CUresult;
cuda_driver_test!(primary_ctx_default_inactive);
fn primary_ctx_default_inactive<T: CudaDriverFns>() {
assert_eq!(T::cuInit(0), CUresult::CUDA_SUCCESS);
let mut flags = u32::max_value();
let mut active = i32::max_value();
assert_eq!(
T::cuDevicePrimaryCtxGetState(0, &mut flags, &mut active),
CUresult::CUDA_SUCCESS
);
assert_eq!(flags, 0);
assert_eq!(active, 0);
}
#[test]
pub fn dynamic_event_pool_page_is_64b() {
assert_eq!(mem::size_of::<DynamicEventPoolEntry>(), 64);
assert_eq!(mem::align_of::<DynamicEventPoolEntry>(), 64);
}
}

View file

@ -134,30 +134,6 @@ impl<T: CudaRepr> Decuda<*mut T::Impl> for *mut T {
} }
} }
impl From<l0::sys::ze_result_t> for CUresult {
fn from(result: l0::sys::ze_result_t) -> Self {
match result {
l0::sys::ze_result_t::ZE_RESULT_SUCCESS => CUresult::CUDA_SUCCESS,
l0_sys::ze_result_t::ZE_RESULT_ERROR_UNINITIALIZED => {
CUresult::CUDA_ERROR_NOT_INITIALIZED
}
l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ENUMERATION
| l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_ARGUMENT
| l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION
| l0_sys::ze_result_t::ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION => {
CUresult::CUDA_ERROR_INVALID_VALUE
}
l0_sys::ze_result_t::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY => {
CUresult::CUDA_ERROR_OUT_OF_MEMORY
}
l0_sys::ze_result_t::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE => {
CUresult::CUDA_ERROR_NOT_SUPPORTED
}
_ => CUresult::CUDA_ERROR_UNKNOWN,
}
}
}
impl<T> From<TryLockError<T>> for CUresult { impl<T> From<TryLockError<T>> for CUresult {
fn from(_: TryLockError<T>) -> Self { fn from(_: TryLockError<T>) -> Self {
CUresult::CUDA_ERROR_ILLEGAL_STATE CUresult::CUDA_ERROR_ILLEGAL_STATE
@ -184,13 +160,6 @@ impl Encuda for CUresult {
} }
} }
impl Encuda for l0::sys::ze_result_t {
type To = CUresult;
fn encuda(self: Self) -> Self::To {
self.into()
}
}
impl Encuda for () { impl Encuda for () {
type To = CUresult; type To = CUresult;
fn encuda(self: Self) -> Self::To { fn encuda(self: Self) -> Self::To {
@ -215,7 +184,6 @@ lazy_static! {
struct GlobalState { struct GlobalState {
devices: Vec<Device>, devices: Vec<Device>,
global_heap: *mut c_void, global_heap: *mut c_void,
platform: ocl_core::PlatformId,
} }
unsafe impl Send for GlobalState {} unsafe impl Send for GlobalState {}
@ -282,19 +250,6 @@ impl GlobalState {
} }
} }
fn lock_enqueue(
stream: *mut stream::Stream,
f: impl FnOnce(&ocl_core::CommandQueue) -> Result<(), CUresult>,
) -> Result<(), CUresult> {
Self::lock_stream(stream, |stream_data| {
let l0_dev = unsafe { (*(*stream_data.context).device).base };
let l0_ctx = unsafe { &mut (*(*stream_data.context).device).ocl_context };
let cmd_list = unsafe { transmute_lifetime(&stream_data.cmd_list) };
f(&stream_data.cmd_list.as_ref().unwrap())?;
Ok(())
})?
}
fn lock_function<T>( fn lock_function<T>(
func: *mut function::Function, func: *mut function::Function,
f: impl FnOnce(&mut function::FunctionData) -> T, f: impl FnOnce(&mut function::FunctionData) -> T,
@ -309,11 +264,6 @@ impl GlobalState {
} }
} }
// TODO: implement
fn is_intel_gpu_driver(_: &l0::Driver) -> bool {
true
}
pub fn init() -> Result<(), CUresult> { pub fn init() -> Result<(), CUresult> {
let mut global_state = GLOBAL_STATE let mut global_state = GLOBAL_STATE
.lock() .lock()
@ -321,36 +271,29 @@ pub fn init() -> Result<(), CUresult> {
if global_state.is_some() { if global_state.is_some() {
return Ok(()); return Ok(());
} }
l0::init()?;
let platforms = ocl_core::get_platform_ids()?; let platforms = ocl_core::get_platform_ids()?;
let (platform, device) = platforms let mut devices = platforms
.iter() .iter()
.find_map(|plat| { .filter_map(|plat| {
let devices = let devices =
ocl_core::get_device_ids(plat, Some(ocl_core::DeviceType::GPU), None).ok()?; ocl_core::get_device_ids(plat, Some(ocl_core::DeviceType::GPU), None).ok()?;
for dev in devices { for dev in devices {
let vendor = ocl_core::get_device_info(dev, ocl_core::DeviceInfo::VendorId).ok()?; let vendor = ocl_core::get_device_info(dev, ocl_core::DeviceInfo::VendorId).ok()?;
if let ocl_core::DeviceInfoResult::VendorId(0x8086) = vendor { let is_amd = match vendor {
let dev_type = ocl_core::DeviceInfoResult::VendorId(0x8086) => false,
ocl_core::get_device_info(dev, ocl_core::DeviceInfo::Type).ok()?; ocl_core::DeviceInfoResult::VendorId(0x1002) => true,
if let ocl_core::DeviceInfoResult::Type(ocl_core::DeviceType::GPU) = dev_type { _ => continue,
return Some((plat.clone(), dev)); };
} let dev_type = ocl_core::get_device_info(dev, ocl_core::DeviceInfo::Type).ok()?;
if let ocl_core::DeviceInfoResult::Type(ocl_core::DeviceType::GPU) = dev_type {
return Some((plat.clone(), dev, is_amd));
} }
} }
None None
}) })
.ok_or(CUresult::CUDA_ERROR_UNKNOWN)?; .enumerate()
let drivers = l0::Driver::get()?; .map(|(idx, (platform, device, is_amd))| device::Device::new(platform, device, idx, is_amd))
let mut devices = match drivers.into_iter().find(is_intel_gpu_driver) { .collect::<Result<Vec<_>, _>>()?;
None => return Err(CUresult::CUDA_ERROR_UNKNOWN),
Some(driver) => driver
.devices()?
.into_iter()
.enumerate()
.map(|(idx, l0_dev)| device::Device::new(l0_dev, platform, device, idx).unwrap())
.collect::<Vec<_>>(),
};
for d in devices.iter_mut() { for d in devices.iter_mut() {
d.late_init(); d.late_init();
d.primary_context.late_init(); d.primary_context.late_init();
@ -362,7 +305,6 @@ pub fn init() -> Result<(), CUresult> {
*global_state = Some(GlobalState { *global_state = Some(GlobalState {
devices, devices,
global_heap, global_heap,
platform,
}); });
drop(global_state); drop(global_state);
Ok(()) Ok(())

View file

@ -128,6 +128,12 @@ impl SpirvModule {
generic_paths.chain(std::iter::once(additional_path)) generic_paths.chain(std::iter::once(additional_path))
} }
#[cfg(not(target_os = "linux"))]
fn compile_amd(spirv_il: &[u8], ptx_lib: Option<&'static [u8]>) -> io::Result<()> {
Ok(())
}
#[cfg(target_os = "linux")]
fn compile_amd(spirv_il: &[u8], ptx_lib: Option<&'static [u8]>) -> io::Result<()> { fn compile_amd(spirv_il: &[u8], ptx_lib: Option<&'static [u8]>) -> io::Result<()> {
let dir = tempfile::tempdir()?; let dir = tempfile::tempdir()?;
let mut spirv = NamedTempFile::new_in(&dir)?; let mut spirv = NamedTempFile::new_in(&dir)?;

View file

@ -1,5 +1,3 @@
extern crate level_zero as l0;
extern crate level_zero_sys as l0_sys;
#[macro_use] #[macro_use]
extern crate lazy_static; extern crate lazy_static;
#[cfg(test)] #[cfg(test)]