Add basic cuModule*, add handful of missing stuff

This commit is contained in:
Andrzej Janik 2024-11-22 18:57:19 +01:00
parent 3ec7bffdc5
commit 9f677e23c0
6 changed files with 152 additions and 248 deletions

View file

@ -9,6 +9,8 @@ name = "nvcuda"
crate-type = ["cdylib"]
[dependencies]
comgr = { path = "../comgr" }
ptx_parser = { path = "../ptx_parser" }
ptx = { path = "../ptx" }
cuda_types = { path = "../cuda_types" }
cuda_base = { path = "../cuda_base" }

View file

@ -7,3 +7,7 @@ pub(crate) unsafe fn get_limit(pvalue: *mut usize, limit: hipLimit_t) -> hipErro
pub(crate) fn set_limit(limit: hipLimit_t, value: usize) -> hipError_t {
unsafe { hipDeviceSetLimit(limit, value) }
}
pub(crate) fn synchronize() -> hipError_t {
unsafe { hipDeviceSynchronize() }
}

View file

@ -300,6 +300,10 @@ pub(crate) fn get_properties(prop: &mut cuda_types::CUdevprop, dev: hipDevice_t)
Ok(())
}
pub(crate) fn get_count(count: &mut ::core::ffi::c_int) -> hipError_t {
unsafe { hipGetDeviceCount(count) }
}
fn clamp_usize(x: usize) -> i32 {
usize::min(x, i32::MAX as usize) as i32
}

View file

@ -1,8 +1,10 @@
use cuda_types::*;
use hip_runtime_sys::*;
use std::mem::{self, ManuallyDrop};
pub(super) mod context;
pub(super) mod device;
pub(super) mod module;
#[cfg(debug_assertions)]
pub(crate) fn unimplemented() -> CUresult {
@ -66,9 +68,38 @@ macro_rules! from_cuda_transmute {
};
}
macro_rules! from_cuda_object {
($($type_:ty),*) => {
$(
impl<'a> FromCuda<'a, <$type_ as ZludaObject>::CudaHandle> for <$type_ as ZludaObject>::CudaHandle {
fn from_cuda(handle: &'a <$type_ as ZludaObject>::CudaHandle) -> Result<<$type_ as ZludaObject>::CudaHandle, CUerror> {
Ok(*handle)
}
}
impl<'a> FromCuda<'a, *mut <$type_ as ZludaObject>::CudaHandle> for &'a mut <$type_ as ZludaObject>::CudaHandle {
fn from_cuda(handle: &'a *mut <$type_ as ZludaObject>::CudaHandle) -> Result<&'a mut <$type_ as ZludaObject>::CudaHandle, CUerror> {
match unsafe { handle.as_mut() } {
Some(x) => Ok(x),
None => Err(CUerror::INVALID_VALUE),
}
}
}
impl<'a> FromCuda<'a, <$type_ as ZludaObject>::CudaHandle> for &'a $type_ {
fn from_cuda(handle: &'a <$type_ as ZludaObject>::CudaHandle) -> Result<&'a $type_, CUerror> {
Ok(as_ref(handle).as_result()?)
}
}
)*
};
}
from_cuda_nop!(
*mut i8,
*mut usize,
*const std::ffi::c_void,
*const ::core::ffi::c_char,
i32,
u32,
usize,
@ -77,8 +108,10 @@ from_cuda_nop!(
);
from_cuda_transmute!(
CUdevice => hipDevice_t,
CUuuid => hipUUID
CUuuid => hipUUID,
CUfunction => hipFunction_t
);
from_cuda_object!(module::Module);
impl<'a> FromCuda<'a, CUlimit> for hipLimit_t {
fn from_cuda(limit: &'a CUlimit) -> Result<Self, CUerror> {
@ -91,6 +124,72 @@ impl<'a> FromCuda<'a, CUlimit> for hipLimit_t {
}
}
pub(crate) trait ZludaObject: Sized + Send + Sync {
const COOKIE: usize;
const LIVENESS_FAIL: CUerror = cuda_types::CUerror::INVALID_VALUE;
type CudaHandle: Sized;
fn drop_checked(&mut self) -> CUresult;
fn wrap(self) -> Self::CudaHandle {
unsafe { mem::transmute_copy(&LiveCheck::wrap(self)) }
}
}
#[repr(C)]
pub(crate) struct LiveCheck<T: ZludaObject> {
cookie: usize,
data: ManuallyDrop<T>,
}
impl<T: ZludaObject> LiveCheck<T> {
fn wrap(data: T) -> *mut Self {
Box::into_raw(Box::new(LiveCheck {
cookie: T::COOKIE,
data: ManuallyDrop::new(data),
}))
}
fn as_result(&self) -> Result<&T, CUerror> {
if self.cookie == T::COOKIE {
Ok(&self.data)
} else {
Err(T::LIVENESS_FAIL)
}
}
// This looks like nonsense, but it's not. There are two cases:
// Err(CUerror) -> meaning that the object is invalid, this pointer does not point into valid memory
// Ok(maybe_error) -> meaning that the object is valid, we dropped everything, but there *might*
// an error in the underlying runtime that we want to propagate
#[must_use]
fn drop_checked(&mut self) -> Result<Result<(), CUerror>, CUerror> {
if self.cookie == T::COOKIE {
self.cookie = 0;
let result = self.data.drop_checked();
unsafe { ManuallyDrop::drop(&mut self.data) };
Ok(result)
} else {
Err(T::LIVENESS_FAIL)
}
}
}
pub fn as_ref<'a, T: ZludaObject>(
handle: &'a T::CudaHandle,
) -> &'a ManuallyDrop<Box<LiveCheck<T>>> {
unsafe { mem::transmute(handle) }
}
pub fn drop_checked<T: ZludaObject>(handle: T::CudaHandle) -> Result<(), CUerror> {
let mut wrapped_object: ManuallyDrop<Box<LiveCheck<T>>> =
unsafe { mem::transmute_copy(&handle) };
let underlying_error = LiveCheck::drop_checked(&mut wrapped_object)?;
unsafe { ManuallyDrop::drop(&mut wrapped_object) };
underlying_error
}
pub(crate) fn init(flags: ::core::ffi::c_uint) -> hipError_t {
unsafe { hipInit(flags) }
}

View file

@ -1,261 +1,53 @@
use std::borrow::Cow;
use std::collections::HashMap;
use std::ffi::{CStr, CString};
use std::fs::File;
use std::io::{self, Read, Write};
use std::ops::Add;
use std::os::raw::c_char;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::{env, fs, iter, mem, ptr, slice};
use super::ZludaObject;
use cuda_types::*;
use hip_runtime_sys::*;
use std::{ffi::CStr, mem};
use hip_runtime_sys::{
hipCtxGetCurrent, hipCtxGetDevice, hipDeviceGetAttribute, hipDeviceGetName, hipDeviceProp_t,
hipError_t, hipGetDeviceProperties, hipGetStreamDeviceId, hipModuleLoadData,
};
use tempfile::NamedTempFile;
use crate::cuda::CUmodule;
use crate::hip_call;
pub struct SpirvModule {
pub binaries: Vec<u32>,
pub kernel_info: HashMap<String, ptx::KernelInfo>,
pub should_link_ptx_impl: Option<(&'static [u8], &'static [u8])>,
pub build_options: CString,
pub(crate) struct Module {
base: hipModule_t,
}
impl SpirvModule {
pub fn new_raw<'a>(text: *const c_char) -> Result<Self, hipError_t> {
let u8_text = unsafe { CStr::from_ptr(text) };
let ptx_text = u8_text
.to_str()
.map_err(|_| hipError_t::hipErrorInvalidImage)?;
Self::new(ptx_text)
}
impl ZludaObject for Module {
const COOKIE: usize = 0xe9138bd040487d4a;
pub fn new<'a>(ptx_text: &str) -> Result<Self, hipError_t> {
let mut errors = Vec::new();
let ast = ptx::ModuleParser::new()
.parse(&mut errors, ptx_text)
.map_err(|_| hipError_t::hipErrorInvalidImage)?;
if errors.len() > 0 {
return Err(hipError_t::hipErrorInvalidImage);
}
let spirv_module =
ptx::to_spirv_module(ast).map_err(|_| hipError_t::hipErrorInvalidImage)?;
Ok(SpirvModule {
binaries: spirv_module.assemble(),
kernel_info: spirv_module.kernel_info,
should_link_ptx_impl: spirv_module.should_link_ptx_impl,
build_options: spirv_module.build_options,
})
type CudaHandle = CUmodule;
fn drop_checked(&mut self) -> CUresult {
unsafe { hipModuleUnload(self.base) }?;
Ok(())
}
}
pub(crate) fn load(module: *mut CUmodule, fname: *const i8) -> Result<(), hipError_t> {
let file_name = unsafe { CStr::from_ptr(fname) }
pub(crate) fn load_data(module: &mut CUmodule, image: *const std::ffi::c_void) -> CUresult {
let text = unsafe { CStr::from_ptr(image.cast()) }
.to_str()
.map_err(|_| hipError_t::hipErrorInvalidValue)?;
let mut file = File::open(file_name).map_err(|_| hipError_t::hipErrorFileNotFound)?;
let mut file_buffer = Vec::new();
file.read_to_end(&mut file_buffer)
.map_err(|_| hipError_t::hipErrorUnknown)?;
let result = load_data(module, file_buffer.as_ptr() as _);
drop(file_buffer);
result
}
pub(crate) fn load_data(
module: *mut CUmodule,
image: *const std::ffi::c_void,
) -> Result<(), hipError_t> {
if image == ptr::null() {
return Err(hipError_t::hipErrorInvalidValue);
}
if unsafe { *(image as *const u32) } == 0x464c457f {
return match unsafe { hipModuleLoadData(module as _, image) } {
hipError_t::hipSuccess => Ok(()),
e => Err(e),
};
}
let spirv_data = SpirvModule::new_raw(image as *const _)?;
load_data_impl(module, spirv_data)
}
pub fn load_data_impl(pmod: *mut CUmodule, spirv_data: SpirvModule) -> Result<(), hipError_t> {
.map_err(|_| CUerror::INVALID_VALUE)?;
let ast = ptx_parser::parse_module_checked(text).map_err(|_| CUerror::NO_BINARY_FOR_GPU)?;
let llvm_module = ptx::to_llvm_module(ast).map_err(|_| CUerror::UNKNOWN)?;
let mut dev = 0;
hip_call! { hipCtxGetDevice(&mut dev) };
unsafe { hipCtxGetDevice(&mut dev) }?;
let mut props = unsafe { mem::zeroed() };
hip_call! { hipGetDeviceProperties(&mut props, dev) };
let arch_binary = compile_amd(
&props,
iter::once(&spirv_data.binaries[..]),
spirv_data.should_link_ptx_impl,
unsafe { hipGetDevicePropertiesR0600(&mut props, dev) }?;
let elf_module = comgr::compile_bitcode(
unsafe { CStr::from_ptr(props.gcnArchName.as_ptr()) },
&*llvm_module.llvm_ir,
llvm_module.linked_bitcode(),
)
.map_err(|_| hipError_t::hipErrorUnknown)?;
hip_call! { hipModuleLoadData(pmod as _, arch_binary.as_ptr() as _) };
.map_err(|_| CUerror::UNKNOWN)?;
let mut hip_module = unsafe { mem::zeroed() };
unsafe { hipModuleLoadData(&mut hip_module, elf_module.as_ptr().cast()) }?;
*module = Module { base: hip_module }.wrap();
Ok(())
}
const LLVM_SPIRV: &'static str = "/home/vosen/amd/llvm-project/build/bin/llvm-spirv";
const AMDGPU: &'static str = "/opt/rocm/";
const AMDGPU_TARGET: &'static str = "amdgcn-amd-amdhsa";
const AMDGPU_BITCODE: [&'static str; 8] = [
"opencl.bc",
"ocml.bc",
"ockl.bc",
"oclc_correctly_rounded_sqrt_off.bc",
"oclc_daz_opt_on.bc",
"oclc_finite_only_off.bc",
"oclc_unsafe_math_off.bc",
"oclc_wavefrontsize64_off.bc",
];
const AMDGPU_BITCODE_DEVICE_PREFIX: &'static str = "oclc_isa_version_";
pub(crate) fn compile_amd<'a>(
device_pros: &hipDeviceProp_t,
spirv_il: impl Iterator<Item = &'a [u32]>,
ptx_lib: Option<(&'static [u8], &'static [u8])>,
) -> io::Result<Vec<u8>> {
let null_terminator = device_pros
.gcnArchName
.iter()
.position(|&x| x == 0)
.unwrap();
let gcn_arch_slice = unsafe {
slice::from_raw_parts(device_pros.gcnArchName.as_ptr() as _, null_terminator + 1)
};
let device_name =
if let Ok(Ok(name)) = CStr::from_bytes_with_nul(gcn_arch_slice).map(|x| x.to_str()) {
name
} else {
return Err(io::Error::new(io::ErrorKind::Other, ""));
};
let dir = tempfile::tempdir()?;
let llvm_spirv_path = match env::var("LLVM_SPIRV") {
Ok(path) => Cow::Owned(path),
Err(_) => Cow::Borrowed(LLVM_SPIRV),
};
let llvm_files = spirv_il
.map(|spirv| {
let mut spirv_file = NamedTempFile::new_in(&dir)?;
let spirv_u8 = unsafe {
slice::from_raw_parts(
spirv.as_ptr() as *const u8,
spirv.len() * mem::size_of::<u32>(),
)
};
spirv_file.write_all(spirv_u8)?;
if cfg!(debug_assertions) {
persist_file(spirv_file.path())?;
}
let llvm = NamedTempFile::new_in(&dir)?;
let to_llvm_cmd = Command::new(&*llvm_spirv_path)
//.arg("--spirv-debug")
.arg("-r")
.arg("-o")
.arg(llvm.path())
.arg(spirv_file.path())
.status()?;
assert!(to_llvm_cmd.success());
if cfg!(debug_assertions) {
persist_file(llvm.path())?;
}
Ok::<_, io::Error>(llvm)
})
.collect::<Result<Vec<_>, _>>()?;
let linked_binary = NamedTempFile::new_in(&dir)?;
let mut llvm_link = PathBuf::from(AMDGPU);
llvm_link.push("llvm");
llvm_link.push("bin");
llvm_link.push("llvm-link");
let mut linker_cmd = Command::new(&llvm_link);
linker_cmd
.arg("-o")
.arg(linked_binary.path())
.args(llvm_files.iter().map(|f| f.path()))
.args(get_bitcode_paths(device_name));
if cfg!(debug_assertions) {
linker_cmd.arg("-v");
}
let status = linker_cmd.status()?;
assert!(status.success());
if cfg!(debug_assertions) {
persist_file(linked_binary.path())?;
}
let mut ptx_lib_bitcode = NamedTempFile::new_in(&dir)?;
let compiled_binary = NamedTempFile::new_in(&dir)?;
let mut clang_exe = PathBuf::from(AMDGPU);
clang_exe.push("llvm");
clang_exe.push("bin");
clang_exe.push("clang");
let mut compiler_cmd = Command::new(&clang_exe);
compiler_cmd
.arg(format!("-mcpu={}", device_name))
.arg("-ffp-contract=off")
.arg("-nogpulib")
.arg("-mno-wavefrontsize64")
.arg("-O3")
.arg("-Xclang")
.arg("-O3")
.arg("-Xlinker")
.arg("--no-undefined")
.arg("-target")
.arg(AMDGPU_TARGET)
.arg("-o")
.arg(compiled_binary.path())
.arg("-x")
.arg("ir")
.arg(linked_binary.path());
if let Some((_, bitcode)) = ptx_lib {
ptx_lib_bitcode.write_all(bitcode)?;
compiler_cmd.arg(ptx_lib_bitcode.path());
};
if cfg!(debug_assertions) {
compiler_cmd.arg("-v");
}
let status = compiler_cmd.status()?;
assert!(status.success());
let mut result = Vec::new();
let compiled_bin_path = compiled_binary.path();
let mut compiled_binary = File::open(compiled_bin_path)?;
compiled_binary.read_to_end(&mut result)?;
if cfg!(debug_assertions) {
persist_file(compiled_bin_path)?;
}
Ok(result)
pub(crate) fn unload(hmod: CUmodule) -> CUresult {
super::drop_checked::<Module>(hmod)
}
fn persist_file(path: &Path) -> io::Result<()> {
let mut persistent = PathBuf::from("/tmp/zluda");
std::fs::create_dir_all(&persistent)?;
persistent.push(path.file_name().unwrap());
std::fs::copy(path, persistent)?;
Ok(())
}
fn get_bitcode_paths(device_name: &str) -> impl Iterator<Item = PathBuf> {
let generic_paths = AMDGPU_BITCODE.iter().map(|x| {
let mut path = PathBuf::from(AMDGPU);
path.push("amdgcn");
path.push("bitcode");
path.push(x);
path
});
let suffix = if let Some(suffix_idx) = device_name.find(':') {
suffix_idx
} else {
device_name.len()
};
let mut additional_path = PathBuf::from(AMDGPU);
additional_path.push("amdgcn");
additional_path.push("bitcode");
additional_path.push(format!(
"{}{}{}",
AMDGPU_BITCODE_DEVICE_PREFIX,
&device_name[3..suffix],
".bc"
));
generic_paths.chain(std::iter::once(additional_path))
pub(crate) fn get_function(
hfunc: &mut hipFunction_t,
hmod: &Module,
name: *const ::core::ffi::c_char,
) -> hipError_t {
unsafe { hipModuleGetFunction(hfunc, hmod.base, name) }
}

View file

@ -27,16 +27,16 @@ macro_rules! implemented {
};
}
use cuda_base::cuda_function_declarations;
cuda_function_declarations!(
cuda_base::cuda_function_declarations!(
unimplemented,
implemented <= [
cuCtxGetLimit,
cuCtxSetLimit,
cuCtxSynchronize,
cuDeviceComputeCapability,
cuDeviceGet,
cuDeviceGetAttribute,
cuDeviceGetCount,
cuDeviceGetLuid,
cuDeviceGetName,
cuDeviceGetProperties,
@ -44,5 +44,8 @@ cuda_function_declarations!(
cuDeviceGetUuid_v2,
cuDeviceTotalMem_v2,
cuInit,
cuModuleGetFunction,
cuModuleLoadData,
cuModuleUnload,
]
);