mirror of
https://github.com/vosen/ZLUDA.git
synced 2025-04-19 16:04:44 +00:00
Merge branch 'master' into patch-2
This commit is contained in:
commit
8f69b71f60
212 changed files with 90461 additions and 60005 deletions
|
@ -0,0 +1,2 @@
|
|||
[alias]
|
||||
xtask = "run --package xtask --"
|
|
@ -8,18 +8,31 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get update -y && DEBIAN_FRONTEND=noninter
|
|||
python3 \
|
||||
ripgrep \
|
||||
git \
|
||||
ltrace
|
||||
ltrace \
|
||||
# required by llvm 17
|
||||
lsb-release software-properties-common gnupg
|
||||
|
||||
ARG LLVM_VERSION=17
|
||||
RUN wget https://apt.llvm.org/llvm.sh && \
|
||||
chmod +x llvm.sh && \
|
||||
./llvm.sh ${LLVM_VERSION}
|
||||
|
||||
# Feel free to change to a newer version if you have a newer verison on your host
|
||||
ARG CUDA_VERSION=12-4
|
||||
ARG CUDA_PKG_VERSION=12-4
|
||||
# Docker <-> host driver version compatiblity is newer host <-> older docker
|
||||
# We don't care about a specific driver version, so pick oldest 5XX
|
||||
ARG CUDA_DRIVER=515
|
||||
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||
RUN DEBIAN_FRONTEND=noninteractive apt-get update -y && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||
# CUDA headers need it for interop
|
||||
libgl-dev libegl-dev libvdpau-dev \
|
||||
nvidia-utils-${CUDA_DRIVER} \
|
||||
cuda-cudart-${CUDA_VERSION}
|
||||
cuda-cudart-dev-${CUDA_PKG_VERSION} \
|
||||
cuda-nvml-dev-${CUDA_PKG_VERSION} \
|
||||
cuda-cudart-${CUDA_PKG_VERSION} \
|
||||
cuda-profiler-api-${CUDA_PKG_VERSION} \
|
||||
cuda-nvcc-${CUDA_PKG_VERSION}
|
||||
|
||||
ARG ROCM_VERSION=6.2.2
|
||||
ARG ROCM_VERSION=6.3.1
|
||||
RUN mkdir --parents --mode=0755 /etc/apt/keyrings && \
|
||||
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
|
||||
gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null && \
|
||||
|
@ -29,9 +42,11 @@ RUN mkdir --parents --mode=0755 /etc/apt/keyrings && \
|
|||
rocminfo \
|
||||
rocm-gdb \
|
||||
rocm-smi-lib \
|
||||
hip-runtime-amd && \
|
||||
rocm-llvm-dev \
|
||||
hip-runtime-amd \
|
||||
hip-dev && \
|
||||
echo '/opt/rocm/lib' > /etc/ld.so.conf.d/rocm.conf && \
|
||||
ldconfig
|
||||
|
||||
ENV PATH=$PATH:/opt/rocm-6.2.2/bin
|
||||
ENV PATH=$PATH:/opt/rocm-${ROCM_VERSION}/bin
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
//"hostRequirements": { "gpu": "optional" }
|
||||
"customizations": {
|
||||
"vscode": {
|
||||
"extensions": [ "mhutchie.git-graph" ],
|
||||
"extensions": [ "mhutchie.git-graph" ]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
7
.gitmodules
vendored
7
.gitmodules
vendored
|
@ -1,10 +1,3 @@
|
|||
[submodule "ext/spirv-tools"]
|
||||
path = ext/spirv-tools
|
||||
url = https://github.com/KhronosGroup/SPIRV-Tools
|
||||
branch = master
|
||||
[submodule "ext/spirv-headers"]
|
||||
path = ext/spirv-headers
|
||||
url = https://github.com/KhronosGroup/SPIRV-Headers
|
||||
[submodule "ext/llvm-project"]
|
||||
path = ext/llvm-project
|
||||
url = https://github.com/llvm/llvm-project
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
# Dependencies
|
||||
|
||||
Development builds of ZLUDA requires following dependencies:
|
||||
|
||||
* CMake
|
||||
* Python 3
|
||||
|
||||
Additionally the repository has to be cloned with Git submodules initalized. If you cloned the repo without initalizing submodules, do this:
|
||||
```
|
||||
git submodule update --init --recursive
|
||||
```
|
||||
|
||||
# Tests
|
||||
|
||||
Tests should be executed with `--workspace` option to test non-default targets:
|
||||
```
|
||||
cargo test --workspace
|
||||
```
|
||||
|
||||
# Debugging
|
||||
|
||||
## Debuggging CUDA applications
|
||||
|
||||
When running an application with ZLUDA quite often you will run into subtle bugs or incompatibilities in the generated GPU code. The best way to debug an application's GPU CUDA code is to use ZLUDA dumper.
|
||||
|
||||
Library `zluda_dump` can be injected into a CUDA application and produce a trace which, for every launched GPU function contains:
|
||||
* PTX source
|
||||
* Launch arguments (block size, grid size, shared memory size)
|
||||
* Dump of function arguments. Both after and before
|
||||
|
||||
Example use with GeekBench:
|
||||
```
|
||||
set ZLUDA_DUMP_KERNEL=knn_match
|
||||
set ZLUDA_DUMP_DIR=C:\temp\zluda_dump
|
||||
"<ZLUDA_PATH>\zluda_with.exe" "<ZLUDA_PATH>\zluda_dump.dll" -- "geekbench_x86_64.exe" --compute CUDA
|
||||
```
|
||||
|
||||
The example above, for every execution of GPU function `knn_match`, will save its details into the directory `C:\temp\zluda_dump`
|
||||
|
||||
This dump can be replayed with `replay.py` script from `zluda_dump` source directory. Use it like this:
|
||||
```
|
||||
python replay.py "C:\temp\zluda_dump\geekbench_x86_64.exe"
|
||||
```
|
||||
You must copy (or symlink) ZLUDA `nvcuda.dll` into PyCUDA directory, so it will run using ZLUDA. Example output:
|
||||
```
|
||||
Intel(R) Graphics [0x3e92] [github.com/vosen/ZLUDA]
|
||||
C:\temp\zluda_dump\geekbench_x86_64.exe\4140_scale_pyramid
|
||||
C:\temp\zluda_dump\geekbench_x86_64.exe\4345_convolve_1d_vertical_grayscale
|
||||
Skipping, launch block size (512) bigger than maximum block size (256)
|
||||
C:\temp\zluda_dump\geekbench_x86_64.exe\4480_scale_pyramid
|
||||
6:
|
||||
Arrays are not equal
|
||||
|
||||
Mismatched elements: 1200 / 19989588 (0.006%)
|
||||
Max absolute difference: 255
|
||||
Max relative difference: 255.
|
||||
x: array([ 7, 6, 8, ..., 193, 195, 193], dtype=uint8)
|
||||
y: array([ 7, 6, 8, ..., 193, 195, 193], dtype=uint8)
|
||||
```
|
||||
From this output one can observe that in kernel launch 4480, 6th argument to function `scale_pyramid` differs between what was executed on an NVIDIA GPU using CUDA and Intel GPU using ZLUDA.
|
||||
__Important__: It's impossible to infer what was the type (and semantics) of argument passed to a GPU function. At our level it's a buffer of bytes and by default `replay.py` simply checks if two buffers are byte-equal. That means you will have a ton of false negatives when running `replay.py`. You should override them for your particular case in `replay.py` - it already contains some overrides for GeekBench kernels
|
1395
Cargo.lock
generated
Normal file
1395
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
60
Cargo.toml
60
Cargo.toml
|
@ -1,27 +1,33 @@
|
|||
[workspace]
|
||||
|
||||
resolver = "2"
|
||||
|
||||
members = [
|
||||
"ext/hip_runtime-sys",
|
||||
"ext/amd_comgr-sys",
|
||||
"comgr",
|
||||
"cuda_base",
|
||||
"cuda_types",
|
||||
"detours-sys",
|
||||
"level_zero-sys",
|
||||
"level_zero",
|
||||
"spirv_tools-sys",
|
||||
"zluda",
|
||||
"zluda_dump",
|
||||
"zluda_lib",
|
||||
"zluda_inject",
|
||||
"zluda_redirect",
|
||||
"zluda_ml",
|
||||
"ptx",
|
||||
"ptx_parser",
|
||||
"ptx_parser_macros",
|
||||
"ptx_parser_macros_impl",
|
||||
]
|
||||
|
||||
default-members = ["zluda_lib", "zluda_ml", "zluda_inject", "zluda_redirect"]
|
||||
[workspace]
|
||||
|
||||
resolver = "2"
|
||||
|
||||
members = [
|
||||
"ext/hip_runtime-sys",
|
||||
"ext/amd_comgr-sys",
|
||||
"comgr",
|
||||
"cuda_base",
|
||||
"cuda_types",
|
||||
"detours-sys",
|
||||
"zluda",
|
||||
"zluda_dump",
|
||||
"zluda_inject",
|
||||
"zluda_redirect",
|
||||
"zluda_ml",
|
||||
"ptx",
|
||||
"ptx_parser",
|
||||
"ptx_parser_macros",
|
||||
"ptx_parser_macros_impl",
|
||||
"xtask",
|
||||
"zluda_bindgen",
|
||||
]
|
||||
|
||||
default-members = ["zluda", "zluda_ml", "zluda_inject", "zluda_redirect"]
|
||||
|
||||
[profile.release-lto]
|
||||
inherits = "release"
|
||||
codegen-units = 1
|
||||
lto = true
|
||||
|
||||
[profile.dev.package.xtask]
|
||||
opt-level = 2
|
||||
|
|
File diff suppressed because one or more lines are too long
Before Width: | Height: | Size: 259 KiB |
60
README.md
60
README.md
|
@ -4,18 +4,23 @@
|
|||
|
||||
ZLUDA is a drop-in replacement for CUDA on non-NVIDIA GPU. ZLUDA allows to run unmodified CUDA applications using non-NVIDIA GPUs with near-native performance.
|
||||
|
||||
ZLUDA supports AMD Radeon RX 5000 series and newer GPUs (both desktop and integrated).
|
||||
|
||||

|
||||
|
||||
ZLUDA is work in progress. Follow development here and say hi on [Discord](https://discord.gg/sg6BNzXuc7). For more details see the announcement: https://vosen.github.io/ZLUDA/blog/zludas-third-life/
|
||||
|
||||
|
||||
## Usage
|
||||
**Warning**: ZLUDA is under heavy development (see news [here](https://vosen.github.io/ZLUDA/blog/zludas-third-life/)). Instructions below might not work.
|
||||
**Warning**: This version ZLUDA is under heavy development (more [here](https://vosen.github.io/ZLUDA/blog/zludas-third-life/)) and right now only supports Geekbench. ZLUDA probably will not work with your application just yet.
|
||||
|
||||
### Windows
|
||||
You should have the most recent ROCm installed.\
|
||||
Run your application like this:
|
||||
```
|
||||
<ZLUDA_DIRECTORY>\zluda_with.exe -- <APPLICATION> <APPLICATIONS_ARGUMENTS>
|
||||
```
|
||||
You should have recent AMD GPU driver ("AMD Software: Adrenalin Edition") installed.\
|
||||
To run your application you should etiher:
|
||||
* (Recommended approach) Copy ZLUDA-provided `nvcuda.dll` and `nvml.dll` from `target\release` (if built from sources) or `zluda` (if downloaded a zip package) into a path which your application uses to load CUDA. Paths vary application to application, but usually it's the directory where the .exe file is located
|
||||
* Use ZLUDA launcher like below. ZLUDA launcher is known to be buggy and incomplete:
|
||||
```
|
||||
<ZLUDA_DIRECTORY>\zluda_with.exe -- <APPLICATION> <APPLICATIONS_ARGUMENTS>
|
||||
```
|
||||
|
||||
### Linux
|
||||
|
||||
|
@ -24,33 +29,42 @@ Run your application like this:
|
|||
LD_LIBRARY_PATH=<ZLUDA_DIRECTORY> <APPLICATION> <APPLICATIONS_ARGUMENTS>
|
||||
```
|
||||
|
||||
where `<ZLUDA_DIRECTORY>` is the directory which contains ZLUDA-provided `libcuda.so`: `target/release` if you built from sources or `zluda` if you downloaded prebuilt package.
|
||||
|
||||
### MacOS
|
||||
|
||||
Not supported
|
||||
|
||||
## Building
|
||||
**Warning**: ZLUDA is under heavy development (see news [here](https://vosen.github.io/ZLUDA/blog/zludas-third-life/)). Instructions below might not work.
|
||||
|
||||
_Note_: This repo has submodules. Make sure to recurse submodules when cloning this repo, e.g.: `git clone --recursive --depth=1 https://github.com/vosen/ZLUDA.git`
|
||||
* Git
|
||||
* CMake
|
||||
* Python 3
|
||||
* Rust compiler (recent version)
|
||||
* C++ compiler
|
||||
* (Optional, but recommended) [Ninja build system](https://ninja-build.org/)
|
||||
|
||||
You should have a relatively recent version of Rust installed, then you just do:
|
||||
### Build steps
|
||||
|
||||
```
|
||||
cargo build --release
|
||||
```
|
||||
in the main directory of the project.
|
||||
### Linux
|
||||
|
||||
If you are building on Linux you must also symlink (or rename) the ZLUDA output binaries after ZLUDA build finishes:
|
||||
```
|
||||
ln -s libnvcuda.so target/release/libcuda.so
|
||||
ln -s libnvcuda.so target/release/libcuda.so.1
|
||||
ln -s libnvml.so target/release/libnvidia-ml.so
|
||||
```
|
||||
* Git clone the repo (make sure to use `--recursive` option to fetch submodules):
|
||||
`git clone --recursive --depth=1 https://github.com/vosen/ZLUDA.git`
|
||||
* Enter freshly cloned `ZLUDA` directory and build with cargo (this takes a while):
|
||||
`cargo xtask --release`
|
||||
|
||||
## Contributing
|
||||
|
||||
If you want to develop ZLUDA itself, read [CONTRIBUTING.md](CONTRIBUTING.md), it contains instructions how to set up dependencies and run tests
|
||||
ZLUDA project has a commercial backing and _does not_ accept donations.
|
||||
ZLUDA project accepts pull requests and other non-monetary contributions.
|
||||
|
||||
If you want to contribute a code fix or documentation update feel free to open a Pull Request.
|
||||
|
||||
### Getting started
|
||||
|
||||
There's no architecture document (yet). Two most important crates in ZLUDA are `ptx` (PTX compiler) and `zluda` (AMD GPU runtime). A good starting point to tinkering the project is to run one of the `ptx` unit tests under a debugger and understand what it is doing. `cargo test -p ptx -- ::add_hip` is a simple test that adds two numbers.
|
||||
|
||||
Github issues tagged with ["help wanted"](https://github.com/vosen/ZLUDA/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) are tasks that are self-containted. Their level of difficulty varies, they are not always good beginner tasks, but they defined unambiguously.
|
||||
|
||||
If you have questions feel free to ask on [#devtalk channel on Discord](https://discord.com/channels/1273316903783497778/1303329281409159270).
|
||||
|
||||
|
||||
## License
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
use amd_comgr_sys::*;
|
||||
use std::{ffi::CStr, iter, mem, ptr};
|
||||
use std::{ffi::CStr, mem, ptr};
|
||||
|
||||
struct Data(amd_comgr_data_t);
|
||||
|
||||
|
@ -133,31 +133,45 @@ pub fn compile_bitcode(
|
|||
&linking_info,
|
||||
amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_BC_TO_BC,
|
||||
)?;
|
||||
let link_with_device_libs_info = ActionInfo::new()?;
|
||||
link_with_device_libs_info.set_isa_name(gcn_arch)?;
|
||||
link_with_device_libs_info.set_language(amd_comgr_language_t::AMD_COMGR_LANGUAGE_LLVM_IR)?;
|
||||
// This makes no sense, but it makes ockl linking work
|
||||
link_with_device_libs_info.set_options([c"-Xclang", c"-mno-link-builtin-bitcode-postopt"].into_iter())?;
|
||||
let with_device_libs = do_action(
|
||||
&linked_data_set,
|
||||
&link_with_device_libs_info,
|
||||
amd_comgr_action_kind_t::AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC,
|
||||
)?;
|
||||
let compile_action_info = ActionInfo::new()?;
|
||||
compile_action_info.set_isa_name(gcn_arch)?;
|
||||
compile_action_info.set_options(iter::once(c"-O3"))?;
|
||||
let reloc_data_set = do_action(
|
||||
&with_device_libs,
|
||||
&compile_action_info,
|
||||
amd_comgr_action_kind_t::AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE,
|
||||
)?;
|
||||
let compile_to_exec = ActionInfo::new()?;
|
||||
compile_to_exec.set_isa_name(gcn_arch)?;
|
||||
compile_to_exec.set_language(amd_comgr_language_t::AMD_COMGR_LANGUAGE_LLVM_IR)?;
|
||||
let common_options = [
|
||||
// This makes no sense, but it makes ockl linking work
|
||||
c"-Xclang",
|
||||
c"-mno-link-builtin-bitcode-postopt",
|
||||
// Otherwise LLVM omits dynamic fp mode for ockl functions during linking
|
||||
// and then fails to inline them
|
||||
c"-Xclang",
|
||||
c"-fdenormal-fp-math=dynamic",
|
||||
c"-O3",
|
||||
c"-mno-wavefrontsize64",
|
||||
c"-mcumode",
|
||||
// Useful for inlining reports, combined with AMD_COMGR_SAVE_TEMPS=1 AMD_COMGR_EMIT_VERBOSE_LOGS=1 AMD_COMGR_REDIRECT_LOGS=stderr
|
||||
// c"-fsave-optimization-record=yaml",
|
||||
]
|
||||
.into_iter();
|
||||
let opt_options = if cfg!(debug_assertions) {
|
||||
//[c"-g", c"-mllvm", c"-print-before-all", c"", c""]
|
||||
[c"-g", c"", c"", c"", c""]
|
||||
} else {
|
||||
[
|
||||
c"-g0",
|
||||
// default inlining threshold times 10
|
||||
c"-mllvm",
|
||||
c"-inline-threshold=2250",
|
||||
c"-mllvm",
|
||||
c"-inlinehint-threshold=3250",
|
||||
]
|
||||
};
|
||||
compile_to_exec.set_options(common_options.chain(opt_options))?;
|
||||
let exec_data_set = do_action(
|
||||
&reloc_data_set,
|
||||
&compile_action_info,
|
||||
amd_comgr_action_kind_t::AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE,
|
||||
&linked_data_set,
|
||||
&compile_to_exec,
|
||||
amd_comgr_action_kind_t::AMD_COMGR_ACTION_COMPILE_SOURCE_TO_EXECUTABLE,
|
||||
)?;
|
||||
let executable =
|
||||
exec_data_set.get_data(amd_comgr_data_kind_t::AMD_COMGR_DATA_KIND_EXECUTABLE, 0)?;
|
||||
exec_data_set.get_data(amd_comgr_data_kind_t::AMD_COMGR_DATA_KIND_EXECUTABLE, 0)?;
|
||||
executable.copy_content()
|
||||
}
|
||||
|
||||
|
|
|
@ -2,11 +2,11 @@
|
|||
name = "cuda_base"
|
||||
version = "0.0.0"
|
||||
authors = ["Andrzej Janik <vosen@vosen.pl>"]
|
||||
edition = "2018"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
quote = "1.0"
|
||||
syn = { version = "1.0", features = ["full", "visit-mut"] }
|
||||
syn = { version = "2.0", features = ["full", "visit-mut", "extra-traits"] }
|
||||
proc-macro2 = "1.0"
|
||||
rustc-hash = "1.1.0"
|
||||
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
bindgen build/wrapper.h -o src/cuda.rs --no-partialeq "CUDA_HOST_NODE_PARAMS_st" --with-derive-eq --whitelist-function="^cu.*" --whitelist-var="^CU.*" --size_t-is-usize --default-enum-style=newtype --no-layout-tests --no-doc-comments --no-derive-debug --new-type-alias "^CUdevice_v\d+$|^CUdeviceptr_v\d+$" --must-use-type "cudaError_enum" -- -I/usr/local/cuda/include
|
|
@ -1,3 +1,7 @@
|
|||
#define __CUDA_API_VERSION_INTERNAL
|
||||
#include <cuda.h>
|
||||
#include <cudaGL.h>
|
||||
#include <cudaProfiler.h>
|
||||
#include <cudaGL.h>
|
||||
#include <cudaEGL.h>
|
||||
#include <vdpau/vdpau.h>
|
||||
#include <cudaVDPAU.h>
|
||||
|
|
26769
cuda_base/src/cuda.rs
26769
cuda_base/src/cuda.rs
File diff suppressed because it is too large
Load diff
|
@ -1,110 +1,26 @@
|
|||
extern crate proc_macro;
|
||||
|
||||
use std::collections::hash_map;
|
||||
use std::iter;
|
||||
|
||||
use proc_macro::TokenStream;
|
||||
use proc_macro2::Span;
|
||||
use quote::{format_ident, quote, ToTokens};
|
||||
use rustc_hash::{FxHashMap, FxHashSet};
|
||||
use quote::{quote, ToTokens};
|
||||
use rustc_hash::FxHashMap;
|
||||
use std::iter;
|
||||
use syn::parse::{Parse, ParseStream};
|
||||
use syn::punctuated::Punctuated;
|
||||
use syn::visit_mut::VisitMut;
|
||||
use syn::{
|
||||
bracketed, parse_macro_input, Abi, Fields, File, FnArg, ForeignItem, ForeignItemFn, Ident,
|
||||
Item, ItemForeignMod, LitStr, PatType, Path, PathArguments, PathSegment, ReturnType, Signature,
|
||||
Token, Type, TypeArray, TypePath, TypePtr,
|
||||
bracketed, parse_macro_input, File, ForeignItem, ForeignItemFn, Ident, Item, Path, Signature,
|
||||
Token,
|
||||
};
|
||||
|
||||
const CUDA_RS: &'static str = include_str! {"cuda.rs"};
|
||||
|
||||
// This macro copies cuda.rs as-is with some changes:
|
||||
// * All function declarations are filtered out
|
||||
// * CUdeviceptr_v2 is redefined from `unsigned long long` to `*void`
|
||||
// * `extern "C"` gets replaced by `extern "system"`
|
||||
// * CUuuid_st is redefined to use uchar instead of char
|
||||
#[proc_macro]
|
||||
pub fn cuda_type_declarations(_: TokenStream) -> TokenStream {
|
||||
let mut cuda_module = syn::parse_str::<File>(CUDA_RS).unwrap();
|
||||
cuda_module.items = cuda_module
|
||||
.items
|
||||
.into_iter()
|
||||
.filter_map(|item| match item {
|
||||
Item::ForeignMod(_) => None,
|
||||
Item::Struct(mut struct_) => {
|
||||
if "CUdeviceptr_v2" == struct_.ident.to_string() {
|
||||
match &mut struct_.fields {
|
||||
Fields::Unnamed(ref mut fields) => {
|
||||
fields.unnamed[0].ty =
|
||||
absolute_path_to_mut_ptr(&["std", "os", "raw", "c_void"])
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
} else if "CUuuid_st" == struct_.ident.to_string() {
|
||||
match &mut struct_.fields {
|
||||
Fields::Named(ref mut fields) => match fields.named[0].ty {
|
||||
Type::Array(TypeArray { ref mut elem, .. }) => {
|
||||
*elem = Box::new(Type::Path(TypePath {
|
||||
qself: None,
|
||||
path: segments_to_path(&["std", "os", "raw", "c_uchar"]),
|
||||
}))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
},
|
||||
_ => panic!(),
|
||||
}
|
||||
}
|
||||
Some(Item::Struct(struct_))
|
||||
}
|
||||
i => Some(i),
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
syn::visit_mut::visit_file_mut(&mut FixAbi, &mut cuda_module);
|
||||
cuda_module.into_token_stream().into()
|
||||
}
|
||||
|
||||
fn segments_to_path(path: &[&'static str]) -> Path {
|
||||
let mut segments = Punctuated::new();
|
||||
for ident in path {
|
||||
let ident = PathSegment {
|
||||
ident: Ident::new(ident, Span::call_site()),
|
||||
arguments: PathArguments::None,
|
||||
};
|
||||
segments.push(ident);
|
||||
}
|
||||
Path {
|
||||
leading_colon: Some(Token)),
|
||||
segments,
|
||||
}
|
||||
}
|
||||
|
||||
fn absolute_path_to_mut_ptr(path: &[&'static str]) -> Type {
|
||||
Type::Ptr(TypePtr {
|
||||
star_token: Token),
|
||||
const_token: None,
|
||||
mutability: Some(Token)),
|
||||
elem: Box::new(Type::Path(TypePath {
|
||||
qself: None,
|
||||
path: segments_to_path(path),
|
||||
})),
|
||||
})
|
||||
}
|
||||
|
||||
struct FixAbi;
|
||||
|
||||
impl VisitMut for FixAbi {
|
||||
fn visit_abi_mut(&mut self, i: &mut Abi) {
|
||||
if let Some(ref mut name) = i.name {
|
||||
*name = LitStr::new("system", Span::call_site());
|
||||
}
|
||||
}
|
||||
}
|
||||
const NVML_RS: &'static str = include_str! {"nvml.rs"};
|
||||
|
||||
// This macro accepts following arguments:
|
||||
// * `type_path`: path to the module with type definitions (in the module tree)
|
||||
// * `normal_macro`: ident for a normal macro
|
||||
// * `override_macro`: ident for an override macro
|
||||
// * `override_fns`: list of override functions
|
||||
// * zero or more:
|
||||
// * `override_macro`: ident for an override macro
|
||||
// * `override_fns`: list of override functions
|
||||
// Then macro goes through every function in rust.rs, and for every fn `foo`:
|
||||
// * if `foo` is contained in `override_fns` then pass it into `override_macro`
|
||||
// * if `foo` is not contained in `override_fns` pass it to `normal_macro`
|
||||
|
@ -116,391 +32,202 @@ impl VisitMut for FixAbi {
|
|||
// Additionally, it does a fixup of CUDA types so they get prefixed with `type_path`
|
||||
#[proc_macro]
|
||||
pub fn cuda_function_declarations(tokens: TokenStream) -> TokenStream {
|
||||
function_declarations(tokens, CUDA_RS)
|
||||
}
|
||||
|
||||
fn function_declarations(tokens: TokenStream, module: &str) -> TokenStream {
|
||||
let input = parse_macro_input!(tokens as FnDeclInput);
|
||||
let cuda_module = syn::parse_str::<File>(CUDA_RS).unwrap();
|
||||
let override_fns = input
|
||||
.override_fns
|
||||
.iter()
|
||||
.map(ToString::to_string)
|
||||
.collect::<FxHashSet<_>>();
|
||||
let (normal_macro_args, override_macro_args): (Vec<_>, Vec<_>) = cuda_module
|
||||
.items
|
||||
.into_iter()
|
||||
.filter_map(|item| match item {
|
||||
Item::ForeignMod(ItemForeignMod { mut items, .. }) => match items.pop().unwrap() {
|
||||
ForeignItem::Fn(ForeignItemFn {
|
||||
sig:
|
||||
Signature {
|
||||
ident,
|
||||
inputs,
|
||||
output,
|
||||
..
|
||||
},
|
||||
..
|
||||
}) => {
|
||||
let use_normal_macro = !override_fns.contains(&ident.to_string());
|
||||
let inputs = inputs
|
||||
.into_iter()
|
||||
.map(|fn_arg| match fn_arg {
|
||||
FnArg::Typed(mut pat_type) => {
|
||||
pat_type.ty =
|
||||
prepend_cuda_path_to_type(&input.type_path, pat_type.ty);
|
||||
FnArg::Typed(pat_type)
|
||||
}
|
||||
_ => unreachable!(),
|
||||
})
|
||||
.collect::<Punctuated<_, Token![,]>>();
|
||||
let output = match output {
|
||||
ReturnType::Type(_, type_) => type_,
|
||||
ReturnType::Default => unreachable!(),
|
||||
};
|
||||
let type_path = input.type_path.clone();
|
||||
Some((
|
||||
quote! {
|
||||
"system" fn #ident(#inputs) -> #type_path :: #output
|
||||
},
|
||||
use_normal_macro,
|
||||
))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
},
|
||||
_ => None,
|
||||
})
|
||||
.partition(|(_, use_normal_macro)| *use_normal_macro);
|
||||
let mut result = proc_macro2::TokenStream::new();
|
||||
if !normal_macro_args.is_empty() {
|
||||
let punctuated_normal_macro_args = to_punctuated::<Token![;]>(normal_macro_args);
|
||||
let macro_ = &input.normal_macro;
|
||||
result.extend(iter::once(quote! {
|
||||
#macro_ ! (#punctuated_normal_macro_args);
|
||||
}));
|
||||
let mut cuda_module = syn::parse_str::<File>(module).unwrap();
|
||||
let mut choose_macro = ChooseMacro::new(input);
|
||||
syn::visit_mut::visit_file_mut(&mut FixFnSignatures, &mut cuda_module);
|
||||
let extern_ = if let Item::ForeignMod(extern_) = cuda_module.items.pop().unwrap() {
|
||||
extern_
|
||||
} else {
|
||||
unreachable!()
|
||||
};
|
||||
let abi = extern_.abi.name;
|
||||
for mut item in extern_.items {
|
||||
if let ForeignItem::Fn(ForeignItemFn {
|
||||
sig: Signature { ref ident, .. },
|
||||
ref mut attrs,
|
||||
..
|
||||
}) = item
|
||||
{
|
||||
*attrs = Vec::new();
|
||||
choose_macro.add(ident, quote! { #abi #item });
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
if !override_macro_args.is_empty() {
|
||||
let punctuated_override_macro_args = to_punctuated::<Token![;]>(override_macro_args);
|
||||
let macro_ = &input.override_macro;
|
||||
result.extend(iter::once(quote! {
|
||||
#macro_ ! (#punctuated_override_macro_args);
|
||||
}));
|
||||
let mut result = proc_macro2::TokenStream::new();
|
||||
for (path, items) in
|
||||
iter::once(choose_macro.default).chain(choose_macro.override_sets.into_iter())
|
||||
{
|
||||
if items.is_empty() {
|
||||
continue;
|
||||
}
|
||||
quote! {
|
||||
#path ! { #(#items)* }
|
||||
}
|
||||
.to_tokens(&mut result);
|
||||
}
|
||||
result.into()
|
||||
}
|
||||
|
||||
fn to_punctuated<P: ToTokens + Default>(
|
||||
elms: Vec<(proc_macro2::TokenStream, bool)>,
|
||||
) -> proc_macro2::TokenStream {
|
||||
let mut collection = Punctuated::<proc_macro2::TokenStream, P>::new();
|
||||
collection.extend(elms.into_iter().map(|(token_stream, _)| token_stream));
|
||||
collection.into_token_stream()
|
||||
#[proc_macro]
|
||||
pub fn nvml_function_declarations(tokens: TokenStream) -> TokenStream {
|
||||
function_declarations(tokens, NVML_RS)
|
||||
}
|
||||
|
||||
fn prepend_cuda_path_to_type(base_path: &Path, type_: Box<Type>) -> Box<Type> {
|
||||
match *type_ {
|
||||
Type::Path(mut type_path) => {
|
||||
type_path.path = prepend_cuda_path_to_path(base_path, type_path.path);
|
||||
Box::new(Type::Path(type_path))
|
||||
}
|
||||
Type::Ptr(mut type_ptr) => {
|
||||
type_ptr.elem = prepend_cuda_path_to_type(base_path, type_ptr.elem);
|
||||
Box::new(Type::Ptr(type_ptr))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
fn prepend_cuda_path_to_path(base_path: &Path, path: Path) -> Path {
|
||||
if path.leading_colon.is_some() {
|
||||
return path;
|
||||
}
|
||||
if path.segments.len() == 1 {
|
||||
let ident = path.segments[0].ident.to_string();
|
||||
if ident.starts_with("CU")
|
||||
|| ident.starts_with("cu")
|
||||
|| ident.starts_with("GL")
|
||||
|| ident == "HGPUNV"
|
||||
{
|
||||
let mut base_path = base_path.clone();
|
||||
base_path.segments.extend(path.segments);
|
||||
return base_path;
|
||||
}
|
||||
}
|
||||
path
|
||||
}
|
||||
|
||||
struct FnDeclInput {
|
||||
type_path: Path,
|
||||
normal_macro: Path,
|
||||
override_macro: Path,
|
||||
override_fns: Punctuated<Ident, Token![,]>,
|
||||
overrides: Punctuated<OverrideMacro, Token![,]>,
|
||||
}
|
||||
|
||||
impl Parse for FnDeclInput {
|
||||
fn parse(input: ParseStream) -> syn::Result<Self> {
|
||||
let type_path = input.parse::<Path>()?;
|
||||
input.parse::<Token![,]>()?;
|
||||
let normal_macro = input.parse::<Path>()?;
|
||||
input.parse::<Token![,]>()?;
|
||||
let override_macro = input.parse::<Path>()?;
|
||||
input.parse::<Token![,]>()?;
|
||||
let override_fns_content;
|
||||
bracketed!(override_fns_content in input);
|
||||
let override_fns = override_fns_content.parse_terminated(Ident::parse)?;
|
||||
let overrides = if input.is_empty() {
|
||||
Punctuated::new()
|
||||
} else {
|
||||
input.parse::<Token![,]>()?;
|
||||
input.parse_terminated(OverrideMacro::parse, Token![,])?
|
||||
};
|
||||
Ok(Self {
|
||||
type_path,
|
||||
normal_macro,
|
||||
override_macro,
|
||||
override_fns,
|
||||
overrides,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// This trait accepts following parameters:
|
||||
// * `type_path`: path to the module with type definitions (in the module tree)
|
||||
// * `trait_`: name of the trait to be derived
|
||||
// * `ignore_types`: bracketed list of types to ignore
|
||||
// * `ignore_fns`: bracketed list of fns to ignore
|
||||
#[proc_macro]
|
||||
pub fn cuda_derive_display_trait(tokens: TokenStream) -> TokenStream {
|
||||
let input = parse_macro_input!(tokens as DeriveDisplayInput);
|
||||
let cuda_module = syn::parse_str::<File>(CUDA_RS).unwrap();
|
||||
let mut derive_state = DeriveDisplayState::new(input);
|
||||
cuda_module
|
||||
.items
|
||||
.into_iter()
|
||||
.filter_map(|i| cuda_derive_display_trait_for_item(&mut derive_state, i))
|
||||
.collect::<proc_macro2::TokenStream>()
|
||||
.into()
|
||||
struct OverrideMacro {
|
||||
macro_: Path,
|
||||
functions: Punctuated<Ident, Token![,]>,
|
||||
}
|
||||
|
||||
fn cuda_derive_display_trait_for_item(
|
||||
state: &mut DeriveDisplayState,
|
||||
item: Item,
|
||||
) -> Option<proc_macro2::TokenStream> {
|
||||
let path_prefix = &state.type_path;
|
||||
let path_prefix_iter = iter::repeat(&path_prefix);
|
||||
let trait_ = &state.trait_;
|
||||
let trait_iter = iter::repeat(&state.trait_);
|
||||
match item {
|
||||
Item::Const(_) => None,
|
||||
Item::ForeignMod(ItemForeignMod { mut items, .. }) => match items.pop().unwrap() {
|
||||
ForeignItem::Fn(ForeignItemFn {
|
||||
sig: Signature { ident, inputs, .. },
|
||||
..
|
||||
}) => {
|
||||
if state.ignore_fns.contains(&ident) {
|
||||
return None;
|
||||
}
|
||||
let inputs = inputs
|
||||
.into_iter()
|
||||
.map(|fn_arg| match fn_arg {
|
||||
FnArg::Typed(mut pat_type) => {
|
||||
pat_type.ty = prepend_cuda_path_to_type(path_prefix, pat_type.ty);
|
||||
FnArg::Typed(pat_type)
|
||||
}
|
||||
_ => unreachable!(),
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let inputs_iter = inputs.iter();
|
||||
let mut arg_name_iter = inputs.iter().map(|fn_arg| match fn_arg {
|
||||
FnArg::Typed(PatType { pat, .. }) => pat,
|
||||
_ => unreachable!(),
|
||||
});
|
||||
let fn_name = format_ident!("write_{}", ident);
|
||||
let original_fn_name = ident.to_string();
|
||||
Some(match arg_name_iter.next() {
|
||||
Some(first_arg_name) => quote! {
|
||||
pub fn #fn_name(writer: &mut (impl std::io::Write + ?Sized), #(#inputs_iter,)*) -> std::io::Result<()> {
|
||||
writer.write_all(concat!("(", stringify!(#first_arg_name), ": ").as_bytes())?;
|
||||
let mut arg_idx = 0usize;
|
||||
CudaDisplay::write(&#first_arg_name, #original_fn_name, arg_idx, writer)?;
|
||||
#(
|
||||
writer.write_all(b", ")?;
|
||||
writer.write_all(concat!(stringify!(#arg_name_iter), ": ").as_bytes())?;
|
||||
CudaDisplay::write(&#arg_name_iter, #original_fn_name, arg_idx, writer)?;
|
||||
arg_idx += 1;
|
||||
)*
|
||||
writer.write_all(b")")
|
||||
}
|
||||
},
|
||||
None => quote! {
|
||||
pub fn #fn_name(writer: &mut (impl std::io::Write + ?Sized)) -> std::io::Result<()> {
|
||||
writer.write_all(b"()")
|
||||
}
|
||||
},
|
||||
})
|
||||
}
|
||||
_ => unreachable!(),
|
||||
},
|
||||
Item::Impl(mut item_impl) => {
|
||||
let enum_ = match *(item_impl.self_ty) {
|
||||
Type::Path(mut path) => path.path.segments.pop().unwrap().into_value().ident,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
let variant_ = match item_impl.items.pop().unwrap() {
|
||||
syn::ImplItem::Const(item_const) => item_const.ident,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
state.record_enum_variant(enum_, variant_);
|
||||
None
|
||||
}
|
||||
Item::Struct(item_struct) => {
|
||||
let item_struct_name = item_struct.ident.to_string();
|
||||
if state.ignore_types.contains(&item_struct.ident) {
|
||||
return None;
|
||||
}
|
||||
if item_struct_name.ends_with("_enum") {
|
||||
let enum_ = &item_struct.ident;
|
||||
let enum_iter = iter::repeat(&item_struct.ident);
|
||||
let variants = state.enums.get(&item_struct.ident).unwrap().iter();
|
||||
Some(quote! {
|
||||
impl #trait_ for #path_prefix :: #enum_ {
|
||||
fn write(&self, _fn_name: &'static str, _index: usize, writer: &mut (impl std::io::Write + ?Sized)) -> std::io::Result<()> {
|
||||
match self {
|
||||
#(& #path_prefix_iter :: #enum_iter :: #variants => writer.write_all(stringify!(#variants).as_bytes()),)*
|
||||
_ => write!(writer, "{}", self.0)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
} else {
|
||||
let struct_ = &item_struct.ident;
|
||||
let (first_field, rest_of_fields) = match item_struct.fields {
|
||||
Fields::Named(fields) => {
|
||||
let mut all_idents = fields.named.into_iter().filter_map(|f| {
|
||||
let f_ident = f.ident.unwrap();
|
||||
let name = f_ident.to_string();
|
||||
if name.starts_with("reserved") || name == "_unused" {
|
||||
None
|
||||
} else {
|
||||
Some(f_ident)
|
||||
}
|
||||
});
|
||||
let first = match all_idents.next() {
|
||||
Some(f) => f,
|
||||
None => return None,
|
||||
};
|
||||
(first, all_idents)
|
||||
}
|
||||
_ => return None,
|
||||
};
|
||||
Some(quote! {
|
||||
impl #trait_ for #path_prefix :: #struct_ {
|
||||
fn write(&self, _fn_name: &'static str, _index: usize, writer: &mut (impl std::io::Write + ?Sized)) -> std::io::Result<()> {
|
||||
writer.write_all(concat!("{ ", stringify!(#first_field), ": ").as_bytes())?;
|
||||
#trait_::write(&self.#first_field, "", 0, writer)?;
|
||||
#(
|
||||
writer.write_all(concat!(", ", stringify!(#rest_of_fields), ": ").as_bytes())?;
|
||||
#trait_iter::write(&self.#rest_of_fields, "", 0, writer)?;
|
||||
)*
|
||||
writer.write_all(b" }")
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Item::Type(item_type) => {
|
||||
if state.ignore_types.contains(&item_type.ident) {
|
||||
return None;
|
||||
};
|
||||
match *(item_type.ty) {
|
||||
Type::Ptr(_) => {
|
||||
let type_ = item_type.ident;
|
||||
Some(quote! {
|
||||
impl #trait_ for #path_prefix :: #type_ {
|
||||
fn write(&self, _fn_name: &'static str, _index: usize, writer: &mut (impl std::io::Write + ?Sized)) -> std::io::Result<()> {
|
||||
write!(writer, "{:p}", *self)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
Type::Path(type_path) => {
|
||||
if type_path.path.leading_colon.is_some() {
|
||||
let option_seg = type_path.path.segments.last().unwrap();
|
||||
if option_seg.ident == "Option" {
|
||||
match &option_seg.arguments {
|
||||
PathArguments::AngleBracketed(generic) => match generic.args[0] {
|
||||
syn::GenericArgument::Type(Type::BareFn(_)) => {
|
||||
let type_ = &item_type.ident;
|
||||
return Some(quote! {
|
||||
impl #trait_ for #path_prefix :: #type_ {
|
||||
fn write(&self, _fn_name: &'static str, _index: usize, writer: &mut (impl std::io::Write + ?Sized)) -> std::io::Result<()> {
|
||||
write!(writer, "{:p}", unsafe { std::mem::transmute::<#path_prefix :: #type_, *mut ::std::ffi::c_void>(*self) })
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
_ => unreachable!(),
|
||||
},
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
Item::Union(_) => None,
|
||||
Item::Use(_) => None,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
struct DeriveDisplayState {
|
||||
type_path: Path,
|
||||
trait_: Path,
|
||||
ignore_types: FxHashSet<Ident>,
|
||||
ignore_fns: FxHashSet<Ident>,
|
||||
enums: FxHashMap<Ident, Vec<Ident>>,
|
||||
}
|
||||
|
||||
impl DeriveDisplayState {
|
||||
fn new(input: DeriveDisplayInput) -> Self {
|
||||
DeriveDisplayState {
|
||||
type_path: input.type_path,
|
||||
trait_: input.trait_,
|
||||
ignore_types: input.ignore_types.into_iter().collect(),
|
||||
ignore_fns: input.ignore_fns.into_iter().collect(),
|
||||
enums: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
fn record_enum_variant(&mut self, enum_: Ident, variant: Ident) {
|
||||
match self.enums.entry(enum_) {
|
||||
hash_map::Entry::Occupied(mut entry) => {
|
||||
entry.get_mut().push(variant);
|
||||
}
|
||||
hash_map::Entry::Vacant(entry) => {
|
||||
entry.insert(vec![variant]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct DeriveDisplayInput {
|
||||
type_path: Path,
|
||||
trait_: Path,
|
||||
ignore_types: Punctuated<Ident, Token![,]>,
|
||||
ignore_fns: Punctuated<Ident, Token![,]>,
|
||||
}
|
||||
|
||||
impl Parse for DeriveDisplayInput {
|
||||
impl Parse for OverrideMacro {
|
||||
fn parse(input: ParseStream) -> syn::Result<Self> {
|
||||
let type_path = input.parse::<Path>()?;
|
||||
input.parse::<Token![,]>()?;
|
||||
let trait_ = input.parse::<Path>()?;
|
||||
input.parse::<Token![,]>()?;
|
||||
let ignore_types_buffer;
|
||||
bracketed!(ignore_types_buffer in input);
|
||||
let ignore_types = ignore_types_buffer.parse_terminated(Ident::parse)?;
|
||||
input.parse::<Token![,]>()?;
|
||||
let ignore_fns_buffer;
|
||||
bracketed!(ignore_fns_buffer in input);
|
||||
let ignore_fns = ignore_fns_buffer.parse_terminated(Ident::parse)?;
|
||||
Ok(Self {
|
||||
type_path,
|
||||
trait_,
|
||||
ignore_types,
|
||||
ignore_fns,
|
||||
})
|
||||
let macro_ = input.parse::<Path>()?;
|
||||
input.parse::<Token![<=]>()?;
|
||||
let functions_content;
|
||||
bracketed!(functions_content in input);
|
||||
let functions = functions_content.parse_terminated(Ident::parse, Token![,])?;
|
||||
Ok(Self { macro_, functions })
|
||||
}
|
||||
}
|
||||
|
||||
struct ChooseMacro {
|
||||
default: (Path, Vec<proc_macro2::TokenStream>),
|
||||
override_lookup: FxHashMap<Ident, Path>,
|
||||
override_sets: FxHashMap<Path, Vec<proc_macro2::TokenStream>>,
|
||||
}
|
||||
|
||||
impl ChooseMacro {
|
||||
fn new(input: FnDeclInput) -> Self {
|
||||
let mut override_lookup = FxHashMap::default();
|
||||
let mut override_sets = FxHashMap::default();
|
||||
for OverrideMacro { macro_, functions } in input.overrides {
|
||||
for ident in functions {
|
||||
override_lookup.insert(ident, macro_.clone());
|
||||
override_sets.insert(macro_.clone(), Vec::new());
|
||||
}
|
||||
}
|
||||
Self {
|
||||
default: (input.normal_macro, Vec::new()),
|
||||
override_lookup,
|
||||
override_sets,
|
||||
}
|
||||
}
|
||||
|
||||
fn add(&mut self, ident: &Ident, tokens: proc_macro2::TokenStream) {
|
||||
match self.override_lookup.get(ident) {
|
||||
Some(override_macro) => {
|
||||
self.override_sets
|
||||
.get_mut(override_macro)
|
||||
.unwrap()
|
||||
.push(tokens);
|
||||
}
|
||||
None => self.default.1.push(tokens),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For some reason prettyplease will append trailing comma *only*
|
||||
// if there are two or more arguments
|
||||
struct FixFnSignatures;
|
||||
|
||||
impl VisitMut for FixFnSignatures {
|
||||
fn visit_signature_mut(&mut self, s: &mut syn::Signature) {
|
||||
s.inputs.pop_punct();
|
||||
}
|
||||
}
|
||||
|
||||
const MODULES: &[&str] = &[
|
||||
"context", "device", "driver", "function", "link", "memory", "module", "pointer",
|
||||
];
|
||||
|
||||
#[proc_macro]
|
||||
pub fn cuda_normalize_fn(tokens: TokenStream) -> TokenStream {
|
||||
let mut path = parse_macro_input!(tokens as syn::Path);
|
||||
let fn_ = path
|
||||
.segments
|
||||
.pop()
|
||||
.unwrap()
|
||||
.into_tuple()
|
||||
.0
|
||||
.ident
|
||||
.to_string();
|
||||
let already_has_module = MODULES.contains(&&*path.segments.last().unwrap().ident.to_string());
|
||||
let segments: Vec<String> = split(&fn_[2..]); // skip "cu"
|
||||
let fn_path = join(segments, !already_has_module);
|
||||
quote! {
|
||||
#path #fn_path
|
||||
}
|
||||
.into()
|
||||
}
|
||||
|
||||
fn split(fn_: &str) -> Vec<String> {
|
||||
let mut result = Vec::new();
|
||||
for c in fn_.chars() {
|
||||
if c.is_ascii_uppercase() {
|
||||
result.push(c.to_ascii_lowercase().to_string());
|
||||
} else {
|
||||
result.last_mut().unwrap().push(c);
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn join(fn_: Vec<String>, find_module: bool) -> Punctuated<Ident, Token![::]> {
|
||||
fn full_form(segment: &str) -> Option<&[&str]> {
|
||||
Some(match segment {
|
||||
"ctx" => &["context"],
|
||||
"func" => &["function"],
|
||||
"mem" => &["memory"],
|
||||
"memcpy" => &["memory", "copy"],
|
||||
"memset" => &["memory", "set"],
|
||||
_ => return None,
|
||||
})
|
||||
}
|
||||
let mut normalized: Vec<&str> = Vec::new();
|
||||
for segment in fn_.iter() {
|
||||
match full_form(segment) {
|
||||
Some(segments) => normalized.extend(segments.into_iter()),
|
||||
None => normalized.push(&*segment),
|
||||
}
|
||||
}
|
||||
if !find_module {
|
||||
return [Ident::new(&normalized.join("_"), Span::call_site())]
|
||||
.into_iter()
|
||||
.collect();
|
||||
}
|
||||
if !MODULES.contains(&normalized[0]) {
|
||||
let mut globalized = vec!["driver"];
|
||||
globalized.extend(normalized);
|
||||
normalized = globalized;
|
||||
}
|
||||
let (module, path) = normalized.split_first().unwrap();
|
||||
let path = path.join("_");
|
||||
[module, &&*path]
|
||||
.into_iter()
|
||||
.map(|s| Ident::new(s, Span::call_site()))
|
||||
.collect()
|
||||
}
|
||||
|
|
7857
cuda_base/src/nvml.rs
Normal file
7857
cuda_base/src/nvml.rs
Normal file
File diff suppressed because it is too large
Load diff
|
@ -6,3 +6,4 @@ edition = "2018"
|
|||
|
||||
[dependencies]
|
||||
cuda_base = { path = "../cuda_base" }
|
||||
hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
|
||||
|
|
8110
cuda_types/src/cuda.rs
Normal file
8110
cuda_types/src/cuda.rs
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,3 +1,2 @@
|
|||
use cuda_base::cuda_type_declarations;
|
||||
|
||||
cuda_type_declarations!();
|
||||
pub mod cuda;
|
||||
pub mod nvml;
|
4185
cuda_types/src/nvml.rs
Normal file
4185
cuda_types/src/nvml.rs
Normal file
File diff suppressed because it is too large
Load diff
1
ext/hip_runtime-sys/README
vendored
1
ext/hip_runtime-sys/README
vendored
|
@ -1 +0,0 @@
|
|||
bindgen --rust-target 1.77 /opt/rocm/include/hip/hip_runtime_api.h -o hip_runtime_api.rs --no-layout-tests --default-enum-style=newtype --allowlist-function "hip.*" --allowlist-type "hip.*" --no-derive-debug --must-use-type hipError_t --new-type-alias "^hipDeviceptr_t$" --allowlist-var "^hip.*$" -- -I/opt/rocm/include -D__HIP_PLATFORM_AMD__
|
|
@ -1,2 +0,0 @@
|
|||
#define __HIP_PLATFORM_HCC__
|
||||
#include <hip/hip_runtime_api.h>
|
7422
ext/hip_runtime-sys/src/hip_runtime_api.rs
vendored
7422
ext/hip_runtime-sys/src/hip_runtime_api.rs
vendored
File diff suppressed because it is too large
Load diff
12669
ext/hip_runtime-sys/src/lib.rs
vendored
12669
ext/hip_runtime-sys/src/lib.rs
vendored
File diff suppressed because it is too large
Load diff
1
ext/spirv-headers
vendored
1
ext/spirv-headers
vendored
|
@ -1 +0,0 @@
|
|||
Subproject commit 308bd07424350a6000f35a77b5f85cd4f3da319e
|
1
ext/spirv-tools
vendored
1
ext/spirv-tools
vendored
|
@ -1 +0,0 @@
|
|||
Subproject commit e128ab0d624ce7beb08eb9656bb260c597a46d0a
|
1
geekbench.svg
Normal file
1
geekbench.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 287 KiB |
|
@ -1,8 +0,0 @@
|
|||
[package]
|
||||
name = "level_zero-sys"
|
||||
version = "1.0.4"
|
||||
authors = ["Andrzej Janik <vosen@vosen.pl>"]
|
||||
edition = "2018"
|
||||
links = "ze_loader"
|
||||
|
||||
[lib]
|
|
@ -1,4 +0,0 @@
|
|||
sed 's/^typedef uint32_t ze_.*flags_t;$//g' include/ze_api.h > include/level_zero/ze_api.h
|
||||
sed -i -r 's/ze_(.*)_flag_t/ze_\1_flags_t/g' include/level_zero/ze_api.h
|
||||
bindgen --with-derive-default --no-default ".*format_t" --no-default ".*fd_t" --no-default ".*constants_t" --no-default ".*handle_t" --no-default ".*desc_t" --no-default ".*params_t" --size_t-is-usize --default-enum-style=newtype --bitfield-enum ".*flags_t" --whitelist-function "ze.*" --whitelist-type "ze.*" include/level_zero/ze_api.h -o src/ze_api.rs -- -Iinclude
|
||||
sed -i 's/pub struct _ze_result_t/#[must_use]\npub struct _ze_result_t/g' src/ze_api.rs
|
|
@ -1,17 +0,0 @@
|
|||
use env::VarError;
|
||||
use std::{env, path::PathBuf};
|
||||
|
||||
fn main() -> Result<(), VarError> {
|
||||
println!("cargo:rustc-link-lib=dylib=ze_loader");
|
||||
if cfg!(windows) {
|
||||
let env = env::var("CARGO_CFG_TARGET_ENV")?;
|
||||
if env == "msvc" {
|
||||
let mut path = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?);
|
||||
path.push("lib");
|
||||
println!("cargo:rustc-link-search=native={}", path.display());
|
||||
} else {
|
||||
println!("cargo:rustc-link-search=native=C:\\Windows\\System32");
|
||||
};
|
||||
}
|
||||
Ok(())
|
||||
}
|
1
level_zero-sys/include/level_zero/.gitignore
vendored
1
level_zero-sys/include/level_zero/.gitignore
vendored
|
@ -1 +0,0 @@
|
|||
ze_api.h
|
File diff suppressed because it is too large
Load diff
|
@ -1,316 +0,0 @@
|
|||
EXPORTS
|
||||
zeCommandListAppendBarrier
|
||||
zeCommandListAppendEventReset
|
||||
zeCommandListAppendImageCopy
|
||||
zeCommandListAppendImageCopyFromMemory
|
||||
zeCommandListAppendImageCopyRegion
|
||||
zeCommandListAppendImageCopyToMemory
|
||||
zeCommandListAppendLaunchCooperativeKernel
|
||||
zeCommandListAppendLaunchKernel
|
||||
zeCommandListAppendLaunchKernelIndirect
|
||||
zeCommandListAppendLaunchMultipleKernelsIndirect
|
||||
zeCommandListAppendMemAdvise
|
||||
zeCommandListAppendMemoryCopy
|
||||
zeCommandListAppendMemoryCopyFromContext
|
||||
zeCommandListAppendMemoryCopyRegion
|
||||
zeCommandListAppendMemoryFill
|
||||
zeCommandListAppendMemoryPrefetch
|
||||
zeCommandListAppendMemoryRangesBarrier
|
||||
zeCommandListAppendQueryKernelTimestamps
|
||||
zeCommandListAppendSignalEvent
|
||||
zeCommandListAppendWaitOnEvents
|
||||
zeCommandListAppendWriteGlobalTimestamp
|
||||
zeCommandListClose
|
||||
zeCommandListCreate
|
||||
zeCommandListCreateImmediate
|
||||
zeCommandListDestroy
|
||||
zeCommandListReset
|
||||
zeCommandQueueCreate
|
||||
zeCommandQueueDestroy
|
||||
zeCommandQueueExecuteCommandLists
|
||||
zeCommandQueueSynchronize
|
||||
zeContextCreate
|
||||
zeContextCreateEx
|
||||
zeContextDestroy
|
||||
zeContextEvictImage
|
||||
zeContextEvictMemory
|
||||
zeContextGetStatus
|
||||
zeContextMakeImageResident
|
||||
zeContextMakeMemoryResident
|
||||
zeContextSystemBarrier
|
||||
zeDeviceCanAccessPeer
|
||||
zeDeviceGet
|
||||
zeDeviceGetCacheProperties
|
||||
zeDeviceGetCommandQueueGroupProperties
|
||||
zeDeviceGetComputeProperties
|
||||
zeDeviceGetExternalMemoryProperties
|
||||
zeDeviceGetGlobalTimestamps
|
||||
zeDeviceGetImageProperties
|
||||
zeDeviceGetMemoryAccessProperties
|
||||
zeDeviceGetMemoryProperties
|
||||
zeDeviceGetModuleProperties
|
||||
zeDeviceGetP2PProperties
|
||||
zeDeviceGetProperties
|
||||
zeDeviceGetStatus
|
||||
zeDeviceGetSubDevices
|
||||
zeDriverGet
|
||||
zeDriverGetApiVersion
|
||||
zeDriverGetExtensionFunctionAddress
|
||||
zeDriverGetExtensionProperties
|
||||
zeDriverGetIpcProperties
|
||||
zeDriverGetProperties
|
||||
zeEventCreate
|
||||
zeEventDestroy
|
||||
zeEventHostReset
|
||||
zeEventHostSignal
|
||||
zeEventHostSynchronize
|
||||
zeEventPoolCloseIpcHandle
|
||||
zeEventPoolCreate
|
||||
zeEventPoolDestroy
|
||||
zeEventPoolGetIpcHandle
|
||||
zeEventPoolOpenIpcHandle
|
||||
zeEventQueryKernelTimestamp
|
||||
zeEventQueryStatus
|
||||
zeFenceCreate
|
||||
zeFenceDestroy
|
||||
zeFenceHostSynchronize
|
||||
zeFenceQueryStatus
|
||||
zeFenceReset
|
||||
zeGetCommandListProcAddrTable
|
||||
zeGetCommandQueueProcAddrTable
|
||||
zeGetContextProcAddrTable
|
||||
zeGetDeviceProcAddrTable
|
||||
zeGetDriverProcAddrTable
|
||||
zeGetEventPoolProcAddrTable
|
||||
zeGetEventProcAddrTable
|
||||
zeGetFenceProcAddrTable
|
||||
zeGetGlobalProcAddrTable
|
||||
zeGetImageProcAddrTable
|
||||
zeGetKernelExpProcAddrTable
|
||||
zeGetKernelProcAddrTable
|
||||
zeGetMemProcAddrTable
|
||||
zeGetModuleBuildLogProcAddrTable
|
||||
zeGetModuleProcAddrTable
|
||||
zeGetPhysicalMemProcAddrTable
|
||||
zeGetSamplerProcAddrTable
|
||||
zeGetVirtualMemProcAddrTable
|
||||
zeImageCreate
|
||||
zeImageDestroy
|
||||
zeImageGetProperties
|
||||
zeInit
|
||||
zeKernelCreate
|
||||
zeKernelDestroy
|
||||
zeKernelGetIndirectAccess
|
||||
zeKernelGetName
|
||||
zeKernelGetProperties
|
||||
zeKernelGetSourceAttributes
|
||||
zeKernelSetArgumentValue
|
||||
zeKernelSetCacheConfig
|
||||
zeKernelSetGlobalOffsetExp
|
||||
zeKernelSetGroupSize
|
||||
zeKernelSetIndirectAccess
|
||||
zeKernelSuggestGroupSize
|
||||
zeKernelSuggestMaxCooperativeGroupCount
|
||||
zeLoaderInit
|
||||
zeMemAllocDevice
|
||||
zeMemAllocHost
|
||||
zeMemAllocShared
|
||||
zeMemCloseIpcHandle
|
||||
zeMemFree
|
||||
zeMemGetAddressRange
|
||||
zeMemGetAllocProperties
|
||||
zeMemGetIpcHandle
|
||||
zeMemOpenIpcHandle
|
||||
zeModuleBuildLogDestroy
|
||||
zeModuleBuildLogGetString
|
||||
zeModuleCreate
|
||||
zeModuleDestroy
|
||||
zeModuleDynamicLink
|
||||
zeModuleGetFunctionPointer
|
||||
zeModuleGetGlobalPointer
|
||||
zeModuleGetKernelNames
|
||||
zeModuleGetNativeBinary
|
||||
zeModuleGetProperties
|
||||
zePhysicalMemCreate
|
||||
zePhysicalMemDestroy
|
||||
zeSamplerCreate
|
||||
zeSamplerDestroy
|
||||
zeVirtualMemFree
|
||||
zeVirtualMemGetAccessAttribute
|
||||
zeVirtualMemMap
|
||||
zeVirtualMemQueryPageSize
|
||||
zeVirtualMemReserve
|
||||
zeVirtualMemSetAccessAttribute
|
||||
zeVirtualMemUnmap
|
||||
zelGetTracerApiProcAddrTable
|
||||
zelTracerCreate
|
||||
zelTracerDestroy
|
||||
zelTracerSetEnabled
|
||||
zelTracerSetEpilogues
|
||||
zelTracerSetPrologues
|
||||
zesDeviceEnumDiagnosticTestSuites
|
||||
zesDeviceEnumEngineGroups
|
||||
zesDeviceEnumFabricPorts
|
||||
zesDeviceEnumFans
|
||||
zesDeviceEnumFirmwares
|
||||
zesDeviceEnumFrequencyDomains
|
||||
zesDeviceEnumLeds
|
||||
zesDeviceEnumMemoryModules
|
||||
zesDeviceEnumPerformanceFactorDomains
|
||||
zesDeviceEnumPowerDomains
|
||||
zesDeviceEnumPsus
|
||||
zesDeviceEnumRasErrorSets
|
||||
zesDeviceEnumSchedulers
|
||||
zesDeviceEnumStandbyDomains
|
||||
zesDeviceEnumTemperatureSensors
|
||||
zesDeviceEventRegister
|
||||
zesDeviceGetProperties
|
||||
zesDeviceGetState
|
||||
zesDevicePciGetBars
|
||||
zesDevicePciGetProperties
|
||||
zesDevicePciGetState
|
||||
zesDevicePciGetStats
|
||||
zesDeviceProcessesGetState
|
||||
zesDeviceReset
|
||||
zesDiagnosticsGetProperties
|
||||
zesDiagnosticsGetTests
|
||||
zesDiagnosticsRunTests
|
||||
zesDriverEventListen
|
||||
zesDriverEventListenEx
|
||||
zesEngineGetActivity
|
||||
zesEngineGetProperties
|
||||
zesFabricPortGetConfig
|
||||
zesFabricPortGetLinkType
|
||||
zesFabricPortGetProperties
|
||||
zesFabricPortGetState
|
||||
zesFabricPortGetThroughput
|
||||
zesFabricPortSetConfig
|
||||
zesFanGetConfig
|
||||
zesFanGetProperties
|
||||
zesFanGetState
|
||||
zesFanSetDefaultMode
|
||||
zesFanSetFixedSpeedMode
|
||||
zesFanSetSpeedTableMode
|
||||
zesFirmwareFlash
|
||||
zesFirmwareGetProperties
|
||||
zesFrequencyGetAvailableClocks
|
||||
zesFrequencyGetProperties
|
||||
zesFrequencyGetRange
|
||||
zesFrequencyGetState
|
||||
zesFrequencyGetThrottleTime
|
||||
zesFrequencyOcGetCapabilities
|
||||
zesFrequencyOcGetFrequencyTarget
|
||||
zesFrequencyOcGetIccMax
|
||||
zesFrequencyOcGetMode
|
||||
zesFrequencyOcGetTjMax
|
||||
zesFrequencyOcGetVoltageTarget
|
||||
zesFrequencyOcSetFrequencyTarget
|
||||
zesFrequencyOcSetIccMax
|
||||
zesFrequencyOcSetMode
|
||||
zesFrequencyOcSetTjMax
|
||||
zesFrequencyOcSetVoltageTarget
|
||||
zesFrequencySetRange
|
||||
zesGetDeviceProcAddrTable
|
||||
zesGetDiagnosticsProcAddrTable
|
||||
zesGetDriverProcAddrTable
|
||||
zesGetEngineProcAddrTable
|
||||
zesGetFabricPortProcAddrTable
|
||||
zesGetFanProcAddrTable
|
||||
zesGetFirmwareProcAddrTable
|
||||
zesGetFrequencyProcAddrTable
|
||||
zesGetLedProcAddrTable
|
||||
zesGetMemoryProcAddrTable
|
||||
zesGetPerformanceFactorProcAddrTable
|
||||
zesGetPowerProcAddrTable
|
||||
zesGetPsuProcAddrTable
|
||||
zesGetRasProcAddrTable
|
||||
zesGetSchedulerProcAddrTable
|
||||
zesGetStandbyProcAddrTable
|
||||
zesGetTemperatureProcAddrTable
|
||||
zesLedGetProperties
|
||||
zesLedGetState
|
||||
zesLedSetColor
|
||||
zesLedSetState
|
||||
zesMemoryGetBandwidth
|
||||
zesMemoryGetProperties
|
||||
zesMemoryGetState
|
||||
zesPerformanceFactorGetConfig
|
||||
zesPerformanceFactorGetProperties
|
||||
zesPerformanceFactorSetConfig
|
||||
zesPowerGetEnergyCounter
|
||||
zesPowerGetEnergyThreshold
|
||||
zesPowerGetLimits
|
||||
zesPowerGetProperties
|
||||
zesPowerSetEnergyThreshold
|
||||
zesPowerSetLimits
|
||||
zesPsuGetProperties
|
||||
zesPsuGetState
|
||||
zesRasGetConfig
|
||||
zesRasGetProperties
|
||||
zesRasGetState
|
||||
zesRasSetConfig
|
||||
zesSchedulerGetCurrentMode
|
||||
zesSchedulerGetProperties
|
||||
zesSchedulerGetTimeoutModeProperties
|
||||
zesSchedulerGetTimesliceModeProperties
|
||||
zesSchedulerSetComputeUnitDebugMode
|
||||
zesSchedulerSetExclusiveMode
|
||||
zesSchedulerSetTimeoutMode
|
||||
zesSchedulerSetTimesliceMode
|
||||
zesStandbyGetMode
|
||||
zesStandbyGetProperties
|
||||
zesStandbySetMode
|
||||
zesTemperatureGetConfig
|
||||
zesTemperatureGetProperties
|
||||
zesTemperatureGetState
|
||||
zesTemperatureSetConfig
|
||||
zetCommandListAppendMetricMemoryBarrier
|
||||
zetCommandListAppendMetricQueryBegin
|
||||
zetCommandListAppendMetricQueryEnd
|
||||
zetCommandListAppendMetricStreamerMarker
|
||||
zetContextActivateMetricGroups
|
||||
zetDebugAcknowledgeEvent
|
||||
zetDebugAttach
|
||||
zetDebugDetach
|
||||
zetDebugGetRegisterSetProperties
|
||||
zetDebugInterrupt
|
||||
zetDebugReadEvent
|
||||
zetDebugReadMemory
|
||||
zetDebugReadRegisters
|
||||
zetDebugResume
|
||||
zetDebugWriteMemory
|
||||
zetDebugWriteRegisters
|
||||
zetDeviceGetDebugProperties
|
||||
zetGetCommandListProcAddrTable
|
||||
zetGetContextProcAddrTable
|
||||
zetGetDebugProcAddrTable
|
||||
zetGetDeviceProcAddrTable
|
||||
zetGetKernelProcAddrTable
|
||||
zetGetMetricGroupProcAddrTable
|
||||
zetGetMetricProcAddrTable
|
||||
zetGetMetricQueryPoolProcAddrTable
|
||||
zetGetMetricQueryProcAddrTable
|
||||
zetGetMetricStreamerProcAddrTable
|
||||
zetGetModuleProcAddrTable
|
||||
zetGetTracerExpProcAddrTable
|
||||
zetKernelGetProfileInfo
|
||||
zetMetricGet
|
||||
zetMetricGetProperties
|
||||
zetMetricGroupCalculateMetricValues
|
||||
zetMetricGroupGet
|
||||
zetMetricGroupGetProperties
|
||||
zetMetricQueryCreate
|
||||
zetMetricQueryDestroy
|
||||
zetMetricQueryGetData
|
||||
zetMetricQueryPoolCreate
|
||||
zetMetricQueryPoolDestroy
|
||||
zetMetricQueryReset
|
||||
zetMetricStreamerClose
|
||||
zetMetricStreamerOpen
|
||||
zetMetricStreamerReadData
|
||||
zetModuleGetDebugInfo
|
||||
zetTracerExpCreate
|
||||
zetTracerExpDestroy
|
||||
zetTracerExpSetEnabled
|
||||
zetTracerExpSetEpilogues
|
||||
zetTracerExpSetPrologues
|
Binary file not shown.
|
@ -1,3 +0,0 @@
|
|||
#![allow(warnings)]
|
||||
pub mod ze_api;
|
||||
pub use ze_api::*;
|
File diff suppressed because it is too large
Load diff
|
@ -1,14 +0,0 @@
|
|||
[package]
|
||||
name = "level_zero"
|
||||
version = "0.1.0"
|
||||
authors = ["Andrzej Janik <vosen@vosen.pl>"]
|
||||
edition = "2018"
|
||||
|
||||
[lib]
|
||||
|
||||
[dependencies]
|
||||
level_zero-sys = { path = "../level_zero-sys" }
|
||||
|
||||
[dependencies.ocl-core]
|
||||
version = "0.11"
|
||||
features = ["opencl_version_1_2", "opencl_version_2_0", "opencl_version_2_1"]
|
|
@ -1 +0,0 @@
|
|||
More ergonomic bindings for oneAPI Level Zero
|
|
@ -1,4 +0,0 @@
|
|||
pub use level_zero_sys as sys;
|
||||
|
||||
pub mod ze;
|
||||
pub use ze::*;
|
1289
level_zero/src/ze.rs
1289
level_zero/src/ze.rs
File diff suppressed because it is too large
Load diff
|
@ -1,7 +1,10 @@
|
|||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wunused-parameter"
|
||||
#include <llvm-c/Core.h>
|
||||
#include <llvm/IR/IRBuilder.h>
|
||||
#include <llvm/IR/Type.h>
|
||||
#include <llvm/IR/Instructions.h>
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
|
@ -189,7 +192,8 @@ void LLVMZludaBuildFence(LLVMBuilderRef B, LLVMAtomicOrdering Ordering,
|
|||
auto builder = llvm::unwrap(B);
|
||||
LLVMContext &context = builder->getContext();
|
||||
builder->CreateFence(mapFromLLVMOrdering(Ordering),
|
||||
context.getOrInsertSyncScopeID(scope));
|
||||
context.getOrInsertSyncScopeID(scope),
|
||||
Name);
|
||||
}
|
||||
|
||||
LLVM_C_EXTERN_C_END
|
|
@ -17,6 +17,10 @@ bitflags = "1.2"
|
|||
rustc-hash = "2.0.0"
|
||||
strum = "0.26"
|
||||
strum_macros = "0.26"
|
||||
petgraph = "0.7.1"
|
||||
microlp = "0.2.10"
|
||||
int-enum = "1.1"
|
||||
unwrap_or = "1.0.1"
|
||||
|
||||
[dev-dependencies]
|
||||
hip_runtime-sys = { path = "../ext/hip_runtime-sys" }
|
||||
|
@ -24,3 +28,4 @@ comgr = { path = "../comgr" }
|
|||
tempfile = "3"
|
||||
paste = "1.0"
|
||||
cuda-driver-sys = "0.3.0"
|
||||
pretty_assertions = "1.4.1"
|
Binary file not shown.
|
@ -1,8 +1,10 @@
|
|||
// Every time this file changes it must te rebuilt, you need `rocm-llvm-dev` and `llvm-17`:
|
||||
// /opt/rocm/llvm/bin/clang -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -nogpulib -O3 -mno-wavefrontsize64 -o zluda_ptx_impl.bc -emit-llvm -c --offload-device-only --offload-arch=gfx1010 && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc -o - | sed '/@llvm.used/d' | sed '/wchar_size/d' | sed '/llvm.module.flags/d' | sed 's/define hidden/define linkonce_odr/g' | sed 's/\"target-cpu\"=\"gfx1010\"//g' | sed -E 's/\"target-features\"=\"[^\"]+\"//g' | sed 's/ nneg / /g' | sed 's/ disjoint / /g' | llvm-as-17 - -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
|
||||
// Every time this file changes it must te rebuilt, you need `rocm-llvm-dev` and `llvm-17`
|
||||
// `fdenormal-fp-math=dynamic` is required to make functions eligible for inlining
|
||||
// /opt/rocm/llvm/bin/clang -Xclang -fdenormal-fp-math=dynamic -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -nogpulib -O3 -mno-wavefrontsize64 -o zluda_ptx_impl.bc -emit-llvm -c --offload-device-only --offload-arch=gfx1010 && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc -o - | sed '/@llvm.used/d' | sed '/wchar_size/d' | sed '/llvm.module.flags/d' | sed 's/define hidden/define linkonce_odr/g' | sed 's/\"target-cpu\"=\"gfx1010\"//g' | sed -E 's/\"target-features\"=\"[^\"]+\"//g' | sed 's/ nneg / /g' | sed 's/ disjoint / /g' | llvm-as-17 - -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <hip/amd_detail/amd_device_functions.h>
|
||||
|
||||
#define FUNC(NAME) __device__ __attribute__((retain)) __zluda_ptx_impl_##NAME
|
||||
|
||||
|
@ -25,19 +27,19 @@ extern "C"
|
|||
return (uint32_t)__ockl_get_local_size(member);
|
||||
}
|
||||
|
||||
size_t __ockl_get_global_id(uint32_t) __device__;
|
||||
size_t __ockl_get_group_id(uint32_t) __device__;
|
||||
uint32_t FUNC(sreg_ctaid)(uint8_t member)
|
||||
{
|
||||
return (uint32_t)__ockl_get_global_id(member);
|
||||
return (uint32_t)__ockl_get_group_id(member);
|
||||
}
|
||||
|
||||
size_t __ockl_get_global_size(uint32_t) __device__;
|
||||
size_t __ockl_get_num_groups(uint32_t) __device__;
|
||||
uint32_t FUNC(sreg_nctaid)(uint8_t member)
|
||||
{
|
||||
return (uint32_t)__ockl_get_global_size(member);
|
||||
return (uint32_t)__ockl_get_num_groups(member);
|
||||
}
|
||||
|
||||
uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __attribute__((device));
|
||||
uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __device__;
|
||||
uint32_t FUNC(bfe_u32)(uint32_t base, uint32_t pos_32, uint32_t len_32)
|
||||
{
|
||||
uint32_t pos = pos_32 & 0xFFU;
|
||||
|
@ -65,7 +67,7 @@ extern "C"
|
|||
return (base >> pos) & ((1UL << len) - 1UL);
|
||||
}
|
||||
|
||||
int32_t __ockl_bfe_i32(int32_t, uint32_t, uint32_t) __attribute__((device));
|
||||
int32_t __ockl_bfe_i32(int32_t, uint32_t, uint32_t) __device__;
|
||||
int32_t FUNC(bfe_s32)(int32_t base, uint32_t pos_32, uint32_t len_32)
|
||||
{
|
||||
uint32_t pos = pos_32 & 0xFFU;
|
||||
|
@ -120,7 +122,7 @@ extern "C"
|
|||
return (base << (64U - pos - len)) >> (64U - len);
|
||||
}
|
||||
|
||||
uint32_t __ockl_bfm_u32(uint32_t count, uint32_t offset) __attribute__((device));
|
||||
uint32_t __ockl_bfm_u32(uint32_t count, uint32_t offset) __device__;
|
||||
uint32_t FUNC(bfi_b32)(uint32_t insert, uint32_t base, uint32_t pos_32, uint32_t len_32)
|
||||
{
|
||||
uint32_t pos = pos_32 & 0xFFU;
|
||||
|
@ -148,4 +150,20 @@ extern "C"
|
|||
mask = ((1UL << len) - 1UL) << (pos);
|
||||
return (~mask & base) | (mask & (insert << pos));
|
||||
}
|
||||
|
||||
void FUNC(bar_sync)(uint32_t)
|
||||
{
|
||||
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
|
||||
__builtin_amdgcn_s_barrier();
|
||||
}
|
||||
|
||||
void FUNC(__assertfail)(uint64_t message,
|
||||
uint64_t file,
|
||||
uint32_t line,
|
||||
uint64_t function,
|
||||
uint64_t char_size)
|
||||
{
|
||||
(void)char_size;
|
||||
__assert_fail((const char *)message, (const char *)file, line, (const char *)function);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,8 +2,8 @@ use super::*;
|
|||
|
||||
pub(super) fn run<'a, 'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives
|
||||
.into_iter()
|
||||
.map(|directive| run_directive(resolver, directive))
|
||||
|
@ -12,8 +12,8 @@ pub(super) fn run<'a, 'input>(
|
|||
|
||||
fn run_directive<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2,
|
||||
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
Ok(match directive {
|
||||
var @ Directive2::Variable(..) => var,
|
||||
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
|
||||
|
@ -22,13 +22,13 @@ fn run_directive<'input>(
|
|||
|
||||
fn run_method<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2,
|
||||
mut method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
mut method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
let is_declaration = method.body.is_none();
|
||||
let mut body = Vec::new();
|
||||
let mut remap_returns = Vec::new();
|
||||
if !method.func_decl.name.is_kernel() {
|
||||
for arg in method.func_decl.return_arguments.iter_mut() {
|
||||
if !method.is_kernel {
|
||||
for arg in method.return_arguments.iter_mut() {
|
||||
match arg.state_space {
|
||||
ptx_parser::StateSpace::Param => {
|
||||
arg.state_space = ptx_parser::StateSpace::Reg;
|
||||
|
@ -51,7 +51,7 @@ fn run_method<'input>(
|
|||
_ => return Err(error_unreachable()),
|
||||
}
|
||||
}
|
||||
for arg in method.func_decl.input_arguments.iter_mut() {
|
||||
for arg in method.input_arguments.iter_mut() {
|
||||
match arg.state_space {
|
||||
ptx_parser::StateSpace::Param => {
|
||||
arg.state_space = ptx_parser::StateSpace::Reg;
|
||||
|
@ -95,14 +95,7 @@ fn run_method<'input>(
|
|||
Ok::<_, TranslateError>(body)
|
||||
})
|
||||
.transpose()?;
|
||||
Ok(Function2 {
|
||||
func_decl: method.func_decl,
|
||||
globals: method.globals,
|
||||
body,
|
||||
import_as: method.import_as,
|
||||
tuning: method.tuning,
|
||||
linkage: method.linkage,
|
||||
})
|
||||
Ok(Function2 { body, ..method })
|
||||
}
|
||||
|
||||
fn run_statement<'input>(
|
||||
|
|
|
@ -65,17 +65,24 @@ impl Drop for Context {
|
|||
}
|
||||
}
|
||||
|
||||
struct Module(LLVMModuleRef);
|
||||
pub struct Module(LLVMModuleRef, Context);
|
||||
|
||||
impl Module {
|
||||
fn new(ctx: &Context, name: &CStr) -> Self {
|
||||
Self(unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) })
|
||||
fn new(ctx: Context, name: &CStr) -> Self {
|
||||
Self(
|
||||
unsafe { LLVMModuleCreateWithNameInContext(name.as_ptr(), ctx.get()) },
|
||||
ctx,
|
||||
)
|
||||
}
|
||||
|
||||
fn get(&self) -> LLVMModuleRef {
|
||||
self.0
|
||||
}
|
||||
|
||||
fn context(&self) -> &Context {
|
||||
&self.1
|
||||
}
|
||||
|
||||
fn verify(&self) -> Result<(), Message> {
|
||||
let mut err = ptr::null_mut();
|
||||
let error = unsafe {
|
||||
|
@ -92,13 +99,14 @@ impl Module {
|
|||
}
|
||||
}
|
||||
|
||||
fn write_bitcode_to_memory(&self) -> MemoryBuffer {
|
||||
pub fn write_bitcode_to_memory(&self) -> MemoryBuffer {
|
||||
let memory_buffer = unsafe { LLVMWriteBitcodeToMemoryBuffer(self.get()) };
|
||||
MemoryBuffer(memory_buffer)
|
||||
}
|
||||
|
||||
fn write_to_stderr(&self) {
|
||||
unsafe { LLVMDumpModule(self.get()) };
|
||||
pub fn print_module_to_string(&self) -> Message {
|
||||
let asm = unsafe { LLVMPrintModuleToString(self.get()) };
|
||||
Message(unsafe { CStr::from_ptr(asm) })
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -134,7 +142,7 @@ impl Drop for Builder {
|
|||
}
|
||||
}
|
||||
|
||||
struct Message(&'static CStr);
|
||||
pub struct Message(&'static CStr);
|
||||
|
||||
impl Drop for Message {
|
||||
fn drop(&mut self) {
|
||||
|
@ -150,6 +158,12 @@ impl std::fmt::Debug for Message {
|
|||
}
|
||||
}
|
||||
|
||||
impl Message {
|
||||
pub fn to_str(&self) -> &str {
|
||||
self.0.to_str().unwrap().trim()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MemoryBuffer(LLVMMemoryBufferRef);
|
||||
|
||||
impl Drop for MemoryBuffer {
|
||||
|
@ -172,22 +186,21 @@ impl Deref for MemoryBuffer {
|
|||
|
||||
pub(super) fn run<'input>(
|
||||
id_defs: GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<MemoryBuffer, TranslateError> {
|
||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Module, TranslateError> {
|
||||
let context = Context::new();
|
||||
let module = Module::new(&context, LLVM_UNNAMED);
|
||||
let mut emit_ctx = ModuleEmitContext::new(&context, &module, &id_defs);
|
||||
let module = Module::new(context, LLVM_UNNAMED);
|
||||
let mut emit_ctx = ModuleEmitContext::new(&module, &id_defs);
|
||||
for directive in directives {
|
||||
match directive {
|
||||
Directive2::Variable(linking, variable) => emit_ctx.emit_global(linking, variable)?,
|
||||
Directive2::Method(method) => emit_ctx.emit_method(method)?,
|
||||
}
|
||||
}
|
||||
module.write_to_stderr();
|
||||
if let Err(err) = module.verify() {
|
||||
panic!("{:?}", err);
|
||||
}
|
||||
Ok(module.write_bitcode_to_memory())
|
||||
Ok(module)
|
||||
}
|
||||
|
||||
struct ModuleEmitContext<'a, 'input> {
|
||||
|
@ -199,11 +212,8 @@ struct ModuleEmitContext<'a, 'input> {
|
|||
}
|
||||
|
||||
impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
||||
fn new(
|
||||
context: &Context,
|
||||
module: &Module,
|
||||
id_defs: &'a GlobalStringIdentResolver2<'input>,
|
||||
) -> Self {
|
||||
fn new(module: &Module, id_defs: &'a GlobalStringIdentResolver2<'input>) -> Self {
|
||||
let context = module.context();
|
||||
ModuleEmitContext {
|
||||
context: context.get(),
|
||||
module: module.get(),
|
||||
|
@ -223,39 +233,51 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
|||
|
||||
fn emit_method(
|
||||
&mut self,
|
||||
method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<(), TranslateError> {
|
||||
let func_decl = method.func_decl;
|
||||
let name = method
|
||||
.import_as
|
||||
.as_deref()
|
||||
.or_else(|| match func_decl.name {
|
||||
ast::MethodName::Kernel(name) => Some(name),
|
||||
ast::MethodName::Func(id) => self.id_defs.ident_map[&id].name.as_deref(),
|
||||
})
|
||||
.or_else(|| self.id_defs.ident_map[&method.name].name.as_deref())
|
||||
.ok_or_else(|| error_unreachable())?;
|
||||
let name = CString::new(name).map_err(|_| error_unreachable())?;
|
||||
let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) };
|
||||
if fn_ == ptr::null_mut() {
|
||||
let fn_type = get_function_type(
|
||||
self.context,
|
||||
func_decl.return_arguments.iter().map(|v| &v.v_type),
|
||||
func_decl
|
||||
method.return_arguments.iter().map(|v| &v.v_type),
|
||||
method
|
||||
.input_arguments
|
||||
.iter()
|
||||
.map(|v| get_input_argument_type(self.context, &v.v_type, v.state_space)),
|
||||
)?;
|
||||
fn_ = unsafe { LLVMAddFunction(self.module, name.as_ptr(), fn_type) };
|
||||
self.emit_fn_attribute(fn_, "amdgpu-unsafe-fp-atomics", "true");
|
||||
self.emit_fn_attribute(fn_, "uniform-work-group-size", "true");
|
||||
self.emit_fn_attribute(fn_, "no-trapping-math", "true");
|
||||
}
|
||||
if let ast::MethodName::Func(name) = func_decl.name {
|
||||
self.resolver.register(name, fn_);
|
||||
if !method.is_kernel {
|
||||
self.resolver.register(method.name, fn_);
|
||||
self.emit_fn_attribute(fn_, "denormal-fp-math-f32", "dynamic");
|
||||
self.emit_fn_attribute(fn_, "denormal-fp-math", "dynamic");
|
||||
} else {
|
||||
self.emit_fn_attribute(
|
||||
fn_,
|
||||
"denormal-fp-math-f32",
|
||||
llvm_ftz(method.flush_to_zero_f32),
|
||||
);
|
||||
self.emit_fn_attribute(
|
||||
fn_,
|
||||
"denormal-fp-math",
|
||||
llvm_ftz(method.flush_to_zero_f16f64),
|
||||
);
|
||||
}
|
||||
for (i, param) in func_decl.input_arguments.iter().enumerate() {
|
||||
for (i, param) in method.input_arguments.iter().enumerate() {
|
||||
let value = unsafe { LLVMGetParam(fn_, i as u32) };
|
||||
let name = self.resolver.get_or_add(param.name);
|
||||
unsafe { LLVMSetValueName2(value, name.as_ptr().cast(), name.len()) };
|
||||
self.resolver.register(param.name, value);
|
||||
if func_decl.name.is_kernel() {
|
||||
if method.is_kernel {
|
||||
let attr_kind = unsafe {
|
||||
LLVMGetEnumAttributeKindForName(b"byref".as_ptr().cast(), b"byref".len())
|
||||
};
|
||||
|
@ -269,7 +291,7 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
|||
unsafe { LLVMAddAttributeAtIndex(fn_, i as u32 + 1, attr) };
|
||||
}
|
||||
}
|
||||
let call_conv = if func_decl.name.is_kernel() {
|
||||
let call_conv = if method.is_kernel {
|
||||
Self::kernel_call_convention()
|
||||
} else {
|
||||
Self::func_call_convention()
|
||||
|
@ -284,7 +306,7 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
|||
unsafe { LLVMAppendBasicBlockInContext(self.context, fn_, LLVM_UNNAMED.as_ptr()) };
|
||||
unsafe { LLVMPositionBuilderAtEnd(self.builder.get(), real_bb) };
|
||||
let mut method_emitter = MethodEmitContext::new(self, fn_, variables_builder);
|
||||
for var in func_decl.return_arguments {
|
||||
for var in method.return_arguments {
|
||||
method_emitter.emit_variable(var)?;
|
||||
}
|
||||
for statement in statements.iter() {
|
||||
|
@ -292,6 +314,17 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
|||
method_emitter.emit_label_initial(*label);
|
||||
}
|
||||
}
|
||||
let mut statements = statements.into_iter();
|
||||
if let Some(Statement::Label(label)) = statements.next() {
|
||||
method_emitter.emit_label_delayed(label)?;
|
||||
} else {
|
||||
return Err(error_unreachable());
|
||||
}
|
||||
method_emitter.emit_kernel_rounding_prelude(
|
||||
method.is_kernel,
|
||||
method.rounding_mode_f32,
|
||||
method.rounding_mode_f16f64,
|
||||
)?;
|
||||
for statement in statements {
|
||||
method_emitter.emit_statement(statement)?;
|
||||
}
|
||||
|
@ -404,6 +437,27 @@ impl<'a, 'input> ModuleEmitContext<'a, 'input> {
|
|||
ptx_parser::ScalarType::BF16x2 => todo!(),
|
||||
})
|
||||
}
|
||||
|
||||
fn emit_fn_attribute(&self, llvm_object: LLVMValueRef, key: &str, value: &str) {
|
||||
let attribute = unsafe {
|
||||
LLVMCreateStringAttribute(
|
||||
self.context,
|
||||
key.as_ptr() as _,
|
||||
key.len() as u32,
|
||||
value.as_ptr() as _,
|
||||
value.len() as u32,
|
||||
)
|
||||
};
|
||||
unsafe { LLVMAddAttributeAtIndex(llvm_object, LLVMAttributeFunctionIndex, attribute) };
|
||||
}
|
||||
}
|
||||
|
||||
fn llvm_ftz(ftz: bool) -> &'static str {
|
||||
if ftz {
|
||||
"preserve-sign"
|
||||
} else {
|
||||
"ieee"
|
||||
}
|
||||
}
|
||||
|
||||
fn get_input_argument_type(
|
||||
|
@ -462,9 +516,32 @@ impl<'a> MethodEmitContext<'a> {
|
|||
Statement::FunctionPointer(_) => todo!(),
|
||||
Statement::VectorRead(vector_read) => self.emit_vector_read(vector_read)?,
|
||||
Statement::VectorWrite(vector_write) => self.emit_vector_write(vector_write)?,
|
||||
Statement::SetMode(mode_reg) => self.emit_set_mode(mode_reg)?,
|
||||
})
|
||||
}
|
||||
|
||||
// This should be a kernel attribute, but sadly AMDGPU LLVM target does
|
||||
// not support attribute for it. So we have to set it as the first
|
||||
// instruction in the body of a kernel
|
||||
fn emit_kernel_rounding_prelude(
|
||||
&mut self,
|
||||
is_kernel: bool,
|
||||
rounding_mode_f32: ast::RoundingMode,
|
||||
rounding_mode_f16f64: ast::RoundingMode,
|
||||
) -> Result<(), TranslateError> {
|
||||
if is_kernel {
|
||||
if rounding_mode_f32 != ast::RoundingMode::NearestEven
|
||||
|| rounding_mode_f16f64 != ast::RoundingMode::NearestEven
|
||||
{
|
||||
self.emit_set_mode(ModeRegister::Rounding {
|
||||
f32: rounding_mode_f32,
|
||||
f16f64: rounding_mode_f16f64,
|
||||
})?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn emit_variable(&mut self, var: ast::Variable<SpirvWord>) -> Result<(), TranslateError> {
|
||||
let alloca = unsafe {
|
||||
LLVMZludaBuildAlloca(
|
||||
|
@ -517,6 +594,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
ast::Instruction::Add { data, arguments } => self.emit_add(data, arguments),
|
||||
ast::Instruction::St { data, arguments } => self.emit_st(data, arguments),
|
||||
ast::Instruction::Mul { data, arguments } => self.emit_mul(data, arguments),
|
||||
ast::Instruction::Mul24 { data, arguments } => self.emit_mul24(data, arguments),
|
||||
ast::Instruction::Setp { data, arguments } => self.emit_setp(data, arguments),
|
||||
ast::Instruction::SetpBool { .. } => todo!(),
|
||||
ast::Instruction::Not { data, arguments } => self.emit_not(data, arguments),
|
||||
|
@ -529,7 +607,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
ast::Instruction::Shl { data, arguments } => self.emit_shl(data, arguments),
|
||||
ast::Instruction::Ret { data } => Ok(self.emit_ret(data)),
|
||||
ast::Instruction::Cvta { data, arguments } => self.emit_cvta(data, arguments),
|
||||
ast::Instruction::Abs { .. } => todo!(),
|
||||
ast::Instruction::Abs { data, arguments } => self.emit_abs(data, arguments),
|
||||
ast::Instruction::Mad { data, arguments } => self.emit_mad(data, arguments),
|
||||
ast::Instruction::Fma { data, arguments } => self.emit_fma(data, arguments),
|
||||
ast::Instruction::Sub { data, arguments } => self.emit_sub(data, arguments),
|
||||
|
@ -539,7 +617,6 @@ impl<'a> MethodEmitContext<'a> {
|
|||
ast::Instruction::Sqrt { data, arguments } => self.emit_sqrt(data, arguments),
|
||||
ast::Instruction::Rsqrt { data, arguments } => self.emit_rsqrt(data, arguments),
|
||||
ast::Instruction::Selp { data, arguments } => self.emit_selp(data, arguments),
|
||||
ast::Instruction::Bar { .. } => todo!(),
|
||||
ast::Instruction::Atom { data, arguments } => self.emit_atom(data, arguments),
|
||||
ast::Instruction::AtomCas { data, arguments } => self.emit_atom_cas(data, arguments),
|
||||
ast::Instruction::Div { data, arguments } => self.emit_div(data, arguments),
|
||||
|
@ -559,6 +636,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
ast::Instruction::Trap {} => todo!(),
|
||||
// replaced by a function call
|
||||
ast::Instruction::Bfe { .. }
|
||||
| ast::Instruction::Bar { .. }
|
||||
| ast::Instruction::Bfi { .. }
|
||||
| ast::Instruction::Activemask { .. } => return Err(error_unreachable()),
|
||||
}
|
||||
|
@ -1117,7 +1195,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
let cos = self.emit_intrinsic(
|
||||
c"llvm.cos.f32",
|
||||
Some(arguments.dst),
|
||||
&ast::ScalarType::F32.into(),
|
||||
Some(&ast::ScalarType::F32.into()),
|
||||
vec![(self.resolver.value(arguments.src)?, llvm_f32)],
|
||||
)?;
|
||||
unsafe { LLVMZludaSetFastMathFlags(cos, LLVMZludaFastMathApproxFunc) }
|
||||
|
@ -1370,7 +1448,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
let sin = self.emit_intrinsic(
|
||||
c"llvm.sin.f32",
|
||||
Some(arguments.dst),
|
||||
&ast::ScalarType::F32.into(),
|
||||
Some(&ast::ScalarType::F32.into()),
|
||||
vec![(self.resolver.value(arguments.src)?, llvm_f32)],
|
||||
)?;
|
||||
unsafe { LLVMZludaSetFastMathFlags(sin, LLVMZludaFastMathApproxFunc) }
|
||||
|
@ -1381,12 +1459,12 @@ impl<'a> MethodEmitContext<'a> {
|
|||
&mut self,
|
||||
name: &CStr,
|
||||
dst: Option<SpirvWord>,
|
||||
return_type: &ast::Type,
|
||||
return_type: Option<&ast::Type>,
|
||||
arguments: Vec<(LLVMValueRef, LLVMTypeRef)>,
|
||||
) -> Result<LLVMValueRef, TranslateError> {
|
||||
let fn_type = get_function_type(
|
||||
self.context,
|
||||
iter::once(return_type),
|
||||
return_type.into_iter(),
|
||||
arguments.iter().map(|(_, type_)| Ok(*type_)),
|
||||
)?;
|
||||
let mut fn_ = unsafe { LLVMGetNamedFunction(self.module, name.as_ptr()) };
|
||||
|
@ -1547,7 +1625,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
return self.emit_cvt_float_to_int(
|
||||
data.from,
|
||||
data.to,
|
||||
integer_rounding.unwrap_or(ast::RoundingMode::NearestEven),
|
||||
integer_rounding,
|
||||
arguments,
|
||||
Some(LLVMBuildFPToSI),
|
||||
)
|
||||
|
@ -1570,8 +1648,12 @@ impl<'a> MethodEmitContext<'a> {
|
|||
Some(LLVMBuildFPToUI),
|
||||
)
|
||||
}
|
||||
ptx_parser::CvtMode::FPFromSigned(_) => todo!(),
|
||||
ptx_parser::CvtMode::FPFromUnsigned(_) => todo!(),
|
||||
ptx_parser::CvtMode::FPFromSigned(_) => {
|
||||
return self.emit_cvt_int_to_float(data.to, arguments, LLVMBuildSIToFP)
|
||||
}
|
||||
ptx_parser::CvtMode::FPFromUnsigned(_) => {
|
||||
return self.emit_cvt_int_to_float(data.to, arguments, LLVMBuildUIToFP)
|
||||
}
|
||||
};
|
||||
let src = self.resolver.value(arguments.src)?;
|
||||
self.resolver.with_result(arguments.dst, |dst| unsafe {
|
||||
|
@ -1601,7 +1683,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
let clamped = self.emit_intrinsic(
|
||||
c"llvm.umin",
|
||||
None,
|
||||
&from.into(),
|
||||
Some(&from.into()),
|
||||
vec![
|
||||
(self.resolver.value(arguments.src)?, from_llvm),
|
||||
(max, from_llvm),
|
||||
|
@ -1631,7 +1713,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
let zero_clamped = self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(zero_clamp_intrinsic.as_bytes()) },
|
||||
None,
|
||||
&from.into(),
|
||||
Some(&from.into()),
|
||||
vec![
|
||||
(self.resolver.value(arguments.src)?, from_llvm),
|
||||
(zero, from_llvm),
|
||||
|
@ -1650,7 +1732,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
let fully_clamped = self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(max_clamp_intrinsic.as_bytes()) },
|
||||
None,
|
||||
&from.into(),
|
||||
Some(&from.into()),
|
||||
vec![(zero_clamped, from_llvm), (max, from_llvm)],
|
||||
)?;
|
||||
let resize_fn = if to.layout().size() >= from.layout().size() {
|
||||
|
@ -1690,7 +1772,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
let rounded_float = self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
||||
None,
|
||||
&from.into(),
|
||||
Some(&from.into()),
|
||||
vec![(
|
||||
self.resolver.value(arguments.src)?,
|
||||
get_scalar_type(self.context, from),
|
||||
|
@ -1726,6 +1808,25 @@ impl<'a> MethodEmitContext<'a> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn emit_cvt_int_to_float(
|
||||
&mut self,
|
||||
to: ptx_parser::ScalarType,
|
||||
arguments: ptx_parser::CvtArgs<SpirvWord>,
|
||||
llvm_func: unsafe extern "C" fn(
|
||||
arg1: LLVMBuilderRef,
|
||||
Val: LLVMValueRef,
|
||||
DestTy: LLVMTypeRef,
|
||||
Name: *const i8,
|
||||
) -> LLVMValueRef,
|
||||
) -> Result<(), TranslateError> {
|
||||
let type_ = get_scalar_type(self.context, to);
|
||||
let src = self.resolver.value(arguments.src)?;
|
||||
self.resolver.with_result(arguments.dst, |dst| unsafe {
|
||||
llvm_func(self.builder, src, type_, dst)
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn emit_rsqrt(
|
||||
&mut self,
|
||||
data: ptx_parser::TypeFtz,
|
||||
|
@ -1740,7 +1841,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
self.emit_intrinsic(
|
||||
intrinsic,
|
||||
Some(arguments.dst),
|
||||
&data.type_.into(),
|
||||
Some(&data.type_.into()),
|
||||
vec![(self.resolver.value(arguments.src)?, type_)],
|
||||
)?;
|
||||
Ok(())
|
||||
|
@ -1761,7 +1862,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
self.emit_intrinsic(
|
||||
intrinsic,
|
||||
Some(arguments.dst),
|
||||
&data.type_.into(),
|
||||
Some(&data.type_.into()),
|
||||
vec![(self.resolver.value(arguments.src)?, type_)],
|
||||
)?;
|
||||
Ok(())
|
||||
|
@ -1783,7 +1884,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
self.emit_intrinsic(
|
||||
intrinsic,
|
||||
Some(arguments.dst),
|
||||
&data.type_.into(),
|
||||
Some(&data.type_.into()),
|
||||
vec![(self.resolver.value(arguments.src)?, type_)],
|
||||
)?;
|
||||
Ok(())
|
||||
|
@ -1905,7 +2006,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
self.emit_intrinsic(
|
||||
intrinsic,
|
||||
Some(arguments.dst),
|
||||
&data.type_.into(),
|
||||
Some(&data.type_.into()),
|
||||
vec![(
|
||||
self.resolver.value(arguments.src)?,
|
||||
get_scalar_type(self.context, data.type_),
|
||||
|
@ -1922,7 +2023,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
self.emit_intrinsic(
|
||||
c"llvm.amdgcn.log.f32",
|
||||
Some(arguments.dst),
|
||||
&ast::ScalarType::F32.into(),
|
||||
Some(&ast::ScalarType::F32.into()),
|
||||
vec![(
|
||||
self.resolver.value(arguments.src)?,
|
||||
get_scalar_type(self.context, ast::ScalarType::F32.into()),
|
||||
|
@ -1977,7 +2078,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
self.emit_intrinsic(
|
||||
intrinsic,
|
||||
Some(arguments.dst),
|
||||
&type_.into(),
|
||||
Some(&type_.into()),
|
||||
vec![(self.resolver.value(arguments.src)?, llvm_type)],
|
||||
)?;
|
||||
Ok(())
|
||||
|
@ -1994,14 +2095,14 @@ impl<'a> MethodEmitContext<'a> {
|
|||
ptx_parser::MinMaxDetails::Float(ptx_parser::MinMaxFloat { nan: true, .. }) => {
|
||||
return Err(error_todo())
|
||||
}
|
||||
ptx_parser::MinMaxDetails::Float(ptx_parser::MinMaxFloat { .. }) => "llvm.maxnum",
|
||||
ptx_parser::MinMaxDetails::Float(ptx_parser::MinMaxFloat { .. }) => "llvm.minnum",
|
||||
};
|
||||
let intrinsic = format!("{}.{}\0", llvm_prefix, LLVMTypeDisplay(data.type_()));
|
||||
let llvm_type = get_scalar_type(self.context, data.type_());
|
||||
self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
||||
Some(arguments.dst),
|
||||
&data.type_().into(),
|
||||
Some(&data.type_().into()),
|
||||
vec![
|
||||
(self.resolver.value(arguments.src1)?, llvm_type),
|
||||
(self.resolver.value(arguments.src2)?, llvm_type),
|
||||
|
@ -2021,14 +2122,14 @@ impl<'a> MethodEmitContext<'a> {
|
|||
ptx_parser::MinMaxDetails::Float(ptx_parser::MinMaxFloat { nan: true, .. }) => {
|
||||
return Err(error_todo())
|
||||
}
|
||||
ptx_parser::MinMaxDetails::Float(ptx_parser::MinMaxFloat { .. }) => "llvm.minnum",
|
||||
ptx_parser::MinMaxDetails::Float(ptx_parser::MinMaxFloat { .. }) => "llvm.maxnum",
|
||||
};
|
||||
let intrinsic = format!("{}.{}\0", llvm_prefix, LLVMTypeDisplay(data.type_()));
|
||||
let llvm_type = get_scalar_type(self.context, data.type_());
|
||||
self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
||||
Some(arguments.dst),
|
||||
&data.type_().into(),
|
||||
Some(&data.type_().into()),
|
||||
vec![
|
||||
(self.resolver.value(arguments.src1)?, llvm_type),
|
||||
(self.resolver.value(arguments.src2)?, llvm_type),
|
||||
|
@ -2046,7 +2147,7 @@ impl<'a> MethodEmitContext<'a> {
|
|||
self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(intrinsic.as_bytes()) },
|
||||
Some(arguments.dst),
|
||||
&data.type_.into(),
|
||||
Some(&data.type_.into()),
|
||||
vec![
|
||||
(
|
||||
self.resolver.value(arguments.src1)?,
|
||||
|
@ -2149,6 +2250,100 @@ impl<'a> MethodEmitContext<'a> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn emit_abs(
|
||||
&mut self,
|
||||
data: ast::TypeFtz,
|
||||
arguments: ptx_parser::AbsArgs<SpirvWord>,
|
||||
) -> Result<(), TranslateError> {
|
||||
let llvm_type = get_scalar_type(self.context, data.type_);
|
||||
let src = self.resolver.value(arguments.src)?;
|
||||
let (prefix, intrinsic_arguments) = if data.type_.kind() == ast::ScalarKind::Float {
|
||||
("llvm.fabs", vec![(src, llvm_type)])
|
||||
} else {
|
||||
let pred = get_scalar_type(self.context, ast::ScalarType::Pred);
|
||||
let zero = unsafe { LLVMConstInt(pred, 0, 0) };
|
||||
("llvm.abs", vec![(src, llvm_type), (zero, pred)])
|
||||
};
|
||||
let llvm_intrinsic = format!("{}.{}\0", prefix, LLVMTypeDisplay(data.type_));
|
||||
self.emit_intrinsic(
|
||||
unsafe { CStr::from_bytes_with_nul_unchecked(llvm_intrinsic.as_bytes()) },
|
||||
Some(arguments.dst),
|
||||
Some(&data.type_.into()),
|
||||
intrinsic_arguments,
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn emit_mul24(
|
||||
&mut self,
|
||||
data: ast::Mul24Details,
|
||||
arguments: ast::Mul24Args<SpirvWord>,
|
||||
) -> Result<(), TranslateError> {
|
||||
let src1 = self.resolver.value(arguments.src1)?;
|
||||
let src2 = self.resolver.value(arguments.src2)?;
|
||||
self.emit_intrinsic(
|
||||
c"llvm.amdgcn.mul.u24",
|
||||
Some(arguments.dst),
|
||||
Some(&ast::Type::Scalar(data.type_)),
|
||||
vec![
|
||||
(src1, get_scalar_type(self.context, data.type_)),
|
||||
(src2, get_scalar_type(self.context, data.type_)),
|
||||
],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn emit_set_mode(&mut self, mode_reg: ModeRegister) -> Result<(), TranslateError> {
|
||||
fn hwreg(reg: u32, offset: u32, size: u32) -> u32 {
|
||||
reg | (offset << 6) | ((size - 1) << 11)
|
||||
}
|
||||
fn denormal_to_value(ftz: bool) -> u32 {
|
||||
if ftz {
|
||||
0
|
||||
} else {
|
||||
3
|
||||
}
|
||||
}
|
||||
fn rounding_to_value(ftz: ast::RoundingMode) -> u32 {
|
||||
match ftz {
|
||||
ptx_parser::RoundingMode::NearestEven => 0,
|
||||
ptx_parser::RoundingMode::Zero => 3,
|
||||
ptx_parser::RoundingMode::NegativeInf => 2,
|
||||
ptx_parser::RoundingMode::PositiveInf => 1,
|
||||
}
|
||||
}
|
||||
fn merge_regs(f32: u32, f16f64: u32) -> u32 {
|
||||
f32 | f16f64 << 2
|
||||
}
|
||||
let intrinsic = c"llvm.amdgcn.s.setreg";
|
||||
let (hwreg, value) = match mode_reg {
|
||||
ModeRegister::Denormal { f32, f16f64 } => {
|
||||
let hwreg = hwreg(1, 4, 4);
|
||||
let f32 = denormal_to_value(f32);
|
||||
let f16f64 = denormal_to_value(f16f64);
|
||||
let value = merge_regs(f32, f16f64);
|
||||
(hwreg, value)
|
||||
}
|
||||
ModeRegister::Rounding { f32, f16f64 } => {
|
||||
let hwreg = hwreg(1, 0, 4);
|
||||
let f32 = rounding_to_value(f32);
|
||||
let f16f64 = rounding_to_value(f16f64);
|
||||
let value = merge_regs(f32, f16f64);
|
||||
(hwreg, value)
|
||||
}
|
||||
};
|
||||
let llvm_i32 = get_scalar_type(self.context, ast::ScalarType::B32);
|
||||
let hwreg_llvm = unsafe { LLVMConstInt(llvm_i32, hwreg as _, 0) };
|
||||
let value_llvm = unsafe { LLVMConstInt(llvm_i32, value as _, 0) };
|
||||
self.emit_intrinsic(
|
||||
intrinsic,
|
||||
None,
|
||||
None,
|
||||
vec![(hwreg_llvm, llvm_i32), (value_llvm, llvm_i32)],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/*
|
||||
// Currently unused, LLVM 18 (ROCm 6.2) does not support `llvm.set.rounding`
|
||||
// Should be available in LLVM 19
|
||||
|
|
|
@ -2,8 +2,8 @@ use super::*;
|
|||
|
||||
pub(super) fn run<'a, 'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<UnconditionalDirective<'input>>,
|
||||
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives: Vec<UnconditionalDirective>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives
|
||||
.into_iter()
|
||||
.map(|directive| run_directive(resolver, directive))
|
||||
|
@ -13,11 +13,10 @@ pub(super) fn run<'a, 'input>(
|
|||
fn run_directive<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directive: Directive2<
|
||||
'input,
|
||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||
ast::ParsedOperand<SpirvWord>,
|
||||
>,
|
||||
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
Ok(match directive {
|
||||
Directive2::Variable(linking, var) => Directive2::Variable(linking, var),
|
||||
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
|
||||
|
@ -27,11 +26,10 @@ fn run_directive<'input>(
|
|||
fn run_method<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
method: Function2<
|
||||
'input,
|
||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||
ast::ParsedOperand<SpirvWord>,
|
||||
>,
|
||||
) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
let body = method
|
||||
.body
|
||||
.map(|statements| {
|
||||
|
@ -43,12 +41,18 @@ fn run_method<'input>(
|
|||
})
|
||||
.transpose()?;
|
||||
Ok(Function2 {
|
||||
func_decl: method.func_decl,
|
||||
globals: method.globals,
|
||||
body,
|
||||
return_arguments: method.return_arguments,
|
||||
name: method.name,
|
||||
input_arguments: method.input_arguments,
|
||||
import_as: method.import_as,
|
||||
tuning: method.tuning,
|
||||
linkage: method.linkage,
|
||||
is_kernel: method.is_kernel,
|
||||
flush_to_zero_f32: method.flush_to_zero_f32,
|
||||
flush_to_zero_f16f64: method.flush_to_zero_f16f64,
|
||||
rounding_mode_f32: method.rounding_mode_f32,
|
||||
rounding_mode_f16f64: method.rounding_mode_f16f64,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
@ -1,30 +1,33 @@
|
|||
use super::*;
|
||||
|
||||
pub(super) fn run<'a, 'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
resolver: &'a mut GlobalStringIdentResolver2<'input>,
|
||||
special_registers: &'a SpecialRegistersMap2,
|
||||
directives: Vec<UnconditionalDirective<'input>>,
|
||||
) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> {
|
||||
let declarations = SpecialRegistersMap2::generate_declarations(resolver);
|
||||
let mut result = Vec::with_capacity(declarations.len() + directives.len());
|
||||
directives: Vec<UnconditionalDirective>,
|
||||
) -> Result<Vec<UnconditionalDirective>, TranslateError> {
|
||||
let mut result = Vec::with_capacity(SpecialRegistersMap2::len() + directives.len());
|
||||
let mut sreg_to_function =
|
||||
FxHashMap::with_capacity_and_hasher(declarations.len(), Default::default());
|
||||
for (sreg, declaration) in declarations {
|
||||
let name = if let ast::MethodName::Func(name) = declaration.name {
|
||||
name
|
||||
} else {
|
||||
return Err(error_unreachable());
|
||||
};
|
||||
result.push(UnconditionalDirective::Method(UnconditionalFunction {
|
||||
func_decl: declaration,
|
||||
globals: Vec::new(),
|
||||
body: None,
|
||||
import_as: None,
|
||||
tuning: Vec::new(),
|
||||
linkage: ast::LinkingDirective::EXTERN,
|
||||
}));
|
||||
sreg_to_function.insert(sreg, name);
|
||||
}
|
||||
FxHashMap::with_capacity_and_hasher(SpecialRegistersMap2::len(), Default::default());
|
||||
SpecialRegistersMap2::foreach_declaration(
|
||||
resolver,
|
||||
|sreg, (return_arguments, name, input_arguments)| {
|
||||
result.push(UnconditionalDirective::Method(UnconditionalFunction {
|
||||
return_arguments,
|
||||
name,
|
||||
input_arguments,
|
||||
body: None,
|
||||
import_as: None,
|
||||
tuning: Vec::new(),
|
||||
linkage: ast::LinkingDirective::EXTERN,
|
||||
is_kernel: false,
|
||||
flush_to_zero_f32: false,
|
||||
flush_to_zero_f16f64: false,
|
||||
rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
|
||||
rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
|
||||
}));
|
||||
sreg_to_function.insert(sreg, name);
|
||||
},
|
||||
);
|
||||
let mut visitor = SpecialRegisterResolver {
|
||||
resolver,
|
||||
special_registers,
|
||||
|
@ -39,8 +42,8 @@ pub(super) fn run<'a, 'input>(
|
|||
|
||||
fn run_directive<'a, 'input>(
|
||||
visitor: &mut SpecialRegisterResolver<'a, 'input>,
|
||||
directive: UnconditionalDirective<'input>,
|
||||
) -> Result<UnconditionalDirective<'input>, TranslateError> {
|
||||
directive: UnconditionalDirective,
|
||||
) -> Result<UnconditionalDirective, TranslateError> {
|
||||
Ok(match directive {
|
||||
var @ Directive2::Variable(..) => var,
|
||||
Directive2::Method(method) => Directive2::Method(run_method(visitor, method)?),
|
||||
|
@ -49,8 +52,8 @@ fn run_directive<'a, 'input>(
|
|||
|
||||
fn run_method<'a, 'input>(
|
||||
visitor: &mut SpecialRegisterResolver<'a, 'input>,
|
||||
method: UnconditionalFunction<'input>,
|
||||
) -> Result<UnconditionalFunction<'input>, TranslateError> {
|
||||
method: UnconditionalFunction,
|
||||
) -> Result<UnconditionalFunction, TranslateError> {
|
||||
let body = method
|
||||
.body
|
||||
.map(|statements| {
|
||||
|
@ -61,14 +64,7 @@ fn run_method<'a, 'input>(
|
|||
Ok::<_, TranslateError>(result)
|
||||
})
|
||||
.transpose()?;
|
||||
Ok(Function2 {
|
||||
func_decl: method.func_decl,
|
||||
globals: method.globals,
|
||||
body,
|
||||
import_as: method.import_as,
|
||||
tuning: method.tuning,
|
||||
linkage: method.linkage,
|
||||
})
|
||||
Ok(Function2 { body, ..method })
|
||||
}
|
||||
|
||||
fn run_statement<'a, 'input>(
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
use super::*;
|
||||
|
||||
pub(super) fn run<'input>(
|
||||
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
let mut result = Vec::with_capacity(directives.len());
|
||||
for mut directive in directives.into_iter() {
|
||||
run_directive(&mut result, &mut directive)?;
|
||||
|
@ -12,8 +12,8 @@ pub(super) fn run<'input>(
|
|||
}
|
||||
|
||||
fn run_directive<'input>(
|
||||
result: &mut Vec<Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>>,
|
||||
directive: &mut Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>,
|
||||
result: &mut Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
directive: &mut Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<(), TranslateError> {
|
||||
match directive {
|
||||
Directive2::Variable(..) => {}
|
||||
|
@ -23,8 +23,8 @@ fn run_directive<'input>(
|
|||
}
|
||||
|
||||
fn run_function<'input>(
|
||||
result: &mut Vec<Directive2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>>,
|
||||
function: &mut Function2<'input, ptx_parser::Instruction<SpirvWord>, SpirvWord>,
|
||||
result: &mut Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
function: &mut Function2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) {
|
||||
function.body = function.body.take().map(|statements| {
|
||||
statements
|
||||
|
|
|
@ -11,8 +11,8 @@ use super::*;
|
|||
// pass, so we do nothing there
|
||||
pub(super) fn run<'a, 'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives
|
||||
.into_iter()
|
||||
.map(|directive| run_directive(resolver, directive))
|
||||
|
@ -21,8 +21,8 @@ pub(super) fn run<'a, 'input>(
|
|||
|
||||
fn run_directive<'a, 'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
Ok(match directive {
|
||||
var @ Directive2::Variable(..) => var,
|
||||
Directive2::Method(method) => {
|
||||
|
@ -34,12 +34,11 @@ fn run_directive<'a, 'input>(
|
|||
|
||||
fn run_method<'a, 'input>(
|
||||
mut visitor: InsertMemSSAVisitor<'a, 'input>,
|
||||
method: Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Function2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
let mut func_decl = method.func_decl;
|
||||
let is_kernel = func_decl.name.is_kernel();
|
||||
mut method: Function2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Function2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
let is_kernel = method.is_kernel;
|
||||
if is_kernel {
|
||||
for arg in func_decl.input_arguments.iter_mut() {
|
||||
for arg in method.input_arguments.iter_mut() {
|
||||
let old_name = arg.name;
|
||||
let old_space = arg.state_space;
|
||||
let new_space = ast::StateSpace::ParamEntry;
|
||||
|
@ -51,10 +50,10 @@ fn run_method<'a, 'input>(
|
|||
arg.state_space = new_space;
|
||||
}
|
||||
};
|
||||
for arg in func_decl.return_arguments.iter_mut() {
|
||||
for arg in method.return_arguments.iter_mut() {
|
||||
visitor.visit_variable(arg)?;
|
||||
}
|
||||
let return_arguments = &func_decl.return_arguments[..];
|
||||
let return_arguments = &method.return_arguments[..];
|
||||
let body = method
|
||||
.body
|
||||
.map(move |statements| {
|
||||
|
@ -65,14 +64,7 @@ fn run_method<'a, 'input>(
|
|||
Ok::<_, TranslateError>(result)
|
||||
})
|
||||
.transpose()?;
|
||||
Ok(Function2 {
|
||||
func_decl: func_decl,
|
||||
globals: method.globals,
|
||||
body,
|
||||
import_as: method.import_as,
|
||||
tuning: method.tuning,
|
||||
linkage: method.linkage,
|
||||
})
|
||||
Ok(Function2 { body, ..method })
|
||||
}
|
||||
|
||||
fn run_statement<'a, 'input>(
|
||||
|
@ -122,6 +114,13 @@ fn run_statement<'a, 'input>(
|
|||
result.push(Statement::Instruction(instruction));
|
||||
result.extend(visitor.post.drain(..).map(Statement::Instruction));
|
||||
}
|
||||
Statement::PtrAccess(ptr_access) => {
|
||||
let statement = Statement::PtrAccess(visitor.visit_ptr_access(ptr_access)?);
|
||||
let statement = statement.visit_map(visitor)?;
|
||||
result.extend(visitor.pre.drain(..).map(Statement::Instruction));
|
||||
result.push(statement);
|
||||
result.extend(visitor.post.drain(..).map(Statement::Instruction));
|
||||
}
|
||||
s => {
|
||||
let new_statement = s.visit_map(visitor)?;
|
||||
result.extend(visitor.pre.drain(..).map(Statement::Instruction));
|
||||
|
@ -259,6 +258,41 @@ impl<'a, 'input> InsertMemSSAVisitor<'a, 'input> {
|
|||
Ok(ast::Instruction::Ld { data, arguments })
|
||||
}
|
||||
|
||||
fn visit_ptr_access(
|
||||
&mut self,
|
||||
ptr_access: PtrAccess<SpirvWord>,
|
||||
) -> Result<PtrAccess<SpirvWord>, TranslateError> {
|
||||
let (old_space, new_space, name) = match self.variables.get(&ptr_access.ptr_src) {
|
||||
Some(RemapAction::LDStSpaceChange {
|
||||
old_space,
|
||||
new_space,
|
||||
name,
|
||||
}) => (*old_space, *new_space, *name),
|
||||
Some(RemapAction::PreLdPostSt { .. }) | None => return Ok(ptr_access),
|
||||
};
|
||||
if ptr_access.state_space != old_space {
|
||||
return Err(error_mismatched_type());
|
||||
}
|
||||
// Propagate space changes in dst
|
||||
let new_dst = self
|
||||
.resolver
|
||||
.register_unnamed(Some((ptr_access.underlying_type.clone(), new_space)));
|
||||
self.variables.insert(
|
||||
ptr_access.dst,
|
||||
RemapAction::LDStSpaceChange {
|
||||
old_space,
|
||||
new_space,
|
||||
name: new_dst,
|
||||
},
|
||||
);
|
||||
Ok(PtrAccess {
|
||||
ptr_src: name,
|
||||
dst: new_dst,
|
||||
state_space: new_space,
|
||||
..ptr_access
|
||||
})
|
||||
}
|
||||
|
||||
fn visit_variable(&mut self, var: &mut ast::Variable<SpirvWord>) -> Result<(), TranslateError> {
|
||||
let old_space = match var.state_space {
|
||||
space @ (ptx_parser::StateSpace::Reg | ptx_parser::StateSpace::Param) => space,
|
||||
|
|
|
@ -19,8 +19,8 @@ use ptx_parser as ast;
|
|||
*/
|
||||
pub(super) fn run<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives
|
||||
.into_iter()
|
||||
.map(|directive| run_directive(resolver, directive))
|
||||
|
@ -29,8 +29,8 @@ pub(super) fn run<'input>(
|
|||
|
||||
fn run_directive<'a, 'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
Ok(match directive {
|
||||
var @ Directive2::Variable(..) => var,
|
||||
Directive2::Method(mut method) => {
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
.version 6.5
|
||||
.target sm_50
|
||||
.address_size 64
|
||||
|
||||
.func use_modes();
|
||||
|
||||
.visible .entry kernel()
|
||||
{
|
||||
.reg .f32 temp;
|
||||
|
||||
add.rz.ftz.f32 temp, temp, temp;
|
||||
call use_modes;
|
||||
add.rp.ftz.f32 temp, temp, temp;
|
||||
ret;
|
||||
}
|
||||
|
||||
.func use_modes()
|
||||
{
|
||||
.reg .f32 temp;
|
||||
.reg .pred pred;
|
||||
@pred bra SET_RM;
|
||||
@!pred bra SET_RZ;
|
||||
SET_RM:
|
||||
add.rm.f32 temp, temp, temp;
|
||||
ret;
|
||||
SET_RZ:
|
||||
add.rz.f32 temp, temp, temp;
|
||||
ret;
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
.version 6.5
|
||||
.target sm_30
|
||||
.address_size 64
|
||||
|
||||
.visible .entry add()
|
||||
{
|
||||
.reg .f32 temp<3>;
|
||||
|
||||
add.ftz.f16 temp2, temp1, temp0;
|
||||
add.ftz.f32 temp2, temp1, temp0;
|
||||
|
||||
add.f16 temp2, temp1, temp0;
|
||||
add.f32 temp2, temp1, temp0;
|
||||
ret;
|
||||
}
|
1907
ptx/src/pass/instruction_mode_to_global_mode/mod.rs
Normal file
1907
ptx/src/pass/instruction_mode_to_global_mode/mod.rs
Normal file
File diff suppressed because it is too large
Load diff
399
ptx/src/pass/instruction_mode_to_global_mode/test.rs
Normal file
399
ptx/src/pass/instruction_mode_to_global_mode/test.rs
Normal file
|
@ -0,0 +1,399 @@
|
|||
use super::*;
|
||||
use int_enum::IntEnum;
|
||||
use strum::EnumCount;
|
||||
|
||||
#[repr(usize)]
|
||||
#[derive(IntEnum, Eq, PartialEq, Copy, Clone, Debug)]
|
||||
enum Bool {
|
||||
False = 0,
|
||||
True = 1,
|
||||
}
|
||||
|
||||
fn ftz() -> InstructionModes {
|
||||
InstructionModes {
|
||||
denormal_f32: Some(DenormalMode::FlushToZero),
|
||||
denormal_f16f64: None,
|
||||
rounding_f32: None,
|
||||
rounding_f16f64: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn preserve() -> InstructionModes {
|
||||
InstructionModes {
|
||||
denormal_f32: Some(DenormalMode::Preserve),
|
||||
denormal_f16f64: None,
|
||||
rounding_f32: None,
|
||||
rounding_f16f64: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn transitive_mixed() {
|
||||
let mut graph = ControlFlowGraph::new();
|
||||
let entry_id = SpirvWord(1);
|
||||
let false_id = SpirvWord(2);
|
||||
let empty_id = SpirvWord(3);
|
||||
let false2_id = SpirvWord(4);
|
||||
let entry = graph.add_entry_basic_block(entry_id);
|
||||
graph.add_jump(entry, false_id);
|
||||
let false_ = graph.get_or_add_basic_block(false_id);
|
||||
graph.set_modes(false_, ftz(), ftz());
|
||||
graph.add_jump(false_, empty_id);
|
||||
let empty = graph.get_or_add_basic_block(empty_id);
|
||||
graph.add_jump(empty, false2_id);
|
||||
let false2_ = graph.get_or_add_basic_block(false2_id);
|
||||
graph.set_modes(false2_, ftz(), ftz());
|
||||
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
|
||||
assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
|
||||
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
|
||||
assert_eq!(
|
||||
partial_result.bb_maybe_insert_mode[&false_id],
|
||||
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
|
||||
);
|
||||
|
||||
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
|
||||
assert_eq!(result.basic_blocks.len(), 0);
|
||||
assert_eq!(result.kernels.len(), 1);
|
||||
assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn transitive_change_twice() {
|
||||
let mut graph = ControlFlowGraph::new();
|
||||
let entry_id = SpirvWord(1);
|
||||
let false_id = SpirvWord(2);
|
||||
let empty_id = SpirvWord(3);
|
||||
let true_id = SpirvWord(4);
|
||||
let entry = graph.add_entry_basic_block(entry_id);
|
||||
graph.add_jump(entry, false_id);
|
||||
let false_ = graph.get_or_add_basic_block(false_id);
|
||||
graph.set_modes(false_, ftz(), ftz());
|
||||
graph.add_jump(false_, empty_id);
|
||||
let empty = graph.get_or_add_basic_block(empty_id);
|
||||
graph.add_jump(empty, true_id);
|
||||
let true_ = graph.get_or_add_basic_block(true_id);
|
||||
graph.set_modes(true_, preserve(), preserve());
|
||||
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
|
||||
assert_eq!(partial_result.bb_must_insert_mode.len(), 1);
|
||||
assert!(partial_result.bb_must_insert_mode.contains(&true_id));
|
||||
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
|
||||
assert_eq!(
|
||||
partial_result.bb_maybe_insert_mode[&false_id],
|
||||
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
|
||||
);
|
||||
|
||||
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
|
||||
assert_eq!(result.basic_blocks, iter::once(true_id).collect());
|
||||
assert_eq!(result.kernels.len(), 1);
|
||||
assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn transitive_change() {
|
||||
let mut graph = ControlFlowGraph::new();
|
||||
let entry_id = SpirvWord(1);
|
||||
let empty_id = SpirvWord(2);
|
||||
let true_id = SpirvWord(3);
|
||||
let entry = graph.add_entry_basic_block(entry_id);
|
||||
graph.add_jump(entry, empty_id);
|
||||
let empty = graph.get_or_add_basic_block(empty_id);
|
||||
graph.add_jump(empty, true_id);
|
||||
let true_ = graph.get_or_add_basic_block(true_id);
|
||||
graph.set_modes(true_, preserve(), preserve());
|
||||
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
|
||||
assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
|
||||
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 1);
|
||||
assert_eq!(
|
||||
partial_result.bb_maybe_insert_mode[&true_id],
|
||||
(DenormalMode::Preserve, iter::once(entry_id).collect())
|
||||
);
|
||||
|
||||
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
|
||||
assert_eq!(result.basic_blocks.len(), 0);
|
||||
assert_eq!(result.kernels.len(), 1);
|
||||
assert_eq!(result.kernels[&entry_id], DenormalMode::Preserve);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn codependency() {
|
||||
let mut graph = ControlFlowGraph::new();
|
||||
let entry_id = SpirvWord(1);
|
||||
let left_f_id = SpirvWord(2);
|
||||
let right_f_id = SpirvWord(3);
|
||||
let left_none_id = SpirvWord(4);
|
||||
let mid_none_id = SpirvWord(5);
|
||||
let right_none_id = SpirvWord(6);
|
||||
let entry = graph.add_entry_basic_block(entry_id);
|
||||
graph.add_jump(entry, left_f_id);
|
||||
graph.add_jump(entry, right_f_id);
|
||||
let left_f = graph.get_or_add_basic_block(left_f_id);
|
||||
graph.set_modes(left_f, ftz(), ftz());
|
||||
let right_f = graph.get_or_add_basic_block(right_f_id);
|
||||
graph.set_modes(right_f, ftz(), ftz());
|
||||
graph.add_jump(left_f, left_none_id);
|
||||
let left_none = graph.get_or_add_basic_block(left_none_id);
|
||||
graph.add_jump(right_f, right_none_id);
|
||||
let right_none = graph.get_or_add_basic_block(right_none_id);
|
||||
graph.add_jump(left_none, mid_none_id);
|
||||
graph.add_jump(right_none, mid_none_id);
|
||||
let mid_none = graph.get_or_add_basic_block(mid_none_id);
|
||||
graph.add_jump(mid_none, left_none_id);
|
||||
graph.add_jump(mid_none, right_none_id);
|
||||
//println!(
|
||||
// "{:?}",
|
||||
// petgraph::dot::Dot::with_config(&graph.graph, &[petgraph::dot::Config::EdgeNoLabel])
|
||||
//);
|
||||
let partial_result = super::compute_single_mode_insertions(&graph, |node| node.denormal_f32);
|
||||
assert_eq!(partial_result.bb_must_insert_mode.len(), 0);
|
||||
assert_eq!(partial_result.bb_maybe_insert_mode.len(), 2);
|
||||
assert_eq!(
|
||||
partial_result.bb_maybe_insert_mode[&left_f_id],
|
||||
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
|
||||
);
|
||||
assert_eq!(
|
||||
partial_result.bb_maybe_insert_mode[&right_f_id],
|
||||
(DenormalMode::FlushToZero, iter::once(entry_id).collect())
|
||||
);
|
||||
|
||||
let result = optimize_mode_insertions::<DenormalMode, { DenormalMode::COUNT }>(partial_result);
|
||||
assert_eq!(result.basic_blocks.len(), 0);
|
||||
assert_eq!(result.kernels.len(), 1);
|
||||
assert_eq!(result.kernels[&entry_id], DenormalMode::FlushToZero);
|
||||
}
|
||||
|
||||
static FOLD_DENORMAL_PTX: &'static str = include_str!("fold_denormal.ptx");
|
||||
|
||||
#[test]
|
||||
fn fold_denormal() {
|
||||
let method = compile_methods(FOLD_DENORMAL_PTX).pop().unwrap();
|
||||
assert_eq!(true, method.flush_to_zero_f32);
|
||||
assert_eq!(true, method.flush_to_zero_f16f64);
|
||||
let method_body = method.body.unwrap();
|
||||
assert!(matches!(
|
||||
&*method_body,
|
||||
[
|
||||
Statement::Label(..),
|
||||
Statement::Variable(..),
|
||||
Statement::Variable(..),
|
||||
Statement::Variable(..),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::SetMode(ModeRegister::Denormal {
|
||||
f32: false,
|
||||
f16f64: false
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Ret { .. }),
|
||||
]
|
||||
));
|
||||
}
|
||||
|
||||
fn compile_methods(ptx: &str) -> Vec<Function2<ast::Instruction<SpirvWord>, SpirvWord>> {
|
||||
use crate::pass::*;
|
||||
|
||||
let module = ptx_parser::parse_module_checked(ptx).unwrap();
|
||||
let mut flat_resolver = GlobalStringIdentResolver2::new(SpirvWord(1));
|
||||
let mut scoped_resolver = ScopedResolver::new(&mut flat_resolver);
|
||||
let directives = normalize_identifiers2::run(&mut scoped_resolver, module.directives).unwrap();
|
||||
let directives = normalize_predicates2::run(&mut flat_resolver, directives).unwrap();
|
||||
let directives = expand_operands::run(&mut flat_resolver, directives).unwrap();
|
||||
let directives = normalize_basic_blocks::run(&mut flat_resolver, directives).unwrap();
|
||||
let directives = super::run(&mut flat_resolver, directives).unwrap();
|
||||
directives
|
||||
.into_iter()
|
||||
.filter_map(|s| match s {
|
||||
Directive2::Method(m) => Some(m),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
static CALL_WITH_MODE_PTX: &'static str = include_str!("call_with_mode.ptx");
|
||||
|
||||
#[test]
|
||||
fn call_with_mode() {
|
||||
let methods = compile_methods(CALL_WITH_MODE_PTX);
|
||||
|
||||
assert!(matches!(methods[0].body, None));
|
||||
|
||||
let method_1 = methods[1].body.as_ref().unwrap();
|
||||
assert!(matches!(
|
||||
&**method_1,
|
||||
[
|
||||
Statement::Label(..),
|
||||
Statement::Variable(..),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Call { .. }),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
// Dual prelude
|
||||
Statement::SetMode(ModeRegister::Denormal {
|
||||
f32: true,
|
||||
f16f64: true
|
||||
}),
|
||||
Statement::SetMode(ModeRegister::Rounding {
|
||||
f32: ast::RoundingMode::PositiveInf,
|
||||
f16f64: ast::RoundingMode::NearestEven
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
// Denormal prelude
|
||||
Statement::Label(..),
|
||||
Statement::SetMode(ModeRegister::Denormal {
|
||||
f32: true,
|
||||
f16f64: true
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
// Rounding prelude
|
||||
Statement::Label(..),
|
||||
Statement::SetMode(ModeRegister::Rounding {
|
||||
f32: ast::RoundingMode::PositiveInf,
|
||||
f16f64: ast::RoundingMode::NearestEven
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Ret { .. }),
|
||||
]
|
||||
));
|
||||
let [to_fn0] = calls(method_1);
|
||||
let [_, dual_prelude, _, _, add] = labels(method_1);
|
||||
let [post_call, post_prelude_dual, post_prelude_denormal, post_prelude_rounding] =
|
||||
branches(method_1);
|
||||
assert_eq!(methods[0].name, to_fn0);
|
||||
assert_eq!(post_call, dual_prelude);
|
||||
assert_eq!(post_prelude_dual, add);
|
||||
assert_eq!(post_prelude_denormal, add);
|
||||
assert_eq!(post_prelude_rounding, add);
|
||||
|
||||
let method_2 = methods[2].body.as_ref().unwrap();
|
||||
assert!(matches!(
|
||||
&**method_2,
|
||||
[
|
||||
Statement::Label(..),
|
||||
Statement::Variable(..),
|
||||
Statement::Variable(..),
|
||||
Statement::Conditional(..),
|
||||
Statement::Label(..),
|
||||
Statement::Conditional(..),
|
||||
Statement::Label(..),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
// Dual prelude
|
||||
Statement::SetMode(ModeRegister::Denormal {
|
||||
f32: false,
|
||||
f16f64: true
|
||||
}),
|
||||
Statement::SetMode(ModeRegister::Rounding {
|
||||
f32: ast::RoundingMode::NegativeInf,
|
||||
f16f64: ast::RoundingMode::NearestEven
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
// Denormal prelude
|
||||
Statement::Label(..),
|
||||
Statement::SetMode(ModeRegister::Denormal {
|
||||
f32: false,
|
||||
f16f64: true
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
// Rounding prelude
|
||||
Statement::Label(..),
|
||||
Statement::SetMode(ModeRegister::Rounding {
|
||||
f32: ast::RoundingMode::NegativeInf,
|
||||
f16f64: ast::RoundingMode::NearestEven
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
Statement::SetMode(ModeRegister::Denormal {
|
||||
f32: false,
|
||||
f16f64: true
|
||||
}),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
Statement::Instruction(ast::Instruction::Add { .. }),
|
||||
Statement::Instruction(ast::Instruction::Bra { .. }),
|
||||
Statement::Label(..),
|
||||
Statement::Instruction(ast::Instruction::Ret { .. }),
|
||||
]
|
||||
));
|
||||
let [(if_rm_true, if_rm_false), (if_rz_true, if_rz_false)] = conditionals(method_2);
|
||||
let [_, conditional2, post_conditional2, prelude_dual, _, _, add1, add2_set_denormal, add2, ret] =
|
||||
labels(method_2);
|
||||
let [post_conditional2_jump, post_prelude_dual, post_prelude_denormal, post_prelude_rounding, post_add1, post_add2_set_denormal, post_add2] =
|
||||
branches(method_2);
|
||||
assert_eq!(if_rm_true, prelude_dual);
|
||||
assert_eq!(if_rm_false, conditional2);
|
||||
assert_eq!(if_rz_true, post_conditional2);
|
||||
assert_eq!(if_rz_false, add2_set_denormal);
|
||||
assert_eq!(post_conditional2_jump, prelude_dual);
|
||||
assert_eq!(post_prelude_dual, add1);
|
||||
assert_eq!(post_prelude_denormal, add1);
|
||||
assert_eq!(post_prelude_rounding, add1);
|
||||
assert_eq!(post_add1, ret);
|
||||
assert_eq!(post_add2_set_denormal, add2);
|
||||
assert_eq!(post_add2, ret);
|
||||
}
|
||||
|
||||
fn branches<const N: usize>(
|
||||
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> [SpirvWord; N] {
|
||||
fn_.iter()
|
||||
.filter_map(|s| match s {
|
||||
Statement::Instruction(ast::Instruction::Bra {
|
||||
arguments: ast::BraArgs { src },
|
||||
}) => Some(*src),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.try_into()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn labels<const N: usize>(
|
||||
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> [SpirvWord; N] {
|
||||
fn_.iter()
|
||||
.filter_map(
|
||||
|s: &Statement<ptx_parser::Instruction<SpirvWord>, SpirvWord>| match s {
|
||||
Statement::Label(label) => Some(*label),
|
||||
_ => None,
|
||||
},
|
||||
)
|
||||
.collect::<Vec<_>>()
|
||||
.try_into()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn calls<const N: usize>(
|
||||
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> [SpirvWord; N] {
|
||||
fn_.iter()
|
||||
.filter_map(|s| match s {
|
||||
Statement::Instruction(ast::Instruction::Call {
|
||||
arguments: ast::CallArgs { func, .. },
|
||||
..
|
||||
}) => Some(*func),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.try_into()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn conditionals<const N: usize>(
|
||||
fn_: &Vec<Statement<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> [(SpirvWord, SpirvWord); N] {
|
||||
fn_.iter()
|
||||
.filter_map(|s| match s {
|
||||
Statement::Conditional(BrachCondition {
|
||||
if_true, if_false, ..
|
||||
}) => Some((*if_true, *if_false)),
|
||||
_ => None,
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.try_into()
|
||||
.unwrap()
|
||||
}
|
|
@ -17,10 +17,14 @@ mod expand_operands;
|
|||
mod fix_special_registers2;
|
||||
mod hoist_globals;
|
||||
mod insert_explicit_load_store;
|
||||
mod instruction_mode_to_global_mode;
|
||||
mod insert_implicit_conversions2;
|
||||
mod normalize_basic_blocks;
|
||||
mod normalize_identifiers2;
|
||||
mod normalize_predicates2;
|
||||
mod remove_unreachable_basic_blocks;
|
||||
mod replace_instructions_with_function_calls;
|
||||
mod replace_known_functions;
|
||||
mod resolve_function_pointers;
|
||||
|
||||
static ZLUDA_PTX_IMPL: &'static [u8] = include_bytes!("../../lib/zluda_ptx_impl.bc");
|
||||
|
@ -42,11 +46,15 @@ pub fn to_llvm_module<'input>(ast: ast::Module<'input>) -> Result<Module, Transl
|
|||
let mut scoped_resolver = ScopedResolver::new(&mut flat_resolver);
|
||||
let sreg_map = SpecialRegistersMap2::new(&mut scoped_resolver)?;
|
||||
let directives = normalize_identifiers2::run(&mut scoped_resolver, ast.directives)?;
|
||||
let directives = replace_known_functions::run(&mut flat_resolver, directives);
|
||||
let directives = normalize_predicates2::run(&mut flat_resolver, directives)?;
|
||||
let directives = resolve_function_pointers::run(directives)?;
|
||||
let directives = fix_special_registers2::run(&mut flat_resolver, &sreg_map, directives)?;
|
||||
let directives = expand_operands::run(&mut flat_resolver, directives)?;
|
||||
let directives = deparamize_functions::run(&mut flat_resolver, directives)?;
|
||||
let directives = normalize_basic_blocks::run(&mut flat_resolver, directives)?;
|
||||
let directives = remove_unreachable_basic_blocks::run(directives)?;
|
||||
let directives = instruction_mode_to_global_mode::run(&mut flat_resolver, directives)?;
|
||||
let directives = insert_explicit_load_store::run(&mut flat_resolver, directives)?;
|
||||
let directives = insert_implicit_conversions2::run(&mut flat_resolver, directives)?;
|
||||
let directives = replace_instructions_with_function_calls::run(&mut flat_resolver, directives)?;
|
||||
|
@ -59,7 +67,7 @@ pub fn to_llvm_module<'input>(ast: ast::Module<'input>) -> Result<Module, Transl
|
|||
}
|
||||
|
||||
pub struct Module {
|
||||
pub llvm_ir: emit_llvm::MemoryBuffer,
|
||||
pub llvm_ir: emit_llvm::Module,
|
||||
pub kernel_info: HashMap<String, KernelInfo>,
|
||||
}
|
||||
|
||||
|
@ -193,6 +201,20 @@ enum Statement<I, P: ast::Operand> {
|
|||
FunctionPointer(FunctionPointerDetails),
|
||||
VectorRead(VectorRead),
|
||||
VectorWrite(VectorWrite),
|
||||
SetMode(ModeRegister),
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Clone, Copy)]
|
||||
#[cfg_attr(test, derive(Debug))]
|
||||
enum ModeRegister {
|
||||
Denormal {
|
||||
f32: bool,
|
||||
f16f64: bool,
|
||||
},
|
||||
Rounding {
|
||||
f32: ast::RoundingMode,
|
||||
f16f64: ast::RoundingMode,
|
||||
},
|
||||
}
|
||||
|
||||
impl<T: ast::Operand<Ident = SpirvWord>> Statement<ast::Instruction<T>, T> {
|
||||
|
@ -465,6 +487,7 @@ impl<T: ast::Operand<Ident = SpirvWord>> Statement<ast::Instruction<T>, T> {
|
|||
let src = visitor.visit_ident(src, None, false, false)?;
|
||||
Statement::FunctionPointer(FunctionPointerDetails { dst, src })
|
||||
}
|
||||
Statement::SetMode(mode_register) => Statement::SetMode(mode_register),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -523,7 +546,7 @@ struct FunctionPointerDetails {
|
|||
src: SpirvWord,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
||||
#[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Debug)]
|
||||
pub struct SpirvWord(u32);
|
||||
|
||||
impl From<u32> for SpirvWord {
|
||||
|
@ -555,22 +578,27 @@ type NormalizedStatement = Statement<
|
|||
ast::ParsedOperand<SpirvWord>,
|
||||
>;
|
||||
|
||||
enum Directive2<'input, Instruction, Operand: ast::Operand> {
|
||||
enum Directive2<Instruction, Operand: ast::Operand> {
|
||||
Variable(ast::LinkingDirective, ast::Variable<SpirvWord>),
|
||||
Method(Function2<'input, Instruction, Operand>),
|
||||
Method(Function2<Instruction, Operand>),
|
||||
}
|
||||
|
||||
struct Function2<'input, Instruction, Operand: ast::Operand> {
|
||||
pub func_decl: ast::MethodDeclaration<'input, SpirvWord>,
|
||||
pub globals: Vec<ast::Variable<SpirvWord>>,
|
||||
struct Function2<Instruction, Operand: ast::Operand> {
|
||||
pub return_arguments: Vec<ast::Variable<Operand::Ident>>,
|
||||
pub name: Operand::Ident,
|
||||
pub input_arguments: Vec<ast::Variable<Operand::Ident>>,
|
||||
pub body: Option<Vec<Statement<Instruction, Operand>>>,
|
||||
is_kernel: bool,
|
||||
import_as: Option<String>,
|
||||
tuning: Vec<ast::TuningDirective>,
|
||||
linkage: ast::LinkingDirective,
|
||||
flush_to_zero_f32: bool,
|
||||
flush_to_zero_f16f64: bool,
|
||||
rounding_mode_f32: ast::RoundingMode,
|
||||
rounding_mode_f16f64: ast::RoundingMode,
|
||||
}
|
||||
|
||||
type NormalizedDirective2<'input> = Directive2<
|
||||
'input,
|
||||
type NormalizedDirective2 = Directive2<
|
||||
(
|
||||
Option<ast::PredAt<SpirvWord>>,
|
||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||
|
@ -578,8 +606,7 @@ type NormalizedDirective2<'input> = Directive2<
|
|||
ast::ParsedOperand<SpirvWord>,
|
||||
>;
|
||||
|
||||
type NormalizedFunction2<'input> = Function2<
|
||||
'input,
|
||||
type NormalizedFunction2 = Function2<
|
||||
(
|
||||
Option<ast::PredAt<SpirvWord>>,
|
||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||
|
@ -587,17 +614,11 @@ type NormalizedFunction2<'input> = Function2<
|
|||
ast::ParsedOperand<SpirvWord>,
|
||||
>;
|
||||
|
||||
type UnconditionalDirective<'input> = Directive2<
|
||||
'input,
|
||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||
ast::ParsedOperand<SpirvWord>,
|
||||
>;
|
||||
type UnconditionalDirective =
|
||||
Directive2<ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::ParsedOperand<SpirvWord>>;
|
||||
|
||||
type UnconditionalFunction<'input> = Function2<
|
||||
'input,
|
||||
ast::Instruction<ast::ParsedOperand<SpirvWord>>,
|
||||
ast::ParsedOperand<SpirvWord>,
|
||||
>;
|
||||
type UnconditionalFunction =
|
||||
Function2<ast::Instruction<ast::ParsedOperand<SpirvWord>>, ast::ParsedOperand<SpirvWord>>;
|
||||
|
||||
struct GlobalStringIdentResolver2<'input> {
|
||||
pub(crate) current_id: SpirvWord,
|
||||
|
@ -803,47 +824,45 @@ impl SpecialRegistersMap2 {
|
|||
self.id_to_reg.get(&id).copied()
|
||||
}
|
||||
|
||||
fn generate_declarations<'a, 'input>(
|
||||
fn len() -> usize {
|
||||
PtxSpecialRegister::iter().len()
|
||||
}
|
||||
|
||||
fn foreach_declaration<'a, 'input>(
|
||||
resolver: &'a mut GlobalStringIdentResolver2<'input>,
|
||||
) -> impl ExactSizeIterator<
|
||||
Item = (
|
||||
mut fn_: impl FnMut(
|
||||
PtxSpecialRegister,
|
||||
ast::MethodDeclaration<'input, SpirvWord>,
|
||||
(
|
||||
Vec<ast::Variable<SpirvWord>>,
|
||||
SpirvWord,
|
||||
Vec<ast::Variable<SpirvWord>>,
|
||||
),
|
||||
),
|
||||
> + 'a {
|
||||
PtxSpecialRegister::iter().map(|sreg| {
|
||||
) {
|
||||
for sreg in PtxSpecialRegister::iter() {
|
||||
let external_fn_name = [ZLUDA_PTX_PREFIX, sreg.get_unprefixed_function_name()].concat();
|
||||
let name =
|
||||
ast::MethodName::Func(resolver.register_named(Cow::Owned(external_fn_name), None));
|
||||
let name = resolver.register_named(Cow::Owned(external_fn_name), None);
|
||||
let return_type = sreg.get_function_return_type();
|
||||
let input_type = sreg.get_function_input_type();
|
||||
(
|
||||
sreg,
|
||||
ast::MethodDeclaration {
|
||||
return_arguments: vec![ast::Variable {
|
||||
align: None,
|
||||
v_type: return_type.into(),
|
||||
state_space: ast::StateSpace::Reg,
|
||||
name: resolver
|
||||
.register_unnamed(Some((return_type.into(), ast::StateSpace::Reg))),
|
||||
array_init: Vec::new(),
|
||||
}],
|
||||
name: name,
|
||||
input_arguments: input_type
|
||||
.into_iter()
|
||||
.map(|type_| ast::Variable {
|
||||
align: None,
|
||||
v_type: type_.into(),
|
||||
state_space: ast::StateSpace::Reg,
|
||||
name: resolver
|
||||
.register_unnamed(Some((type_.into(), ast::StateSpace::Reg))),
|
||||
array_init: Vec::new(),
|
||||
})
|
||||
.collect::<Vec<_>>(),
|
||||
shared_mem: None,
|
||||
},
|
||||
)
|
||||
})
|
||||
let return_arguments = vec![ast::Variable {
|
||||
align: None,
|
||||
v_type: return_type.into(),
|
||||
state_space: ast::StateSpace::Reg,
|
||||
name: resolver.register_unnamed(Some((return_type.into(), ast::StateSpace::Reg))),
|
||||
array_init: Vec::new(),
|
||||
}];
|
||||
let input_arguments = input_type
|
||||
.into_iter()
|
||||
.map(|type_| ast::Variable {
|
||||
align: None,
|
||||
v_type: type_.into(),
|
||||
state_space: ast::StateSpace::Reg,
|
||||
name: resolver.register_unnamed(Some((type_.into(), ast::StateSpace::Reg))),
|
||||
array_init: Vec::new(),
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
fn_(sreg, (return_arguments, name, input_arguments));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
134
ptx/src/pass/normalize_basic_blocks.rs
Normal file
134
ptx/src/pass/normalize_basic_blocks.rs
Normal file
|
@ -0,0 +1,134 @@
|
|||
use super::*;
|
||||
|
||||
// This pass normalizes ptx modules in two ways that makes mode computation pass
|
||||
// and code emissions passes much simpler:
|
||||
// * Inserts label at the start of every function
|
||||
// This makes control flow graph simpler in mode computation block: we can
|
||||
// represent kernels as separate nodes with its own separate entry/exit mode
|
||||
// * Inserts label at the start of every basic block
|
||||
// * Insert explicit jumps before labels
|
||||
// * Non-.entry methods get a single `ret;` exit point - this is because mode computation
|
||||
// logic requires it. Control flow graph constructed by mode computation
|
||||
// models function calls as jumps into and then from another function.
|
||||
// If this cfg allowed multiple return basic blocks then there would be cases
|
||||
// where we want to insert mode setting instruction along the edge between
|
||||
// `ret;` and bb in the caller. This is only possible if there's a single
|
||||
// edge between from function `ret;` and caller
|
||||
pub(crate) fn run(
|
||||
flat_resolver: &mut GlobalStringIdentResolver2<'_>,
|
||||
mut directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
for directive in directives.iter_mut() {
|
||||
let (body_ref, is_kernel) = match directive {
|
||||
Directive2::Method(Function2 {
|
||||
body: Some(body), is_kernel, ..
|
||||
}) => (body, *is_kernel),
|
||||
_ => continue,
|
||||
};
|
||||
let body = std::mem::replace(body_ref, Vec::new());
|
||||
let mut result = Vec::with_capacity(body.len());
|
||||
let mut previous_instruction_was_terminator = TerminatorKind::Not;
|
||||
let mut body_iterator = body.into_iter();
|
||||
let mut return_statements = Vec::new();
|
||||
match body_iterator.next() {
|
||||
Some(Statement::Label(_)) => {}
|
||||
Some(statement) => {
|
||||
result.push(Statement::Label(flat_resolver.register_unnamed(None)));
|
||||
result.push(statement);
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
for statement in body_iterator {
|
||||
match previous_instruction_was_terminator {
|
||||
TerminatorKind::Not => match statement {
|
||||
Statement::Label(label) => {
|
||||
result.push(Statement::Instruction(ast::Instruction::Bra {
|
||||
arguments: ast::BraArgs { src: label },
|
||||
}))
|
||||
}
|
||||
_ => {}
|
||||
},
|
||||
TerminatorKind::Real => {
|
||||
if !matches!(statement, Statement::Label(..)) {
|
||||
result.push(Statement::Label(flat_resolver.register_unnamed(None)));
|
||||
}
|
||||
}
|
||||
TerminatorKind::Fake => match statement {
|
||||
// If there's a label after a call just reuse it
|
||||
Statement::Label(label) => {
|
||||
result.push(Statement::Instruction(ast::Instruction::Bra {
|
||||
arguments: ast::BraArgs { src: label },
|
||||
}))
|
||||
}
|
||||
_ => {
|
||||
let label = flat_resolver.register_unnamed(None);
|
||||
result.push(Statement::Instruction(ast::Instruction::Bra {
|
||||
arguments: ast::BraArgs { src: label },
|
||||
}));
|
||||
result.push(Statement::Label(label));
|
||||
}
|
||||
},
|
||||
}
|
||||
match statement {
|
||||
Statement::RetValue(..) => {
|
||||
return Err(error_unreachable());
|
||||
}
|
||||
Statement::Instruction(ast::Instruction::Ret { .. }) => {
|
||||
if !is_kernel {
|
||||
return_statements.push(result.len());
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
previous_instruction_was_terminator = is_block_terminator(&statement);
|
||||
result.push(statement);
|
||||
}
|
||||
convert_from_multiple_returns_to_single_return(
|
||||
flat_resolver,
|
||||
&mut result,
|
||||
return_statements,
|
||||
)?;
|
||||
*body_ref = result;
|
||||
}
|
||||
Ok(directives)
|
||||
}
|
||||
|
||||
enum TerminatorKind {
|
||||
Not,
|
||||
Real,
|
||||
Fake,
|
||||
}
|
||||
|
||||
fn convert_from_multiple_returns_to_single_return(
|
||||
flat_resolver: &mut GlobalStringIdentResolver2<'_>,
|
||||
result: &mut Vec<Statement<ptx_parser::Instruction<SpirvWord>, SpirvWord>>,
|
||||
return_statements: Vec<usize>,
|
||||
) -> Result<(), TranslateError> {
|
||||
Ok(if return_statements.len() > 1 {
|
||||
let ret_bb = flat_resolver.register_unnamed(None);
|
||||
result.push(Statement::Label(ret_bb));
|
||||
result.push(Statement::Instruction(ast::Instruction::Ret {
|
||||
data: ast::RetData { uniform: false },
|
||||
}));
|
||||
for ret_index in return_statements {
|
||||
let statement = result.get_mut(ret_index).ok_or_else(error_unreachable)?;
|
||||
*statement = Statement::Instruction(ast::Instruction::Bra {
|
||||
arguments: ast::BraArgs { src: ret_bb },
|
||||
});
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn is_block_terminator(
|
||||
statement: &Statement<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> TerminatorKind {
|
||||
match statement {
|
||||
Statement::Conditional(..)
|
||||
| Statement::Instruction(ast::Instruction::Bra { .. })
|
||||
// Normally call is not a terminator, but we treat it as such because it
|
||||
// makes the "instruction modes to global modes" pass possible
|
||||
| Statement::Instruction(ast::Instruction::Ret { .. }) => TerminatorKind::Real,
|
||||
Statement::Instruction(ast::Instruction::Call { .. }) => TerminatorKind::Fake,
|
||||
_ => TerminatorKind::Not,
|
||||
}
|
||||
}
|
|
@ -4,7 +4,7 @@ use ptx_parser as ast;
|
|||
pub(crate) fn run<'input, 'b>(
|
||||
resolver: &mut ScopedResolver<'input, 'b>,
|
||||
directives: Vec<ast::Directive<'input, ast::ParsedOperand<&'input str>>>,
|
||||
) -> Result<Vec<NormalizedDirective2<'input>>, TranslateError> {
|
||||
) -> Result<Vec<NormalizedDirective2>, TranslateError> {
|
||||
resolver.start_scope();
|
||||
let result = directives
|
||||
.into_iter()
|
||||
|
@ -17,7 +17,7 @@ pub(crate) fn run<'input, 'b>(
|
|||
fn run_directive<'input, 'b>(
|
||||
resolver: &mut ScopedResolver<'input, 'b>,
|
||||
directive: ast::Directive<'input, ast::ParsedOperand<&'input str>>,
|
||||
) -> Result<NormalizedDirective2<'input>, TranslateError> {
|
||||
) -> Result<NormalizedDirective2, TranslateError> {
|
||||
Ok(match directive {
|
||||
ast::Directive::Variable(linking, var) => {
|
||||
NormalizedDirective2::Variable(linking, run_variable(resolver, var)?)
|
||||
|
@ -32,15 +32,11 @@ fn run_method<'input, 'b>(
|
|||
resolver: &mut ScopedResolver<'input, 'b>,
|
||||
linkage: ast::LinkingDirective,
|
||||
method: ast::Function<'input, &'input str, ast::Statement<ast::ParsedOperand<&'input str>>>,
|
||||
) -> Result<NormalizedFunction2<'input>, TranslateError> {
|
||||
let name = match method.func_directive.name {
|
||||
ast::MethodName::Kernel(name) => ast::MethodName::Kernel(name),
|
||||
ast::MethodName::Func(text) => {
|
||||
ast::MethodName::Func(resolver.add_or_get_in_current_scope_untyped(text)?)
|
||||
}
|
||||
};
|
||||
) -> Result<NormalizedFunction2, TranslateError> {
|
||||
let is_kernel = method.func_directive.name.is_kernel();
|
||||
let name = resolver.add_or_get_in_current_scope_untyped(method.func_directive.name.text())?;
|
||||
resolver.start_scope();
|
||||
let func_decl = run_function_decl(resolver, method.func_directive, name)?;
|
||||
let (return_arguments, input_arguments) = run_function_decl(resolver, method.func_directive)?;
|
||||
let body = method
|
||||
.body
|
||||
.map(|statements| {
|
||||
|
@ -51,20 +47,25 @@ fn run_method<'input, 'b>(
|
|||
.transpose()?;
|
||||
resolver.end_scope();
|
||||
Ok(Function2 {
|
||||
func_decl,
|
||||
globals: Vec::new(),
|
||||
return_arguments,
|
||||
name,
|
||||
input_arguments,
|
||||
body,
|
||||
import_as: None,
|
||||
tuning: method.tuning,
|
||||
linkage,
|
||||
is_kernel,
|
||||
tuning: method.tuning,
|
||||
flush_to_zero_f32: false,
|
||||
flush_to_zero_f16f64: false,
|
||||
rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
|
||||
rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
|
||||
})
|
||||
}
|
||||
|
||||
fn run_function_decl<'input, 'b>(
|
||||
resolver: &mut ScopedResolver<'input, 'b>,
|
||||
func_directive: ast::MethodDeclaration<'input, &'input str>,
|
||||
name: ast::MethodName<'input, SpirvWord>,
|
||||
) -> Result<ast::MethodDeclaration<'input, SpirvWord>, TranslateError> {
|
||||
) -> Result<(Vec<ast::Variable<SpirvWord>>, Vec<ast::Variable<SpirvWord>>), TranslateError> {
|
||||
assert!(func_directive.shared_mem.is_none());
|
||||
let return_arguments = func_directive
|
||||
.return_arguments
|
||||
|
@ -76,12 +77,7 @@ fn run_function_decl<'input, 'b>(
|
|||
.into_iter()
|
||||
.map(|var| run_variable(resolver, var))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
Ok(ast::MethodDeclaration {
|
||||
return_arguments,
|
||||
name,
|
||||
input_arguments,
|
||||
shared_mem: None,
|
||||
})
|
||||
Ok((return_arguments, input_arguments))
|
||||
}
|
||||
|
||||
fn run_variable<'input, 'b>(
|
||||
|
|
|
@ -3,8 +3,8 @@ use ptx_parser as ast;
|
|||
|
||||
pub(crate) fn run<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<NormalizedDirective2<'input>>,
|
||||
) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> {
|
||||
directives: Vec<NormalizedDirective2>,
|
||||
) -> Result<Vec<UnconditionalDirective>, TranslateError> {
|
||||
directives
|
||||
.into_iter()
|
||||
.map(|directive| run_directive(resolver, directive))
|
||||
|
@ -13,8 +13,8 @@ pub(crate) fn run<'input>(
|
|||
|
||||
fn run_directive<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directive: NormalizedDirective2<'input>,
|
||||
) -> Result<UnconditionalDirective<'input>, TranslateError> {
|
||||
directive: NormalizedDirective2,
|
||||
) -> Result<UnconditionalDirective, TranslateError> {
|
||||
Ok(match directive {
|
||||
Directive2::Variable(linking, var) => Directive2::Variable(linking, var),
|
||||
Directive2::Method(method) => Directive2::Method(run_method(resolver, method)?),
|
||||
|
@ -23,8 +23,8 @@ fn run_directive<'input>(
|
|||
|
||||
fn run_method<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
method: NormalizedFunction2<'input>,
|
||||
) -> Result<UnconditionalFunction<'input>, TranslateError> {
|
||||
method: NormalizedFunction2,
|
||||
) -> Result<UnconditionalFunction, TranslateError> {
|
||||
let body = method
|
||||
.body
|
||||
.map(|statements| {
|
||||
|
@ -36,12 +36,18 @@ fn run_method<'input>(
|
|||
})
|
||||
.transpose()?;
|
||||
Ok(Function2 {
|
||||
func_decl: method.func_decl,
|
||||
globals: method.globals,
|
||||
body,
|
||||
return_arguments: method.return_arguments,
|
||||
name: method.name,
|
||||
input_arguments: method.input_arguments,
|
||||
import_as: method.import_as,
|
||||
tuning: method.tuning,
|
||||
linkage: method.linkage,
|
||||
is_kernel: method.is_kernel,
|
||||
flush_to_zero_f32: method.flush_to_zero_f32,
|
||||
flush_to_zero_f16f64: method.flush_to_zero_f16f64,
|
||||
rounding_mode_f32: method.rounding_mode_f32,
|
||||
rounding_mode_f16f64: method.rounding_mode_f16f64,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
122
ptx/src/pass/remove_unreachable_basic_blocks.rs
Normal file
122
ptx/src/pass/remove_unreachable_basic_blocks.rs
Normal file
|
@ -0,0 +1,122 @@
|
|||
use super::*;
|
||||
use petgraph::{
|
||||
graph::NodeIndex,
|
||||
visit::{Bfs, VisitMap},
|
||||
Graph,
|
||||
};
|
||||
use rustc_hash::FxHashSet;
|
||||
|
||||
pub(crate) fn run(
|
||||
mut directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
let mut reachable_funcs = FxHashSet::default();
|
||||
for directive in directives.iter_mut() {
|
||||
match directive {
|
||||
Directive2::Method(Function2 {
|
||||
body: Some(body), ..
|
||||
}) => {
|
||||
let old_body = std::mem::replace(body, Vec::new());
|
||||
let mut cfg = ControlFlowGraph::new();
|
||||
let mut old_body_iter = old_body.iter();
|
||||
let mut current_bb = match old_body_iter.next() {
|
||||
Some(Statement::Label(label)) => cfg.add_or_get_node(*label),
|
||||
_ => return Err(error_unreachable()),
|
||||
};
|
||||
let first_bb = current_bb;
|
||||
for statement in old_body_iter {
|
||||
match statement {
|
||||
Statement::Label(label) => {
|
||||
current_bb = cfg.add_or_get_node(*label);
|
||||
}
|
||||
Statement::Conditional(branch) => {
|
||||
cfg.add_branch(current_bb, branch.if_true);
|
||||
cfg.add_branch(current_bb, branch.if_false);
|
||||
}
|
||||
Statement::Instruction(ast::Instruction::Bra {
|
||||
arguments: ast::BraArgs { src },
|
||||
}) => {
|
||||
cfg.add_branch(current_bb, *src);
|
||||
}
|
||||
Statement::FunctionPointer(FunctionPointerDetails {
|
||||
src: _func, ..
|
||||
}) => {
|
||||
return Err(error_todo());
|
||||
}
|
||||
Statement::Instruction(ast::Instruction::Call {
|
||||
arguments: ast::CallArgs { func, .. },
|
||||
..
|
||||
}) => {
|
||||
reachable_funcs.insert(*func);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let mut bfs = Bfs::new(&cfg.graph, first_bb);
|
||||
while let Some(_) = bfs.next(&cfg.graph) {}
|
||||
let mut visited = true;
|
||||
*body = try_filter_to_vec(old_body.into_iter(), |statement| {
|
||||
match statement {
|
||||
Statement::Label(label) => {
|
||||
visited = bfs
|
||||
.discovered
|
||||
.is_visited(cfg.nodes.get(label).ok_or_else(error_unreachable)?);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
Ok(visited)
|
||||
})?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(directives
|
||||
.into_iter()
|
||||
.filter(|directive| match directive {
|
||||
Directive2::Variable(..) => true,
|
||||
Directive2::Method(Function2 {
|
||||
name, is_kernel, ..
|
||||
}) => *is_kernel || reachable_funcs.contains(name),
|
||||
})
|
||||
.collect::<Vec<_>>())
|
||||
}
|
||||
|
||||
fn try_filter_to_vec<T, E>(
|
||||
mut iter: impl ExactSizeIterator<Item = T>,
|
||||
mut filter: impl FnMut(&T) -> Result<bool, E>,
|
||||
) -> Result<Vec<T>, E> {
|
||||
iter.try_fold(Vec::with_capacity(iter.len()), |mut vec, item| {
|
||||
match filter(&item) {
|
||||
Ok(true) => vec.push(item),
|
||||
Ok(false) => {}
|
||||
Err(err) => return Err(err),
|
||||
}
|
||||
Ok(vec)
|
||||
})
|
||||
}
|
||||
|
||||
struct ControlFlowGraph {
|
||||
graph: Graph<SpirvWord, ()>,
|
||||
nodes: FxHashMap<SpirvWord, NodeIndex>,
|
||||
}
|
||||
|
||||
impl ControlFlowGraph {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
graph: Graph::new(),
|
||||
nodes: FxHashMap::default(),
|
||||
}
|
||||
}
|
||||
|
||||
fn add_or_get_node(&mut self, id: SpirvWord) -> NodeIndex {
|
||||
*self
|
||||
.nodes
|
||||
.entry(id)
|
||||
.or_insert_with(|| self.graph.add_node(id))
|
||||
}
|
||||
|
||||
fn add_branch(&mut self, from: NodeIndex, to: SpirvWord) -> NodeIndex {
|
||||
let to = self.add_or_get_node(to);
|
||||
self.graph.add_edge(from, to, ());
|
||||
to
|
||||
}
|
||||
}
|
|
@ -2,8 +2,8 @@ use super::*;
|
|||
|
||||
pub(super) fn run<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
directives: Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
directives: Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>,
|
||||
) -> Result<Vec<Directive2<ast::Instruction<SpirvWord>, SpirvWord>>, TranslateError> {
|
||||
let mut fn_declarations = FxHashMap::default();
|
||||
let remapped_directives = directives
|
||||
.into_iter()
|
||||
|
@ -13,17 +13,18 @@ pub(super) fn run<'input>(
|
|||
.into_iter()
|
||||
.map(|(_, (return_arguments, name, input_arguments))| {
|
||||
Directive2::Method(Function2 {
|
||||
func_decl: ast::MethodDeclaration {
|
||||
return_arguments,
|
||||
name: ast::MethodName::Func(name),
|
||||
input_arguments,
|
||||
shared_mem: None,
|
||||
},
|
||||
globals: Vec::new(),
|
||||
return_arguments,
|
||||
name: name,
|
||||
input_arguments,
|
||||
body: None,
|
||||
import_as: None,
|
||||
tuning: Vec::new(),
|
||||
linkage: ast::LinkingDirective::EXTERN,
|
||||
is_kernel: false,
|
||||
flush_to_zero_f32: false,
|
||||
flush_to_zero_f16f64: false,
|
||||
rounding_mode_f32: ptx_parser::RoundingMode::NearestEven,
|
||||
rounding_mode_f16f64: ptx_parser::RoundingMode::NearestEven,
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
@ -41,8 +42,8 @@ fn run_directive<'input>(
|
|||
Vec<ast::Variable<SpirvWord>>,
|
||||
),
|
||||
>,
|
||||
directive: Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<'input, ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
directive: Directive2<ast::Instruction<SpirvWord>, SpirvWord>,
|
||||
) -> Result<Directive2<ast::Instruction<SpirvWord>, SpirvWord>, TranslateError> {
|
||||
Ok(match directive {
|
||||
var @ Directive2::Variable(..) => var,
|
||||
Directive2::Method(mut method) => {
|
||||
|
@ -104,6 +105,9 @@ fn run_instruction<'input>(
|
|||
let name = ["bfi_", scalar_to_ptx_name(data)].concat();
|
||||
to_call(resolver, fn_declarations, name.into(), i)?
|
||||
}
|
||||
i @ ptx_parser::Instruction::Bar { .. } => {
|
||||
to_call(resolver, fn_declarations, "bar_sync".into(), i)?
|
||||
}
|
||||
i => i,
|
||||
})
|
||||
}
|
||||
|
|
33
ptx/src/pass/replace_known_functions.rs
Normal file
33
ptx/src/pass/replace_known_functions.rs
Normal file
|
@ -0,0 +1,33 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use super::{GlobalStringIdentResolver2, NormalizedDirective2, SpirvWord};
|
||||
|
||||
pub(crate) fn run<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
mut directives: Vec<NormalizedDirective2>,
|
||||
) -> Vec<NormalizedDirective2> {
|
||||
for directive in directives.iter_mut() {
|
||||
match directive {
|
||||
NormalizedDirective2::Method(func) => {
|
||||
replace_with_ptx_impl(resolver, func.name);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
directives
|
||||
}
|
||||
|
||||
fn replace_with_ptx_impl<'input>(
|
||||
resolver: &mut GlobalStringIdentResolver2<'input>,
|
||||
fn_name: SpirvWord,
|
||||
) {
|
||||
let known_names = ["__assertfail"];
|
||||
if let Some(super::IdentEntry {
|
||||
name: Some(name), ..
|
||||
}) = resolver.ident_map.get_mut(&fn_name)
|
||||
{
|
||||
if known_names.contains(&&**name) {
|
||||
*name = Cow::Owned(format!("__zluda_ptx_impl_{}", name));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -3,8 +3,8 @@ use ptx_parser as ast;
|
|||
use rustc_hash::FxHashSet;
|
||||
|
||||
pub(crate) fn run<'input>(
|
||||
directives: Vec<UnconditionalDirective<'input>>,
|
||||
) -> Result<Vec<UnconditionalDirective<'input>>, TranslateError> {
|
||||
directives: Vec<UnconditionalDirective>,
|
||||
) -> Result<Vec<UnconditionalDirective>, TranslateError> {
|
||||
let mut functions = FxHashSet::default();
|
||||
directives
|
||||
.into_iter()
|
||||
|
@ -14,19 +14,13 @@ pub(crate) fn run<'input>(
|
|||
|
||||
fn run_directive<'input>(
|
||||
functions: &mut FxHashSet<SpirvWord>,
|
||||
directive: UnconditionalDirective<'input>,
|
||||
) -> Result<UnconditionalDirective<'input>, TranslateError> {
|
||||
directive: UnconditionalDirective,
|
||||
) -> Result<UnconditionalDirective, TranslateError> {
|
||||
Ok(match directive {
|
||||
var @ Directive2::Variable(..) => var,
|
||||
Directive2::Method(method) => {
|
||||
{
|
||||
let func_decl = &method.func_decl;
|
||||
match func_decl.name {
|
||||
ptx_parser::MethodName::Kernel(_) => {}
|
||||
ptx_parser::MethodName::Func(name) => {
|
||||
functions.insert(name);
|
||||
}
|
||||
}
|
||||
if !method.is_kernel {
|
||||
functions.insert(method.name);
|
||||
}
|
||||
Directive2::Method(run_method(functions, method)?)
|
||||
}
|
||||
|
@ -35,8 +29,8 @@ fn run_directive<'input>(
|
|||
|
||||
fn run_method<'input>(
|
||||
functions: &mut FxHashSet<SpirvWord>,
|
||||
method: UnconditionalFunction<'input>,
|
||||
) -> Result<UnconditionalFunction<'input>, TranslateError> {
|
||||
method: UnconditionalFunction,
|
||||
) -> Result<UnconditionalFunction, TranslateError> {
|
||||
let body = method
|
||||
.body
|
||||
.map(|statements| {
|
||||
|
@ -46,14 +40,7 @@ fn run_method<'input>(
|
|||
.collect::<Result<Vec<_>, _>>()
|
||||
})
|
||||
.transpose()?;
|
||||
Ok(Function2 {
|
||||
func_decl: method.func_decl,
|
||||
globals: method.globals,
|
||||
body,
|
||||
import_as: method.import_as,
|
||||
tuning: method.tuning,
|
||||
linkage: method.linkage,
|
||||
})
|
||||
Ok(Function2 { body, ..method })
|
||||
}
|
||||
|
||||
fn run_statement<'input>(
|
||||
|
|
24
ptx/src/test/ll/activemask.ll
Normal file
24
ptx/src/test/ll/activemask.ll
Normal file
|
@ -0,0 +1,24 @@
|
|||
declare i32 @__zluda_ptx_impl_activemask() #0
|
||||
|
||||
define amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 {
|
||||
%"31" = alloca i64, align 8, addrspace(5)
|
||||
%"32" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"28"
|
||||
|
||||
"28": ; preds = %1
|
||||
%"33" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"33", ptr addrspace(5) %"31", align 4
|
||||
%"34" = call i32 @__zluda_ptx_impl_activemask()
|
||||
store i32 %"34", ptr addrspace(5) %"32", align 4
|
||||
%"35" = load i64, ptr addrspace(5) %"31", align 4
|
||||
%"36" = load i32, ptr addrspace(5) %"32", align 4
|
||||
%"37" = inttoptr i64 %"35" to ptr
|
||||
store i32 %"36", ptr %"37", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
30
ptx/src/test/ll/add.ll
Normal file
30
ptx/src/test/ll/add.ll
Normal file
|
@ -0,0 +1,30 @@
|
|||
define amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"31"
|
||||
|
||||
"31": ; preds = %1
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"46" = inttoptr i64 %"41" to ptr
|
||||
%"40" = load i64, ptr %"46", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"42" = add i64 %"43", 1
|
||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"47" = inttoptr i64 %"44" to ptr
|
||||
store i64 %"45", ptr %"47", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
52
ptx/src/test/ll/add_ftz.ll
Normal file
52
ptx/src/test/ll/add_ftz.ll
Normal file
|
@ -0,0 +1,52 @@
|
|||
define amdgpu_kernel void @add_ftz(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 {
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca float, align 4, addrspace(5)
|
||||
%"42" = alloca float, align 4, addrspace(5)
|
||||
%"43" = alloca float, align 4, addrspace(5)
|
||||
%"44" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"36"
|
||||
|
||||
"36": ; preds = %1
|
||||
%"45" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"40", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"61" = inttoptr i64 %"48" to ptr
|
||||
%"47" = load float, ptr %"61", align 4
|
||||
store float %"47", ptr addrspace(5) %"41", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"62" = inttoptr i64 %"49" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"62", i64 4
|
||||
%"50" = load float, ptr %"33", align 4
|
||||
store float %"50", ptr addrspace(5) %"42", align 4
|
||||
%"52" = load float, ptr addrspace(5) %"41", align 4
|
||||
%"53" = load float, ptr addrspace(5) %"42", align 4
|
||||
%"51" = fadd float %"52", %"53"
|
||||
store float %"51", ptr addrspace(5) %"43", align 4
|
||||
call void @llvm.amdgcn.s.setreg(i32 6401, i32 3)
|
||||
%"55" = load float, ptr addrspace(5) %"41", align 4
|
||||
%"56" = load float, ptr addrspace(5) %"42", align 4
|
||||
%"54" = fadd float %"55", %"56"
|
||||
store float %"54", ptr addrspace(5) %"44", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"58" = load float, ptr addrspace(5) %"43", align 4
|
||||
%"63" = inttoptr i64 %"57" to ptr
|
||||
store float %"58", ptr %"63", align 4
|
||||
%"59" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"64" = inttoptr i64 %"59" to ptr
|
||||
%"35" = getelementptr inbounds i8, ptr %"64", i64 4
|
||||
%"60" = load float, ptr addrspace(5) %"44", align 4
|
||||
store float %"60", ptr %"35", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind willreturn
|
||||
declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind willreturn }
|
30
ptx/src/test/ll/add_non_coherent.ll
Normal file
30
ptx/src/test/ll/add_non_coherent.ll
Normal file
|
@ -0,0 +1,30 @@
|
|||
define amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"31"
|
||||
|
||||
"31": ; preds = %1
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"46" = inttoptr i64 %"41" to ptr addrspace(1)
|
||||
%"40" = load i64, ptr addrspace(1) %"46", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"42" = add i64 %"43", 1
|
||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"47" = inttoptr i64 %"44" to ptr addrspace(1)
|
||||
store i64 %"45", ptr addrspace(1) %"47", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
30
ptx/src/test/ll/add_tuning.ll
Normal file
30
ptx/src/test/ll/add_tuning.ll
Normal file
|
@ -0,0 +1,30 @@
|
|||
define amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"31"
|
||||
|
||||
"31": ; preds = %1
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"46" = inttoptr i64 %"41" to ptr
|
||||
%"40" = load i64, ptr %"46", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"42" = add i64 %"43", 1
|
||||
store i64 %"42", ptr addrspace(5) %"37", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"47" = inttoptr i64 %"44" to ptr
|
||||
store i64 %"45", ptr %"47", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
36
ptx/src/test/ll/and.ll
Normal file
36
ptx/src/test/ll/and.ll
Normal file
|
@ -0,0 +1,36 @@
|
|||
define amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i32, align 4, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"50" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i32, ptr %"50", align 4
|
||||
store i32 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
||||
%"44" = load i32, ptr %"31", align 4
|
||||
store i32 %"44", ptr addrspace(5) %"38", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
||||
%"47" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"52" = and i32 %"46", %"47"
|
||||
store i32 %"52", ptr addrspace(5) %"37", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"49" = load i32, ptr addrspace(5) %"37", align 4
|
||||
%"55" = inttoptr i64 %"48" to ptr
|
||||
store i32 %"49", ptr %"55", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
46
ptx/src/test/ll/atom_add.ll
Normal file
46
ptx/src/test/ll/atom_add.ll
Normal file
|
@ -0,0 +1,46 @@
|
|||
@shared_mem = external addrspace(3) global [1024 x i8], align 4
|
||||
|
||||
define amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i32, align 4, addrspace(5)
|
||||
%"41" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"35"
|
||||
|
||||
"35": ; preds = %1
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"56" = inttoptr i64 %"45" to ptr
|
||||
%"44" = load i32, ptr %"56", align 4
|
||||
store i32 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"57" = inttoptr i64 %"46" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"57", i64 4
|
||||
%"47" = load i32, ptr %"32", align 4
|
||||
store i32 %"47", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i32, ptr addrspace(5) %"40", align 4
|
||||
store i32 %"48", ptr addrspace(3) @shared_mem, align 4
|
||||
%"50" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%2 = atomicrmw add ptr addrspace(3) @shared_mem, i32 %"50" syncscope("agent-one-as") monotonic, align 4
|
||||
store i32 %2, ptr addrspace(5) %"40", align 4
|
||||
%"51" = load i32, ptr addrspace(3) @shared_mem, align 4
|
||||
store i32 %"51", ptr addrspace(5) %"41", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"53" = load i32, ptr addrspace(5) %"40", align 4
|
||||
%"61" = inttoptr i64 %"52" to ptr
|
||||
store i32 %"53", ptr %"61", align 4
|
||||
%"54" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"62" = inttoptr i64 %"54" to ptr
|
||||
%"34" = getelementptr inbounds i8, ptr %"62", i64 4
|
||||
%"55" = load i32, ptr addrspace(5) %"41", align 4
|
||||
store i32 %"55", ptr %"34", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
46
ptx/src/test/ll/atom_add_float.ll
Normal file
46
ptx/src/test/ll/atom_add_float.ll
Normal file
|
@ -0,0 +1,46 @@
|
|||
@shared_mem = external addrspace(3) global [1024 x i8], align 4
|
||||
|
||||
define amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca float, align 4, addrspace(5)
|
||||
%"41" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"35"
|
||||
|
||||
"35": ; preds = %1
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"56" = inttoptr i64 %"45" to ptr
|
||||
%"44" = load float, ptr %"56", align 4
|
||||
store float %"44", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"57" = inttoptr i64 %"46" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"57", i64 4
|
||||
%"47" = load float, ptr %"32", align 4
|
||||
store float %"47", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load float, ptr addrspace(5) %"40", align 4
|
||||
store float %"48", ptr addrspace(3) @shared_mem, align 4
|
||||
%"50" = load float, ptr addrspace(5) %"41", align 4
|
||||
%2 = atomicrmw fadd ptr addrspace(3) @shared_mem, float %"50" syncscope("agent-one-as") monotonic, align 4
|
||||
store float %2, ptr addrspace(5) %"40", align 4
|
||||
%"51" = load float, ptr addrspace(3) @shared_mem, align 4
|
||||
store float %"51", ptr addrspace(5) %"41", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"53" = load float, ptr addrspace(5) %"40", align 4
|
||||
%"61" = inttoptr i64 %"52" to ptr
|
||||
store float %"53", ptr %"61", align 4
|
||||
%"54" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"62" = inttoptr i64 %"54" to ptr
|
||||
%"34" = getelementptr inbounds i8, ptr %"62", i64 4
|
||||
%"55" = load float, ptr addrspace(5) %"41", align 4
|
||||
store float %"55", ptr %"34", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
44
ptx/src/test/ll/atom_cas.ll
Normal file
44
ptx/src/test/ll/atom_cas.ll
Normal file
|
@ -0,0 +1,44 @@
|
|||
define amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
%"43" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"37"
|
||||
|
||||
"37": ; preds = %1
|
||||
%"44" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
%"45" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"41", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"57" = inttoptr i64 %"47" to ptr
|
||||
%"46" = load i32, ptr %"57", align 4
|
||||
store i32 %"46", ptr addrspace(5) %"42", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"58" = inttoptr i64 %"48" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"58", i64 4
|
||||
%"50" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%2 = cmpxchg ptr %"31", i32 %"50", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4
|
||||
%"59" = extractvalue { i32, i1 } %2, 0
|
||||
store i32 %"59", ptr addrspace(5) %"42", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"61" = inttoptr i64 %"51" to ptr
|
||||
%"34" = getelementptr inbounds i8, ptr %"61", i64 4
|
||||
%"52" = load i32, ptr %"34", align 4
|
||||
store i32 %"52", ptr addrspace(5) %"43", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"54" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"62" = inttoptr i64 %"53" to ptr
|
||||
store i32 %"54", ptr %"62", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"63" = inttoptr i64 %"55" to ptr
|
||||
%"36" = getelementptr inbounds i8, ptr %"63", i64 4
|
||||
%"56" = load i32, ptr addrspace(5) %"43", align 4
|
||||
store i32 %"56", ptr %"36", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
46
ptx/src/test/ll/atom_inc.ll
Normal file
46
ptx/src/test/ll/atom_inc.ll
Normal file
|
@ -0,0 +1,46 @@
|
|||
define amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
%"43" = alloca i32, align 4, addrspace(5)
|
||||
%"44" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"37"
|
||||
|
||||
"37": ; preds = %1
|
||||
%"45" = load i64, ptr addrspace(4) %"38", align 4
|
||||
store i64 %"45", ptr addrspace(5) %"40", align 4
|
||||
%"46" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"59" = inttoptr i64 %"48" to ptr
|
||||
%2 = atomicrmw uinc_wrap ptr %"59", i32 101 syncscope("agent-one-as") monotonic, align 4
|
||||
store i32 %2, ptr addrspace(5) %"42", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"60" = inttoptr i64 %"50" to ptr addrspace(1)
|
||||
%3 = atomicrmw uinc_wrap ptr addrspace(1) %"60", i32 101 syncscope("agent-one-as") monotonic, align 4
|
||||
store i32 %3, ptr addrspace(5) %"43", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"61" = inttoptr i64 %"52" to ptr
|
||||
%"51" = load i32, ptr %"61", align 4
|
||||
store i32 %"51", ptr addrspace(5) %"44", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"54" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"62" = inttoptr i64 %"53" to ptr
|
||||
store i32 %"54", ptr %"62", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"63" = inttoptr i64 %"55" to ptr
|
||||
%"34" = getelementptr inbounds i8, ptr %"63", i64 4
|
||||
%"56" = load i32, ptr addrspace(5) %"43", align 4
|
||||
store i32 %"56", ptr %"34", align 4
|
||||
%"57" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"64" = inttoptr i64 %"57" to ptr
|
||||
%"36" = getelementptr inbounds i8, ptr %"64", i64 8
|
||||
%"58" = load i32, ptr addrspace(5) %"44", align 4
|
||||
store i32 %"58", ptr %"36", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
30
ptx/src/test/ll/b64tof64.ll
Normal file
30
ptx/src/test/ll/b64tof64.ll
Normal file
|
@ -0,0 +1,30 @@
|
|||
define amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca double, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"37" = load double, ptr addrspace(4) %"31", align 8
|
||||
store double %"37", ptr addrspace(5) %"33", align 8
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load double, ptr addrspace(5) %"33", align 8
|
||||
%"46" = bitcast double %"40" to i64
|
||||
store i64 %"46", ptr addrspace(5) %"34", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"47" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i64, ptr %"47", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"48" = inttoptr i64 %"43" to ptr
|
||||
store i64 %"44", ptr %"48", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
91
ptx/src/test/ll/bench.ll
Normal file
91
ptx/src/test/ll/bench.ll
Normal file
|
@ -0,0 +1,91 @@
|
|||
declare i32 @__zluda_ptx_impl_sreg_tid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ntid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_ctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_nctaid(i8) #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_clock() #0
|
||||
|
||||
declare i32 @__zluda_ptx_impl_sreg_lanemask_lt() #0
|
||||
|
||||
define amdgpu_kernel void @bench(ptr addrspace(4) byref(i64) %"55", ptr addrspace(4) byref(i64) %"56") #1 {
|
||||
%"57" = alloca i64, align 8, addrspace(5)
|
||||
%"58" = alloca i64, align 8, addrspace(5)
|
||||
%"59" = alloca float, align 4, addrspace(5)
|
||||
%"60" = alloca float, align 4, addrspace(5)
|
||||
%"61" = alloca float, align 4, addrspace(5)
|
||||
%"62" = alloca float, align 4, addrspace(5)
|
||||
%"63" = alloca i32, align 4, addrspace(5)
|
||||
%"64" = alloca i1, align 1, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"97"
|
||||
|
||||
"97": ; preds = %1
|
||||
%"65" = load i64, ptr addrspace(4) %"55", align 4
|
||||
store i64 %"65", ptr addrspace(5) %"57", align 4
|
||||
%"66" = load i64, ptr addrspace(4) %"56", align 4
|
||||
store i64 %"66", ptr addrspace(5) %"58", align 4
|
||||
%"68" = load i64, ptr addrspace(5) %"57", align 4
|
||||
%"91" = inttoptr i64 %"68" to ptr
|
||||
%"67" = load float, ptr %"91", align 4
|
||||
store float %"67", ptr addrspace(5) %"59", align 4
|
||||
%"69" = load i64, ptr addrspace(5) %"57", align 4
|
||||
%"92" = inttoptr i64 %"69" to ptr
|
||||
%"39" = getelementptr inbounds i8, ptr %"92", i64 4
|
||||
%"70" = load float, ptr %"39", align 4
|
||||
store float %"70", ptr addrspace(5) %"60", align 4
|
||||
%"71" = load i64, ptr addrspace(5) %"57", align 4
|
||||
%"93" = inttoptr i64 %"71" to ptr
|
||||
%"41" = getelementptr inbounds i8, ptr %"93", i64 8
|
||||
%"72" = load float, ptr %"41", align 4
|
||||
store float %"72", ptr addrspace(5) %"61", align 4
|
||||
%"73" = load i64, ptr addrspace(5) %"57", align 4
|
||||
%"94" = inttoptr i64 %"73" to ptr
|
||||
%"43" = getelementptr inbounds i8, ptr %"94", i64 12
|
||||
%"74" = load float, ptr %"43", align 4
|
||||
store float %"74", ptr addrspace(5) %"62", align 4
|
||||
store i32 0, ptr addrspace(5) %"63", align 4
|
||||
br label %"10"
|
||||
|
||||
"10": ; preds = %"21", %"97"
|
||||
%"77" = load float, ptr addrspace(5) %"59", align 4
|
||||
%"78" = load float, ptr addrspace(5) %"60", align 4
|
||||
call void asm sideeffect "s_denorm_mode 0", "~{mode}"()
|
||||
%"76" = fmul float %"77", %"78"
|
||||
store float %"76", ptr addrspace(5) %"59", align 4
|
||||
%"80" = load float, ptr addrspace(5) %"61", align 4
|
||||
%"81" = load float, ptr addrspace(5) %"62", align 4
|
||||
call void asm sideeffect "s_denorm_mode 11", "~{mode}"()
|
||||
%"79" = fmul float %"80", %"81"
|
||||
store float %"79", ptr addrspace(5) %"61", align 4
|
||||
%"83" = load i32, ptr addrspace(5) %"63", align 4
|
||||
%"82" = add i32 %"83", 1
|
||||
store i32 %"82", ptr addrspace(5) %"63", align 4
|
||||
%"85" = load i32, ptr addrspace(5) %"63", align 4
|
||||
%"84" = icmp eq i32 %"85", 100000000
|
||||
store i1 %"84", ptr addrspace(5) %"64", align 1
|
||||
%"86" = load i1, ptr addrspace(5) %"64", align 1
|
||||
br i1 %"86", label %"11", label %"21"
|
||||
|
||||
"21": ; preds = %"10"
|
||||
br label %"10"
|
||||
|
||||
"11": ; preds = %"10"
|
||||
%"87" = load i64, ptr addrspace(5) %"58", align 4
|
||||
%"88" = load float, ptr addrspace(5) %"59", align 4
|
||||
%"95" = inttoptr i64 %"87" to ptr
|
||||
store float %"88", ptr %"95", align 4
|
||||
%"89" = load i64, ptr addrspace(5) %"58", align 4
|
||||
%"96" = inttoptr i64 %"89" to ptr
|
||||
%"48" = getelementptr inbounds i8, ptr %"96", i64 4
|
||||
%"90" = load float, ptr addrspace(5) %"61", align 4
|
||||
store float %"90", ptr %"48", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
46
ptx/src/test/ll/bfe.ll
Normal file
46
ptx/src/test/ll/bfe.ll
Normal file
|
@ -0,0 +1,46 @@
|
|||
declare i32 @__zluda_ptx_impl_bfe_u32(i32, i32, i32) #0
|
||||
|
||||
define amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #1 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i32, align 4, addrspace(5)
|
||||
%"41" = alloca i32, align 4, addrspace(5)
|
||||
%"42" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"35"
|
||||
|
||||
"35": ; preds = %1
|
||||
%"43" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"57" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load i32, ptr %"57", align 4
|
||||
store i32 %"45", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"58" = inttoptr i64 %"47" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"58", i64 4
|
||||
%"48" = load i32, ptr %"32", align 4
|
||||
store i32 %"48", ptr addrspace(5) %"41", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"59" = inttoptr i64 %"49" to ptr
|
||||
%"34" = getelementptr inbounds i8, ptr %"59", i64 8
|
||||
%"50" = load i32, ptr %"34", align 4
|
||||
store i32 %"50", ptr addrspace(5) %"42", align 4
|
||||
%"52" = load i32, ptr addrspace(5) %"40", align 4
|
||||
%"53" = load i32, ptr addrspace(5) %"41", align 4
|
||||
%"54" = load i32, ptr addrspace(5) %"42", align 4
|
||||
%"51" = call i32 @__zluda_ptx_impl_bfe_u32(i32 %"52", i32 %"53", i32 %"54")
|
||||
store i32 %"51", ptr addrspace(5) %"40", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"56" = load i32, ptr addrspace(5) %"40", align 4
|
||||
%"60" = inttoptr i64 %"55" to ptr
|
||||
store i32 %"56", ptr %"60", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
53
ptx/src/test/ll/bfi.ll
Normal file
53
ptx/src/test/ll/bfi.ll
Normal file
|
@ -0,0 +1,53 @@
|
|||
declare i32 @__zluda_ptx_impl_bfi_b32(i32, i32, i32, i32) #0
|
||||
|
||||
define amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 {
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
%"42" = alloca i64, align 8, addrspace(5)
|
||||
%"43" = alloca i32, align 4, addrspace(5)
|
||||
%"44" = alloca i32, align 4, addrspace(5)
|
||||
%"45" = alloca i32, align 4, addrspace(5)
|
||||
%"46" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"38"
|
||||
|
||||
"38": ; preds = %1
|
||||
%"47" = load i64, ptr addrspace(4) %"39", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"41", align 4
|
||||
%"48" = load i64, ptr addrspace(4) %"40", align 4
|
||||
store i64 %"48", ptr addrspace(5) %"42", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"64" = inttoptr i64 %"50" to ptr
|
||||
%"49" = load i32, ptr %"64", align 4
|
||||
store i32 %"49", ptr addrspace(5) %"43", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"65" = inttoptr i64 %"51" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"65", i64 4
|
||||
%"52" = load i32, ptr %"33", align 4
|
||||
store i32 %"52", ptr addrspace(5) %"44", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"66" = inttoptr i64 %"53" to ptr
|
||||
%"35" = getelementptr inbounds i8, ptr %"66", i64 8
|
||||
%"54" = load i32, ptr %"35", align 4
|
||||
store i32 %"54", ptr addrspace(5) %"45", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"67" = inttoptr i64 %"55" to ptr
|
||||
%"37" = getelementptr inbounds i8, ptr %"67", i64 12
|
||||
%"56" = load i32, ptr %"37", align 4
|
||||
store i32 %"56", ptr addrspace(5) %"46", align 4
|
||||
%"58" = load i32, ptr addrspace(5) %"43", align 4
|
||||
%"59" = load i32, ptr addrspace(5) %"44", align 4
|
||||
%"60" = load i32, ptr addrspace(5) %"45", align 4
|
||||
%"61" = load i32, ptr addrspace(5) %"46", align 4
|
||||
%"68" = call i32 @__zluda_ptx_impl_bfi_b32(i32 %"58", i32 %"59", i32 %"60", i32 %"61")
|
||||
store i32 %"68", ptr addrspace(5) %"43", align 4
|
||||
%"62" = load i64, ptr addrspace(5) %"42", align 4
|
||||
%"63" = load i32, ptr addrspace(5) %"43", align 4
|
||||
%"71" = inttoptr i64 %"62" to ptr
|
||||
store i32 %"63", ptr %"71", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
34
ptx/src/test/ll/block.ll
Normal file
34
ptx/src/test/ll/block.ll
Normal file
|
@ -0,0 +1,34 @@
|
|||
define amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"46" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"33"
|
||||
|
||||
"33": ; preds = %1
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"42" = load i64, ptr %"51", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"44" = add i64 %"45", 1
|
||||
store i64 %"44", ptr addrspace(5) %"39", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"46", align 4
|
||||
%"47" = add i64 %"48", 1
|
||||
store i64 %"47", ptr addrspace(5) %"46", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"52" = inttoptr i64 %"49" to ptr
|
||||
store i64 %"50", ptr %"52", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
36
ptx/src/test/ll/bra.ll
Normal file
36
ptx/src/test/ll/bra.ll
Normal file
|
@ -0,0 +1,36 @@
|
|||
define amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca i64, align 8, addrspace(5)
|
||||
%"41" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"35"
|
||||
|
||||
"35": ; preds = %1
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"43" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"50" = inttoptr i64 %"45" to ptr
|
||||
%"44" = load i64, ptr %"50", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"40", align 4
|
||||
br label %"10"
|
||||
|
||||
"10": ; preds = %"35"
|
||||
%"47" = load i64, ptr addrspace(5) %"40", align 4
|
||||
%"46" = add i64 %"47", 1
|
||||
store i64 %"46", ptr addrspace(5) %"41", align 4
|
||||
br label %"12"
|
||||
|
||||
"12": ; preds = %"10"
|
||||
%"48" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"41", align 4
|
||||
%"51" = inttoptr i64 %"48" to ptr
|
||||
store i64 %"49", ptr %"51", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
33
ptx/src/test/ll/brev.ll
Normal file
33
ptx/src/test/ll/brev.ll
Normal file
|
@ -0,0 +1,33 @@
|
|||
define amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"43" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load i32, ptr %"43", align 4
|
||||
store i32 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i32, ptr addrspace(5) %"34", align 4
|
||||
%"39" = call i32 @llvm.bitreverse.i32(i32 %"40")
|
||||
store i32 %"39", ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = load i32, ptr addrspace(5) %"34", align 4
|
||||
%"44" = inttoptr i64 %"41" to ptr
|
||||
store i32 %"42", ptr %"44", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare i32 @llvm.bitreverse.i32(i32) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
64
ptx/src/test/ll/call.ll
Normal file
64
ptx/src/test/ll/call.ll
Normal file
|
@ -0,0 +1,64 @@
|
|||
define i64 @incr(i64 %"43") #0 {
|
||||
%"63" = alloca i64, align 8, addrspace(5)
|
||||
%"64" = alloca i64, align 8, addrspace(5)
|
||||
%"65" = alloca i64, align 8, addrspace(5)
|
||||
%"66" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"46"
|
||||
|
||||
"46": ; preds = %1
|
||||
store i64 %"43", ptr addrspace(5) %"65", align 4
|
||||
%"67" = load i64, ptr addrspace(5) %"65", align 4
|
||||
store i64 %"67", ptr addrspace(5) %"66", align 4
|
||||
%"69" = load i64, ptr addrspace(5) %"66", align 4
|
||||
%"68" = add i64 %"69", 1
|
||||
store i64 %"68", ptr addrspace(5) %"66", align 4
|
||||
%"70" = load i64, ptr addrspace(5) %"66", align 4
|
||||
store i64 %"70", ptr addrspace(5) %"64", align 4
|
||||
%"71" = load i64, ptr addrspace(5) %"64", align 4
|
||||
store i64 %"71", ptr addrspace(5) %"63", align 4
|
||||
%2 = load i64, ptr addrspace(5) %"63", align 4
|
||||
ret i64 %2
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"48", ptr addrspace(4) byref(i64) %"49") #1 {
|
||||
%"50" = alloca i64, align 8, addrspace(5)
|
||||
%"51" = alloca i64, align 8, addrspace(5)
|
||||
%"52" = alloca i64, align 8, addrspace(5)
|
||||
%"57" = alloca i64, align 8, addrspace(5)
|
||||
%"58" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"44"
|
||||
|
||||
"44": ; preds = %1
|
||||
%"53" = load i64, ptr addrspace(4) %"48", align 4
|
||||
store i64 %"53", ptr addrspace(5) %"50", align 4
|
||||
%"54" = load i64, ptr addrspace(4) %"49", align 4
|
||||
store i64 %"54", ptr addrspace(5) %"51", align 4
|
||||
%"56" = load i64, ptr addrspace(5) %"50", align 4
|
||||
%"72" = inttoptr i64 %"56" to ptr addrspace(1)
|
||||
%"55" = load i64, ptr addrspace(1) %"72", align 4
|
||||
store i64 %"55", ptr addrspace(5) %"52", align 4
|
||||
%"59" = load i64, ptr addrspace(5) %"52", align 4
|
||||
store i64 %"59", ptr addrspace(5) %"57", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"57", align 4
|
||||
%"41" = call i64 @incr(i64 %"40")
|
||||
br label %"45"
|
||||
|
||||
"45": ; preds = %"44"
|
||||
store i64 %"41", ptr addrspace(5) %"58", align 4
|
||||
%"60" = load i64, ptr addrspace(5) %"58", align 4
|
||||
store i64 %"60", ptr addrspace(5) %"52", align 4
|
||||
%"61" = load i64, ptr addrspace(5) %"51", align 4
|
||||
%"62" = load i64, ptr addrspace(5) %"52", align 4
|
||||
%"75" = inttoptr i64 %"61" to ptr addrspace(1)
|
||||
store i64 %"62", ptr addrspace(1) %"75", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
155
ptx/src/test/ll/call_rnd.ll
Normal file
155
ptx/src/test/ll/call_rnd.ll
Normal file
|
@ -0,0 +1,155 @@
|
|||
define float @add_rm(float %"79", float %"80") #0 {
|
||||
%"128" = alloca float, align 4, addrspace(5)
|
||||
%"129" = alloca float, align 4, addrspace(5)
|
||||
%"130" = alloca float, align 4, addrspace(5)
|
||||
%"131" = alloca float, align 4, addrspace(5)
|
||||
%"132" = alloca float, align 4, addrspace(5)
|
||||
%"133" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"89"
|
||||
|
||||
"89": ; preds = %1
|
||||
call void @llvm.amdgcn.s.setreg(i32 6145, i32 2)
|
||||
br label %"87"
|
||||
|
||||
"87": ; preds = %"89"
|
||||
store float %"79", ptr addrspace(5) %"130", align 4
|
||||
store float %"80", ptr addrspace(5) %"131", align 4
|
||||
%"134" = load float, ptr addrspace(5) %"130", align 4
|
||||
store float %"134", ptr addrspace(5) %"132", align 4
|
||||
%"135" = load float, ptr addrspace(5) %"131", align 4
|
||||
store float %"135", ptr addrspace(5) %"133", align 4
|
||||
%"137" = load float, ptr addrspace(5) %"132", align 4
|
||||
%"138" = load float, ptr addrspace(5) %"133", align 4
|
||||
%"136" = fadd float %"137", %"138"
|
||||
store float %"136", ptr addrspace(5) %"132", align 4
|
||||
%"139" = load float, ptr addrspace(5) %"132", align 4
|
||||
store float %"139", ptr addrspace(5) %"129", align 4
|
||||
%"140" = load float, ptr addrspace(5) %"129", align 4
|
||||
store float %"140", ptr addrspace(5) %"128", align 4
|
||||
%2 = load float, ptr addrspace(5) %"128", align 4
|
||||
ret float %2
|
||||
}
|
||||
|
||||
define float @add_rp(float %"82", float %"83") #0 {
|
||||
%"141" = alloca float, align 4, addrspace(5)
|
||||
%"142" = alloca float, align 4, addrspace(5)
|
||||
%"143" = alloca float, align 4, addrspace(5)
|
||||
%"144" = alloca float, align 4, addrspace(5)
|
||||
%"145" = alloca float, align 4, addrspace(5)
|
||||
%"146" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"88"
|
||||
|
||||
"88": ; preds = %1
|
||||
store float %"82", ptr addrspace(5) %"143", align 4
|
||||
store float %"83", ptr addrspace(5) %"144", align 4
|
||||
%"147" = load float, ptr addrspace(5) %"143", align 4
|
||||
store float %"147", ptr addrspace(5) %"145", align 4
|
||||
%"148" = load float, ptr addrspace(5) %"144", align 4
|
||||
store float %"148", ptr addrspace(5) %"146", align 4
|
||||
%"150" = load float, ptr addrspace(5) %"145", align 4
|
||||
%"151" = load float, ptr addrspace(5) %"146", align 4
|
||||
%"149" = fadd float %"150", %"151"
|
||||
store float %"149", ptr addrspace(5) %"145", align 4
|
||||
%"152" = load float, ptr addrspace(5) %"145", align 4
|
||||
store float %"152", ptr addrspace(5) %"142", align 4
|
||||
%"153" = load float, ptr addrspace(5) %"142", align 4
|
||||
store float %"153", ptr addrspace(5) %"141", align 4
|
||||
%2 = load float, ptr addrspace(5) %"141", align 4
|
||||
ret float %2
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @call_rnd(ptr addrspace(4) byref(i64) %"92", ptr addrspace(4) byref(i64) %"93") #1 {
|
||||
%"94" = alloca i64, align 8, addrspace(5)
|
||||
%"95" = alloca i64, align 8, addrspace(5)
|
||||
%"96" = alloca float, align 4, addrspace(5)
|
||||
%"97" = alloca float, align 4, addrspace(5)
|
||||
%"98" = alloca float, align 4, addrspace(5)
|
||||
%"99" = alloca float, align 4, addrspace(5)
|
||||
%"100" = alloca float, align 4, addrspace(5)
|
||||
%"101" = alloca float, align 4, addrspace(5)
|
||||
%"102" = alloca float, align 4, addrspace(5)
|
||||
%"103" = alloca float, align 4, addrspace(5)
|
||||
%"104" = alloca float, align 4, addrspace(5)
|
||||
%"105" = alloca float, align 4, addrspace(5)
|
||||
%"106" = alloca float, align 4, addrspace(5)
|
||||
%"107" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"84"
|
||||
|
||||
"84": ; preds = %1
|
||||
call void @llvm.amdgcn.s.setreg(i32 6145, i32 1)
|
||||
%"108" = load i64, ptr addrspace(4) %"92", align 4
|
||||
store i64 %"108", ptr addrspace(5) %"94", align 4
|
||||
%"109" = load i64, ptr addrspace(4) %"93", align 4
|
||||
store i64 %"109", ptr addrspace(5) %"95", align 4
|
||||
%"111" = load i64, ptr addrspace(5) %"94", align 4
|
||||
%"154" = inttoptr i64 %"111" to ptr
|
||||
%"110" = load float, ptr %"154", align 4
|
||||
store float %"110", ptr addrspace(5) %"96", align 4
|
||||
%"112" = load i64, ptr addrspace(5) %"94", align 4
|
||||
%"155" = inttoptr i64 %"112" to ptr
|
||||
%"59" = getelementptr inbounds i8, ptr %"155", i64 4
|
||||
%"113" = load float, ptr %"59", align 4
|
||||
store float %"113", ptr addrspace(5) %"97", align 4
|
||||
%"114" = load i64, ptr addrspace(5) %"94", align 4
|
||||
%"156" = inttoptr i64 %"114" to ptr
|
||||
%"61" = getelementptr inbounds i8, ptr %"156", i64 8
|
||||
%"115" = load float, ptr %"61", align 4
|
||||
store float %"115", ptr addrspace(5) %"98", align 4
|
||||
%"116" = load i64, ptr addrspace(5) %"94", align 4
|
||||
%"157" = inttoptr i64 %"116" to ptr
|
||||
%"63" = getelementptr inbounds i8, ptr %"157", i64 12
|
||||
%"117" = load float, ptr %"63", align 4
|
||||
store float %"117", ptr addrspace(5) %"99", align 4
|
||||
%"118" = load float, ptr addrspace(5) %"96", align 4
|
||||
store float %"118", ptr addrspace(5) %"102", align 4
|
||||
%"119" = load float, ptr addrspace(5) %"97", align 4
|
||||
store float %"119", ptr addrspace(5) %"103", align 4
|
||||
%"72" = load float, ptr addrspace(5) %"102", align 4
|
||||
%"73" = load float, ptr addrspace(5) %"103", align 4
|
||||
%"74" = call float @add_rp(float %"72", float %"73")
|
||||
br label %"85"
|
||||
|
||||
"85": ; preds = %"84"
|
||||
store float %"74", ptr addrspace(5) %"104", align 4
|
||||
%"120" = load float, ptr addrspace(5) %"104", align 4
|
||||
store float %"120", ptr addrspace(5) %"100", align 4
|
||||
%"121" = load i64, ptr addrspace(5) %"95", align 4
|
||||
%"122" = load float, ptr addrspace(5) %"100", align 4
|
||||
%"158" = inttoptr i64 %"121" to ptr
|
||||
store float %"122", ptr %"158", align 4
|
||||
%"123" = load float, ptr addrspace(5) %"98", align 4
|
||||
store float %"123", ptr addrspace(5) %"105", align 4
|
||||
%"124" = load float, ptr addrspace(5) %"99", align 4
|
||||
store float %"124", ptr addrspace(5) %"106", align 4
|
||||
%"75" = load float, ptr addrspace(5) %"105", align 4
|
||||
%"76" = load float, ptr addrspace(5) %"106", align 4
|
||||
%"77" = call float @add_rm(float %"75", float %"76")
|
||||
br label %"86"
|
||||
|
||||
"86": ; preds = %"85"
|
||||
store float %"77", ptr addrspace(5) %"107", align 4
|
||||
%"125" = load float, ptr addrspace(5) %"107", align 4
|
||||
store float %"125", ptr addrspace(5) %"101", align 4
|
||||
%"126" = load i64, ptr addrspace(5) %"95", align 4
|
||||
%"159" = inttoptr i64 %"126" to ptr
|
||||
%"65" = getelementptr inbounds i8, ptr %"159", i64 4
|
||||
%"127" = load float, ptr addrspace(5) %"101", align 4
|
||||
store float %"127", ptr %"65", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind willreturn
|
||||
declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #2
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #2 = { nocallback nofree nosync nounwind willreturn }
|
33
ptx/src/test/ll/clz.ll
Normal file
33
ptx/src/test/ll/clz.ll
Normal file
|
@ -0,0 +1,33 @@
|
|||
define amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"43" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load i32, ptr %"43", align 4
|
||||
store i32 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i32, ptr addrspace(5) %"34", align 4
|
||||
%"44" = call i32 @llvm.ctlz.i32(i32 %"40", i1 false)
|
||||
store i32 %"44", ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = load i32, ptr addrspace(5) %"34", align 4
|
||||
%"45" = inttoptr i64 %"41" to ptr
|
||||
store i32 %"42", ptr %"45", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare i32 @llvm.ctlz.i32(i32, i1 immarg) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
50
ptx/src/test/ll/const.ll
Normal file
50
ptx/src/test/ll/const.ll
Normal file
|
@ -0,0 +1,50 @@
|
|||
@constparams = addrspace(4) global [4 x i16] [i16 10, i16 20, i16 30, i16 40], align 8
|
||||
|
||||
define amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 {
|
||||
%"48" = alloca i64, align 8, addrspace(5)
|
||||
%"49" = alloca i64, align 8, addrspace(5)
|
||||
%"50" = alloca i16, align 2, addrspace(5)
|
||||
%"51" = alloca i16, align 2, addrspace(5)
|
||||
%"52" = alloca i16, align 2, addrspace(5)
|
||||
%"53" = alloca i16, align 2, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"45"
|
||||
|
||||
"45": ; preds = %1
|
||||
%"54" = load i64, ptr addrspace(4) %"46", align 4
|
||||
store i64 %"54", ptr addrspace(5) %"48", align 4
|
||||
%"55" = load i64, ptr addrspace(4) %"47", align 4
|
||||
store i64 %"55", ptr addrspace(5) %"49", align 4
|
||||
%"56" = load i16, ptr addrspace(4) @constparams, align 2
|
||||
store i16 %"56", ptr addrspace(5) %"50", align 2
|
||||
%"57" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2
|
||||
store i16 %"57", ptr addrspace(5) %"51", align 2
|
||||
%"58" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2
|
||||
store i16 %"58", ptr addrspace(5) %"52", align 2
|
||||
%"59" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2
|
||||
store i16 %"59", ptr addrspace(5) %"53", align 2
|
||||
%"60" = load i64, ptr addrspace(5) %"49", align 4
|
||||
%"61" = load i16, ptr addrspace(5) %"50", align 2
|
||||
%"72" = inttoptr i64 %"60" to ptr
|
||||
store i16 %"61", ptr %"72", align 2
|
||||
%"62" = load i64, ptr addrspace(5) %"49", align 4
|
||||
%"74" = inttoptr i64 %"62" to ptr
|
||||
%"40" = getelementptr inbounds i8, ptr %"74", i64 2
|
||||
%"63" = load i16, ptr addrspace(5) %"51", align 2
|
||||
store i16 %"63", ptr %"40", align 2
|
||||
%"64" = load i64, ptr addrspace(5) %"49", align 4
|
||||
%"76" = inttoptr i64 %"64" to ptr
|
||||
%"42" = getelementptr inbounds i8, ptr %"76", i64 4
|
||||
%"65" = load i16, ptr addrspace(5) %"52", align 2
|
||||
store i16 %"65", ptr %"42", align 2
|
||||
%"66" = load i64, ptr addrspace(5) %"49", align 4
|
||||
%"78" = inttoptr i64 %"66" to ptr
|
||||
%"44" = getelementptr inbounds i8, ptr %"78", i64 6
|
||||
%"67" = load i16, ptr addrspace(5) %"53", align 2
|
||||
store i16 %"67", ptr %"44", align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
29
ptx/src/test/ll/constant_f32.ll
Normal file
29
ptx/src/test/ll/constant_f32.ll
Normal file
|
@ -0,0 +1,29 @@
|
|||
define amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"44" = inttoptr i64 %"39" to ptr
|
||||
%"38" = load float, ptr %"44", align 4
|
||||
store float %"38", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load float, ptr addrspace(5) %"35", align 4
|
||||
%"40" = fmul float %"41", 5.000000e-01
|
||||
store float %"40", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"43" = load float, ptr addrspace(5) %"35", align 4
|
||||
%"45" = inttoptr i64 %"42" to ptr
|
||||
store float %"43", ptr %"45", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
29
ptx/src/test/ll/constant_negative.ll
Normal file
29
ptx/src/test/ll/constant_negative.ll
Normal file
|
@ -0,0 +1,29 @@
|
|||
define amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"44" = inttoptr i64 %"39" to ptr
|
||||
%"38" = load i32, ptr %"44", align 4
|
||||
store i32 %"38", ptr addrspace(5) %"35", align 4
|
||||
%"41" = load i32, ptr addrspace(5) %"35", align 4
|
||||
%"40" = mul i32 %"41", -1
|
||||
store i32 %"40", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"43" = load i32, ptr addrspace(5) %"35", align 4
|
||||
%"45" = inttoptr i64 %"42" to ptr
|
||||
store i32 %"43", ptr %"45", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
33
ptx/src/test/ll/cos.ll
Normal file
33
ptx/src/test/ll/cos.ll
Normal file
|
@ -0,0 +1,33 @@
|
|||
define amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"43" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load float, ptr %"43", align 4
|
||||
store float %"37", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"39" = call afn float @llvm.cos.f32(float %"40")
|
||||
store float %"39", ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"44" = inttoptr i64 %"41" to ptr
|
||||
store float %"42", ptr %"44", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.cos.f32(float) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
30
ptx/src/test/ll/cvt_f64_f32.ll
Normal file
30
ptx/src/test/ll/cvt_f64_f32.ll
Normal file
|
@ -0,0 +1,30 @@
|
|||
define amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca float, align 4, addrspace(5)
|
||||
%"36" = alloca double, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"45" = inttoptr i64 %"40" to ptr addrspace(1)
|
||||
%"39" = load float, ptr addrspace(1) %"45", align 4
|
||||
store float %"39", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load float, ptr addrspace(5) %"35", align 4
|
||||
%"41" = fpext float %"42" to double
|
||||
store double %"41", ptr addrspace(5) %"36", align 8
|
||||
%"43" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"44" = load double, ptr addrspace(5) %"36", align 8
|
||||
%"46" = inttoptr i64 %"43" to ptr
|
||||
store double %"44", ptr %"46", align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
49
ptx/src/test/ll/cvt_rni.ll
Normal file
49
ptx/src/test/ll/cvt_rni.ll
Normal file
|
@ -0,0 +1,49 @@
|
|||
define amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca float, align 4, addrspace(5)
|
||||
%"40" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"34"
|
||||
|
||||
"34": ; preds = %1
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"55" = inttoptr i64 %"44" to ptr
|
||||
%"43" = load float, ptr %"55", align 4
|
||||
store float %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"56" = inttoptr i64 %"45" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"56", i64 4
|
||||
%"46" = load float, ptr %"31", align 4
|
||||
store float %"46", ptr addrspace(5) %"40", align 4
|
||||
%"48" = load float, ptr addrspace(5) %"39", align 4
|
||||
%2 = call float @llvm.roundeven.f32(float %"48")
|
||||
%"47" = freeze float %2
|
||||
store float %"47", ptr addrspace(5) %"39", align 4
|
||||
%"50" = load float, ptr addrspace(5) %"40", align 4
|
||||
%3 = call float @llvm.roundeven.f32(float %"50")
|
||||
%"49" = freeze float %3
|
||||
store float %"49", ptr addrspace(5) %"40", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"52" = load float, ptr addrspace(5) %"39", align 4
|
||||
%"57" = inttoptr i64 %"51" to ptr
|
||||
store float %"52", ptr %"57", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"58" = inttoptr i64 %"53" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"58", i64 4
|
||||
%"54" = load float, ptr addrspace(5) %"40", align 4
|
||||
store float %"54", ptr %"33", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.roundeven.f32(float) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
54
ptx/src/test/ll/cvt_rzi.ll
Normal file
54
ptx/src/test/ll/cvt_rzi.ll
Normal file
|
@ -0,0 +1,54 @@
|
|||
define amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca float, align 4, addrspace(5)
|
||||
%"40" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"34"
|
||||
|
||||
"34": ; preds = %1
|
||||
call void @llvm.amdgcn.s.setreg(i32 6145, i32 3)
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"55" = inttoptr i64 %"44" to ptr
|
||||
%"43" = load float, ptr %"55", align 4
|
||||
store float %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"56" = inttoptr i64 %"45" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"56", i64 4
|
||||
%"46" = load float, ptr %"31", align 4
|
||||
store float %"46", ptr addrspace(5) %"40", align 4
|
||||
%"48" = load float, ptr addrspace(5) %"39", align 4
|
||||
%2 = call float @llvm.trunc.f32(float %"48")
|
||||
%"47" = freeze float %2
|
||||
store float %"47", ptr addrspace(5) %"39", align 4
|
||||
%"50" = load float, ptr addrspace(5) %"40", align 4
|
||||
%3 = call float @llvm.trunc.f32(float %"50")
|
||||
%"49" = freeze float %3
|
||||
store float %"49", ptr addrspace(5) %"40", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"52" = load float, ptr addrspace(5) %"39", align 4
|
||||
%"57" = inttoptr i64 %"51" to ptr
|
||||
store float %"52", ptr %"57", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"58" = inttoptr i64 %"53" to ptr
|
||||
%"33" = getelementptr inbounds i8, ptr %"58", i64 4
|
||||
%"54" = load float, ptr addrspace(5) %"40", align 4
|
||||
store float %"54", ptr %"33", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind willreturn
|
||||
declare void @llvm.amdgcn.s.setreg(i32 immarg, i32) #1
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.trunc.f32(float) #2
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind willreturn }
|
||||
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
32
ptx/src/test/ll/cvt_s16_s8.ll
Normal file
32
ptx/src/test/ll/cvt_s16_s8.ll
Normal file
|
@ -0,0 +1,32 @@
|
|||
define amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i32, align 4, addrspace(5)
|
||||
%"36" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"45" = inttoptr i64 %"40" to ptr addrspace(1)
|
||||
%"39" = load i32, ptr addrspace(1) %"45", align 4
|
||||
store i32 %"39", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i32, ptr addrspace(5) %"36", align 4
|
||||
%2 = trunc i32 %"42" to i8
|
||||
%"46" = sext i8 %2 to i16
|
||||
%"41" = sext i16 %"46" to i32
|
||||
store i32 %"41", ptr addrspace(5) %"35", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"44" = load i32, ptr addrspace(5) %"35", align 4
|
||||
%"48" = inttoptr i64 %"43" to ptr
|
||||
store i32 %"44", ptr %"48", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
55
ptx/src/test/ll/cvt_s32_f32.ll
Normal file
55
ptx/src/test/ll/cvt_s32_f32.ll
Normal file
|
@ -0,0 +1,55 @@
|
|||
define amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
|
||||
%"37" = alloca i64, align 8, addrspace(5)
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i32, align 4, addrspace(5)
|
||||
%"40" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"34"
|
||||
|
||||
"34": ; preds = %1
|
||||
%"41" = load i64, ptr addrspace(4) %"35", align 4
|
||||
store i64 %"41", ptr addrspace(5) %"37", align 4
|
||||
%"42" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"42", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"56" = inttoptr i64 %"44" to ptr
|
||||
%"55" = load float, ptr %"56", align 4
|
||||
%"43" = bitcast float %"55" to i32
|
||||
store i32 %"43", ptr addrspace(5) %"39", align 4
|
||||
%"45" = load i64, ptr addrspace(5) %"37", align 4
|
||||
%"57" = inttoptr i64 %"45" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"57", i64 4
|
||||
%"58" = load float, ptr %"31", align 4
|
||||
%"46" = bitcast float %"58" to i32
|
||||
store i32 %"46", ptr addrspace(5) %"40", align 4
|
||||
%"48" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"60" = bitcast i32 %"48" to float
|
||||
%2 = call float @llvm.ceil.f32(float %"60")
|
||||
%3 = fptosi float %2 to i32
|
||||
%"59" = freeze i32 %3
|
||||
store i32 %"59", ptr addrspace(5) %"39", align 4
|
||||
%"50" = load i32, ptr addrspace(5) %"40", align 4
|
||||
%"62" = bitcast i32 %"50" to float
|
||||
%4 = call float @llvm.ceil.f32(float %"62")
|
||||
%5 = fptosi float %4 to i32
|
||||
%"61" = freeze i32 %5
|
||||
store i32 %"61", ptr addrspace(5) %"40", align 4
|
||||
%"51" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"52" = load i32, ptr addrspace(5) %"39", align 4
|
||||
%"63" = inttoptr i64 %"51" to ptr addrspace(1)
|
||||
store i32 %"52", ptr addrspace(1) %"63", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"65" = inttoptr i64 %"53" to ptr addrspace(1)
|
||||
%"33" = getelementptr inbounds i8, ptr addrspace(1) %"65", i64 4
|
||||
%"54" = load i32, ptr addrspace(5) %"40", align 4
|
||||
store i32 %"54", ptr addrspace(1) %"33", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.ceil.f32(float) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
30
ptx/src/test/ll/cvt_s64_s32.ll
Normal file
30
ptx/src/test/ll/cvt_s64_s32.ll
Normal file
|
@ -0,0 +1,30 @@
|
|||
define amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i32, align 4, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"37" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"46" = inttoptr i64 %"40" to ptr
|
||||
%"45" = load i32, ptr %"46", align 4
|
||||
store i32 %"45", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load i32, ptr addrspace(5) %"35", align 4
|
||||
%"41" = sext i32 %"42" to i64
|
||||
store i64 %"41", ptr addrspace(5) %"36", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"44" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"47" = inttoptr i64 %"43" to ptr
|
||||
store i64 %"44", ptr %"47", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
41
ptx/src/test/ll/cvt_sat_s_u.ll
Normal file
41
ptx/src/test/ll/cvt_sat_s_u.ll
Normal file
|
@ -0,0 +1,41 @@
|
|||
define amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 {
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i32, align 4, addrspace(5)
|
||||
%"37" = alloca i32, align 4, addrspace(5)
|
||||
%"38" = alloca i32, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"31"
|
||||
|
||||
"31": ; preds = %1
|
||||
%"39" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"49" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load i32, ptr %"49", align 4
|
||||
store i32 %"41", ptr addrspace(5) %"36", align 4
|
||||
%"44" = load i32, ptr addrspace(5) %"36", align 4
|
||||
%2 = call i32 @llvm.smax.i32(i32 %"44", i32 0)
|
||||
%3 = call i32 @llvm.umin.i32(i32 %2, i32 -1)
|
||||
store i32 %3, ptr addrspace(5) %"37", align 4
|
||||
%"46" = load i32, ptr addrspace(5) %"37", align 4
|
||||
store i32 %"46", ptr addrspace(5) %"38", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"48" = load i32, ptr addrspace(5) %"38", align 4
|
||||
%"50" = inttoptr i64 %"47" to ptr
|
||||
store i32 %"48", ptr %"50", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare i32 @llvm.smax.i32(i32, i32) #1
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare i32 @llvm.umin.i32(i32, i32) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
34
ptx/src/test/ll/cvta.ll
Normal file
34
ptx/src/test/ll/cvta.ll
Normal file
|
@ -0,0 +1,34 @@
|
|||
define amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%2 = inttoptr i64 %"38" to ptr
|
||||
%"45" = addrspacecast ptr %2 to ptr addrspace(1)
|
||||
store ptr addrspace(1) %"45", ptr addrspace(5) %"32", align 8
|
||||
%"40" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%3 = inttoptr i64 %"40" to ptr
|
||||
%"47" = addrspacecast ptr %3 to ptr addrspace(1)
|
||||
store ptr addrspace(1) %"47", ptr addrspace(5) %"33", align 8
|
||||
%"42" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"49" = inttoptr i64 %"42" to ptr addrspace(1)
|
||||
%"41" = load float, ptr addrspace(1) %"49", align 4
|
||||
store float %"41", ptr addrspace(5) %"34", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"44" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"50" = inttoptr i64 %"43" to ptr addrspace(1)
|
||||
store float %"44", ptr addrspace(1) %"50", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
36
ptx/src/test/ll/div_approx.ll
Normal file
36
ptx/src/test/ll/div_approx.ll
Normal file
|
@ -0,0 +1,36 @@
|
|||
define amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"33", ptr addrspace(4) byref(i64) %"34") #0 {
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
%"37" = alloca float, align 4, addrspace(5)
|
||||
%"38" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"32"
|
||||
|
||||
"32": ; preds = %1
|
||||
%"39" = load i64, ptr addrspace(4) %"33", align 4
|
||||
store i64 %"39", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(4) %"34", align 4
|
||||
store i64 %"40", ptr addrspace(5) %"36", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"50" = inttoptr i64 %"42" to ptr
|
||||
%"41" = load float, ptr %"50", align 4
|
||||
store float %"41", ptr addrspace(5) %"37", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"51" = inttoptr i64 %"43" to ptr
|
||||
%"31" = getelementptr inbounds i8, ptr %"51", i64 4
|
||||
%"44" = load float, ptr %"31", align 4
|
||||
store float %"44", ptr addrspace(5) %"38", align 4
|
||||
%"46" = load float, ptr addrspace(5) %"37", align 4
|
||||
%"47" = load float, ptr addrspace(5) %"38", align 4
|
||||
%"45" = fdiv arcp afn float %"46", %"47"
|
||||
store float %"45", ptr addrspace(5) %"37", align 4
|
||||
%"48" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"49" = load float, ptr addrspace(5) %"37", align 4
|
||||
%"52" = inttoptr i64 %"48" to ptr
|
||||
store float %"49", ptr %"52", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
33
ptx/src/test/ll/ex2.ll
Normal file
33
ptx/src/test/ll/ex2.ll
Normal file
|
@ -0,0 +1,33 @@
|
|||
define amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
|
||||
%"32" = alloca i64, align 8, addrspace(5)
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"29"
|
||||
|
||||
"29": ; preds = %1
|
||||
%"35" = load i64, ptr addrspace(4) %"30", align 4
|
||||
store i64 %"35", ptr addrspace(5) %"32", align 4
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"38" = load i64, ptr addrspace(5) %"32", align 4
|
||||
%"43" = inttoptr i64 %"38" to ptr
|
||||
%"37" = load float, ptr %"43", align 4
|
||||
store float %"37", ptr addrspace(5) %"34", align 4
|
||||
%"40" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"39" = call float @llvm.amdgcn.exp2.f32(float %"40")
|
||||
store float %"39", ptr addrspace(5) %"34", align 4
|
||||
%"41" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"42" = load float, ptr addrspace(5) %"34", align 4
|
||||
%"44" = inttoptr i64 %"41" to ptr
|
||||
store float %"42", ptr %"44", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.amdgcn.exp2.f32(float) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
32
ptx/src/test/ll/extern_shared.ll
Normal file
32
ptx/src/test/ll/extern_shared.ll
Normal file
|
@ -0,0 +1,32 @@
|
|||
@shared_mem = external addrspace(3) global [0 x i32]
|
||||
|
||||
define amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 {
|
||||
%"33" = alloca i64, align 8, addrspace(5)
|
||||
%"34" = alloca i64, align 8, addrspace(5)
|
||||
%"35" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"30"
|
||||
|
||||
"30": ; preds = %1
|
||||
%"36" = load i64, ptr addrspace(4) %"31", align 4
|
||||
store i64 %"36", ptr addrspace(5) %"33", align 4
|
||||
%"37" = load i64, ptr addrspace(4) %"32", align 4
|
||||
store i64 %"37", ptr addrspace(5) %"34", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"33", align 4
|
||||
%"44" = inttoptr i64 %"39" to ptr addrspace(1)
|
||||
%"38" = load i64, ptr addrspace(1) %"44", align 4
|
||||
store i64 %"38", ptr addrspace(5) %"35", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"35", align 4
|
||||
store i64 %"40", ptr addrspace(3) @shared_mem, align 4
|
||||
%"41" = load i64, ptr addrspace(3) @shared_mem, align 4
|
||||
store i64 %"41", ptr addrspace(5) %"35", align 4
|
||||
%"42" = load i64, ptr addrspace(5) %"34", align 4
|
||||
%"43" = load i64, ptr addrspace(5) %"35", align 4
|
||||
%"47" = inttoptr i64 %"42" to ptr addrspace(1)
|
||||
store i64 %"43", ptr addrspace(1) %"47", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
55
ptx/src/test/ll/extern_shared_call.ll
Normal file
55
ptx/src/test/ll/extern_shared_call.ll
Normal file
|
@ -0,0 +1,55 @@
|
|||
@shared_mem = external addrspace(3) global [0 x i32], align 4
|
||||
|
||||
define void @incr_shared_2_global() #0 {
|
||||
%"36" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"33"
|
||||
|
||||
"33": ; preds = %1
|
||||
%"37" = load i64, ptr addrspace(3) @shared_mem, align 4
|
||||
store i64 %"37", ptr addrspace(5) %"36", align 4
|
||||
%"39" = load i64, ptr addrspace(5) %"36", align 4
|
||||
%"38" = add i64 %"39", 2
|
||||
store i64 %"38", ptr addrspace(5) %"36", align 4
|
||||
%"40" = load i64, ptr addrspace(5) %"36", align 4
|
||||
store i64 %"40", ptr addrspace(3) @shared_mem, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 {
|
||||
%"43" = alloca i64, align 8, addrspace(5)
|
||||
%"44" = alloca i64, align 8, addrspace(5)
|
||||
%"45" = alloca i64, align 8, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"34"
|
||||
|
||||
"34": ; preds = %1
|
||||
%"46" = load i64, ptr addrspace(4) %"41", align 4
|
||||
store i64 %"46", ptr addrspace(5) %"43", align 4
|
||||
%"47" = load i64, ptr addrspace(4) %"42", align 4
|
||||
store i64 %"47", ptr addrspace(5) %"44", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"43", align 4
|
||||
%"56" = inttoptr i64 %"49" to ptr addrspace(1)
|
||||
%"48" = load i64, ptr addrspace(1) %"56", align 4
|
||||
store i64 %"48", ptr addrspace(5) %"45", align 4
|
||||
%"50" = load i64, ptr addrspace(5) %"45", align 4
|
||||
store i64 %"50", ptr addrspace(3) @shared_mem, align 4
|
||||
call void @incr_shared_2_global()
|
||||
br label %"35"
|
||||
|
||||
"35": ; preds = %"34"
|
||||
%"51" = load i64, ptr addrspace(3) @shared_mem, align 4
|
||||
store i64 %"51", ptr addrspace(5) %"45", align 4
|
||||
%"52" = load i64, ptr addrspace(5) %"44", align 4
|
||||
%"53" = load i64, ptr addrspace(5) %"45", align 4
|
||||
%"59" = inttoptr i64 %"52" to ptr addrspace(1)
|
||||
store i64 %"53", ptr addrspace(1) %"59", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
47
ptx/src/test/ll/fma.ll
Normal file
47
ptx/src/test/ll/fma.ll
Normal file
|
@ -0,0 +1,47 @@
|
|||
define amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
|
||||
%"38" = alloca i64, align 8, addrspace(5)
|
||||
%"39" = alloca i64, align 8, addrspace(5)
|
||||
%"40" = alloca float, align 4, addrspace(5)
|
||||
%"41" = alloca float, align 4, addrspace(5)
|
||||
%"42" = alloca float, align 4, addrspace(5)
|
||||
br label %1
|
||||
|
||||
1: ; preds = %0
|
||||
br label %"35"
|
||||
|
||||
"35": ; preds = %1
|
||||
%"43" = load i64, ptr addrspace(4) %"36", align 4
|
||||
store i64 %"43", ptr addrspace(5) %"38", align 4
|
||||
%"44" = load i64, ptr addrspace(4) %"37", align 4
|
||||
store i64 %"44", ptr addrspace(5) %"39", align 4
|
||||
%"46" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"57" = inttoptr i64 %"46" to ptr
|
||||
%"45" = load float, ptr %"57", align 4
|
||||
store float %"45", ptr addrspace(5) %"40", align 4
|
||||
%"47" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"58" = inttoptr i64 %"47" to ptr
|
||||
%"32" = getelementptr inbounds i8, ptr %"58", i64 4
|
||||
%"48" = load float, ptr %"32", align 4
|
||||
store float %"48", ptr addrspace(5) %"41", align 4
|
||||
%"49" = load i64, ptr addrspace(5) %"38", align 4
|
||||
%"59" = inttoptr i64 %"49" to ptr
|
||||
%"34" = getelementptr inbounds i8, ptr %"59", i64 8
|
||||
%"50" = load float, ptr %"34", align 4
|
||||
store float %"50", ptr addrspace(5) %"42", align 4
|
||||
%"52" = load float, ptr addrspace(5) %"40", align 4
|
||||
%"53" = load float, ptr addrspace(5) %"41", align 4
|
||||
%"54" = load float, ptr addrspace(5) %"42", align 4
|
||||
%"51" = call float @llvm.fma.f32(float %"52", float %"53", float %"54")
|
||||
store float %"51", ptr addrspace(5) %"40", align 4
|
||||
%"55" = load i64, ptr addrspace(5) %"39", align 4
|
||||
%"56" = load float, ptr addrspace(5) %"40", align 4
|
||||
%"60" = inttoptr i64 %"55" to ptr
|
||||
store float %"56", ptr %"60", align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare float @llvm.fma.f32(float, float, float) #1
|
||||
|
||||
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
|
||||
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue