Merge branch 'master' into meshroom

# Conflicts:
#	ptx/lib/zluda_ptx_impl.bc
#	ptx/src/ast.rs
#	ptx/src/emit.rs
#	ptx/src/ptx.lalrpop
#	ptx/src/test/spirv_run/mod.rs
#	ptx/src/translate.rs
#	zluda/src/cuda.rs
#	zluda/src/impl/surface.rs
This commit is contained in:
Andrzej Janik 2024-05-16 02:23:29 +02:00
commit 922692d2fa
175 changed files with 8932 additions and 5498 deletions

67
.github/workflows/rust.yml vendored Normal file
View file

@ -0,0 +1,67 @@
name: Rust
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
env:
CARGO_TERM_COLOR: always
ROCM_VERSION: "5.7.3"
jobs:
build_lin:
name: Build and publish (Linux)
runs-on: ubuntu-20.04
steps:
- uses: jlumbroso/free-disk-space@main
- name: Install ROCm
run: |
sudo mkdir --parents --mode=0755 /etc/apt/keyrings
sudo sh -c 'wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null'
sudo sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ env.ROCM_VERSION }} focal main > /etc/apt/sources.list.d/rocm.list'
sudo apt-get update
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-smi-lib hip-runtime-amd comgr hipblaslt-dev hipfft-dev rocblas-dev rocsolver-dev rocsparse-dev miopen-hip-dev rocm-device-libs
echo 'export PATH="$PATH:/opt/rocm/bin"' | sudo tee /etc/profile.d/rocm.sh
echo '/opt/rocm/lib' | sudo tee /etc/ld.so.conf.d/rocm.conf
sudo ldconfig
- uses: actions/checkout@v4
with:
submodules: true
- uses: Swatinem/rust-cache@v2
- name: Build
# We use tar to unpack .tar.gz we've created because Github actions/upload-artifact
# is broken and will _always_ zip your artifact (even if it is a single file).
# See here: https://github.com/actions/upload-artifact/issues/39
# and here: https://github.com/actions/upload-artifact/issues/109
run: |
cargo xtask zip -r
tar -xzf target/release/zluda.tar.gz -C target/release
# https://stackoverflow.com/a/64195658
- name: Set revision hash
run: echo "SHORT_SHA=$(git rev-parse --short HEAD)" >> $GITHUB_ENV
- name: Upload
uses: actions/upload-artifact@v4
with:
name: zluda-linux-${{ env.SHORT_SHA }}
path: target/release/zluda
build_win:
name: Build and publish (Windows)
runs-on: windows-2019
steps:
- uses: actions/checkout@v4
with:
submodules: true
- uses: Swatinem/rust-cache@v2
- name: Build
run: |
cargo xtask zip -r
Expand-Archive -Path target/release/zluda.zip -DestinationPath target/release
# https://stackoverflow.com/a/74033027
- name: Set revision hash
run: echo "SHORT_SHA=$("${{ github.sha }}".SubString(0, 7))" >> $env:GITHUB_ENV
- name: Upload
uses: actions/upload-artifact@v4
with:
name: zluda-windows-${{ env.SHORT_SHA }}
path: target/release/zluda

1
.gitignore vendored
View file

@ -1,5 +1,4 @@
target/
Cargo.lock
.vscode/
.idea/

2561
Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

View file

@ -58,4 +58,4 @@ opt-level = 3
opt-level = 3
[profile.dev.package.xtask]
opt-level = 3
opt-level = 2

View file

@ -1,57 +0,0 @@
[config]
default_to_workspace = false
skip_core_tasks = true
[tasks.build]
run_task = [
{ name = "build-windows", condition = { platforms = ["windows"] } },
{ name = "build-linux", condition = { platforms = ["linux"] } },
]
[tasks.build-windows]
command = "cargo"
args = [
"build",
"-p", "offline_compiler",
"-p", "zluda_dump",
"-p", "zluda_inject",
"-p", "zluda_lib",
"-p", "zluda_ml",
"-p", "zluda_redirect",
]
[tasks.build-linux]
command = "cargo"
args = [
"build",
"-p", "offline_compiler",
"-p", "zluda_blas",
"-p", "zluda_blaslt",
"-p", "zluda_ccl",
"-p", "zluda_dnn",
"-p", "zluda_dump",
"-p", "zluda_fft",
"-p", "zluda_lib",
"-p", "zluda_ml",
"-p", "zluda_sparse",
]
[tasks.build-release]
command = "cargo"
args = [
"build",
"--release",
"-p", "offline_compiler",
"-p", "zluda_blas",
"-p", "zluda_blaslt",
"-p", "zluda_ccl",
"-p", "zluda_dnn",
"-p", "zluda_dump",
"-p", "zluda_fft",
"-p", "zluda_lib",
"-p", "zluda_ml",
"-p", "zluda_sparse",
]
[tasks.default]
alias = "build"

View file

@ -66,7 +66,7 @@ If an application fails to start under ZLUDA or crashes please check [Known Issu
- If both integrated AMD GPU and dedicated AMD GPU are present in the system, ZLUDA uses the integrated GPU.
This is a bug in underying ROCm/HIP runtime. You can work around it by disabling the integrated GPU.
This is a bug in underlying ROCm/HIP runtime. You can work around it by disabling the integrated GPU.
On Windows we recommend you use environment variable `HIP_VISIBLE_DEVICES=1` environment variable (more [here](https://rocmdocs.amd.com/en/latest/conceptual/gpu-isolation.html#hip-visible-devices)) or disable it system-wide in Device Manager.
@ -235,10 +235,6 @@ Performance is currently much lower than the native HIP backend, see the discuss
This is a ROCm/HIP bug. Currently, CompuBench tests have to be run one at a time.
- Some tests output black screen.
This is due to a bug (or an unintended hardware feature) in CompuBench that just happens to work on NVIDIA GPUs.
#### V-Ray Benchmark
- Currently, ZLUDA crashes when running V-Ray benchmark. Nonetheless, certain "lucky" older combinations of ZLUDA and ROCm/HIP are known to run V-Ray Benchmark successfully.

View file

@ -92,13 +92,16 @@ If you are dumping original CUDA use:
### Linux
Known bug: when dumping from original CUDA you should remove (or rename) all the files in `<ZLUDA_DIRECTORY>/dump` except `libcuda.so` and `libcuda.so.1`.
Use it like this:
If dumping from ZLUDA use it like this:
```
LD_LIBRARY_PATH="<ZLUDA_DIRECTORY>/dump:$LD_LIBRARY_PATH" <APPLICATION> <APPLICATION_ARGUMENTS>
```
If dumping from NVIDIA CUDA use it like this:
```
LD_LIBRARY_PATH="<ZLUDA_DIRECTORY>/dump_nvidia:$LD_LIBRARY_PATH" <APPLICATION> <APPLICATION_ARGUMENTS>
```
### Result
If all went well you should see lines like this in the console output and in the log file specified by `ZLUDA_DUMP_DIR`:

View file

@ -3,7 +3,7 @@ extern crate convert_case;
use convert_case::{Case, Casing, StateConverter};
use std::{
env,
env, io,
path::PathBuf,
process::{Command, Stdio},
};
@ -17,8 +17,9 @@ fn main() {
.map(|comp| comp.from_case(Case::Snake));
let msvc = is_msvc();
let (llvm_dir, additonal_cmake_file) = get_llvm_dir();
let out_dir = build_cmake_targets(llvm_components.clone(), llvm_dir, additonal_cmake_file);
emit_compile_and_linking_information(llvm_components, out_dir, msvc)
let (cmake_profile, out_dir) =
build_cmake_targets(llvm_components.clone(), llvm_dir, additonal_cmake_file);
emit_compile_and_linking_information(llvm_components, cmake_profile, out_dir, msvc)
}
fn is_msvc() -> bool {
@ -41,11 +42,20 @@ fn build_cmake_targets<'a>(
components: impl Iterator<Item = StateConverter<'a, &'static str>>,
llvm_dir: PathBuf,
additional_cmake_file: PathBuf,
) -> PathBuf {
) -> (String, PathBuf) {
let mut cmake = Config::new(llvm_dir);
use_ninja(&mut cmake);
cmake
.always_configure(true)
// Should be detected automatically, but we have reports of
// LLVM fiding ZLIB on Windows and then failing to link it.
// Out of caution we explicitly disable all autodetectable components
.define("LLVM_ENABLE_LIBXML2", "OFF")
.define("LLVM_ENABLE_ZLIB", "OFF")
.define("LLVM_ENABLE_ZSTD", "OFF")
.define("LLVM_ENABLE_CURL", "OFF")
.define("LLVM_ENABLE_HTTPLIB", "OFF")
.define("LLVM_ENABLE_LIBEDIT", "OFF")
.define("LLVM_ENABLE_TERMINFO", "OFF")
.define("LLVM_BUILD_TOOLS", "OFF")
.define("LLVM_TARGETS_TO_BUILD", "")
@ -57,7 +67,10 @@ fn build_cmake_targets<'a>(
.build_target(&format!("LLVM{}", component.to_case(Case::Pascal)))
.build();
}
cmake.build_target("llvm-config").build()
(
cmake.get_profile().to_string(),
cmake.build_target("llvm-config").build(),
)
}
fn use_ninja(cmake: &mut Config) {
@ -76,31 +89,27 @@ fn use_ninja(cmake: &mut Config) {
}
fn emit_compile_and_linking_information<'a>(
llvm_components: impl Iterator<Item = StateConverter<'a, &'static str>>,
llvm_components: impl Iterator<Item = StateConverter<'a, &'static str>> + Clone,
cmake_profile: String,
out_dir: PathBuf,
is_msvc: bool,
) {
let mut llvm_config_path = out_dir.clone();
llvm_config_path.push("build");
llvm_config_path.push("bin");
llvm_config_path.push("llvm-config");
let mut llvm_config_cmd = Command::new(&llvm_config_path);
llvm_config_cmd.args([
"--cxxflags",
"--ldflags",
"--libdir",
"--libnames",
"--system-libs",
"--link-static",
]);
for component in llvm_components {
llvm_config_cmd.arg(&component.to_case(Case::Flat));
}
let llvm_config_output = llvm_config_cmd
.stdin(Stdio::null())
.stderr(Stdio::null())
.output()
.unwrap();
// MSBuild uses didfferent output path from ninja or Makefile.
// Not sure how to query CMake about it, so we just try once with
// ninja/Makefile path and then once with MSBuild path
let llvm_config_output = execute_llvm_config(
&out_dir,
&["build", "bin", "llvm-config"],
llvm_components.clone(),
)
.or_else(|_| {
execute_llvm_config(
&out_dir,
&["build", &*cmake_profile, "bin", "llvm-config"],
llvm_components,
)
})
.unwrap();
if !llvm_config_output.status.success() {
panic!()
}
@ -138,3 +147,28 @@ fn emit_compile_and_linking_information<'a>(
println!("cargo:rustc-link-lib=stdc++");
}
}
fn execute_llvm_config<'a>(
out_dir: &PathBuf,
llvm_config_exe_relative: &[&str],
llvm_components: impl Iterator<Item = StateConverter<'a, &'static str>>,
) -> io::Result<std::process::Output> {
let mut llvm_config_path = out_dir.clone();
llvm_config_path.extend(llvm_config_exe_relative);
let mut llvm_config_cmd = Command::new(&llvm_config_path);
llvm_config_cmd.args([
"--cxxflags",
"--ldflags",
"--libdir",
"--libnames",
"--system-libs",
"--link-static",
]);
for component in llvm_components {
llvm_config_cmd.arg(&component.to_case(Case::Flat));
}
llvm_config_cmd
.stdin(Stdio::null())
.stderr(Stdio::null())
.output()
}

View file

@ -18,3 +18,4 @@ features = [
[package.metadata.zluda]
debug_only = true
skip_zip = true

Binary file not shown.

View file

@ -1,5 +1,5 @@
// Compile and disassemble:
// python3 ./cvt.py > cvt.h && /opt/rocm/llvm/bin/clang -std=c++17 -Xclang -no-opaque-pointers -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -S -emit-llvm --cuda-device-only -nogpulib -O3 -Xclang -fallow-half-arguments-and-returns -o - | sed -e 's/define/define linkonce_odr/g' | sed -e '/@llvm.used/d' | sed -e 's/\"target-cpu\"=\"[^\"]*\"//g' | sed -e 's/\"target-features\"=\"[^\"]*\"//g' | sed -e 's/\"denormal-fp-math-f32\"=\"[^\"]*\"//g' | sed -e 's/!llvm.module.flags = !{!0, !1, !2, !3, !4}/!llvm.module.flags = !{ }/g' | sed -e 's/memory(none)/readnone/g' | sed -e 's/memory(argmem: readwrite, inaccessiblemem: readwrite)/inaccessiblemem_or_argmemonly/g' | sed -e 's/memory(read)/readonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | llvm-as-13 -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
// python3 ./cvt.py > cvt.h && /opt/rocm/llvm/bin/clang -std=c++20 -Xclang -no-opaque-pointers -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -S -emit-llvm --cuda-device-only -nogpulib -O3 -Xclang -fallow-half-arguments-and-returns -o - | sed -e 's/define/define linkonce_odr/g' | sed -e '/@llvm.used/d' | sed -e 's/\"target-cpu\"=\"[^\"]*\"//g' | sed -e 's/\"target-features\"=\"[^\"]*\"//g' | sed -e 's/\"denormal-fp-math-f32\"=\"[^\"]*\"//g' | sed -e 's/!llvm.module.flags = !{!0, !1, !2, !3, !4}/!llvm.module.flags = !{ }/g' | sed -e 's/memory(none)/readnone/g' | sed -e 's/memory(argmem: readwrite, inaccessiblemem: readwrite)/inaccessiblemem_or_argmemonly/g' | sed -e 's/memory(read)/readonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | llvm-as-13 -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
// Compile to binary:
// /opt/rocm/llvm/bin/clang -x ir -target amdgcn-amd-amdhsa -Xlinker --no-undefined zluda_ptx_impl.bc -mno-wavefrontsize64 -mcpu=gfx1030
// Decompile:
@ -11,6 +11,7 @@
// https://llvm.org/docs/AMDGPUUsage.html
#include <cstdint>
#include <bit>
#include <hip/hip_runtime.h>
#define HIP_NO_HALF
#include <hip/amd_detail/amd_hip_fp16.h>
@ -155,6 +156,399 @@ static __device__ float4::Native_vec_ __pack_to_float4(const T &t)
return result;
}
typedef uint32_t uint8 __attribute__((ext_vector_type(8)));
typedef uint32_t zluda_uint3 __attribute__((ext_vector_type(3)));
typedef uint8 CONSTANT_SPACE *surface_ptr;
template <typename To, typename From>
static __device__ To transmute(From f)
{
if constexpr (sizeof(To) == sizeof(From))
{
return std::bit_cast<To>(f);
}
else if constexpr (sizeof(To) > sizeof(From))
{
union
{
To t;
From f;
} u = {To{0}};
u.f = f;
return u.t;
}
else if constexpr (sizeof(To) < sizeof(From))
{
union
{
From f;
To t;
} u = {From{f}};
return u.t;
}
else
{
static_assert(sizeof(To) == 0);
}
}
enum class ImageGeometry
{
_1D,
_2D,
_3D,
A1D,
A2D
};
// clang-format off
template <ImageGeometry> struct Coordinates;
template <> struct Coordinates<ImageGeometry::_1D> { using type = uint1::Native_vec_; };
template <> struct Coordinates<ImageGeometry::_2D> { using type = uint2::Native_vec_; };
template <> struct Coordinates<ImageGeometry::_3D> { using type = uint4::Native_vec_; };
template <> struct Coordinates<ImageGeometry::A1D>
{
using type = uint2::Native_vec_; using arg_type = uint1::Native_vec_;
static __device__ type pack_layer(uint32_t layer, arg_type coord)
{
return type { coord.x, layer };
}
};
template <> struct Coordinates<ImageGeometry::A2D>
{
using type = zluda_uint3; using arg_type = uint2::Native_vec_;
static __device__ type pack_layer(uint32_t layer, arg_type coord)
{
return type { coord.x, coord.y, layer };
}
};
// clang-format on
template <typename T, ImageGeometry geo>
static __device__ void image_store_pck(T value, typename Coordinates<geo>::type coord, surface_ptr surface)
{
if constexpr (sizeof(T) <= sizeof(uint))
{
uint value_dword = transmute<uint>(value);
if constexpr (geo == ImageGeometry::_1D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:1D unorm" : : "v"(value_dword), "v"(coord.x), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_2D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:2D unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_3D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:3D unorm" : : "v"(value_dword), "v"(transmute<zluda_uint3>(coord)), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A1D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:1D_ARRAY unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A2D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:2D_ARRAY unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory");
}
else
{
static_assert(sizeof(T) == 0, "Invalid geometry");
}
}
else if constexpr (sizeof(T) == sizeof(uint2::Native_vec_))
{
uint2::Native_vec_ value_dword2 = transmute<uint2::Native_vec_>(value);
if constexpr (geo == ImageGeometry::_1D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:1D unorm" : : "v"(value_dword2), "v"(coord.x), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_2D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:2D unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_3D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:3D unorm" : : "v"(value_dword2), "v"(transmute<zluda_uint3>(coord)), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A1D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:1D_ARRAY unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A2D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:2D_ARRAY unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory");
}
else
{
static_assert(sizeof(T) == 0, "Invalid geometry");
}
}
else if constexpr (sizeof(T) == sizeof(uint4::Native_vec_))
{
uint4::Native_vec_ value_dword4 = transmute<uint4::Native_vec_>(value);
if constexpr (geo == ImageGeometry::_1D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:1D unorm" : : "v"(value_dword4), "v"(coord.x), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_2D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:2D unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_3D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:3D unorm" : : "v"(value_dword4), "v"(transmute<zluda_uint3>(coord)), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A1D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A2D)
{
asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory");
}
else
{
static_assert(sizeof(T) == 0, "Invalid geometry");
}
}
else
{
static_assert(sizeof(T) == 0, "Invalid vector size");
}
}
template <typename T, ImageGeometry geo>
static __device__ T image_load_pck(typename Coordinates<geo>::type coord, surface_ptr surface)
{
if constexpr (sizeof(T) <= sizeof(uint))
{
uint data;
if constexpr (geo == ImageGeometry::_1D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_2D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_3D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute<zluda_uint3>(coord)), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A1D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A2D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory");
}
else
{
static_assert(sizeof(ImageGeometry) == 0, "Invalid geometry");
}
return transmute<T>(data);
}
else if constexpr (sizeof(T) == sizeof(uint2::Native_vec_))
{
uint2::Native_vec_ data;
if constexpr (geo == ImageGeometry::_1D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_2D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_3D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute<zluda_uint3>(coord)), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A1D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A2D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory");
}
else
{
static_assert(sizeof(ImageGeometry) == 0, "Invalid geometry");
}
return transmute<T>(data);
}
else if constexpr (sizeof(T) == sizeof(uint4::Native_vec_))
{
uint4::Native_vec_ data;
if constexpr (geo == ImageGeometry::_1D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_2D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_3D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute<zluda_uint3>(coord)), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A1D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A2D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory");
}
else
{
static_assert(sizeof(ImageGeometry) == 0, "Invalid geometry");
}
return transmute<T>(data);
}
else
{
static_assert(sizeof(T) == 0, "Invalid vector size");
}
}
template <ImageGeometry geo>
static __device__ uint4::Native_vec_ image_load_pck_full(typename Coordinates<geo>::type coord, surface_ptr surface)
{
uint4::Native_vec_ data;
if constexpr (geo == ImageGeometry::_1D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_2D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::_3D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute<zluda_uint3>(coord)), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A1D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory");
}
else if constexpr (geo == ImageGeometry::A2D)
{
asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory");
}
else
{
static_assert(sizeof(ImageGeometry) == 0, "Invalid geometry");
}
return data;
}
template <typename T, ImageGeometry geo>
static __device__ void image_store_pck_full_with(uint4::Native_vec_ data, T value, typename Coordinates<geo>::type coord, surface_ptr surface)
{
// We avoid unions for types smaller than sizeof(uint32_t),
// because in those cases we get this garbage:
// ds_write_b128 v2, v[5:8]
// ds_write_b16 v2, v9
// ds_read_b128 v[5:8], v2
// tested with ROCm 5.7.1 on gfx1030
if constexpr (sizeof(T) == sizeof(uint8_t))
{
uint32_t x = uint32_t(std::bit_cast<uint8_t>(value));
uint32_t data_0 = ((data[0]) >> 8) << 8;
data[0] = data_0 | x;
}
else if constexpr (sizeof(T) == sizeof(uint16_t))
{
uint32_t x = uint32_t(std::bit_cast<uint16_t>(value));
uint32_t data_0 = ((data[0]) >> 16) << 16;
data[0] = data_0 | x;
}
else
{
union
{
uint4::Native_vec_ full_vec;
T value;
} u = {0};
u.full_vec = data;
u.value = value;
data = u.full_vec;
}
image_store_pck<uint4::Native_vec_, geo>(data, coord, surface);
}
constexpr auto IMAGE_RESERVED_TOP_BITS = 3;
static __device__ surface_ptr get_surface_pointer(uint64_t s)
{
return (surface_ptr)((s << IMAGE_RESERVED_TOP_BITS) >> IMAGE_RESERVED_TOP_BITS);
}
static __device__ surface_ptr get_surface_pointer(struct textureReference GLOBAL_SPACE *surf_ref)
{
return (surface_ptr)(surf_ref->textureObject);
}
static __device__ uint32_t x_coordinate_shift(uint64_t s)
{
return uint32_t(s >> (64 - IMAGE_RESERVED_TOP_BITS));
}
static __device__ uint32_t x_coordinate_shift(struct textureReference GLOBAL_SPACE *ptr)
{
uint32_t channels = uint32_t(ptr->numChannels);
uint32_t format_width = 0;
hipArray_Format format = ptr->format;
switch (format)
{
case hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8:
case hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8:
format_width = 1;
break;
case hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16:
case hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16:
case hipArray_Format::HIP_AD_FORMAT_HALF:
format_width = 2;
break;
case hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32:
case hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32:
case hipArray_Format::HIP_AD_FORMAT_FLOAT:
format_width = 4;
break;
default:
__builtin_unreachable();
}
return uint32_t(__builtin_ctz(format_width * channels));
}
template <typename T, ImageGeometry geo, typename Surface>
static __device__ T suld_b_zero(Surface surf_arg, typename Coordinates<geo>::type coord)
{
surface_ptr surface = get_surface_pointer(surf_arg);
uint32_t shift_x = x_coordinate_shift(surf_arg);
coord.x = coord.x >> shift_x;
return image_load_pck<T, geo>(coord, surface);
}
template <typename T, ImageGeometry geo, typename Surface>
static __device__ void sust_b_zero(Surface surf_arg, typename Coordinates<geo>::type coord, T data)
{
surface_ptr surface = get_surface_pointer(surf_arg);
uint32_t shift_x = x_coordinate_shift(surf_arg);
coord.x = coord.x >> shift_x;
if (shift_x <= __builtin_ctz(sizeof(T))) [[likely]]
{
image_store_pck<T, geo>(data, coord, surface);
}
else
{
uint4::Native_vec_ pixel = image_load_pck_full<geo>(coord, surface);
image_store_pck_full_with<T, geo>(pixel, data, coord, surface);
}
}
extern "C"
{
#define atomic_inc(NAME, SUCCESS, FAILURE, SCOPE, SPACE) \
@ -660,179 +1054,101 @@ extern "C"
suld_b_a2d_vec(_v4, b32, uint4);
// suld_b_a2d_vec(_v4, b64, ulong4);
#define sust_b_1d_vec(VEC, TYPE, HIP_TYPE) \
void FUNC(sust_b_1d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, int1::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \
{ \
hipTextureObject_t textureObject = ptr->textureObject; \
TEXTURE_OBJECT_PARAMETERS_INIT; \
(void)s; \
int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
auto tmp = __pack_to_float4(hip_data); \
__ockl_image_store_1D(i, byte_coord, tmp); \
} \
void FUNC(sust_b_indirect_1d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int1::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
HIP_TYPE hip_data; \
hip_data.data = data; \
surf1Dwrite(hip_data, surfObj, coord.x); \
#define SUST_B_ZERO(TYPE, GEOMETRY, HIP_TYPE) \
HIP_TYPE::Native_vec_ FUNC(suld_b_indirect_##TYPE##_zero)(uint64_t surf_arg, typename Coordinates<GEOMETRY>::type coord) \
{ \
return suld_b_zero<HIP_TYPE::Native_vec_, GEOMETRY>(surf_arg, coord); \
} \
void FUNC(sust_b_indirect_##TYPE##_zero)(uint64_t surf_arg, typename Coordinates<GEOMETRY>::type coord, HIP_TYPE::Native_vec_ data) \
{ \
sust_b_zero<HIP_TYPE::Native_vec_, GEOMETRY>(surf_arg, coord, data); \
} \
HIP_TYPE::Native_vec_ FUNC(suld_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, typename Coordinates<GEOMETRY>::type coord) \
{ \
return suld_b_zero<HIP_TYPE::Native_vec_, GEOMETRY>(ptr, coord); \
} \
void FUNC(sust_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, typename Coordinates<GEOMETRY>::type coord, HIP_TYPE::Native_vec_ data) \
{ \
sust_b_zero<HIP_TYPE::Native_vec_, GEOMETRY>(ptr, coord, data); \
}
sust_b_1d_vec(, b8, uchar1);
sust_b_1d_vec(, b16, ushort1);
sust_b_1d_vec(, b32, uint1);
// sust_b_1d_vec(, b64, ulong1);
sust_b_1d_vec(_v2, b8, uchar2);
sust_b_1d_vec(_v2, b16, ushort2);
sust_b_1d_vec(_v2, b32, uint2);
// sust_b_1d_vec(_v2, b64, ulong2);
sust_b_1d_vec(_v4, b8, uchar4);
sust_b_1d_vec(_v4, b16, ushort4);
sust_b_1d_vec(_v4, b32, uint4);
// sust_b_1d_vec(_v4, b64, ulong4);
#define sust_b_2d_vec(VEC, TYPE, HIP_TYPE) \
void FUNC(sust_b_2d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, int2::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \
{ \
hipTextureObject_t textureObject = ptr->textureObject; \
TEXTURE_OBJECT_PARAMETERS_INIT; \
(void)s; \
int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
auto tmp = __pack_to_float4(hip_data); \
__ockl_image_store_2D(i, int2(byte_coord, coord.y).data, tmp); \
} \
void FUNC(sust_b_indirect_2d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int2::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
HIP_TYPE hip_data; \
hip_data.data = data; \
surf2Dwrite(hip_data, surfObj, coord.x, coord.y); \
#define SUST_B_ZERO_ARRAY(TYPE, GEOMETRY, HIP_TYPE) \
HIP_TYPE::Native_vec_ FUNC(suld_b_indirect_##TYPE##_zero)(uint64_t surf_arg, uint32_t layer, typename Coordinates<GEOMETRY>::arg_type coord) \
{ \
auto coord_array = Coordinates<GEOMETRY>::pack_layer(layer, coord); \
return suld_b_zero<HIP_TYPE::Native_vec_, GEOMETRY>(surf_arg, coord_array); \
} \
void FUNC(sust_b_indirect_##TYPE##_zero)(uint64_t surf_arg, uint32_t layer, typename Coordinates<GEOMETRY>::arg_type coord, HIP_TYPE::Native_vec_ data) \
{ \
auto coord_array = Coordinates<GEOMETRY>::pack_layer(layer, coord); \
sust_b_zero<HIP_TYPE::Native_vec_, GEOMETRY>(surf_arg, coord_array, data); \
} \
HIP_TYPE::Native_vec_ FUNC(suld_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, uint32_t layer, typename Coordinates<GEOMETRY>::arg_type coord) \
{ \
auto coord_array = Coordinates<GEOMETRY>::pack_layer(layer, coord); \
return suld_b_zero<HIP_TYPE::Native_vec_, GEOMETRY>(ptr, coord_array); \
} \
void FUNC(sust_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, uint32_t layer, typename Coordinates<GEOMETRY>::arg_type coord, HIP_TYPE::Native_vec_ data) \
{ \
auto coord_array = Coordinates<GEOMETRY>::pack_layer(layer, coord); \
sust_b_zero<HIP_TYPE::Native_vec_, GEOMETRY>(ptr, coord_array, data); \
}
sust_b_2d_vec(, b8, uchar1);
sust_b_2d_vec(, b16, ushort1);
sust_b_2d_vec(, b32, uint1);
// sust_b_2d_vec(, b64, ulong1);
sust_b_2d_vec(_v2, b8, uchar2);
sust_b_2d_vec(_v2, b16, ushort2);
sust_b_2d_vec(_v2, b32, uint2);
// sust_b_2d_vec(_v2, b64, ulong2);
sust_b_2d_vec(_v4, b8, uchar4);
sust_b_2d_vec(_v4, b16, ushort4);
sust_b_2d_vec(_v4, b32, uint4);
// sust_b_2d_vec(_v4, b64, ulong4);
#define sust_b_3d_vec(VEC, TYPE, HIP_TYPE) \
void FUNC(sust_b_3d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, int4::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \
{ \
hipTextureObject_t textureObject = ptr->textureObject; \
TEXTURE_OBJECT_PARAMETERS_INIT; \
(void)s; \
int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
auto tmp = __pack_to_float4(hip_data); \
__ockl_image_store_3D(i, int4(byte_coord, coord.y, coord.z, 0).data, tmp); \
} \
void FUNC(sust_b_indirect_3d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int4::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
__HIP_SURFACE_OBJECT_PARAMETERS_INIT; \
int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
auto tmp = __pack_to_float4(hip_data); \
__ockl_image_store_3D(i, int4(byte_coord, coord.y, coord.z, 0).data, tmp); \
}
sust_b_3d_vec(, b8, uchar1);
sust_b_3d_vec(, b16, ushort1);
sust_b_3d_vec(, b32, uint1);
// sust_b_3d_vec(, b64, ulong1);
sust_b_3d_vec(_v2, b8, uchar2);
sust_b_3d_vec(_v2, b16, ushort2);
sust_b_3d_vec(_v2, b32, uint2);
// sust_b_3d_vec(_v2, b64, ulong2);
sust_b_3d_vec(_v4, b8, uchar4);
sust_b_3d_vec(_v4, b16, ushort4);
sust_b_3d_vec(_v4, b32, uint4);
// sust_b_3d_vec(_v4, b64, ulong4);
#define sust_b_a1d_vec(VEC, TYPE, HIP_TYPE) \
void FUNC(sust_b_a1d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, uint layer, int x, HIP_TYPE::Native_vec_ data) \
{ \
hipTextureObject_t textureObject = ptr->textureObject; \
TEXTURE_OBJECT_PARAMETERS_INIT; \
(void)s; \
int byte_coord = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1Da(i), __ockl_image_channel_order_1Da(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
auto tmp = __pack_to_float4(hip_data); \
__ockl_image_store_1Da(i, int2(byte_coord, int(layer)).data, tmp); \
} \
void FUNC(sust_b_indirect_a1d##VEC##_##TYPE##_trap)(uint64_t serf_arg, uint layer, int x, HIP_TYPE::Native_vec_ data) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
__HIP_SURFACE_OBJECT_PARAMETERS_INIT; \
int byte_coord = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1Da(i), __ockl_image_channel_order_1Da(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
auto tmp = __pack_to_float4(hip_data); \
__ockl_image_store_1Da(i, int2(byte_coord, int(layer)).data, tmp); \
}
sust_b_a1d_vec(, b8, uchar1);
sust_b_a1d_vec(, b16, ushort1);
sust_b_a1d_vec(, b32, uint1);
// sust_b_a1d_vec(, b64, ulong1);
sust_b_a1d_vec(_v2, b8, uchar2);
sust_b_a1d_vec(_v2, b16, ushort2);
sust_b_a1d_vec(_v2, b32, uint2);
// sust_b_a1d_vec(_v2, b64, ulong2);
sust_b_a1d_vec(_v4, b8, uchar4);
sust_b_a1d_vec(_v4, b16, ushort4);
sust_b_a1d_vec(_v4, b32, uint4);
// sust_b_a1d_vec(_v4, b64, ulong4);
#define sust_b_a2d_vec(VEC, TYPE, HIP_TYPE) \
void FUNC(sust_b_a2d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, uint layer, int x, int y, HIP_TYPE::Native_vec_ data) \
{ \
hipTextureObject_t textureObject = ptr->textureObject; \
TEXTURE_OBJECT_PARAMETERS_INIT; \
(void)s; \
int byte_coord = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2Da(i), __ockl_image_channel_order_2Da(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
auto tmp = __pack_to_float4(hip_data); \
__ockl_image_store_2Da(i, int4(byte_coord, y, int(layer), 0).data, tmp); \
} \
void FUNC(sust_b_indirect_a2d##VEC##_##TYPE##_trap)(uint64_t serf_arg, uint layer, int x, int y, HIP_TYPE::Native_vec_ data) \
{ \
hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \
__HIP_SURFACE_OBJECT_PARAMETERS_INIT; \
int byte_coord = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2Da(i), __ockl_image_channel_order_2Da(i)); \
HIP_TYPE hip_data; \
hip_data.data = data; \
auto tmp = __pack_to_float4(hip_data); \
__ockl_image_store_2Da(i, int4(byte_coord, y, int(layer), 0).data, tmp); \
}
sust_b_a2d_vec(, b8, uchar1);
sust_b_a2d_vec(, b16, ushort1);
sust_b_a2d_vec(, b32, uint1);
// sust_b_a2d_vec(, b64, ulong1);
sust_b_a2d_vec(_v2, b8, uchar2);
sust_b_a2d_vec(_v2, b16, ushort2);
sust_b_a2d_vec(_v2, b32, uint2);
// sust_b_a2d_vec(_v2, b64, ulong2);
sust_b_a2d_vec(_v4, b8, uchar4);
sust_b_a2d_vec(_v4, b16, ushort4);
sust_b_a2d_vec(_v4, b32, uint4);
// sust_b_a2d_vec(_v4, b64, ulong4);
SUST_B_ZERO(1d_b8, ImageGeometry::_1D, uchar1);
SUST_B_ZERO(1d_b16, ImageGeometry::_1D, ushort1);
SUST_B_ZERO(1d_b32, ImageGeometry::_1D, uint1);
SUST_B_ZERO(1d_b64, ImageGeometry::_1D, ulong1);
SUST_B_ZERO(1d_v2_b8, ImageGeometry::_1D, uchar2);
SUST_B_ZERO(1d_v2_b16, ImageGeometry::_1D, ushort2);
SUST_B_ZERO(1d_v2_b32, ImageGeometry::_1D, uint2);
SUST_B_ZERO(1d_v2_b64, ImageGeometry::_1D, ulong2);
SUST_B_ZERO(1d_v4_b8, ImageGeometry::_1D, uchar4);
SUST_B_ZERO(1d_v4_b16, ImageGeometry::_1D, ushort4);
SUST_B_ZERO(1d_v4_b32, ImageGeometry::_1D, uint4);
SUST_B_ZERO(2d_b8, ImageGeometry::_2D, uchar1);
SUST_B_ZERO(2d_b16, ImageGeometry::_2D, ushort1);
SUST_B_ZERO(2d_b32, ImageGeometry::_2D, uint1);
SUST_B_ZERO(2d_b64, ImageGeometry::_2D, ulong1);
SUST_B_ZERO(2d_v2_b8, ImageGeometry::_2D, uchar2);
SUST_B_ZERO(2d_v2_b16, ImageGeometry::_2D, ushort2);
SUST_B_ZERO(2d_v2_b32, ImageGeometry::_2D, uint2);
SUST_B_ZERO(2d_v2_b64, ImageGeometry::_2D, ulong2);
SUST_B_ZERO(2d_v4_b8, ImageGeometry::_2D, uchar4);
SUST_B_ZERO(2d_v4_b16, ImageGeometry::_2D, ushort4);
SUST_B_ZERO(2d_v4_b32, ImageGeometry::_2D, uint4);
SUST_B_ZERO(3d_b8, ImageGeometry::_3D, uchar1);
SUST_B_ZERO(3d_b16, ImageGeometry::_3D, ushort1);
SUST_B_ZERO(3d_b32, ImageGeometry::_3D, uint1);
SUST_B_ZERO(3d_b64, ImageGeometry::_3D, ulong1);
SUST_B_ZERO(3d_v2_b8, ImageGeometry::_3D, uchar2);
SUST_B_ZERO(3d_v2_b16, ImageGeometry::_3D, ushort2);
SUST_B_ZERO(3d_v2_b32, ImageGeometry::_3D, uint2);
SUST_B_ZERO(3d_v2_b64, ImageGeometry::_3D, ulong2);
SUST_B_ZERO(3d_v4_b8, ImageGeometry::_3D, uchar4);
SUST_B_ZERO(3d_v4_b16, ImageGeometry::_3D, ushort4);
SUST_B_ZERO(3d_v4_b32, ImageGeometry::_3D, uint4);
SUST_B_ZERO_ARRAY(a1d_b8, ImageGeometry::A1D, uchar1);
SUST_B_ZERO_ARRAY(a1d_b16, ImageGeometry::A1D, ushort1);
SUST_B_ZERO_ARRAY(a1d_b32, ImageGeometry::A1D, uint1);
SUST_B_ZERO_ARRAY(a1d_b64, ImageGeometry::A1D, ulong1);
SUST_B_ZERO_ARRAY(a1d_v2_b8, ImageGeometry::A1D, uchar2);
SUST_B_ZERO_ARRAY(a1d_v2_b16, ImageGeometry::A1D, ushort2);
SUST_B_ZERO_ARRAY(a1d_v2_b32, ImageGeometry::A1D, uint2);
SUST_B_ZERO_ARRAY(a1d_v2_b64, ImageGeometry::A1D, ulong2);
SUST_B_ZERO_ARRAY(a1d_v4_b8, ImageGeometry::A1D, uchar4);
SUST_B_ZERO_ARRAY(a1d_v4_b16, ImageGeometry::A1D, ushort4);
SUST_B_ZERO_ARRAY(a1d_v4_b32, ImageGeometry::A1D, uint4);
SUST_B_ZERO_ARRAY(a2d_b8, ImageGeometry::A2D, uchar1);
SUST_B_ZERO_ARRAY(a2d_b16, ImageGeometry::A2D, ushort1);
SUST_B_ZERO_ARRAY(a2d_b32, ImageGeometry::A2D, uint1);
SUST_B_ZERO_ARRAY(a2d_b64, ImageGeometry::A2D, ulong1);
SUST_B_ZERO_ARRAY(a2d_v2_b8, ImageGeometry::A2D, uchar2);
SUST_B_ZERO_ARRAY(a2d_v2_b16, ImageGeometry::A2D, ushort2);
SUST_B_ZERO_ARRAY(a2d_v2_b32, ImageGeometry::A2D, uint2);
SUST_B_ZERO_ARRAY(a2d_v2_b64, ImageGeometry::A2D, ulong2);
SUST_B_ZERO_ARRAY(a2d_v4_b8, ImageGeometry::A2D, uchar4);
SUST_B_ZERO_ARRAY(a2d_v4_b16, ImageGeometry::A2D, ushort4);
SUST_B_ZERO_ARRAY(a2d_v4_b32, ImageGeometry::A2D, uint4);
__device__ static inline bool is_upper_warp()
{
@ -984,6 +1300,7 @@ extern "C"
default:
return 0;
}
return 2;
case 'l':
switch (s[1])
{
@ -1013,17 +1330,18 @@ extern "C"
case 'X':
case 'n':
len = 8;
return 2;
break;
default:
return 0;
}
return 3;
default:
return 0;
}
return 2;
default:
return 0;
}
return 1;
}
__device__ static bool parse_printf_specifier(const char *s, uint8_t &len)
@ -1117,8 +1435,36 @@ extern "C"
char c = *(s++);
if (c == 0)
break;
if (c == '%')
if (c != '%')
continue;
// %% requires no additional handling
if (*s == '%')
{
s++;
continue;
}
// %s uses __ockl_printf_append_string_n
// https://github.com/ROCm/ROCm-Device-Libs/blob/rocm-5.7.x/ockl/src/services.cl#L343
if (*s == 's')
{
s++;
const char *value = (const char *)read_valist(valist_ptr, valist_offset, 8);
handle = __ockl_printf_append_string_n(handle, value, strlen_plus_one(value), 0);
continue;
}
// Keep scanning until we figure out the length of this specifier or if we reach the end of the string
while (*s != 0) {
// "The width is not specified in the format string, but as an additional integer value argument preceding the argument that has to be formatted."
if (*s == '*') {
s++;
uint64_t value = read_valist(valist_ptr, valist_offset, 4);
handle = __ockl_printf_append_args(handle, 1, value, 0, 0, 0, 0, 0, 0, 0);
continue;
}
uint8_t len = 0;
if (parse_printf_specifier(s, len))
{
@ -1130,16 +1476,22 @@ extern "C"
if (specifier_with_length)
{
s += specifier_with_length;
}
if (len > 0)
{
uint64_t value = read_valist(valist_ptr, valist_offset, len);
handle = __ockl_printf_append_args(handle, 1, value, 0, 0, 0, 0, 0, 0, 0);
} else {
// Assume the unknown character is a sub-specifier and move on
s++;
continue;
}
}
if (len > 0)
{
uint64_t value = read_valist(valist_ptr, valist_offset, len);
handle = __ockl_printf_append_args(handle, 1, value, 0, 0, 0, 0, 0, 0, 0);
}
break;
}
}
return (uint32_t)__ockl_printf_append_args(handle, 0, 0, 0, 0, 0, 0, 0, 0, 1);
__ockl_printf_append_args(handle, 0, 0, 0, 0, 0, 0, 0, 0, 1);
return 1;
}

View file

@ -380,6 +380,7 @@ pub enum Instruction<P: ArgParams> {
},
MadCC {
type_: ScalarType,
is_hi: bool,
arg: Arg4<P>,
},
Fma(ArithFloat, Arg4<P>),
@ -476,6 +477,7 @@ pub enum Instruction<P: ArgParams> {
Red(AtomDetails, Arg2St<P>),
Nanosleep(Arg1<P>),
Isspacep(StateSpace, Arg2<P>),
Sad(ScalarType, Arg4<P>),
}
#[derive(Copy, Clone)]

View file

@ -7,12 +7,13 @@ use std::ffi::CStr;
use std::fmt::Display;
use std::io::Write;
use std::ptr::null_mut;
use std::{convert, iter, mem, ptr};
use std::{iter, mem, ptr};
use zluda_llvm::core::*;
use zluda_llvm::prelude::*;
use zluda_llvm::zluda::*;
use zluda_llvm::*;
use crate::ast::SetpData;
use crate::translate::{
self, Arg4CarryOut, ConstType, ConversionKind, DenormSummary, ExpandedArgParams, FPDenormMode,
MadCCDetails, MadCDetails, TranslationModule, TypeKind, TypeParts,
@ -156,7 +157,7 @@ impl NamedIdGenerator {
if let Some(id) = id {
self.register_result(id, func)
} else {
func(b"\0".as_ptr() as _)
func(LLVM_UNNAMED)
}
}
@ -497,10 +498,12 @@ fn emit_function_variable(
) -> Result<(), TranslateError> {
let builder = ctx.builder.get();
let llvm_type = get_llvm_type(ctx, &variable.type_)?;
let addr_space = get_llvm_address_space(&ctx.constants, variable.state_space)?;
let value = ctx.names.register_result(variable.name, |name| unsafe {
LLVMZludaBuildAlloca(builder, llvm_type, addr_space, name)
});
let value = emit_alloca(
ctx,
llvm_type,
get_llvm_address_space(&ctx.constants, variable.state_space)?,
Some(variable.name),
);
match variable.initializer {
None => {}
Some(init) => {
@ -523,12 +526,27 @@ fn emit_method<'a, 'input>(
let llvm_method = emit_method_declaration(ctx, &method)?;
emit_linkage_for_method(&method, is_kernel, llvm_method);
emit_tuning(ctx, llvm_method, &method.tuning);
for statement in method.body.iter().flat_map(convert::identity) {
let statements = match method.body {
Some(statements) => statements,
None => return Ok(()),
};
// Initial BB that holds all the variable declarations
let bb_with_variables =
unsafe { LLVMAppendBasicBlockInContext(ctx.context.get(), llvm_method, LLVM_UNNAMED) };
// Rest of the code
let starting_bb =
unsafe { LLVMAppendBasicBlockInContext(ctx.context.get(), llvm_method, LLVM_UNNAMED) };
unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), starting_bb) };
for statement in statements.iter() {
register_basic_blocks(ctx, llvm_method, statement);
}
for statement in method.body.into_iter().flatten() {
for statement in statements.into_iter() {
emit_statement(ctx, is_kernel, statement)?;
}
// happens if there is a post-ret trailing label
terminate_current_block_if_needed(ctx, None);
unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), bb_with_variables) };
unsafe { LLVMBuildBr(ctx.builder.get(), starting_bb) };
Ok(())
}
@ -607,7 +625,6 @@ fn emit_statement(
is_kernel: bool,
statement: crate::translate::ExpandedStatement,
) -> Result<(), TranslateError> {
start_synthetic_basic_block_if_needed(ctx, &statement);
Ok(match statement {
crate::translate::Statement::Label(label) => emit_label(ctx, label)?,
crate::translate::Statement::Variable(var) => emit_function_variable(ctx, var)?,
@ -625,8 +642,8 @@ fn emit_statement(
crate::translate::Statement::MadC(MadCDetails { type_, is_hi, arg }) => {
emit_inst_madc(ctx, type_, is_hi, &arg)?
}
crate::translate::Statement::MadCC(MadCCDetails { type_, arg }) => {
emit_inst_madcc(ctx, type_, &arg)?
crate::translate::Statement::MadCC(MadCCDetails { type_, is_hi, arg }) => {
emit_inst_madcc(ctx, type_, is_hi, &arg)?
}
crate::translate::Statement::AddC(type_, arg) => emit_inst_add_c(ctx, type_, &arg)?,
crate::translate::Statement::AddCC(type_, arg) => {
@ -752,27 +769,6 @@ fn emit_ret_value(
Ok(())
}
fn start_synthetic_basic_block_if_needed(
ctx: &mut EmitContext,
statement: &crate::translate::ExpandedStatement,
) {
let current_block = unsafe { LLVMGetInsertBlock(ctx.builder.get()) };
if current_block == ptr::null_mut() {
return;
}
let terminator = unsafe { LLVMGetBasicBlockTerminator(current_block) };
if terminator == ptr::null_mut() {
return;
}
if let crate::translate::Statement::Label(..) = statement {
return;
}
let new_block =
unsafe { LLVMCreateBasicBlockInContext(ctx.context.get(), b"\0".as_ptr() as _) };
unsafe { LLVMInsertExistingBasicBlockAfterInsertBlock(ctx.builder.get(), new_block) };
unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), new_block) };
}
fn emit_ptr_access(
ctx: &mut EmitContext,
ptr_access: &crate::translate::PtrAccess<crate::translate::ExpandedArgParams>,
@ -1076,7 +1072,7 @@ fn emit_value_copy(
) -> Result<(), TranslateError> {
let builder = ctx.builder.get();
let type_ = get_llvm_type(ctx, type_)?;
let temp_value = unsafe { LLVMBuildAlloca(builder, type_, LLVM_UNNAMED) };
let temp_value = emit_alloca(ctx, type_, ctx.constants.private_space, None);
unsafe { LLVMBuildStore(builder, src, temp_value) };
ctx.names.register_result(dst, |dst| unsafe {
LLVMBuildLoad2(builder, type_, temp_value, dst)
@ -1084,6 +1080,28 @@ fn emit_value_copy(
Ok(())
}
// From "Performance Tips for Frontend Authors" (https://llvm.org/docs/Frontend/PerformanceTips.html):
// "The SROA (Scalar Replacement Of Aggregates) and Mem2Reg passes only attempt to eliminate alloca
// instructions that are in the entry basic block. Given SSA is the canonical form expected by much
// of the optimizer; if allocas can not be eliminated by Mem2Reg or SROA, the optimizer is likely to
// be less effective than it could be."
fn emit_alloca(
ctx: &mut EmitContext,
type_: LLVMTypeRef,
addr_space: u32,
name: Option<Id>,
) -> LLVMValueRef {
let builder = ctx.builder.get();
let current_bb = unsafe { LLVMGetInsertBlock(builder) };
let variables_bb = unsafe { LLVMGetFirstBasicBlock(LLVMGetBasicBlockParent(current_bb)) };
unsafe { LLVMPositionBuilderAtEnd(builder, variables_bb) };
let result = ctx.names.register_result_option(name, |name| unsafe {
LLVMZludaBuildAlloca(builder, type_, addr_space, name)
});
unsafe { LLVMPositionBuilderAtEnd(builder, current_bb) };
result
}
fn emit_instruction(
ctx: &mut EmitContext,
is_kernel: bool,
@ -1142,6 +1160,7 @@ fn emit_instruction(
ast::Instruction::Set(details, arg) => emit_inst_set(ctx, details, arg)?,
ast::Instruction::Red(details, arg) => emit_inst_red(ctx, details, arg)?,
ast::Instruction::Isspacep(space, arg) => emit_inst_isspacep(ctx, *space, arg)?,
ast::Instruction::Sad(type_, arg) => emit_inst_sad(ctx, *type_, arg)?,
// replaced by function calls or Statement variants
ast::Instruction::Activemask { .. }
| ast::Instruction::Bar(..)
@ -1230,6 +1249,36 @@ fn emit_inst_isspacep_impl(
)
}
fn emit_inst_sad(
ctx: &mut EmitContext,
type_: ast::ScalarType,
arg: &ast::Arg4<ExpandedArgParams>,
) -> Result<(), TranslateError> {
let builder = ctx.builder.get();
let less_than = emit_inst_setp_int(
ctx,
&SetpData {
typ: type_,
flush_to_zero: None,
cmp_op: ast::SetpCompareOp::Greater,
},
None,
arg.src1,
arg.src2,
)?;
let a = ctx.names.value(arg.src1)?;
let b = ctx.names.value(arg.src2)?;
let a_minus_b = unsafe { LLVMBuildSub(builder, a, b, LLVM_UNNAMED) };
let b_minus_a = unsafe { LLVMBuildSub(builder, b, a, LLVM_UNNAMED) };
let a_or_b = unsafe { LLVMBuildSelect(builder, less_than, a_minus_b, b_minus_a, LLVM_UNNAMED) };
let src3 = ctx.names.value(arg.src3)?;
ctx.names.register_result(arg.dst, |dst_name| unsafe {
LLVMBuildAdd(builder, src3, a_or_b, dst_name)
});
Ok(())
}
fn emit_inst_red(
ctx: &mut EmitContext,
details: &ast::AtomDetails,
@ -1359,9 +1408,6 @@ fn emit_int_trap(ctx: &mut EmitContext) -> Result<(), TranslateError> {
0,
LLVM_UNNAMED,
);
// llvm.trap is not a terminator,
// LLVM might fail with an unterminated basic block if we don't insert unreachable
LLVMBuildUnreachable(builder);
}
Ok(())
}
@ -2149,16 +2195,17 @@ fn emit_inst_mad_lo(
)
}
// TODO: support mad.hi.cc
fn emit_inst_madcc(
ctx: &mut EmitContext,
type_: ast::ScalarType,
is_hi: bool,
arg: &Arg4CarryOut<ExpandedArgParams>,
) -> Result<(), TranslateError> {
let builder = ctx.builder.get();
let src1 = ctx.names.value(arg.src1)?;
let src2 = ctx.names.value(arg.src2)?;
let mul_result = unsafe { LLVMBuildMul(builder, src1, src2, LLVM_UNNAMED) };
let mul_result = if is_hi {
emit_inst_mul_hi_impl(ctx, type_, None, arg.src1, arg.src2)?
} else {
emit_inst_mul_low_impl(ctx, None, arg.src1, arg.src2, LLVMBuildMul)?
};
emit_inst_addsub_cc_impl(
ctx,
"add",
@ -2246,29 +2293,6 @@ fn emit_inst_madc(
mul_result,
args.src3,
)
/*
let src3 = ctx.names.value(args.src3)?;
let add_no_carry = unsafe { LLVMBuildAdd(builder, mul_result, src3, LLVM_UNNAMED) };
let carry_flag = ctx.names.value(args.carry_in)?;
let llvm_type = get_llvm_type(ctx, &ast::Type::Scalar(type_))?;
let carry_flag = unsafe { LLVMBuildZExt(builder, carry_flag, llvm_type, LLVM_UNNAMED) };
if let Some(carry_out) = args.carry_out {
emit_inst_addsub_cc_impl(
ctx,
"add",
type_,
args.dst,
carry_out,
add_no_carry,
carry_flag,
)?;
} else {
ctx.names.register_result(args.dst, |dst| unsafe {
LLVMBuildAdd(builder, add_no_carry, carry_flag, dst)
});
}
Ok(())
*/
}
fn emit_inst_add_c(
@ -3559,12 +3583,12 @@ fn emit_store_var(
fn emit_label(ctx: &mut EmitContext, label: Id) -> Result<(), TranslateError> {
let new_block = unsafe { LLVMValueAsBasicBlock(ctx.names.value(label)?) };
terminate_current_block_if_needed(ctx, new_block);
terminate_current_block_if_needed(ctx, Some(new_block));
unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), new_block) };
Ok(())
}
fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: LLVMBasicBlockRef) {
fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: Option<LLVMBasicBlockRef>) {
let current_block = unsafe { LLVMGetInsertBlock(ctx.builder.get()) };
if current_block == ptr::null_mut() {
return;
@ -3573,7 +3597,10 @@ fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: LLVMBasic
if terminator != ptr::null_mut() {
return;
}
unsafe { LLVMBuildBr(ctx.builder.get(), new_block) };
match new_block {
Some(new_block) => unsafe { LLVMBuildBr(ctx.builder.get(), new_block) },
None => unsafe { LLVMBuildUnreachable(ctx.builder.get()) },
};
}
fn emit_method_declaration<'input>(

View file

@ -227,6 +227,7 @@ match {
"rem",
"ret",
"rsqrt",
"sad",
"selp",
"set",
"setp",
@ -309,6 +310,7 @@ ExtendedID : &'input str = {
"rem",
"ret",
"rsqrt",
"sad",
"selp",
"set",
"setp",
@ -846,6 +848,7 @@ Instruction: ast::Instruction<ast::ParsedArgParams<'input>> = {
InstRed,
InstNanosleep,
InstIsspacep,
InstSad
};
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
@ -1523,7 +1526,12 @@ InstMad: ast::Instruction<ast::ParsedArgParams<'input>> = {
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc
InstMadCC: ast::Instruction<ast::ParsedArgParams<'input>> = {
"mad" ".lo" ".cc" <type_:IntType3264> <arg:Arg4> => ast::Instruction::MadCC{<>},
"mad" ".lo" ".cc" <type_:IntType3264> <arg:Arg4> => {
ast::Instruction::MadCC { type_, arg, is_hi: false }
},
"mad" ".hi" ".cc" <type_:IntType3264> <arg:Arg4> => {
ast::Instruction::MadCC { type_, arg, is_hi: true }
},
};
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc
@ -2435,6 +2443,15 @@ InstIsspacep: ast::Instruction<ast::ParsedArgParams<'input>> = {
}
}
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad
InstSad: ast::Instruction<ast::ParsedArgParams<'input>> = {
"sad" <type_:IntType> <a:Arg4> => {
ast::Instruction::Sad(type_, a)
}
}
NegTypeFtz: ast::ScalarType = {
".f16" => ast::ScalarType::F16,
".f16x2" => ast::ScalarType::F16x2,

View file

@ -1,44 +1,44 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 {
"38":
define protected amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"27", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"28", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"29", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"13" to ptr
%"30" = load i32, ptr %"31", align 4
store i32 %"30", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"32" = inttoptr i64 %"15" to ptr
%"40" = getelementptr inbounds i8, ptr %"32", i64 4
%"33" = load i32, ptr %"40", align 4
store i32 %"33", ptr addrspace(5) %"7", align 4
%"17" = load i32, ptr addrspace(5) %"6", align 4
%"16" = call i32 @llvm.abs.i32(i32 %"17", i1 false)
store i32 %"16", ptr addrspace(5) %"6", align 4
%"19" = load i32, ptr addrspace(5) %"7", align 4
%"18" = call i32 @llvm.abs.i32(i32 %"19", i1 false)
store i32 %"18", ptr addrspace(5) %"7", align 4
%"20" = load i64, ptr addrspace(5) %"5", align 8
%"21" = load i32, ptr addrspace(5) %"6", align 4
%"34" = inttoptr i64 %"20" to ptr
store i32 %"21", ptr %"34", align 4
%"22" = load i64, ptr addrspace(5) %"5", align 8
%"23" = load i32, ptr addrspace(5) %"7", align 4
%"36" = inttoptr i64 %"22" to ptr
%"42" = getelementptr inbounds i8, ptr %"36", i64 4
store i32 %"23", ptr %"42", align 4
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"30" = inttoptr i64 %"12" to ptr
%"29" = load i32, ptr %"30", align 4
store i32 %"29", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"14" to ptr
%"38" = getelementptr inbounds i8, ptr %"31", i64 4
%"32" = load i32, ptr %"38", align 4
store i32 %"32", ptr addrspace(5) %"7", align 4
%"16" = load i32, ptr addrspace(5) %"6", align 4
%"15" = call i32 @llvm.abs.i32(i32 %"16", i1 false)
store i32 %"15", ptr addrspace(5) %"6", align 4
%"18" = load i32, ptr addrspace(5) %"7", align 4
%"17" = call i32 @llvm.abs.i32(i32 %"18", i1 false)
store i32 %"17", ptr addrspace(5) %"7", align 4
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load i32, ptr addrspace(5) %"6", align 4
%"33" = inttoptr i64 %"19" to ptr
store i32 %"20", ptr %"33", align 4
%"21" = load i64, ptr addrspace(5) %"5", align 8
%"22" = load i32, ptr addrspace(5) %"7", align 4
%"35" = inttoptr i64 %"21" to ptr
%"40" = getelementptr inbounds i8, ptr %"35", i64 4
store i32 %"22", ptr %"40", align 4
ret void
}

View file

@ -3,22 +3,22 @@ target triple = "amdgcn-amd-amdhsa"
declare i32 @__zluda_ptx_impl__activemask() #0
define protected amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"12", ptr addrspace(4) byref(i64) %"13") #1 {
"16":
define protected amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"11", ptr addrspace(4) byref(i64) %"12") #1 {
%"6" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"6", align 1
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i32, align 4, addrspace(5)
%"8" = load i64, ptr addrspace(4) %"13", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = call i32 @__zluda_ptx_impl__activemask()
store i32 %"9", ptr addrspace(5) %"5", align 4
%"10" = load i64, ptr addrspace(5) %"4", align 8
%"11" = load i32, ptr addrspace(5) %"5", align 4
%"14" = inttoptr i64 %"10" to ptr
store i32 %"11", ptr %"14", align 4
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"6", align 1
%"7" = load i64, ptr addrspace(4) %"12", align 8
store i64 %"7", ptr addrspace(5) %"4", align 8
%"8" = call i32 @__zluda_ptx_impl__activemask()
store i32 %"8", ptr addrspace(5) %"5", align 4
%"9" = load i64, ptr addrspace(5) %"4", align 8
%"10" = load i32, ptr addrspace(5) %"5", align 4
%"13" = inttoptr i64 %"9" to ptr
store i32 %"10", ptr %"13", align 4
ret void
}

View file

@ -1,31 +1,31 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
"23":
define protected amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"20", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"21" = inttoptr i64 %"13" to ptr
%"12" = load i64, ptr %"21", align 8
store i64 %"12", ptr addrspace(5) %"6", align 8
%"15" = load i64, ptr addrspace(5) %"6", align 8
%"14" = add i64 %"15", 1
store i64 %"14", ptr addrspace(5) %"7", align 8
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"17" = load i64, ptr addrspace(5) %"7", align 8
%"22" = inttoptr i64 %"16" to ptr
store i64 %"17", ptr %"22", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"20" = inttoptr i64 %"12" to ptr
%"11" = load i64, ptr %"20", align 8
store i64 %"11", ptr addrspace(5) %"6", align 8
%"14" = load i64, ptr addrspace(5) %"6", align 8
%"13" = add i64 %"14", 1
store i64 %"13", ptr addrspace(5) %"7", align 8
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"7", align 8
%"21" = inttoptr i64 %"15" to ptr
store i64 %"16", ptr %"21", align 8
ret void
}

View file

@ -3,34 +3,34 @@ target triple = "amdgcn-amd-amdhsa"
@PI = protected addrspace(1) externally_initialized global float 0x400921FB60000000, align 4
define protected amdgpu_kernel void @add_global(ptr addrspace(4) byref(i64) %"21", ptr addrspace(4) byref(i64) %"22") #0 {
"25":
define protected amdgpu_kernel void @add_global(ptr addrspace(4) byref(i64) %"20", ptr addrspace(4) byref(i64) %"21") #0 {
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca float, align 4, addrspace(5)
%"8" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = load i64, ptr addrspace(4) %"20", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(4) %"21", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(4) %"22", align 8
store i64 %"12", ptr addrspace(5) %"6", align 8
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"23" = inttoptr i64 %"14" to ptr
%"13" = load float, ptr %"23", align 4
store float %"13", ptr addrspace(5) %"7", align 4
%"15" = load float, ptr addrspace(1) @PI, align 4
store float %"15", ptr addrspace(5) %"8", align 4
%"17" = load float, ptr addrspace(5) %"7", align 4
%"18" = load float, ptr addrspace(5) %"8", align 4
%"16" = fadd float %"17", %"18"
store float %"16", ptr addrspace(5) %"7", align 4
%"19" = load i64, ptr addrspace(5) %"6", align 8
%"20" = load float, ptr addrspace(5) %"7", align 4
%"24" = inttoptr i64 %"19" to ptr
store float %"20", ptr %"24", align 4
store i64 %"11", ptr addrspace(5) %"6", align 8
%"13" = load i64, ptr addrspace(5) %"5", align 8
%"22" = inttoptr i64 %"13" to ptr
%"12" = load float, ptr %"22", align 4
store float %"12", ptr addrspace(5) %"7", align 4
%"14" = load float, ptr addrspace(1) @PI, align 4
store float %"14", ptr addrspace(5) %"8", align 4
%"16" = load float, ptr addrspace(5) %"7", align 4
%"17" = load float, ptr addrspace(5) %"8", align 4
%"15" = fadd float %"16", %"17"
store float %"15", ptr addrspace(5) %"7", align 4
%"18" = load i64, ptr addrspace(5) %"6", align 8
%"19" = load float, ptr addrspace(5) %"7", align 4
%"23" = inttoptr i64 %"18" to ptr
store float %"19", ptr %"23", align 4
ret void
}

View file

@ -1,31 +1,31 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
"23":
define protected amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"20", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"21" = inttoptr i64 %"13" to ptr addrspace(1)
%"12" = load i64, ptr addrspace(1) %"21", align 8
store i64 %"12", ptr addrspace(5) %"6", align 8
%"15" = load i64, ptr addrspace(5) %"6", align 8
%"14" = add i64 %"15", 1
store i64 %"14", ptr addrspace(5) %"7", align 8
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"17" = load i64, ptr addrspace(5) %"7", align 8
%"22" = inttoptr i64 %"16" to ptr addrspace(1)
store i64 %"17", ptr addrspace(1) %"22", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"20" = inttoptr i64 %"12" to ptr addrspace(1)
%"11" = load i64, ptr addrspace(1) %"20", align 8
store i64 %"11", ptr addrspace(5) %"6", align 8
%"14" = load i64, ptr addrspace(5) %"6", align 8
%"13" = add i64 %"14", 1
store i64 %"13", ptr addrspace(5) %"7", align 8
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"7", align 8
%"21" = inttoptr i64 %"15" to ptr addrspace(1)
store i64 %"16", ptr addrspace(1) %"21", align 8
ret void
}

View file

@ -1,47 +1,47 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @add_param_ptr(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
"39":
define protected amdgpu_kernel void @add_param_ptr(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
%"32" = ptrtoint ptr addrspace(4) %"27" to i64
%0 = alloca i64, align 8, addrspace(5)
store i64 %"32", ptr addrspace(5) %0, align 8
%"31" = load i64, ptr addrspace(5) %0, align 8
store i64 %"31", ptr addrspace(5) %"4", align 8
%"34" = ptrtoint ptr addrspace(4) %"28" to i64
%1 = alloca i64, align 8, addrspace(5)
store i64 %"34", ptr addrspace(5) %1, align 8
%"33" = load i64, ptr addrspace(5) %1, align 8
store i64 %"33", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"35" = inttoptr i64 %"13" to ptr addrspace(4)
%2 = alloca i64, align 8, addrspace(5)
br label %3
3: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"31" = ptrtoint ptr addrspace(4) %"26" to i64
store i64 %"31", ptr addrspace(5) %1, align 8
%"30" = load i64, ptr addrspace(5) %1, align 8
store i64 %"30", ptr addrspace(5) %"4", align 8
%"33" = ptrtoint ptr addrspace(4) %"27" to i64
store i64 %"33", ptr addrspace(5) %2, align 8
%"32" = load i64, ptr addrspace(5) %2, align 8
store i64 %"32", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"34" = inttoptr i64 %"12" to ptr addrspace(4)
%"39" = getelementptr inbounds i8, ptr addrspace(4) %"34", i64 0
%"11" = load i64, ptr addrspace(4) %"39", align 8
store i64 %"11", ptr addrspace(5) %"4", align 8
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"35" = inttoptr i64 %"14" to ptr addrspace(4)
%"41" = getelementptr inbounds i8, ptr addrspace(4) %"35", i64 0
%"12" = load i64, ptr addrspace(4) %"41", align 8
store i64 %"12", ptr addrspace(5) %"4", align 8
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"36" = inttoptr i64 %"15" to ptr addrspace(4)
%"43" = getelementptr inbounds i8, ptr addrspace(4) %"36", i64 0
%"14" = load i64, ptr addrspace(4) %"43", align 8
store i64 %"14", ptr addrspace(5) %"5", align 8
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"37" = inttoptr i64 %"17" to ptr
%"16" = load i64, ptr %"37", align 8
store i64 %"16", ptr addrspace(5) %"6", align 8
%"19" = load i64, ptr addrspace(5) %"6", align 8
%"18" = add i64 %"19", 1
store i64 %"18", ptr addrspace(5) %"7", align 8
%"20" = load i64, ptr addrspace(5) %"5", align 8
%"21" = load i64, ptr addrspace(5) %"7", align 8
%"38" = inttoptr i64 %"20" to ptr
store i64 %"21", ptr %"38", align 8
%"13" = load i64, ptr addrspace(4) %"41", align 8
store i64 %"13", ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"36" = inttoptr i64 %"16" to ptr
%"15" = load i64, ptr %"36", align 8
store i64 %"15", ptr addrspace(5) %"6", align 8
%"18" = load i64, ptr addrspace(5) %"6", align 8
%"17" = add i64 %"18", 1
store i64 %"17", ptr addrspace(5) %"7", align 8
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load i64, ptr addrspace(5) %"7", align 8
%"37" = inttoptr i64 %"19" to ptr
store i64 %"20", ptr %"37", align 8
ret void
}

View file

@ -1,31 +1,31 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
"23":
define protected amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"20", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"21" = inttoptr i64 %"13" to ptr
%"12" = load i64, ptr %"21", align 8
store i64 %"12", ptr addrspace(5) %"6", align 8
%"15" = load i64, ptr addrspace(5) %"6", align 8
%"14" = add i64 %"15", 1
store i64 %"14", ptr addrspace(5) %"7", align 8
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"17" = load i64, ptr addrspace(5) %"7", align 8
%"22" = inttoptr i64 %"16" to ptr
store i64 %"17", ptr %"22", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"20" = inttoptr i64 %"12" to ptr
%"11" = load i64, ptr %"20", align 8
store i64 %"11", ptr addrspace(5) %"6", align 8
%"14" = load i64, ptr addrspace(5) %"6", align 8
%"13" = add i64 %"14", 1
store i64 %"13", ptr addrspace(5) %"7", align 8
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"7", align 8
%"21" = inttoptr i64 %"15" to ptr
store i64 %"16", ptr %"21", align 8
ret void
}

View file

@ -1,12 +1,8 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @addc_cc(ptr addrspace(4) byref(i64) %"54", ptr addrspace(4) byref(i64) %"55") #0 {
"69":
define protected amdgpu_kernel void @addc_cc(ptr addrspace(4) byref(i64) %"53", ptr addrspace(4) byref(i64) %"54") #0 {
%"13" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"13", align 1
%"14" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"14", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
@ -16,70 +12,74 @@ define protected amdgpu_kernel void @addc_cc(ptr addrspace(4) byref(i64) %"54",
%"10" = alloca i32, align 4, addrspace(5)
%"11" = alloca i32, align 4, addrspace(5)
%"12" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"13", align 1
%"14" = load i64, ptr addrspace(4) %"53", align 8
store i64 %"14", ptr addrspace(5) %"4", align 8
%"15" = load i64, ptr addrspace(4) %"54", align 8
store i64 %"15", ptr addrspace(5) %"4", align 8
%"16" = load i64, ptr addrspace(4) %"55", align 8
store i64 %"16", ptr addrspace(5) %"5", align 8
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"57" = inttoptr i64 %"18" to ptr
%"56" = load i32, ptr %"57", align 4
store i32 %"56", ptr addrspace(5) %"9", align 4
%"20" = load i64, ptr addrspace(5) %"4", align 8
%"58" = inttoptr i64 %"20" to ptr
%"71" = getelementptr inbounds i8, ptr %"58", i64 4
%"59" = load i32, ptr %"71", align 4
store i32 %"59", ptr addrspace(5) %"10", align 4
%"22" = load i64, ptr addrspace(5) %"4", align 8
%"60" = inttoptr i64 %"22" to ptr
%"73" = getelementptr inbounds i8, ptr %"60", i64 8
%"21" = load i32, ptr %"73", align 4
store i32 %"21", ptr addrspace(5) %"11", align 4
%"24" = load i64, ptr addrspace(5) %"4", align 8
%"61" = inttoptr i64 %"24" to ptr
%"75" = getelementptr inbounds i8, ptr %"61", i64 12
%"23" = load i32, ptr %"75", align 4
store i32 %"23", ptr addrspace(5) %"12", align 4
%"27" = load i32, ptr addrspace(5) %"9", align 4
%"28" = load i32, ptr addrspace(5) %"10", align 4
%0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %"27", i32 %"28")
%"25" = extractvalue { i32, i1 } %0, 0
%"26" = extractvalue { i32, i1 } %0, 1
store i32 %"25", ptr addrspace(5) %"6", align 4
store i1 %"26", ptr addrspace(5) %"13", align 1
%"31" = load i1, ptr addrspace(5) %"13", align 1
%"32" = load i32, ptr addrspace(5) %"6", align 4
%"33" = load i32, ptr addrspace(5) %"11", align 4
%1 = zext i1 %"31" to i32
%2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %"32", i32 %"33")
%3 = extractvalue { i32, i1 } %2, 0
%4 = extractvalue { i32, i1 } %2, 1
%5 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %3, i32 %1)
%"29" = extractvalue { i32, i1 } %5, 0
%6 = extractvalue { i32, i1 } %5, 1
%"30" = xor i1 %4, %6
store i32 %"29", ptr addrspace(5) %"7", align 4
store i1 %"30", ptr addrspace(5) %"13", align 1
%"35" = load i1, ptr addrspace(5) %"13", align 1
%"36" = load i32, ptr addrspace(5) %"7", align 4
%"37" = load i32, ptr addrspace(5) %"12", align 4
%7 = zext i1 %"35" to i32
%8 = add i32 %"36", %"37"
%"34" = add i32 %8, %7
store i32 %"34", ptr addrspace(5) %"8", align 4
%"38" = load i64, ptr addrspace(5) %"5", align 8
%"39" = load i32, ptr addrspace(5) %"6", align 4
%"66" = inttoptr i64 %"38" to ptr
store i32 %"39", ptr %"66", align 4
%"40" = load i64, ptr addrspace(5) %"5", align 8
%"41" = load i32, ptr addrspace(5) %"7", align 4
%"67" = inttoptr i64 %"40" to ptr
%"77" = getelementptr inbounds i8, ptr %"67", i64 4
store i32 %"41", ptr %"77", align 4
%"42" = load i64, ptr addrspace(5) %"5", align 8
%"43" = load i32, ptr addrspace(5) %"8", align 4
%"68" = inttoptr i64 %"42" to ptr
%"79" = getelementptr inbounds i8, ptr %"68", i64 8
store i32 %"43", ptr %"79", align 4
store i64 %"15", ptr addrspace(5) %"5", align 8
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"56" = inttoptr i64 %"17" to ptr
%"55" = load i32, ptr %"56", align 4
store i32 %"55", ptr addrspace(5) %"9", align 4
%"19" = load i64, ptr addrspace(5) %"4", align 8
%"57" = inttoptr i64 %"19" to ptr
%"69" = getelementptr inbounds i8, ptr %"57", i64 4
%"58" = load i32, ptr %"69", align 4
store i32 %"58", ptr addrspace(5) %"10", align 4
%"21" = load i64, ptr addrspace(5) %"4", align 8
%"59" = inttoptr i64 %"21" to ptr
%"71" = getelementptr inbounds i8, ptr %"59", i64 8
%"20" = load i32, ptr %"71", align 4
store i32 %"20", ptr addrspace(5) %"11", align 4
%"23" = load i64, ptr addrspace(5) %"4", align 8
%"60" = inttoptr i64 %"23" to ptr
%"73" = getelementptr inbounds i8, ptr %"60", i64 12
%"22" = load i32, ptr %"73", align 4
store i32 %"22", ptr addrspace(5) %"12", align 4
%"26" = load i32, ptr addrspace(5) %"9", align 4
%"27" = load i32, ptr addrspace(5) %"10", align 4
%2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %"26", i32 %"27")
%"24" = extractvalue { i32, i1 } %2, 0
%"25" = extractvalue { i32, i1 } %2, 1
store i32 %"24", ptr addrspace(5) %"6", align 4
store i1 %"25", ptr addrspace(5) %"13", align 1
%"30" = load i1, ptr addrspace(5) %"13", align 1
%"31" = load i32, ptr addrspace(5) %"6", align 4
%"32" = load i32, ptr addrspace(5) %"11", align 4
%3 = zext i1 %"30" to i32
%4 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %"31", i32 %"32")
%5 = extractvalue { i32, i1 } %4, 0
%6 = extractvalue { i32, i1 } %4, 1
%7 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %5, i32 %3)
%"28" = extractvalue { i32, i1 } %7, 0
%8 = extractvalue { i32, i1 } %7, 1
%"29" = xor i1 %6, %8
store i32 %"28", ptr addrspace(5) %"7", align 4
store i1 %"29", ptr addrspace(5) %"13", align 1
%"34" = load i1, ptr addrspace(5) %"13", align 1
%"35" = load i32, ptr addrspace(5) %"7", align 4
%"36" = load i32, ptr addrspace(5) %"12", align 4
%9 = zext i1 %"34" to i32
%10 = add i32 %"35", %"36"
%"33" = add i32 %10, %9
store i32 %"33", ptr addrspace(5) %"8", align 4
%"37" = load i64, ptr addrspace(5) %"5", align 8
%"38" = load i32, ptr addrspace(5) %"6", align 4
%"65" = inttoptr i64 %"37" to ptr
store i32 %"38", ptr %"65", align 4
%"39" = load i64, ptr addrspace(5) %"5", align 8
%"40" = load i32, ptr addrspace(5) %"7", align 4
%"66" = inttoptr i64 %"39" to ptr
%"75" = getelementptr inbounds i8, ptr %"66", i64 4
store i32 %"40", ptr %"75", align 4
%"41" = load i64, ptr addrspace(5) %"5", align 8
%"42" = load i32, ptr addrspace(5) %"8", align 4
%"67" = inttoptr i64 %"41" to ptr
%"77" = getelementptr inbounds i8, ptr %"67", i64 8
store i32 %"42", ptr %"77", align 4
ret void
}

View file

@ -1,63 +1,63 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @addc_cc2(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 {
"51":
define protected amdgpu_kernel void @addc_cc2(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 {
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
%"11" = load i64, ptr addrspace(4) %"41", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -1, i32 -1)
%"42" = extractvalue { i32, i1 } %0, 0
%"13" = extractvalue { i32, i1 } %0, 1
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = load i64, ptr addrspace(4) %"40", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -1, i32 -1)
%"41" = extractvalue { i32, i1 } %2, 0
%"12" = extractvalue { i32, i1 } %2, 1
store i32 %"41", ptr addrspace(5) %"6", align 4
store i1 %"12", ptr addrspace(5) %"9", align 1
%"15" = load i1, ptr addrspace(5) %"9", align 1
%3 = zext i1 %"15" to i32
%4 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -4, i32 -4)
%5 = extractvalue { i32, i1 } %4, 0
%6 = extractvalue { i32, i1 } %4, 1
%7 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %5, i32 %3)
%"42" = extractvalue { i32, i1 } %7, 0
%8 = extractvalue { i32, i1 } %7, 1
%"14" = xor i1 %6, %8
store i32 %"42", ptr addrspace(5) %"6", align 4
store i1 %"13", ptr addrspace(5) %"9", align 1
%"16" = load i1, ptr addrspace(5) %"9", align 1
%1 = zext i1 %"16" to i32
%2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -4, i32 -4)
%3 = extractvalue { i32, i1 } %2, 0
%4 = extractvalue { i32, i1 } %2, 1
%5 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %3, i32 %1)
%"43" = extractvalue { i32, i1 } %5, 0
%6 = extractvalue { i32, i1 } %5, 1
%"15" = xor i1 %4, %6
store i32 %"43", ptr addrspace(5) %"6", align 4
store i1 %"15", ptr addrspace(5) %"9", align 1
%"18" = load i1, ptr addrspace(5) %"9", align 1
%7 = zext i1 %"18" to i32
%"44" = add i32 0, %7
store i32 %"44", ptr addrspace(5) %"7", align 4
%"21" = load i1, ptr addrspace(5) %"9", align 1
%8 = zext i1 %"21" to i32
%9 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 -1)
%10 = extractvalue { i32, i1 } %9, 0
%11 = extractvalue { i32, i1 } %9, 1
%12 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %10, i32 %8)
%"45" = extractvalue { i32, i1 } %12, 0
%13 = extractvalue { i32, i1 } %12, 1
%"20" = xor i1 %11, %13
store i32 %"45", ptr addrspace(5) %"6", align 4
store i1 %"20", ptr addrspace(5) %"9", align 1
%"23" = load i1, ptr addrspace(5) %"9", align 1
%14 = zext i1 %"23" to i32
%"46" = add i32 0, %14
store i32 %"46", ptr addrspace(5) %"8", align 4
%"24" = load i64, ptr addrspace(5) %"5", align 8
%"25" = load i32, ptr addrspace(5) %"7", align 4
%"47" = inttoptr i64 %"24" to ptr
store i32 %"25", ptr %"47", align 4
%"26" = load i64, ptr addrspace(5) %"5", align 8
%"27" = load i32, ptr addrspace(5) %"8", align 4
%"49" = inttoptr i64 %"26" to ptr
%"53" = getelementptr inbounds i8, ptr %"49", i64 4
store i32 %"27", ptr %"53", align 4
store i1 %"14", ptr addrspace(5) %"9", align 1
%"17" = load i1, ptr addrspace(5) %"9", align 1
%9 = zext i1 %"17" to i32
%"43" = add i32 0, %9
store i32 %"43", ptr addrspace(5) %"7", align 4
%"20" = load i1, ptr addrspace(5) %"9", align 1
%10 = zext i1 %"20" to i32
%11 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 -1)
%12 = extractvalue { i32, i1 } %11, 0
%13 = extractvalue { i32, i1 } %11, 1
%14 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %12, i32 %10)
%"44" = extractvalue { i32, i1 } %14, 0
%15 = extractvalue { i32, i1 } %14, 1
%"19" = xor i1 %13, %15
store i32 %"44", ptr addrspace(5) %"6", align 4
store i1 %"19", ptr addrspace(5) %"9", align 1
%"22" = load i1, ptr addrspace(5) %"9", align 1
%16 = zext i1 %"22" to i32
%"45" = add i32 0, %16
store i32 %"45", ptr addrspace(5) %"8", align 4
%"23" = load i64, ptr addrspace(5) %"5", align 8
%"24" = load i32, ptr addrspace(5) %"7", align 4
%"46" = inttoptr i64 %"23" to ptr
store i32 %"24", ptr %"46", align 4
%"25" = load i64, ptr addrspace(5) %"5", align 8
%"26" = load i32, ptr addrspace(5) %"8", align 4
%"48" = inttoptr i64 %"25" to ptr
%"51" = getelementptr inbounds i8, ptr %"48", i64 4
store i32 %"26", ptr %"51", align 4
ret void
}

View file

@ -1,12 +1,8 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @_Z13callback_onlyIdEvPvS0_10callback_tx(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 {
"59":
define protected amdgpu_kernel void @_Z13callback_onlyIdEvPvS0_10callback_tx(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #0 {
%"22" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"22", align 1
%"23" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"23", align 1
%"7" = alloca i1, align 1, addrspace(5)
%"8" = alloca double, align 8, addrspace(5)
%"9" = alloca double, align 8, addrspace(5)
@ -14,47 +10,51 @@ define protected amdgpu_kernel void @_Z13callback_onlyIdEvPvS0_10callback_tx(ptr
%"11" = alloca i64, align 8, addrspace(5)
%"12" = alloca i64, align 8, addrspace(5)
%"13" = alloca i64, align 8, addrspace(5)
%"47" = alloca i64, align 8, addrspace(5)
%"49" = alloca [4 x i32], align 16, addrspace(5)
%"51" = load i64, ptr addrspace(4) %"43", align 8
store i64 %"51", ptr addrspace(5) %"10", align 8
%"52" = load i64, ptr addrspace(4) %"44", align 8
store i64 %"52", ptr addrspace(5) %"11", align 8
%"53" = load i64, ptr addrspace(4) %"45", align 8
store i64 %"53", ptr addrspace(5) %"12", align 8
%"54" = load i64, ptr addrspace(4) %"46", align 8
store i64 %"54", ptr addrspace(5) %"13", align 8
%"29" = load i64, ptr addrspace(5) %"12", align 8
%"30" = load i64, ptr addrspace(5) %"13", align 8
%"28" = icmp sge i64 %"29", %"30"
store i1 %"28", ptr addrspace(5) %"7", align 1
%"31" = load i1, ptr addrspace(5) %"7", align 1
br i1 %"31", label %"6", label %"18"
%"46" = alloca i64, align 8, addrspace(5)
%"48" = alloca [4 x i32], align 16, addrspace(5)
br label %1
"18": ; preds = %"59"
1: ; preds = %0
store i1 false, ptr addrspace(5) %"22", align 1
%"50" = load i64, ptr addrspace(4) %"42", align 8
store i64 %"50", ptr addrspace(5) %"10", align 8
%"51" = load i64, ptr addrspace(4) %"43", align 8
store i64 %"51", ptr addrspace(5) %"11", align 8
%"52" = load i64, ptr addrspace(4) %"44", align 8
store i64 %"52", ptr addrspace(5) %"12", align 8
%"53" = load i64, ptr addrspace(4) %"45", align 8
store i64 %"53", ptr addrspace(5) %"13", align 8
%"28" = load i64, ptr addrspace(5) %"12", align 8
%"29" = load i64, ptr addrspace(5) %"13", align 8
%"27" = icmp sge i64 %"28", %"29"
store i1 %"27", ptr addrspace(5) %"7", align 1
%"30" = load i1, ptr addrspace(5) %"7", align 1
br i1 %"30", label %"6", label %"18"
"18": ; preds = %1
%"31" = load i64, ptr addrspace(5) %"11", align 8
%"59" = getelementptr inbounds i8, ptr addrspace(5) %"46", i64 0
store i64 %"31", ptr addrspace(5) %"59", align 8
%"32" = load i64, ptr addrspace(5) %"11", align 8
%"61" = getelementptr inbounds i8, ptr addrspace(5) %"47", i64 0
store i64 %"32", ptr addrspace(5) %"61", align 8
%"33" = load i64, ptr addrspace(5) %"11", align 8
%0 = inttoptr i64 %"33" to ptr
%"21" = call [4 x i32] %0()
store [4 x i32] %"21", ptr addrspace(5) %"49", align 4
%"63" = getelementptr inbounds i8, ptr addrspace(5) %"49", i64 0
%"19" = load <2 x double>, ptr addrspace(5) %"63", align 16
%"34" = extractelement <2 x double> %"19", i32 0
%"35" = extractelement <2 x double> %"19", i32 1
store double %"34", ptr addrspace(5) %"8", align 8
store double %"35", ptr addrspace(5) %"9", align 8
%"36" = load double, ptr addrspace(5) %"8", align 8
%"37" = load double, ptr addrspace(5) %"9", align 8
%1 = insertelement <2 x double> undef, double %"36", i32 0
%"20" = insertelement <2 x double> %1, double %"37", i32 1
%"38" = load i64, ptr addrspace(5) %"10", align 8
%"58" = inttoptr i64 %"38" to ptr addrspace(1)
store <2 x double> %"20", ptr addrspace(1) %"58", align 16
%2 = inttoptr i64 %"32" to ptr
%"21" = call [4 x i32] %2()
store [4 x i32] %"21", ptr addrspace(5) %"48", align 4
%"61" = getelementptr inbounds i8, ptr addrspace(5) %"48", i64 0
%"19" = load <2 x double>, ptr addrspace(5) %"61", align 16
%"33" = extractelement <2 x double> %"19", i32 0
%"34" = extractelement <2 x double> %"19", i32 1
store double %"33", ptr addrspace(5) %"8", align 8
store double %"34", ptr addrspace(5) %"9", align 8
%"35" = load double, ptr addrspace(5) %"8", align 8
%"36" = load double, ptr addrspace(5) %"9", align 8
%3 = insertelement <2 x double> undef, double %"35", i32 0
%"20" = insertelement <2 x double> %3, double %"36", i32 1
%"37" = load i64, ptr addrspace(5) %"10", align 8
%"57" = inttoptr i64 %"37" to ptr addrspace(1)
store <2 x double> %"20", ptr addrspace(1) %"57", align 16
br label %"6"
"6": ; preds = %"18", %"59"
"6": ; preds = %"18", %1
ret void
}

View file

@ -7,12 +7,8 @@ target triple = "amdgcn-amd-amdhsa"
declare void @__zluda_ptx_impl____assertfail(i64, i64, i32, i64, i64) #0
define protected amdgpu_kernel void @amdgpu_unnamed(ptr addrspace(4) byref(i64) %"58", ptr addrspace(4) byref(i64) %"59") #1 {
"74":
define protected amdgpu_kernel void @amdgpu_unnamed(ptr addrspace(4) byref(i64) %"57", ptr addrspace(4) byref(i64) %"58") #1 {
%"33" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"33", align 1
%"34" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"34", align 1
%"14" = alloca i64, align 8, addrspace(5)
%"15" = alloca i64, align 8, addrspace(5)
%"16" = alloca i64, align 8, addrspace(5)
@ -20,63 +16,67 @@ define protected amdgpu_kernel void @amdgpu_unnamed(ptr addrspace(4) byref(i64)
%"18" = alloca i1, align 1, addrspace(5)
%"19" = alloca i64, align 8, addrspace(5)
%"20" = alloca i32, align 4, addrspace(5)
%"60" = alloca i64, align 8, addrspace(5)
%"61" = alloca i64, align 8, addrspace(5)
%"62" = alloca i32, align 4, addrspace(5)
%"63" = alloca i64, align 8, addrspace(5)
%"64" = alloca i64, align 8, addrspace(5)
%"35" = load i64, ptr addrspace(4) %"58", align 8
store i64 %"35", ptr addrspace(5) %"14", align 8
%"36" = load i64, ptr addrspace(4) %"59", align 8
store i64 %"36", ptr addrspace(5) %"15", align 8
%"38" = load i64, ptr addrspace(5) %"14", align 8
%"66" = inttoptr i64 %"38" to ptr
%"37" = load i64, ptr %"66", align 8
store i64 %"37", ptr addrspace(5) %"16", align 8
%"40" = load i64, ptr addrspace(5) %"16", align 8
%"39" = icmp uge i64 %"40", 1
store i1 %"39", ptr addrspace(5) %"18", align 1
%"41" = load i1, ptr addrspace(5) %"18", align 1
br i1 %"41", label %"13", label %"27"
"27": ; preds = %"74"
%0 = alloca i64, align 8, addrspace(5)
store i64 ptrtoint (ptr addrspace(1) @0 to i64), ptr addrspace(5) %0, align 8
%"67" = load i64, ptr addrspace(5) %0, align 8
store i64 %"67", ptr addrspace(5) %"19", align 8
%"43" = load i64, ptr addrspace(5) %"19", align 8
store i64 %"43", ptr addrspace(5) %"60", align 8
%"59" = alloca i64, align 8, addrspace(5)
%1 = alloca i64, align 8, addrspace(5)
store i64 ptrtoint (ptr addrspace(1) @1 to i64), ptr addrspace(5) %1, align 8
%"69" = load i64, ptr addrspace(5) %1, align 8
store i64 %"69", ptr addrspace(5) %"19", align 8
%"45" = load i64, ptr addrspace(5) %"19", align 8
store i64 %"45", ptr addrspace(5) %"61", align 8
store i32 1, ptr addrspace(5) %"62", align 4
%"60" = alloca i64, align 8, addrspace(5)
%2 = alloca i64, align 8, addrspace(5)
store i64 ptrtoint (ptr addrspace(1) @2 to i64), ptr addrspace(5) %2, align 8
%"71" = load i64, ptr addrspace(5) %2, align 8
store i64 %"71", ptr addrspace(5) %"19", align 8
%"47" = load i64, ptr addrspace(5) %"19", align 8
store i64 %"47", ptr addrspace(5) %"63", align 8
%"76" = getelementptr inbounds i8, ptr addrspace(5) %"64", i64 0
store i64 1, ptr addrspace(5) %"76", align 8
%"28" = load i64, ptr addrspace(5) %"60", align 8
%"29" = load i64, ptr addrspace(5) %"61", align 8
%"30" = load i32, ptr addrspace(5) %"62", align 4
%"31" = load i64, ptr addrspace(5) %"63", align 8
%"32" = load i64, ptr addrspace(5) %"64", align 8
%"61" = alloca i32, align 4, addrspace(5)
%"62" = alloca i64, align 8, addrspace(5)
%3 = alloca i64, align 8, addrspace(5)
%"63" = alloca i64, align 8, addrspace(5)
br label %4
4: ; preds = %0
store i1 false, ptr addrspace(5) %"33", align 1
%"34" = load i64, ptr addrspace(4) %"57", align 8
store i64 %"34", ptr addrspace(5) %"14", align 8
%"35" = load i64, ptr addrspace(4) %"58", align 8
store i64 %"35", ptr addrspace(5) %"15", align 8
%"37" = load i64, ptr addrspace(5) %"14", align 8
%"65" = inttoptr i64 %"37" to ptr
%"36" = load i64, ptr %"65", align 8
store i64 %"36", ptr addrspace(5) %"16", align 8
%"39" = load i64, ptr addrspace(5) %"16", align 8
%"38" = icmp uge i64 %"39", 1
store i1 %"38", ptr addrspace(5) %"18", align 1
%"40" = load i1, ptr addrspace(5) %"18", align 1
br i1 %"40", label %"13", label %"27"
"27": ; preds = %4
store i64 ptrtoint (ptr addrspace(1) @0 to i64), ptr addrspace(5) %1, align 8
%"66" = load i64, ptr addrspace(5) %1, align 8
store i64 %"66", ptr addrspace(5) %"19", align 8
%"42" = load i64, ptr addrspace(5) %"19", align 8
store i64 %"42", ptr addrspace(5) %"59", align 8
store i64 ptrtoint (ptr addrspace(1) @1 to i64), ptr addrspace(5) %2, align 8
%"68" = load i64, ptr addrspace(5) %2, align 8
store i64 %"68", ptr addrspace(5) %"19", align 8
%"44" = load i64, ptr addrspace(5) %"19", align 8
store i64 %"44", ptr addrspace(5) %"60", align 8
store i32 1, ptr addrspace(5) %"61", align 4
store i64 ptrtoint (ptr addrspace(1) @2 to i64), ptr addrspace(5) %3, align 8
%"70" = load i64, ptr addrspace(5) %3, align 8
store i64 %"70", ptr addrspace(5) %"19", align 8
%"46" = load i64, ptr addrspace(5) %"19", align 8
store i64 %"46", ptr addrspace(5) %"62", align 8
%"74" = getelementptr inbounds i8, ptr addrspace(5) %"63", i64 0
store i64 1, ptr addrspace(5) %"74", align 8
%"28" = load i64, ptr addrspace(5) %"59", align 8
%"29" = load i64, ptr addrspace(5) %"60", align 8
%"30" = load i32, ptr addrspace(5) %"61", align 4
%"31" = load i64, ptr addrspace(5) %"62", align 8
%"32" = load i64, ptr addrspace(5) %"63", align 8
call void @__zluda_ptx_impl____assertfail(i64 %"28", i64 %"29", i32 %"30", i64 %"31", i64 %"32")
br label %"13"
"13": ; preds = %"27", %"74"
%"49" = load i64, ptr addrspace(5) %"16", align 8
%"48" = add i64 %"49", 1
store i64 %"48", ptr addrspace(5) %"17", align 8
%"50" = load i64, ptr addrspace(5) %"15", align 8
%"51" = load i64, ptr addrspace(5) %"17", align 8
%"73" = inttoptr i64 %"50" to ptr
store i64 %"51", ptr %"73", align 8
"13": ; preds = %"27", %4
%"48" = load i64, ptr addrspace(5) %"16", align 8
%"47" = add i64 %"48", 1
store i64 %"47", ptr addrspace(5) %"17", align 8
%"49" = load i64, ptr addrspace(5) %"15", align 8
%"50" = load i64, ptr addrspace(5) %"17", align 8
%"72" = inttoptr i64 %"49" to ptr
store i64 %"50", ptr %"72", align 8
ret void
}

View file

@ -1,37 +1,37 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
"31":
define protected amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"22", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"23", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"24", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"25" = inttoptr i64 %"13" to ptr
%"12" = load i32, ptr %"25", align 4
store i32 %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"26" = inttoptr i64 %"15" to ptr
%"33" = getelementptr inbounds i8, ptr %"26", i64 4
%"14" = load i32, ptr %"33", align 4
store i32 %"14", ptr addrspace(5) %"7", align 4
%"17" = load i32, ptr addrspace(5) %"6", align 4
%"18" = load i32, ptr addrspace(5) %"7", align 4
%"27" = and i32 %"17", %"18"
store i32 %"27", ptr addrspace(5) %"6", align 4
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load i32, ptr addrspace(5) %"6", align 4
%"30" = inttoptr i64 %"19" to ptr
store i32 %"20", ptr %"30", align 4
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"24" = inttoptr i64 %"12" to ptr
%"11" = load i32, ptr %"24", align 4
store i32 %"11", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"25" = inttoptr i64 %"14" to ptr
%"31" = getelementptr inbounds i8, ptr %"25", i64 4
%"13" = load i32, ptr %"31", align 4
store i32 %"13", ptr addrspace(5) %"7", align 4
%"16" = load i32, ptr addrspace(5) %"6", align 4
%"17" = load i32, ptr addrspace(5) %"7", align 4
%"26" = and i32 %"16", %"17"
store i32 %"26", ptr addrspace(5) %"6", align 4
%"18" = load i64, ptr addrspace(5) %"5", align 8
%"19" = load i32, ptr addrspace(5) %"6", align 4
%"29" = inttoptr i64 %"18" to ptr
store i32 %"19", ptr %"29", align 4
ret void
}

View file

@ -3,62 +3,62 @@ target triple = "amdgcn-amd-amdhsa"
declare void @__zluda_ptx_impl____assertfail(i64, i64, i32, i64, i64) #0
define protected amdgpu_kernel void @assertfail(ptr addrspace(4) byref(i64) %"63", ptr addrspace(4) byref(i64) %"64") #1 {
"82":
define protected amdgpu_kernel void @assertfail(ptr addrspace(4) byref(i64) %"62", ptr addrspace(4) byref(i64) %"63") #1 {
%"35" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"35", align 1
%"36" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"36", align 1
%"15" = alloca i64, align 8, addrspace(5)
%"16" = alloca i64, align 8, addrspace(5)
%"17" = alloca i64, align 8, addrspace(5)
%"18" = alloca i64, align 8, addrspace(5)
%"19" = alloca i32, align 4, addrspace(5)
%"65" = alloca i64, align 8, addrspace(5)
%"67" = alloca i64, align 8, addrspace(5)
%"69" = alloca i32, align 4, addrspace(5)
%"71" = alloca i64, align 8, addrspace(5)
%"73" = alloca i64, align 8, addrspace(5)
%1 = alloca i32, align 4, addrspace(5)
%"64" = alloca i64, align 8, addrspace(5)
%"66" = alloca i64, align 8, addrspace(5)
%"68" = alloca i32, align 4, addrspace(5)
%"70" = alloca i64, align 8, addrspace(5)
%"72" = alloca i64, align 8, addrspace(5)
br label %2
2: ; preds = %0
store i1 false, ptr addrspace(5) %"35", align 1
%"36" = load i64, ptr addrspace(4) %"62", align 8
store i64 %"36", ptr addrspace(5) %"15", align 8
%"37" = load i64, ptr addrspace(4) %"63", align 8
store i64 %"37", ptr addrspace(5) %"15", align 8
%"38" = load i64, ptr addrspace(4) %"64", align 8
store i64 %"38", ptr addrspace(5) %"16", align 8
%0 = alloca i32, align 4, addrspace(5)
store i32 0, ptr addrspace(5) %0, align 4
%"75" = load i32, ptr addrspace(5) %0, align 4
store i32 %"75", ptr addrspace(5) %"19", align 4
store i64 %"37", ptr addrspace(5) %"16", align 8
store i32 0, ptr addrspace(5) %1, align 4
%"74" = load i32, ptr addrspace(5) %1, align 4
store i32 %"74", ptr addrspace(5) %"19", align 4
%"39" = load i64, ptr addrspace(5) %"15", align 8
%"82" = getelementptr inbounds i8, ptr addrspace(5) %"64", i64 0
store i64 %"39", ptr addrspace(5) %"82", align 8
%"40" = load i64, ptr addrspace(5) %"15", align 8
%"84" = getelementptr inbounds i8, ptr addrspace(5) %"65", i64 0
%"84" = getelementptr inbounds i8, ptr addrspace(5) %"66", i64 0
store i64 %"40", ptr addrspace(5) %"84", align 8
%"41" = load i64, ptr addrspace(5) %"15", align 8
%"86" = getelementptr inbounds i8, ptr addrspace(5) %"67", i64 0
store i64 %"41", ptr addrspace(5) %"86", align 8
%"42" = load i32, ptr addrspace(5) %"19", align 4
%"88" = getelementptr inbounds i8, ptr addrspace(5) %"69", i64 0
store i32 %"42", ptr addrspace(5) %"88", align 4
%"41" = load i32, ptr addrspace(5) %"19", align 4
%"86" = getelementptr inbounds i8, ptr addrspace(5) %"68", i64 0
store i32 %"41", ptr addrspace(5) %"86", align 4
%"42" = load i64, ptr addrspace(5) %"15", align 8
%"88" = getelementptr inbounds i8, ptr addrspace(5) %"70", i64 0
store i64 %"42", ptr addrspace(5) %"88", align 8
%"43" = load i64, ptr addrspace(5) %"15", align 8
%"90" = getelementptr inbounds i8, ptr addrspace(5) %"71", i64 0
%"90" = getelementptr inbounds i8, ptr addrspace(5) %"72", i64 0
store i64 %"43", ptr addrspace(5) %"90", align 8
%"44" = load i64, ptr addrspace(5) %"15", align 8
%"92" = getelementptr inbounds i8, ptr addrspace(5) %"73", i64 0
store i64 %"44", ptr addrspace(5) %"92", align 8
%"30" = load i64, ptr addrspace(5) %"65", align 8
%"31" = load i64, ptr addrspace(5) %"67", align 8
%"32" = load i32, ptr addrspace(5) %"69", align 4
%"33" = load i64, ptr addrspace(5) %"71", align 8
%"34" = load i64, ptr addrspace(5) %"73", align 8
%"30" = load i64, ptr addrspace(5) %"64", align 8
%"31" = load i64, ptr addrspace(5) %"66", align 8
%"32" = load i32, ptr addrspace(5) %"68", align 4
%"33" = load i64, ptr addrspace(5) %"70", align 8
%"34" = load i64, ptr addrspace(5) %"72", align 8
call void @__zluda_ptx_impl____assertfail(i64 %"30", i64 %"31", i32 %"32", i64 %"33", i64 %"34")
%"46" = load i64, ptr addrspace(5) %"15", align 8
%"80" = inttoptr i64 %"46" to ptr
%"45" = load i64, ptr %"80", align 8
store i64 %"45", ptr addrspace(5) %"17", align 8
%"48" = load i64, ptr addrspace(5) %"17", align 8
%"47" = add i64 %"48", 1
store i64 %"47", ptr addrspace(5) %"18", align 8
%"49" = load i64, ptr addrspace(5) %"16", align 8
%"50" = load i64, ptr addrspace(5) %"18", align 8
%"81" = inttoptr i64 %"49" to ptr
store i64 %"50", ptr %"81", align 8
%"45" = load i64, ptr addrspace(5) %"15", align 8
%"79" = inttoptr i64 %"45" to ptr
%"44" = load i64, ptr %"79", align 8
store i64 %"44", ptr addrspace(5) %"17", align 8
%"47" = load i64, ptr addrspace(5) %"17", align 8
%"46" = add i64 %"47", 1
store i64 %"46", ptr addrspace(5) %"18", align 8
%"48" = load i64, ptr addrspace(5) %"16", align 8
%"49" = load i64, ptr addrspace(5) %"18", align 8
%"80" = inttoptr i64 %"48" to ptr
store i64 %"49", ptr %"80", align 8
ret void
}

View file

@ -3,45 +3,45 @@ target triple = "amdgcn-amd-amdhsa"
@"4" = private addrspace(3) global [1024 x i8] undef, align 4
define protected amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 {
"38":
define protected amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 {
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = load i64, ptr addrspace(4) %"28", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(4) %"29", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(4) %"30", align 8
store i64 %"12", ptr addrspace(5) %"6", align 8
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"31" = inttoptr i64 %"14" to ptr
%"13" = load i32, ptr %"31", align 4
store i32 %"13", ptr addrspace(5) %"7", align 4
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"32" = inttoptr i64 %"16" to ptr
%"40" = getelementptr inbounds i8, ptr %"32", i64 4
%"15" = load i32, ptr %"40", align 4
store i32 %"15", ptr addrspace(5) %"8", align 4
%"17" = load i32, ptr addrspace(5) %"7", align 4
store i32 %"17", ptr addrspace(3) @"4", align 4
%"19" = load i32, ptr addrspace(5) %"8", align 4
%"18" = atomicrmw add ptr addrspace(3) @"4", i32 %"19" syncscope("agent-one-as") monotonic, align 4
store i32 %"18", ptr addrspace(5) %"7", align 4
%"20" = load i32, ptr addrspace(3) @"4", align 4
store i32 %"20", ptr addrspace(5) %"8", align 4
%"21" = load i64, ptr addrspace(5) %"6", align 8
%"22" = load i32, ptr addrspace(5) %"7", align 4
%"36" = inttoptr i64 %"21" to ptr
store i32 %"22", ptr %"36", align 4
%"23" = load i64, ptr addrspace(5) %"6", align 8
%"24" = load i32, ptr addrspace(5) %"8", align 4
%"37" = inttoptr i64 %"23" to ptr
%"42" = getelementptr inbounds i8, ptr %"37", i64 4
store i32 %"24", ptr %"42", align 4
store i64 %"11", ptr addrspace(5) %"6", align 8
%"13" = load i64, ptr addrspace(5) %"5", align 8
%"30" = inttoptr i64 %"13" to ptr
%"12" = load i32, ptr %"30", align 4
store i32 %"12", ptr addrspace(5) %"7", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"31" = inttoptr i64 %"15" to ptr
%"38" = getelementptr inbounds i8, ptr %"31", i64 4
%"14" = load i32, ptr %"38", align 4
store i32 %"14", ptr addrspace(5) %"8", align 4
%"16" = load i32, ptr addrspace(5) %"7", align 4
store i32 %"16", ptr addrspace(3) @"4", align 4
%"18" = load i32, ptr addrspace(5) %"8", align 4
%"17" = atomicrmw add ptr addrspace(3) @"4", i32 %"18" syncscope("agent-one-as") monotonic, align 4
store i32 %"17", ptr addrspace(5) %"7", align 4
%"19" = load i32, ptr addrspace(3) @"4", align 4
store i32 %"19", ptr addrspace(5) %"8", align 4
%"20" = load i64, ptr addrspace(5) %"6", align 8
%"21" = load i32, ptr addrspace(5) %"7", align 4
%"35" = inttoptr i64 %"20" to ptr
store i32 %"21", ptr %"35", align 4
%"22" = load i64, ptr addrspace(5) %"6", align 8
%"23" = load i32, ptr addrspace(5) %"8", align 4
%"36" = inttoptr i64 %"22" to ptr
%"40" = getelementptr inbounds i8, ptr %"36", i64 4
store i32 %"23", ptr %"40", align 4
ret void
}

View file

@ -3,46 +3,46 @@ target triple = "amdgcn-amd-amdhsa"
@"4" = private addrspace(3) global [1024 x i8] undef, align 4
define protected amdgpu_kernel void @atom_add_f16(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
"38":
define protected amdgpu_kernel void @atom_add_f16(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca half, align 2, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"26", align 8
store i64 %"9", ptr addrspace(5) %"5", align 8
%"10" = load i64, ptr addrspace(4) %"27", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(4) %"28", align 8
store i64 %"11", ptr addrspace(5) %"6", align 8
%"13" = load i64, ptr addrspace(5) %"5", align 8
%"29" = inttoptr i64 %"13" to ptr
%"40" = getelementptr inbounds i8, ptr %"29", i64 2
%"30" = load i16, ptr %"40", align 2
%"12" = bitcast i16 %"30" to half
store half %"12", ptr addrspace(5) %"7", align 2
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load half, ptr addrspace(5) %"7", align 2
%"31" = inttoptr i64 %"15" to ptr
%"14" = atomicrmw fadd ptr %"31", half %"16" syncscope("agent-one-as") monotonic, align 2
store half %"14", ptr addrspace(5) %"7", align 2
%"17" = load i64, ptr addrspace(5) %"6", align 8
%"18" = load half, ptr addrspace(5) %"7", align 2
%"32" = inttoptr i64 %"17" to ptr
%"33" = bitcast half %"18" to i16
store i16 %"33", ptr %"32", align 2
%"20" = load i64, ptr addrspace(5) %"5", align 8
store i64 %"10", ptr addrspace(5) %"6", align 8
%"12" = load i64, ptr addrspace(5) %"5", align 8
%"28" = inttoptr i64 %"12" to ptr
%"38" = getelementptr inbounds i8, ptr %"28", i64 2
%"29" = load i16, ptr %"38", align 2
%"11" = bitcast i16 %"29" to half
store half %"11", ptr addrspace(5) %"7", align 2
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"15" = load half, ptr addrspace(5) %"7", align 2
%"30" = inttoptr i64 %"14" to ptr
%"13" = atomicrmw fadd ptr %"30", half %"15" syncscope("agent-one-as") monotonic, align 2
store half %"13", ptr addrspace(5) %"7", align 2
%"16" = load i64, ptr addrspace(5) %"6", align 8
%"17" = load half, ptr addrspace(5) %"7", align 2
%"31" = inttoptr i64 %"16" to ptr
%"32" = bitcast half %"17" to i16
store i16 %"32", ptr %"31", align 2
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"34" = inttoptr i64 %"19" to ptr
%"33" = load i16, ptr %"34", align 2
%"18" = bitcast i16 %"33" to half
store half %"18", ptr addrspace(5) %"7", align 2
%"20" = load i64, ptr addrspace(5) %"6", align 8
%"21" = load half, ptr addrspace(5) %"7", align 2
%"35" = inttoptr i64 %"20" to ptr
%"34" = load i16, ptr %"35", align 2
%"19" = bitcast i16 %"34" to half
store half %"19", ptr addrspace(5) %"7", align 2
%"21" = load i64, ptr addrspace(5) %"6", align 8
%"22" = load half, ptr addrspace(5) %"7", align 2
%"36" = inttoptr i64 %"21" to ptr
%"42" = getelementptr inbounds i8, ptr %"36", i64 2
%"37" = bitcast half %"22" to i16
store i16 %"37", ptr %"42", align 2
%"40" = getelementptr inbounds i8, ptr %"35", i64 2
%"36" = bitcast half %"21" to i16
store i16 %"36", ptr %"40", align 2
ret void
}

View file

@ -3,45 +3,45 @@ target triple = "amdgcn-amd-amdhsa"
@"4" = private addrspace(3) global [1024 x i8] undef, align 4
define protected amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 {
"38":
define protected amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 {
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca float, align 4, addrspace(5)
%"8" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = load i64, ptr addrspace(4) %"28", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(4) %"29", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(4) %"30", align 8
store i64 %"12", ptr addrspace(5) %"6", align 8
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"31" = inttoptr i64 %"14" to ptr
%"13" = load float, ptr %"31", align 4
store float %"13", ptr addrspace(5) %"7", align 4
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"32" = inttoptr i64 %"16" to ptr
%"40" = getelementptr inbounds i8, ptr %"32", i64 4
%"15" = load float, ptr %"40", align 4
store float %"15", ptr addrspace(5) %"8", align 4
%"17" = load float, ptr addrspace(5) %"7", align 4
store float %"17", ptr addrspace(3) @"4", align 4
%"19" = load float, ptr addrspace(5) %"8", align 4
%"18" = atomicrmw fadd ptr addrspace(3) @"4", float %"19" syncscope("agent-one-as") monotonic, align 4
store float %"18", ptr addrspace(5) %"7", align 4
%"20" = load float, ptr addrspace(3) @"4", align 4
store float %"20", ptr addrspace(5) %"8", align 4
%"21" = load i64, ptr addrspace(5) %"6", align 8
%"22" = load float, ptr addrspace(5) %"7", align 4
%"36" = inttoptr i64 %"21" to ptr
store float %"22", ptr %"36", align 4
%"23" = load i64, ptr addrspace(5) %"6", align 8
%"24" = load float, ptr addrspace(5) %"8", align 4
%"37" = inttoptr i64 %"23" to ptr
%"42" = getelementptr inbounds i8, ptr %"37", i64 4
store float %"24", ptr %"42", align 4
store i64 %"11", ptr addrspace(5) %"6", align 8
%"13" = load i64, ptr addrspace(5) %"5", align 8
%"30" = inttoptr i64 %"13" to ptr
%"12" = load float, ptr %"30", align 4
store float %"12", ptr addrspace(5) %"7", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"31" = inttoptr i64 %"15" to ptr
%"38" = getelementptr inbounds i8, ptr %"31", i64 4
%"14" = load float, ptr %"38", align 4
store float %"14", ptr addrspace(5) %"8", align 4
%"16" = load float, ptr addrspace(5) %"7", align 4
store float %"16", ptr addrspace(3) @"4", align 4
%"18" = load float, ptr addrspace(5) %"8", align 4
%"17" = atomicrmw fadd ptr addrspace(3) @"4", float %"18" syncscope("agent-one-as") monotonic, align 4
store float %"17", ptr addrspace(5) %"7", align 4
%"19" = load float, ptr addrspace(3) @"4", align 4
store float %"19", ptr addrspace(5) %"8", align 4
%"20" = load i64, ptr addrspace(5) %"6", align 8
%"21" = load float, ptr addrspace(5) %"7", align 4
%"35" = inttoptr i64 %"20" to ptr
store float %"21", ptr %"35", align 4
%"22" = load i64, ptr addrspace(5) %"6", align 8
%"23" = load float, ptr addrspace(5) %"8", align 4
%"36" = inttoptr i64 %"22" to ptr
%"40" = getelementptr inbounds i8, ptr %"36", i64 4
store float %"23", ptr %"40", align 4
ret void
}

View file

@ -1,45 +1,45 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 {
"39":
define protected amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"29", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"30", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"31", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"32" = inttoptr i64 %"13" to ptr
%"12" = load i32, ptr %"32", align 4
store i32 %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"16" = load i32, ptr addrspace(5) %"6", align 4
%"33" = inttoptr i64 %"15" to ptr
%"41" = getelementptr inbounds i8, ptr %"33", i64 4
%0 = cmpxchg ptr %"41", i32 %"16", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4
%"34" = extractvalue { i32, i1 } %0, 0
store i32 %"34", ptr addrspace(5) %"6", align 4
%"18" = load i64, ptr addrspace(5) %"4", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"12" to ptr
%"11" = load i32, ptr %"31", align 4
store i32 %"11", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"15" = load i32, ptr addrspace(5) %"6", align 4
%"32" = inttoptr i64 %"14" to ptr
%"39" = getelementptr inbounds i8, ptr %"32", i64 4
%2 = cmpxchg ptr %"39", i32 %"15", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4
%"33" = extractvalue { i32, i1 } %2, 0
store i32 %"33", ptr addrspace(5) %"6", align 4
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"35" = inttoptr i64 %"17" to ptr
%"41" = getelementptr inbounds i8, ptr %"35", i64 4
%"16" = load i32, ptr %"41", align 4
store i32 %"16", ptr addrspace(5) %"7", align 4
%"18" = load i64, ptr addrspace(5) %"5", align 8
%"19" = load i32, ptr addrspace(5) %"6", align 4
%"36" = inttoptr i64 %"18" to ptr
%"43" = getelementptr inbounds i8, ptr %"36", i64 4
%"17" = load i32, ptr %"43", align 4
store i32 %"17", ptr addrspace(5) %"7", align 4
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load i32, ptr addrspace(5) %"6", align 4
%"37" = inttoptr i64 %"19" to ptr
store i32 %"20", ptr %"37", align 4
%"21" = load i64, ptr addrspace(5) %"5", align 8
%"22" = load i32, ptr addrspace(5) %"7", align 4
%"38" = inttoptr i64 %"21" to ptr
%"45" = getelementptr inbounds i8, ptr %"38", i64 4
store i32 %"22", ptr %"45", align 4
store i32 %"19", ptr %"36", align 4
%"20" = load i64, ptr addrspace(5) %"5", align 8
%"21" = load i32, ptr addrspace(5) %"7", align 4
%"37" = inttoptr i64 %"20" to ptr
%"43" = getelementptr inbounds i8, ptr %"37", i64 4
store i32 %"21", ptr %"43", align 4
ret void
}

View file

@ -5,47 +5,47 @@ declare i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr, i32) #0
declare i32 @__zluda_ptx_impl__atom_relaxed_gpu_global_inc(ptr addrspace(1), i32) #0
define protected amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #1 {
"39":
define protected amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #1 {
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = load i64, ptr addrspace(4) %"30", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"31", align 8
store i64 %"11", ptr addrspace(5) %"4", align 8
%"12" = load i64, ptr addrspace(4) %"32", align 8
store i64 %"12", ptr addrspace(5) %"5", align 8
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"33" = inttoptr i64 %"14" to ptr
%"13" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr %"33", i32 101)
store i32 %"13", ptr addrspace(5) %"6", align 4
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"34" = inttoptr i64 %"16" to ptr addrspace(1)
%"15" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_global_inc(ptr addrspace(1) %"34", i32 101)
store i32 %"15", ptr addrspace(5) %"7", align 4
%"18" = load i64, ptr addrspace(5) %"4", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"32" = inttoptr i64 %"13" to ptr
%"12" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr %"32", i32 101)
store i32 %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"33" = inttoptr i64 %"15" to ptr addrspace(1)
%"14" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_global_inc(ptr addrspace(1) %"33", i32 101)
store i32 %"14", ptr addrspace(5) %"7", align 4
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"34" = inttoptr i64 %"17" to ptr
%"16" = load i32, ptr %"34", align 4
store i32 %"16", ptr addrspace(5) %"8", align 4
%"18" = load i64, ptr addrspace(5) %"5", align 8
%"19" = load i32, ptr addrspace(5) %"6", align 4
%"35" = inttoptr i64 %"18" to ptr
%"17" = load i32, ptr %"35", align 4
store i32 %"17", ptr addrspace(5) %"8", align 4
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load i32, ptr addrspace(5) %"6", align 4
%"36" = inttoptr i64 %"19" to ptr
store i32 %"20", ptr %"36", align 4
%"21" = load i64, ptr addrspace(5) %"5", align 8
%"22" = load i32, ptr addrspace(5) %"7", align 4
%"37" = inttoptr i64 %"21" to ptr
%"49" = getelementptr inbounds i8, ptr %"37", i64 4
store i32 %"22", ptr %"49", align 4
%"23" = load i64, ptr addrspace(5) %"5", align 8
%"24" = load i32, ptr addrspace(5) %"8", align 4
%"38" = inttoptr i64 %"23" to ptr
%"51" = getelementptr inbounds i8, ptr %"38", i64 8
store i32 %"24", ptr %"51", align 4
store i32 %"19", ptr %"35", align 4
%"20" = load i64, ptr addrspace(5) %"5", align 8
%"21" = load i32, ptr addrspace(5) %"7", align 4
%"36" = inttoptr i64 %"20" to ptr
%"47" = getelementptr inbounds i8, ptr %"36", i64 4
store i32 %"21", ptr %"47", align 4
%"22" = load i64, ptr addrspace(5) %"5", align 8
%"23" = load i32, ptr addrspace(5) %"8", align 4
%"37" = inttoptr i64 %"22" to ptr
%"49" = getelementptr inbounds i8, ptr %"37", i64 8
store i32 %"23", ptr %"49", align 4
ret void
}

View file

@ -1,27 +1,27 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @atom_ld_st(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 {
"19":
define protected amdgpu_kernel void @atom_ld_st(ptr addrspace(4) byref(i64) %"14", ptr addrspace(4) byref(i64) %"15") #0 {
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = load i64, ptr addrspace(4) %"14", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = load i64, ptr addrspace(4) %"15", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"16", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
store i64 %"9", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(5) %"4", align 8
%"16" = inttoptr i64 %"11" to ptr
%"10" = load atomic i32, ptr %"16" syncscope("agent-one-as") acquire, align 4
store i32 %"10", ptr addrspace(5) %"6", align 4
%"12" = load i64, ptr addrspace(5) %"5", align 8
%"13" = load i32, ptr addrspace(5) %"6", align 4
%"17" = inttoptr i64 %"12" to ptr
%"11" = load atomic i32, ptr %"17" syncscope("agent-one-as") acquire, align 4
store i32 %"11", ptr addrspace(5) %"6", align 4
%"13" = load i64, ptr addrspace(5) %"5", align 8
%"14" = load i32, ptr addrspace(5) %"6", align 4
%"18" = inttoptr i64 %"13" to ptr
store atomic i32 %"14", ptr %"18" syncscope("agent-one-as") release, align 4
store atomic i32 %"13", ptr %"17" syncscope("agent-one-as") release, align 4
ret void
}

View file

@ -1,36 +1,36 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @atom_ld_st_vec(ptr addrspace(4) byref(i64) %"20", ptr addrspace(4) byref(i64) %"21") #0 {
"24":
define protected amdgpu_kernel void @atom_ld_st_vec(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"11" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"11", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"10", align 1
%"11" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"11", ptr addrspace(5) %"4", align 8
%"12" = load i64, ptr addrspace(4) %"20", align 8
store i64 %"12", ptr addrspace(5) %"4", align 8
%"13" = load i64, ptr addrspace(4) %"21", align 8
store i64 %"13", ptr addrspace(5) %"5", align 8
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"22" = inttoptr i64 %"14" to ptr
%0 = load atomic i128, ptr %"22" syncscope("agent-one-as") acquire, align 16
%"8" = bitcast i128 %0 to <2 x i64>
%"15" = extractelement <2 x i64> %"8", i32 0
%"16" = extractelement <2 x i64> %"8", i32 1
store i64 %"15", ptr addrspace(5) %"6", align 8
store i64 %"16", ptr addrspace(5) %"7", align 8
%"17" = load i64, ptr addrspace(5) %"6", align 8
%"18" = load i64, ptr addrspace(5) %"7", align 8
%1 = insertelement <2 x i64> undef, i64 %"17", i32 0
%"9" = insertelement <2 x i64> %1, i64 %"18", i32 1
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"23" = inttoptr i64 %"19" to ptr
%2 = bitcast <2 x i64> %"9" to i128
store atomic i128 %2, ptr %"23" syncscope("agent-one-as") release, align 16
store i64 %"12", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"21" = inttoptr i64 %"13" to ptr
%2 = load atomic i128, ptr %"21" syncscope("agent-one-as") acquire, align 16
%"8" = bitcast i128 %2 to <2 x i64>
%"14" = extractelement <2 x i64> %"8", i32 0
%"15" = extractelement <2 x i64> %"8", i32 1
store i64 %"14", ptr addrspace(5) %"6", align 8
store i64 %"15", ptr addrspace(5) %"7", align 8
%"16" = load i64, ptr addrspace(5) %"6", align 8
%"17" = load i64, ptr addrspace(5) %"7", align 8
%3 = insertelement <2 x i64> undef, i64 %"16", i32 0
%"9" = insertelement <2 x i64> %3, i64 %"17", i32 1
%"18" = load i64, ptr addrspace(5) %"5", align 8
%"22" = inttoptr i64 %"18" to ptr
%4 = bitcast <2 x i64> %"9" to i128
store atomic i128 %4, ptr %"22" syncscope("agent-one-as") release, align 16
ret void
}

View file

@ -1,38 +1,38 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @atom_max_u32(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
"31":
define protected amdgpu_kernel void @atom_max_u32(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"22", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"23", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"24", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"24" = inttoptr i64 %"12" to ptr
%"11" = load i32, ptr %"24", align 4
store i32 %"11", ptr addrspace(5) %"6", align 4
%"13" = load i64, ptr addrspace(5) %"5", align 8
%"14" = load i32, ptr addrspace(5) %"6", align 4
%"25" = inttoptr i64 %"13" to ptr
%"12" = load i32, ptr %"25", align 4
store i32 %"12", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"15" = load i32, ptr addrspace(5) %"6", align 4
%"26" = inttoptr i64 %"14" to ptr
store i32 %"15", ptr %"26", align 4
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"27" = inttoptr i64 %"17" to ptr
%"33" = getelementptr inbounds i8, ptr %"27", i64 4
%"16" = load i32, ptr %"33", align 4
store i32 %"16", ptr addrspace(5) %"7", align 4
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load i32, ptr addrspace(5) %"7", align 4
%"29" = inttoptr i64 %"19" to ptr
%"28" = atomicrmw umax ptr %"29", i32 %"20" syncscope("agent-one-as") monotonic, align 4
store i32 %"28", ptr addrspace(5) %"6", align 4
store i32 %"14", ptr %"25", align 4
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"26" = inttoptr i64 %"16" to ptr
%"31" = getelementptr inbounds i8, ptr %"26", i64 4
%"15" = load i32, ptr %"31", align 4
store i32 %"15", ptr addrspace(5) %"7", align 4
%"18" = load i64, ptr addrspace(5) %"5", align 8
%"19" = load i32, ptr addrspace(5) %"7", align 4
%"28" = inttoptr i64 %"18" to ptr
%"27" = atomicrmw umax ptr %"28", i32 %"19" syncscope("agent-one-as") monotonic, align 4
store i32 %"27", ptr addrspace(5) %"6", align 4
ret void
}

View file

@ -1,34 +1,34 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"24":
define protected amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca double, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
%"10" = load double, ptr addrspace(4) %"18", align 8
store double %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"11", ptr addrspace(5) %"6", align 8
%"13" = load double, ptr addrspace(5) %"4", align 8
%"21" = bitcast double %"13" to i64
%0 = alloca i64, align 8, addrspace(5)
store i64 %"21", ptr addrspace(5) %0, align 8
%"12" = load i64, ptr addrspace(5) %0, align 8
store i64 %"12", ptr addrspace(5) %"5", align 8
%"15" = load i64, ptr addrspace(5) %"5", align 8
%1 = alloca i64, align 8, addrspace(5)
br label %2
2: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load double, ptr addrspace(4) %"17", align 8
store double %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"10", ptr addrspace(5) %"6", align 8
%"12" = load double, ptr addrspace(5) %"4", align 8
%"20" = bitcast double %"12" to i64
store i64 %"20", ptr addrspace(5) %1, align 8
%"11" = load i64, ptr addrspace(5) %1, align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"21" = inttoptr i64 %"14" to ptr
%"13" = load i64, ptr %"21", align 8
store i64 %"13", ptr addrspace(5) %"7", align 8
%"15" = load i64, ptr addrspace(5) %"6", align 8
%"16" = load i64, ptr addrspace(5) %"7", align 8
%"22" = inttoptr i64 %"15" to ptr
%"14" = load i64, ptr %"22", align 8
store i64 %"14", ptr addrspace(5) %"7", align 8
%"16" = load i64, ptr addrspace(5) %"6", align 8
%"17" = load i64, ptr addrspace(5) %"7", align 8
%"23" = inttoptr i64 %"16" to ptr
store i64 %"17", ptr %"23", align 8
store i64 %"16", ptr %"22", align 8
ret void
}

View file

@ -4,11 +4,11 @@ target triple = "amdgcn-amd-amdhsa"
declare void @__zluda_ptx_impl__barrier_sync(i32) #0
define protected amdgpu_kernel void @barrier() #1 {
"5":
%"2" = alloca i1, align 1, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"2", align 1
%"3" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"3", align 1
call void @__zluda_ptx_impl__barrier_sync(i32 0)
ret void
}

View file

@ -3,44 +3,44 @@ target triple = "amdgcn-amd-amdhsa"
declare i32 @__zluda_ptx_impl__bfe_u32(i32, i32, i32) #0
define protected amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 {
"35":
define protected amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 {
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = load i64, ptr addrspace(4) %"28", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"29", align 8
store i64 %"11", ptr addrspace(5) %"4", align 8
%"12" = load i64, ptr addrspace(4) %"30", align 8
store i64 %"12", ptr addrspace(5) %"5", align 8
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"14" to ptr
%"13" = load i32, ptr %"31", align 4
store i32 %"13", ptr addrspace(5) %"6", align 4
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"32" = inttoptr i64 %"16" to ptr
%"42" = getelementptr inbounds i8, ptr %"32", i64 4
%"15" = load i32, ptr %"42", align 4
store i32 %"15", ptr addrspace(5) %"7", align 4
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"33" = inttoptr i64 %"18" to ptr
%"44" = getelementptr inbounds i8, ptr %"33", i64 8
%"17" = load i32, ptr %"44", align 4
store i32 %"17", ptr addrspace(5) %"8", align 4
%"20" = load i32, ptr addrspace(5) %"6", align 4
%"21" = load i32, ptr addrspace(5) %"7", align 4
%"22" = load i32, ptr addrspace(5) %"8", align 4
%"19" = call i32 @__zluda_ptx_impl__bfe_u32(i32 %"20", i32 %"21", i32 %"22")
store i32 %"19", ptr addrspace(5) %"6", align 4
%"23" = load i64, ptr addrspace(5) %"5", align 8
%"24" = load i32, ptr addrspace(5) %"6", align 4
%"34" = inttoptr i64 %"23" to ptr
store i32 %"24", ptr %"34", align 4
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"30" = inttoptr i64 %"13" to ptr
%"12" = load i32, ptr %"30", align 4
store i32 %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"15" to ptr
%"40" = getelementptr inbounds i8, ptr %"31", i64 4
%"14" = load i32, ptr %"40", align 4
store i32 %"14", ptr addrspace(5) %"7", align 4
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"32" = inttoptr i64 %"17" to ptr
%"42" = getelementptr inbounds i8, ptr %"32", i64 8
%"16" = load i32, ptr %"42", align 4
store i32 %"16", ptr addrspace(5) %"8", align 4
%"19" = load i32, ptr addrspace(5) %"6", align 4
%"20" = load i32, ptr addrspace(5) %"7", align 4
%"21" = load i32, ptr addrspace(5) %"8", align 4
%"18" = call i32 @__zluda_ptx_impl__bfe_u32(i32 %"19", i32 %"20", i32 %"21")
store i32 %"18", ptr addrspace(5) %"6", align 4
%"22" = load i64, ptr addrspace(5) %"5", align 8
%"23" = load i32, ptr addrspace(5) %"6", align 4
%"33" = inttoptr i64 %"22" to ptr
store i32 %"23", ptr %"33", align 4
ret void
}

View file

@ -3,51 +3,51 @@ target triple = "amdgcn-amd-amdhsa"
declare i32 @__zluda_ptx_impl__bfi_b32(i32, i32, i32, i32) #0
define protected amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 {
"45":
define protected amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #1 {
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"11" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"11", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
%"9" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"10", align 1
%"11" = load i64, ptr addrspace(4) %"34", align 8
store i64 %"11", ptr addrspace(5) %"4", align 8
%"12" = load i64, ptr addrspace(4) %"35", align 8
store i64 %"12", ptr addrspace(5) %"4", align 8
%"13" = load i64, ptr addrspace(4) %"36", align 8
store i64 %"13", ptr addrspace(5) %"5", align 8
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"37" = inttoptr i64 %"15" to ptr
%"14" = load i32, ptr %"37", align 4
store i32 %"14", ptr addrspace(5) %"6", align 4
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"38" = inttoptr i64 %"17" to ptr
%"53" = getelementptr inbounds i8, ptr %"38", i64 4
%"16" = load i32, ptr %"53", align 4
store i32 %"16", ptr addrspace(5) %"7", align 4
%"19" = load i64, ptr addrspace(5) %"4", align 8
%"39" = inttoptr i64 %"19" to ptr
%"55" = getelementptr inbounds i8, ptr %"39", i64 8
%"18" = load i32, ptr %"55", align 4
store i32 %"18", ptr addrspace(5) %"8", align 4
%"21" = load i64, ptr addrspace(5) %"4", align 8
%"40" = inttoptr i64 %"21" to ptr
%"57" = getelementptr inbounds i8, ptr %"40", i64 12
%"20" = load i32, ptr %"57", align 4
store i32 %"20", ptr addrspace(5) %"9", align 4
%"23" = load i32, ptr addrspace(5) %"6", align 4
%"24" = load i32, ptr addrspace(5) %"7", align 4
%"25" = load i32, ptr addrspace(5) %"8", align 4
%"26" = load i32, ptr addrspace(5) %"9", align 4
%"41" = call i32 @__zluda_ptx_impl__bfi_b32(i32 %"23", i32 %"24", i32 %"25", i32 %"26")
store i32 %"41", ptr addrspace(5) %"6", align 4
%"27" = load i64, ptr addrspace(5) %"5", align 8
%"28" = load i32, ptr addrspace(5) %"6", align 4
%"44" = inttoptr i64 %"27" to ptr
store i32 %"28", ptr %"44", align 4
store i64 %"12", ptr addrspace(5) %"5", align 8
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"36" = inttoptr i64 %"14" to ptr
%"13" = load i32, ptr %"36", align 4
store i32 %"13", ptr addrspace(5) %"6", align 4
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"37" = inttoptr i64 %"16" to ptr
%"51" = getelementptr inbounds i8, ptr %"37", i64 4
%"15" = load i32, ptr %"51", align 4
store i32 %"15", ptr addrspace(5) %"7", align 4
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"38" = inttoptr i64 %"18" to ptr
%"53" = getelementptr inbounds i8, ptr %"38", i64 8
%"17" = load i32, ptr %"53", align 4
store i32 %"17", ptr addrspace(5) %"8", align 4
%"20" = load i64, ptr addrspace(5) %"4", align 8
%"39" = inttoptr i64 %"20" to ptr
%"55" = getelementptr inbounds i8, ptr %"39", i64 12
%"19" = load i32, ptr %"55", align 4
store i32 %"19", ptr addrspace(5) %"9", align 4
%"22" = load i32, ptr addrspace(5) %"6", align 4
%"23" = load i32, ptr addrspace(5) %"7", align 4
%"24" = load i32, ptr addrspace(5) %"8", align 4
%"25" = load i32, ptr addrspace(5) %"9", align 4
%"40" = call i32 @__zluda_ptx_impl__bfi_b32(i32 %"22", i32 %"23", i32 %"24", i32 %"25")
store i32 %"40", ptr addrspace(5) %"6", align 4
%"26" = load i64, ptr addrspace(5) %"5", align 8
%"27" = load i32, ptr addrspace(5) %"6", align 4
%"43" = inttoptr i64 %"26" to ptr
store i32 %"27", ptr %"43", align 4
ret void
}

View file

@ -1,12 +1,8 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @bfind(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 {
"53":
define protected amdgpu_kernel void @bfind(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
%"12" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"12", align 1
%"13" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"13", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
@ -15,56 +11,60 @@ define protected amdgpu_kernel void @bfind(ptr addrspace(4) byref(i64) %"42", pt
%"9" = alloca i32, align 4, addrspace(5)
%"10" = alloca i32, align 4, addrspace(5)
%"11" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"12", align 1
%"13" = load i64, ptr addrspace(4) %"41", align 8
store i64 %"13", ptr addrspace(5) %"4", align 8
%"14" = load i64, ptr addrspace(4) %"42", align 8
store i64 %"14", ptr addrspace(5) %"4", align 8
%"15" = load i64, ptr addrspace(4) %"43", align 8
store i64 %"15", ptr addrspace(5) %"5", align 8
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"44" = inttoptr i64 %"17" to ptr
%"16" = load i32, ptr %"44", align 4
store i32 %"16", ptr addrspace(5) %"6", align 4
%"19" = load i64, ptr addrspace(5) %"4", align 8
%"45" = inttoptr i64 %"19" to ptr
%"55" = getelementptr inbounds i8, ptr %"45", i64 4
%"18" = load i32, ptr %"55", align 4
store i32 %"18", ptr addrspace(5) %"7", align 4
%"21" = load i64, ptr addrspace(5) %"4", align 8
%"46" = inttoptr i64 %"21" to ptr
%"57" = getelementptr inbounds i8, ptr %"46", i64 8
%"20" = load i32, ptr %"57", align 4
store i32 %"20", ptr addrspace(5) %"8", align 4
%"23" = load i32, ptr addrspace(5) %"6", align 4
%0 = icmp eq i32 %"23", 0
%1 = call i32 @llvm.ctlz.i32(i32 %"23", i1 true)
%2 = sub i32 31, %1
%"47" = select i1 %0, i32 -1, i32 %2
store i32 %"47", ptr addrspace(5) %"9", align 4
%"25" = load i32, ptr addrspace(5) %"7", align 4
%3 = icmp eq i32 %"25", 0
%4 = call i32 @llvm.ctlz.i32(i32 %"25", i1 true)
%5 = sub i32 31, %4
%"48" = select i1 %3, i32 -1, i32 %5
store i32 %"48", ptr addrspace(5) %"10", align 4
%"27" = load i32, ptr addrspace(5) %"8", align 4
%6 = icmp eq i32 %"27", 0
%7 = call i32 @llvm.ctlz.i32(i32 %"27", i1 true)
%8 = sub i32 31, %7
%"49" = select i1 %6, i32 -1, i32 %8
store i32 %"49", ptr addrspace(5) %"11", align 4
%"28" = load i64, ptr addrspace(5) %"5", align 8
%"29" = load i32, ptr addrspace(5) %"9", align 4
%"50" = inttoptr i64 %"28" to ptr
store i32 %"29", ptr %"50", align 4
%"30" = load i64, ptr addrspace(5) %"5", align 8
%"31" = load i32, ptr addrspace(5) %"10", align 4
%"51" = inttoptr i64 %"30" to ptr
%"59" = getelementptr inbounds i8, ptr %"51", i64 4
store i32 %"31", ptr %"59", align 4
%"32" = load i64, ptr addrspace(5) %"5", align 8
%"33" = load i32, ptr addrspace(5) %"11", align 4
%"52" = inttoptr i64 %"32" to ptr
%"61" = getelementptr inbounds i8, ptr %"52", i64 8
store i32 %"33", ptr %"61", align 4
store i64 %"14", ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"43" = inttoptr i64 %"16" to ptr
%"15" = load i32, ptr %"43", align 4
store i32 %"15", ptr addrspace(5) %"6", align 4
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"44" = inttoptr i64 %"18" to ptr
%"53" = getelementptr inbounds i8, ptr %"44", i64 4
%"17" = load i32, ptr %"53", align 4
store i32 %"17", ptr addrspace(5) %"7", align 4
%"20" = load i64, ptr addrspace(5) %"4", align 8
%"45" = inttoptr i64 %"20" to ptr
%"55" = getelementptr inbounds i8, ptr %"45", i64 8
%"19" = load i32, ptr %"55", align 4
store i32 %"19", ptr addrspace(5) %"8", align 4
%"22" = load i32, ptr addrspace(5) %"6", align 4
%2 = icmp eq i32 %"22", 0
%3 = call i32 @llvm.ctlz.i32(i32 %"22", i1 true)
%4 = sub i32 31, %3
%"46" = select i1 %2, i32 -1, i32 %4
store i32 %"46", ptr addrspace(5) %"9", align 4
%"24" = load i32, ptr addrspace(5) %"7", align 4
%5 = icmp eq i32 %"24", 0
%6 = call i32 @llvm.ctlz.i32(i32 %"24", i1 true)
%7 = sub i32 31, %6
%"47" = select i1 %5, i32 -1, i32 %7
store i32 %"47", ptr addrspace(5) %"10", align 4
%"26" = load i32, ptr addrspace(5) %"8", align 4
%8 = icmp eq i32 %"26", 0
%9 = call i32 @llvm.ctlz.i32(i32 %"26", i1 true)
%10 = sub i32 31, %9
%"48" = select i1 %8, i32 -1, i32 %10
store i32 %"48", ptr addrspace(5) %"11", align 4
%"27" = load i64, ptr addrspace(5) %"5", align 8
%"28" = load i32, ptr addrspace(5) %"9", align 4
%"49" = inttoptr i64 %"27" to ptr
store i32 %"28", ptr %"49", align 4
%"29" = load i64, ptr addrspace(5) %"5", align 8
%"30" = load i32, ptr addrspace(5) %"10", align 4
%"50" = inttoptr i64 %"29" to ptr
%"57" = getelementptr inbounds i8, ptr %"50", i64 4
store i32 %"30", ptr %"57", align 4
%"31" = load i64, ptr addrspace(5) %"5", align 8
%"32" = load i32, ptr addrspace(5) %"11", align 4
%"51" = inttoptr i64 %"31" to ptr
%"59" = getelementptr inbounds i8, ptr %"51", i64 8
store i32 %"32", ptr %"59", align 4
ret void
}

View file

@ -1,12 +1,8 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @bfind_shiftamt(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 {
"53":
define protected amdgpu_kernel void @bfind_shiftamt(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
%"12" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"12", align 1
%"13" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"13", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
@ -15,53 +11,57 @@ define protected amdgpu_kernel void @bfind_shiftamt(ptr addrspace(4) byref(i64)
%"9" = alloca i32, align 4, addrspace(5)
%"10" = alloca i32, align 4, addrspace(5)
%"11" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"12", align 1
%"13" = load i64, ptr addrspace(4) %"41", align 8
store i64 %"13", ptr addrspace(5) %"4", align 8
%"14" = load i64, ptr addrspace(4) %"42", align 8
store i64 %"14", ptr addrspace(5) %"4", align 8
%"15" = load i64, ptr addrspace(4) %"43", align 8
store i64 %"15", ptr addrspace(5) %"5", align 8
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"44" = inttoptr i64 %"17" to ptr
%"16" = load i32, ptr %"44", align 4
store i32 %"16", ptr addrspace(5) %"6", align 4
%"19" = load i64, ptr addrspace(5) %"4", align 8
%"45" = inttoptr i64 %"19" to ptr
%"55" = getelementptr inbounds i8, ptr %"45", i64 4
%"18" = load i32, ptr %"55", align 4
store i32 %"18", ptr addrspace(5) %"7", align 4
%"21" = load i64, ptr addrspace(5) %"4", align 8
%"46" = inttoptr i64 %"21" to ptr
%"57" = getelementptr inbounds i8, ptr %"46", i64 8
%"20" = load i32, ptr %"57", align 4
store i32 %"20", ptr addrspace(5) %"8", align 4
%"23" = load i32, ptr addrspace(5) %"6", align 4
%0 = icmp eq i32 %"23", 0
%1 = call i32 @llvm.ctlz.i32(i32 %"23", i1 true)
%"47" = select i1 %0, i32 -1, i32 %1
store i32 %"47", ptr addrspace(5) %"9", align 4
%"25" = load i32, ptr addrspace(5) %"7", align 4
%2 = icmp eq i32 %"25", 0
%3 = call i32 @llvm.ctlz.i32(i32 %"25", i1 true)
%"48" = select i1 %2, i32 -1, i32 %3
store i32 %"48", ptr addrspace(5) %"10", align 4
%"27" = load i32, ptr addrspace(5) %"8", align 4
%4 = icmp eq i32 %"27", 0
%5 = call i32 @llvm.ctlz.i32(i32 %"27", i1 true)
%"49" = select i1 %4, i32 -1, i32 %5
store i32 %"49", ptr addrspace(5) %"11", align 4
%"28" = load i64, ptr addrspace(5) %"5", align 8
%"29" = load i32, ptr addrspace(5) %"9", align 4
%"50" = inttoptr i64 %"28" to ptr
store i32 %"29", ptr %"50", align 4
%"30" = load i64, ptr addrspace(5) %"5", align 8
%"31" = load i32, ptr addrspace(5) %"10", align 4
%"51" = inttoptr i64 %"30" to ptr
%"59" = getelementptr inbounds i8, ptr %"51", i64 4
store i32 %"31", ptr %"59", align 4
%"32" = load i64, ptr addrspace(5) %"5", align 8
%"33" = load i32, ptr addrspace(5) %"11", align 4
%"52" = inttoptr i64 %"32" to ptr
%"61" = getelementptr inbounds i8, ptr %"52", i64 8
store i32 %"33", ptr %"61", align 4
store i64 %"14", ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"43" = inttoptr i64 %"16" to ptr
%"15" = load i32, ptr %"43", align 4
store i32 %"15", ptr addrspace(5) %"6", align 4
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"44" = inttoptr i64 %"18" to ptr
%"53" = getelementptr inbounds i8, ptr %"44", i64 4
%"17" = load i32, ptr %"53", align 4
store i32 %"17", ptr addrspace(5) %"7", align 4
%"20" = load i64, ptr addrspace(5) %"4", align 8
%"45" = inttoptr i64 %"20" to ptr
%"55" = getelementptr inbounds i8, ptr %"45", i64 8
%"19" = load i32, ptr %"55", align 4
store i32 %"19", ptr addrspace(5) %"8", align 4
%"22" = load i32, ptr addrspace(5) %"6", align 4
%2 = icmp eq i32 %"22", 0
%3 = call i32 @llvm.ctlz.i32(i32 %"22", i1 true)
%"46" = select i1 %2, i32 -1, i32 %3
store i32 %"46", ptr addrspace(5) %"9", align 4
%"24" = load i32, ptr addrspace(5) %"7", align 4
%4 = icmp eq i32 %"24", 0
%5 = call i32 @llvm.ctlz.i32(i32 %"24", i1 true)
%"47" = select i1 %4, i32 -1, i32 %5
store i32 %"47", ptr addrspace(5) %"10", align 4
%"26" = load i32, ptr addrspace(5) %"8", align 4
%6 = icmp eq i32 %"26", 0
%7 = call i32 @llvm.ctlz.i32(i32 %"26", i1 true)
%"48" = select i1 %6, i32 -1, i32 %7
store i32 %"48", ptr addrspace(5) %"11", align 4
%"27" = load i64, ptr addrspace(5) %"5", align 8
%"28" = load i32, ptr addrspace(5) %"9", align 4
%"49" = inttoptr i64 %"27" to ptr
store i32 %"28", ptr %"49", align 4
%"29" = load i64, ptr addrspace(5) %"5", align 8
%"30" = load i32, ptr addrspace(5) %"10", align 4
%"50" = inttoptr i64 %"29" to ptr
%"57" = getelementptr inbounds i8, ptr %"50", i64 4
store i32 %"30", ptr %"57", align 4
%"31" = load i64, ptr addrspace(5) %"5", align 8
%"32" = load i32, ptr addrspace(5) %"11", align 4
%"51" = inttoptr i64 %"31" to ptr
%"59" = getelementptr inbounds i8, ptr %"51", i64 8
store i32 %"32", ptr %"59", align 4
ret void
}

View file

@ -1,35 +1,35 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
"27":
define protected amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 {
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
%"8" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = load i64, ptr addrspace(4) %"22", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"23", align 8
store i64 %"11", ptr addrspace(5) %"4", align 8
%"12" = load i64, ptr addrspace(4) %"24", align 8
store i64 %"12", ptr addrspace(5) %"5", align 8
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"25" = inttoptr i64 %"14" to ptr
%"13" = load i64, ptr %"25", align 8
store i64 %"13", ptr addrspace(5) %"6", align 8
%"16" = load i64, ptr addrspace(5) %"6", align 8
%"15" = add i64 %"16", 1
store i64 %"15", ptr addrspace(5) %"7", align 8
%"18" = load i64, ptr addrspace(5) %"8", align 8
%"17" = add i64 %"18", 1
store i64 %"17", ptr addrspace(5) %"8", align 8
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load i64, ptr addrspace(5) %"7", align 8
%"26" = inttoptr i64 %"19" to ptr
store i64 %"20", ptr %"26", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"24" = inttoptr i64 %"13" to ptr
%"12" = load i64, ptr %"24", align 8
store i64 %"12", ptr addrspace(5) %"6", align 8
%"15" = load i64, ptr addrspace(5) %"6", align 8
%"14" = add i64 %"15", 1
store i64 %"14", ptr addrspace(5) %"7", align 8
%"17" = load i64, ptr addrspace(5) %"8", align 8
%"16" = add i64 %"17", 1
store i64 %"16", ptr addrspace(5) %"8", align 8
%"18" = load i64, ptr addrspace(5) %"5", align 8
%"19" = load i64, ptr addrspace(5) %"7", align 8
%"25" = inttoptr i64 %"18" to ptr
store i64 %"19", ptr %"25", align 8
ret void
}

View file

@ -1,43 +1,43 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #0 {
"29":
define protected amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 {
%"11" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"11", align 1
%"12" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"12", align 1
%"7" = alloca i64, align 8, addrspace(5)
%"8" = alloca i64, align 8, addrspace(5)
%"9" = alloca i64, align 8, addrspace(5)
%"10" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"11", align 1
%"12" = load i64, ptr addrspace(4) %"24", align 8
store i64 %"12", ptr addrspace(5) %"7", align 8
%"13" = load i64, ptr addrspace(4) %"25", align 8
store i64 %"13", ptr addrspace(5) %"7", align 8
%"14" = load i64, ptr addrspace(4) %"26", align 8
store i64 %"14", ptr addrspace(5) %"8", align 8
%"16" = load i64, ptr addrspace(5) %"7", align 8
%"27" = inttoptr i64 %"16" to ptr
%"15" = load i64, ptr %"27", align 8
store i64 %"15", ptr addrspace(5) %"9", align 8
store i64 %"13", ptr addrspace(5) %"8", align 8
%"15" = load i64, ptr addrspace(5) %"7", align 8
%"26" = inttoptr i64 %"15" to ptr
%"14" = load i64, ptr %"26", align 8
store i64 %"14", ptr addrspace(5) %"9", align 8
br label %"4"
"4": ; preds = %"29"
%"18" = load i64, ptr addrspace(5) %"9", align 8
%"17" = add i64 %"18", 1
store i64 %"17", ptr addrspace(5) %"10", align 8
"4": ; preds = %1
%"17" = load i64, ptr addrspace(5) %"9", align 8
%"16" = add i64 %"17", 1
store i64 %"16", ptr addrspace(5) %"10", align 8
br label %"6"
0: ; No predecessors!
%"20" = load i64, ptr addrspace(5) %"9", align 8
%"19" = add i64 %"20", 2
store i64 %"19", ptr addrspace(5) %"10", align 8
"5": ; No predecessors!
%"19" = load i64, ptr addrspace(5) %"9", align 8
%"18" = add i64 %"19", 2
store i64 %"18", ptr addrspace(5) %"10", align 8
br label %"6"
"6": ; preds = %0, %"4"
%"21" = load i64, ptr addrspace(5) %"8", align 8
%"22" = load i64, ptr addrspace(5) %"10", align 8
%"28" = inttoptr i64 %"21" to ptr
store i64 %"22", ptr %"28", align 8
"6": ; preds = %"5", %"4"
%"20" = load i64, ptr addrspace(5) %"8", align 8
%"21" = load i64, ptr addrspace(5) %"10", align 8
%"27" = inttoptr i64 %"20" to ptr
store i64 %"21", ptr %"27", align 8
ret void
}

View file

@ -1,30 +1,30 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
"21":
define protected amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 {
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = load i64, ptr addrspace(4) %"16", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"19" = inttoptr i64 %"12" to ptr
%"11" = load i32, ptr %"19", align 4
store i32 %"11", ptr addrspace(5) %"6", align 4
%"14" = load i32, ptr addrspace(5) %"6", align 4
%"13" = call i32 @llvm.bitreverse.i32(i32 %"14")
store i32 %"13", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load i32, ptr addrspace(5) %"6", align 4
%"20" = inttoptr i64 %"15" to ptr
store i32 %"16", ptr %"20", align 4
store i64 %"9", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(5) %"4", align 8
%"18" = inttoptr i64 %"11" to ptr
%"10" = load i32, ptr %"18", align 4
store i32 %"10", ptr addrspace(5) %"6", align 4
%"13" = load i32, ptr addrspace(5) %"6", align 4
%"12" = call i32 @llvm.bitreverse.i32(i32 %"13")
store i32 %"12", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"15" = load i32, ptr addrspace(5) %"6", align 4
%"19" = inttoptr i64 %"14" to ptr
store i32 %"15", ptr %"19", align 4
ret void
}

View file

@ -1,63 +1,63 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define private i64 @incr(i64 %"31") #0 {
"51":
define private i64 @incr(i64 %"29") #0 {
%"18" = alloca i64, align 8, addrspace(5)
%"17" = alloca i64, align 8, addrspace(5)
%"21" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"21", align 1
%"22" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"22", align 1
%"44" = alloca i64, align 8, addrspace(5)
%"45" = alloca i64, align 8, addrspace(5)
%"20" = alloca i1, align 1, addrspace(5)
%"42" = alloca i64, align 8, addrspace(5)
%"43" = alloca i64, align 8, addrspace(5)
%"14" = alloca i64, align 8, addrspace(5)
store i64 %"31", ptr addrspace(5) %"18", align 8
%"32" = load i64, ptr addrspace(5) %"18", align 8
store i64 %"32", ptr addrspace(5) %"45", align 8
%"33" = load i64, ptr addrspace(5) %"45", align 8
store i64 %"33", ptr addrspace(5) %"14", align 8
%"35" = load i64, ptr addrspace(5) %"14", align 8
%"34" = add i64 %"35", 1
store i64 %"34", ptr addrspace(5) %"14", align 8
%"36" = load i64, ptr addrspace(5) %"14", align 8
store i64 %"36", ptr addrspace(5) %"44", align 8
%"37" = load i64, ptr addrspace(5) %"44", align 8
store i64 %"37", ptr addrspace(5) %"17", align 8
%"38" = load i64, ptr addrspace(5) %"17", align 8
ret i64 %"38"
br label %1
1: ; preds = %0
store i64 %"29", ptr addrspace(5) %"18", align 8
store i1 false, ptr addrspace(5) %"20", align 1
%"30" = load i64, ptr addrspace(5) %"18", align 8
store i64 %"30", ptr addrspace(5) %"43", align 8
%"31" = load i64, ptr addrspace(5) %"43", align 8
store i64 %"31", ptr addrspace(5) %"14", align 8
%"33" = load i64, ptr addrspace(5) %"14", align 8
%"32" = add i64 %"33", 1
store i64 %"32", ptr addrspace(5) %"14", align 8
%"34" = load i64, ptr addrspace(5) %"14", align 8
store i64 %"34", ptr addrspace(5) %"42", align 8
%"35" = load i64, ptr addrspace(5) %"42", align 8
store i64 %"35", ptr addrspace(5) %"17", align 8
%"36" = load i64, ptr addrspace(5) %"17", align 8
ret i64 %"36"
}
define protected amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 {
"50":
define protected amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
%"19" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"19", align 1
%"20" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"20", align 1
%"7" = alloca i64, align 8, addrspace(5)
%"8" = alloca i64, align 8, addrspace(5)
%"9" = alloca i64, align 8, addrspace(5)
%"42" = alloca i64, align 8, addrspace(5)
%"43" = alloca i64, align 8, addrspace(5)
%"23" = load i64, ptr addrspace(4) %"40", align 8
store i64 %"23", ptr addrspace(5) %"7", align 8
%"24" = load i64, ptr addrspace(4) %"41", align 8
store i64 %"24", ptr addrspace(5) %"8", align 8
%"26" = load i64, ptr addrspace(5) %"7", align 8
%"46" = inttoptr i64 %"26" to ptr addrspace(1)
%"25" = load i64, ptr addrspace(1) %"46", align 8
store i64 %"25", ptr addrspace(5) %"9", align 8
%"27" = load i64, ptr addrspace(5) %"9", align 8
store i64 %"27", ptr addrspace(5) %"42", align 8
%"15" = load i64, ptr addrspace(5) %"42", align 8
%"40" = alloca i64, align 8, addrspace(5)
%"41" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"19", align 1
%"21" = load i64, ptr addrspace(4) %"38", align 8
store i64 %"21", ptr addrspace(5) %"7", align 8
%"22" = load i64, ptr addrspace(4) %"39", align 8
store i64 %"22", ptr addrspace(5) %"8", align 8
%"24" = load i64, ptr addrspace(5) %"7", align 8
%"44" = inttoptr i64 %"24" to ptr addrspace(1)
%"23" = load i64, ptr addrspace(1) %"44", align 8
store i64 %"23", ptr addrspace(5) %"9", align 8
%"25" = load i64, ptr addrspace(5) %"9", align 8
store i64 %"25", ptr addrspace(5) %"40", align 8
%"15" = load i64, ptr addrspace(5) %"40", align 8
%"16" = call i64 @incr(i64 %"15")
store i64 %"16", ptr addrspace(5) %"43", align 8
%"28" = load i64, ptr addrspace(5) %"43", align 8
store i64 %"28", ptr addrspace(5) %"9", align 8
%"29" = load i64, ptr addrspace(5) %"8", align 8
%"30" = load i64, ptr addrspace(5) %"9", align 8
%"49" = inttoptr i64 %"29" to ptr addrspace(1)
store i64 %"30", ptr addrspace(1) %"49", align 8
store i64 %"16", ptr addrspace(5) %"41", align 8
%"26" = load i64, ptr addrspace(5) %"41", align 8
store i64 %"26", ptr addrspace(5) %"9", align 8
%"27" = load i64, ptr addrspace(5) %"8", align 8
%"28" = load i64, ptr addrspace(5) %"9", align 8
%"47" = inttoptr i64 %"27" to ptr addrspace(1)
store i64 %"28", ptr addrspace(1) %"47", align 8
ret void
}

View file

@ -1,68 +1,68 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define private [2 x i32] @incr(i64 %"23") #0 {
"58":
define private [2 x i32] @incr(i64 %"21") #0 {
%"16" = alloca i64, align 8, addrspace(5)
%"15" = alloca [2 x i32], align 4, addrspace(5)
%"19" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"19", align 1
%"20" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"20", align 1
%"44" = alloca [2 x i32], align 4, addrspace(5)
%"45" = alloca i64, align 8, addrspace(5)
%"42" = alloca [2 x i32], align 4, addrspace(5)
%"43" = alloca i64, align 8, addrspace(5)
%"4" = alloca i64, align 8, addrspace(5)
store i64 %"23", ptr addrspace(5) %"16", align 8
%"24" = load i64, ptr addrspace(5) %"16", align 8
store i64 %"24", ptr addrspace(5) %"45", align 8
%"25" = load i64, ptr addrspace(5) %"45", align 8
store i64 %"25", ptr addrspace(5) %"4", align 8
%"27" = load i64, ptr addrspace(5) %"4", align 8
%"26" = add i64 %"27", 1
store i64 %"26", ptr addrspace(5) %"4", align 8
%"28" = load i64, ptr addrspace(5) %"4", align 8
store i64 %"28", ptr addrspace(5) %"44", align 8
%"29" = load [2 x i32], ptr addrspace(5) %"44", align 4
store [2 x i32] %"29", ptr addrspace(5) %"15", align 4
%"30" = load [2 x i32], ptr addrspace(5) %"15", align 4
ret [2 x i32] %"30"
br label %1
1: ; preds = %0
store i64 %"21", ptr addrspace(5) %"16", align 8
store i1 false, ptr addrspace(5) %"19", align 1
%"22" = load i64, ptr addrspace(5) %"16", align 8
store i64 %"22", ptr addrspace(5) %"43", align 8
%"23" = load i64, ptr addrspace(5) %"43", align 8
store i64 %"23", ptr addrspace(5) %"4", align 8
%"25" = load i64, ptr addrspace(5) %"4", align 8
%"24" = add i64 %"25", 1
store i64 %"24", ptr addrspace(5) %"4", align 8
%"26" = load i64, ptr addrspace(5) %"4", align 8
store i64 %"26", ptr addrspace(5) %"42", align 8
%"27" = load [2 x i32], ptr addrspace(5) %"42", align 4
store [2 x i32] %"27", ptr addrspace(5) %"15", align 4
%"28" = load [2 x i32], ptr addrspace(5) %"15", align 4
ret [2 x i32] %"28"
}
define protected amdgpu_kernel void @call_bug(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 {
"59":
%"21" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"21", align 1
%"22" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"22", align 1
define protected amdgpu_kernel void @call_bug(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #0 {
%"20" = alloca i1, align 1, addrspace(5)
%"8" = alloca i64, align 8, addrspace(5)
%"9" = alloca i64, align 8, addrspace(5)
%"10" = alloca i64, align 8, addrspace(5)
%"11" = alloca i64, align 8, addrspace(5)
%"48" = alloca i64, align 8, addrspace(5)
%"49" = alloca [2 x i32], align 4, addrspace(5)
%"31" = load i64, ptr addrspace(4) %"46", align 8
store i64 %"31", ptr addrspace(5) %"8", align 8
%"32" = load i64, ptr addrspace(4) %"47", align 8
store i64 %"32", ptr addrspace(5) %"9", align 8
%"34" = load i64, ptr addrspace(5) %"8", align 8
%"52" = inttoptr i64 %"34" to ptr addrspace(1)
%"33" = load i64, ptr addrspace(1) %"52", align 8
store i64 %"33", ptr addrspace(5) %"10", align 8
%"35" = load i64, ptr addrspace(5) %"10", align 8
store i64 %"35", ptr addrspace(5) %"48", align 8
%"46" = alloca i64, align 8, addrspace(5)
%"47" = alloca [2 x i32], align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"20", align 1
%"29" = load i64, ptr addrspace(4) %"44", align 8
store i64 %"29", ptr addrspace(5) %"8", align 8
%"30" = load i64, ptr addrspace(4) %"45", align 8
store i64 %"30", ptr addrspace(5) %"9", align 8
%"32" = load i64, ptr addrspace(5) %"8", align 8
%"50" = inttoptr i64 %"32" to ptr addrspace(1)
%"31" = load i64, ptr addrspace(1) %"50", align 8
store i64 %"31", ptr addrspace(5) %"10", align 8
%"33" = load i64, ptr addrspace(5) %"10", align 8
store i64 %"33", ptr addrspace(5) %"46", align 8
store i64 ptrtoint (ptr @incr to i64), ptr addrspace(5) %"11", align 8
%"17" = load i64, ptr addrspace(5) %"48", align 8
%"37" = load i64, ptr addrspace(5) %"11", align 8
%0 = inttoptr i64 %"37" to ptr
%"18" = call [2 x i32] %0(i64 %"17")
store [2 x i32] %"18", ptr addrspace(5) %"49", align 4
%"61" = getelementptr inbounds i8, ptr addrspace(5) %"49", i64 0
%"38" = load i64, ptr addrspace(5) %"61", align 8
store i64 %"38", ptr addrspace(5) %"10", align 8
%"39" = load i64, ptr addrspace(5) %"9", align 8
%"40" = load i64, ptr addrspace(5) %"10", align 8
%"57" = inttoptr i64 %"39" to ptr addrspace(1)
store i64 %"40", ptr addrspace(1) %"57", align 8
%"17" = load i64, ptr addrspace(5) %"46", align 8
%"35" = load i64, ptr addrspace(5) %"11", align 8
%2 = inttoptr i64 %"35" to ptr
%"18" = call [2 x i32] %2(i64 %"17")
store [2 x i32] %"18", ptr addrspace(5) %"47", align 4
%"57" = getelementptr inbounds i8, ptr addrspace(5) %"47", i64 0
%"36" = load i64, ptr addrspace(5) %"57", align 8
store i64 %"36", ptr addrspace(5) %"10", align 8
%"37" = load i64, ptr addrspace(5) %"9", align 8
%"38" = load i64, ptr addrspace(5) %"10", align 8
%"55" = inttoptr i64 %"37" to ptr addrspace(1)
store i64 %"38", ptr addrspace(1) %"55", align 8
ret void
}

View file

@ -3,43 +3,39 @@ target triple = "amdgcn-amd-amdhsa"
%struct.i64i32 = type { i64, i32 }
define private %struct.i64i32 @"1"(i32 %"41", i32 %"42") #0 {
"64":
define private %struct.i64i32 @"1"(i32 %"39", i32 %"40") #0 {
%"18" = alloca i32, align 4, addrspace(5)
%"19" = alloca i32, align 4, addrspace(5)
%"16" = alloca i64, align 8, addrspace(5)
%"17" = alloca i32, align 4, addrspace(5)
%"23" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"23", align 1
%"24" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"24", align 1
%"22" = alloca i1, align 1, addrspace(5)
%"20" = alloca i32, align 4, addrspace(5)
store i32 %"41", ptr addrspace(5) %"18", align 4
store i32 %"42", ptr addrspace(5) %"19", align 4
%"44" = load i32, ptr addrspace(5) %"18", align 4
%"45" = load i32, ptr addrspace(5) %"19", align 4
%"43" = add i32 %"44", %"45"
store i32 %"43", ptr addrspace(5) %"20", align 4
%"47" = load i32, ptr addrspace(5) %"20", align 4
%"46" = zext i32 %"47" to i64
store i64 %"46", ptr addrspace(5) %"16", align 8
%"49" = load i32, ptr addrspace(5) %"18", align 4
%"50" = load i32, ptr addrspace(5) %"19", align 4
%"48" = mul i32 %"49", %"50"
store i32 %"48", ptr addrspace(5) %"17", align 4
%"51" = load i64, ptr addrspace(5) %"16", align 8
%"52" = load i32, ptr addrspace(5) %"17", align 4
%0 = insertvalue %struct.i64i32 undef, i64 %"51", 0
%1 = insertvalue %struct.i64i32 %0, i32 %"52", 1
ret %struct.i64i32 %1
br label %1
1: ; preds = %0
store i32 %"39", ptr addrspace(5) %"18", align 4
store i32 %"40", ptr addrspace(5) %"19", align 4
store i1 false, ptr addrspace(5) %"22", align 1
%"42" = load i32, ptr addrspace(5) %"18", align 4
%"43" = load i32, ptr addrspace(5) %"19", align 4
%"41" = add i32 %"42", %"43"
store i32 %"41", ptr addrspace(5) %"20", align 4
%"45" = load i32, ptr addrspace(5) %"20", align 4
%"44" = zext i32 %"45" to i64
store i64 %"44", ptr addrspace(5) %"16", align 8
%"47" = load i32, ptr addrspace(5) %"18", align 4
%"48" = load i32, ptr addrspace(5) %"19", align 4
%"46" = mul i32 %"47", %"48"
store i32 %"46", ptr addrspace(5) %"17", align 4
%"49" = load i64, ptr addrspace(5) %"16", align 8
%"50" = load i32, ptr addrspace(5) %"17", align 4
%2 = insertvalue %struct.i64i32 undef, i64 %"49", 0
%3 = insertvalue %struct.i64i32 %2, i32 %"50", 1
ret %struct.i64i32 %3
}
define protected amdgpu_kernel void @call_multi_return(ptr addrspace(4) byref(i64) %"57", ptr addrspace(4) byref(i64) %"58") #0 {
"63":
define protected amdgpu_kernel void @call_multi_return(ptr addrspace(4) byref(i64) %"55", ptr addrspace(4) byref(i64) %"56") #0 {
%"21" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"21", align 1
%"22" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"22", align 1
%"9" = alloca i64, align 8, addrspace(5)
%"10" = alloca i64, align 8, addrspace(5)
%"11" = alloca i32, align 4, addrspace(5)
@ -47,38 +43,42 @@ define protected amdgpu_kernel void @call_multi_return(ptr addrspace(4) byref(i6
%"13" = alloca i64, align 8, addrspace(5)
%"14" = alloca i64, align 8, addrspace(5)
%"15" = alloca i32, align 4, addrspace(5)
%"25" = load i64, ptr addrspace(4) %"57", align 8
store i64 %"25", ptr addrspace(5) %"9", align 8
%"26" = load i64, ptr addrspace(4) %"58", align 8
store i64 %"26", ptr addrspace(5) %"10", align 8
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"21", align 1
%"23" = load i64, ptr addrspace(4) %"55", align 8
store i64 %"23", ptr addrspace(5) %"9", align 8
%"24" = load i64, ptr addrspace(4) %"56", align 8
store i64 %"24", ptr addrspace(5) %"10", align 8
%"26" = load i64, ptr addrspace(5) %"9", align 8
%"57" = inttoptr i64 %"26" to ptr addrspace(1)
%"25" = load i32, ptr addrspace(1) %"57", align 4
store i32 %"25", ptr addrspace(5) %"11", align 4
%"28" = load i64, ptr addrspace(5) %"9", align 8
%"59" = inttoptr i64 %"28" to ptr addrspace(1)
%"27" = load i32, ptr addrspace(1) %"59", align 4
store i32 %"27", ptr addrspace(5) %"11", align 4
%"30" = load i64, ptr addrspace(5) %"9", align 8
%"60" = inttoptr i64 %"30" to ptr addrspace(1)
%"66" = getelementptr inbounds i8, ptr addrspace(1) %"60", i64 4
%"29" = load i32, ptr addrspace(1) %"66", align 4
store i32 %"29", ptr addrspace(5) %"12", align 4
%"33" = load i32, ptr addrspace(5) %"11", align 4
%"34" = load i32, ptr addrspace(5) %"12", align 4
%0 = call %struct.i64i32 @"1"(i32 %"33", i32 %"34")
%"31" = extractvalue %struct.i64i32 %0, 0
%"32" = extractvalue %struct.i64i32 %0, 1
store i64 %"31", ptr addrspace(5) %"13", align 8
store i32 %"32", ptr addrspace(5) %"15", align 4
%"36" = load i32, ptr addrspace(5) %"15", align 4
%"35" = zext i32 %"36" to i64
store i64 %"35", ptr addrspace(5) %"14", align 8
%"58" = inttoptr i64 %"28" to ptr addrspace(1)
%"62" = getelementptr inbounds i8, ptr addrspace(1) %"58", i64 4
%"27" = load i32, ptr addrspace(1) %"62", align 4
store i32 %"27", ptr addrspace(5) %"12", align 4
%"31" = load i32, ptr addrspace(5) %"11", align 4
%"32" = load i32, ptr addrspace(5) %"12", align 4
%2 = call %struct.i64i32 @"1"(i32 %"31", i32 %"32")
%"29" = extractvalue %struct.i64i32 %2, 0
%"30" = extractvalue %struct.i64i32 %2, 1
store i64 %"29", ptr addrspace(5) %"13", align 8
store i32 %"30", ptr addrspace(5) %"15", align 4
%"34" = load i32, ptr addrspace(5) %"15", align 4
%"33" = zext i32 %"34" to i64
store i64 %"33", ptr addrspace(5) %"14", align 8
%"35" = load i64, ptr addrspace(5) %"10", align 8
%"36" = load i64, ptr addrspace(5) %"13", align 8
%"59" = inttoptr i64 %"35" to ptr addrspace(1)
store i64 %"36", ptr addrspace(1) %"59", align 8
%"37" = load i64, ptr addrspace(5) %"10", align 8
%"38" = load i64, ptr addrspace(5) %"13", align 8
%"61" = inttoptr i64 %"37" to ptr addrspace(1)
store i64 %"38", ptr addrspace(1) %"61", align 8
%"39" = load i64, ptr addrspace(5) %"10", align 8
%"40" = load i64, ptr addrspace(5) %"14", align 8
%"62" = inttoptr i64 %"39" to ptr addrspace(1)
%"68" = getelementptr inbounds i8, ptr addrspace(1) %"62", i64 8
store i64 %"40", ptr addrspace(1) %"68", align 8
%"38" = load i64, ptr addrspace(5) %"14", align 8
%"60" = inttoptr i64 %"37" to ptr addrspace(1)
%"64" = getelementptr inbounds i8, ptr addrspace(1) %"60", i64 8
store i64 %"38", ptr addrspace(1) %"64", align 8
ret void
}

View file

@ -1,67 +1,67 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define private i64 @incr(i64 %"35") #0 {
"56":
define private i64 @incr(i64 %"33") #0 {
%"20" = alloca i64, align 8, addrspace(5)
%"19" = alloca i64, align 8, addrspace(5)
%"23" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"23", align 1
%"24" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"24", align 1
%"48" = alloca i64, align 8, addrspace(5)
%"49" = alloca i64, align 8, addrspace(5)
%"22" = alloca i1, align 1, addrspace(5)
%"46" = alloca i64, align 8, addrspace(5)
%"47" = alloca i64, align 8, addrspace(5)
%"16" = alloca i64, align 8, addrspace(5)
store i64 %"35", ptr addrspace(5) %"20", align 8
%"36" = load i64, ptr addrspace(5) %"20", align 8
store i64 %"36", ptr addrspace(5) %"49", align 8
%"37" = load i64, ptr addrspace(5) %"49", align 8
store i64 %"37", ptr addrspace(5) %"16", align 8
%"39" = load i64, ptr addrspace(5) %"16", align 8
%"38" = add i64 %"39", 1
store i64 %"38", ptr addrspace(5) %"16", align 8
%"40" = load i64, ptr addrspace(5) %"16", align 8
store i64 %"40", ptr addrspace(5) %"48", align 8
%"41" = load i64, ptr addrspace(5) %"48", align 8
store i64 %"41", ptr addrspace(5) %"19", align 8
%"42" = load i64, ptr addrspace(5) %"19", align 8
ret i64 %"42"
br label %1
1: ; preds = %0
store i64 %"33", ptr addrspace(5) %"20", align 8
store i1 false, ptr addrspace(5) %"22", align 1
%"34" = load i64, ptr addrspace(5) %"20", align 8
store i64 %"34", ptr addrspace(5) %"47", align 8
%"35" = load i64, ptr addrspace(5) %"47", align 8
store i64 %"35", ptr addrspace(5) %"16", align 8
%"37" = load i64, ptr addrspace(5) %"16", align 8
%"36" = add i64 %"37", 1
store i64 %"36", ptr addrspace(5) %"16", align 8
%"38" = load i64, ptr addrspace(5) %"16", align 8
store i64 %"38", ptr addrspace(5) %"46", align 8
%"39" = load i64, ptr addrspace(5) %"46", align 8
store i64 %"39", ptr addrspace(5) %"19", align 8
%"40" = load i64, ptr addrspace(5) %"19", align 8
ret i64 %"40"
}
define protected amdgpu_kernel void @callprototype(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #0 {
"55":
define protected amdgpu_kernel void @callprototype(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 {
%"21" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"21", align 1
%"22" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"22", align 1
%"7" = alloca i64, align 8, addrspace(5)
%"8" = alloca i64, align 8, addrspace(5)
%"9" = alloca i64, align 8, addrspace(5)
%"10" = alloca i64, align 8, addrspace(5)
%"46" = alloca i64, align 8, addrspace(5)
%"47" = alloca i64, align 8, addrspace(5)
%"25" = load i64, ptr addrspace(4) %"44", align 8
store i64 %"25", ptr addrspace(5) %"7", align 8
%"26" = load i64, ptr addrspace(4) %"45", align 8
store i64 %"26", ptr addrspace(5) %"8", align 8
%"28" = load i64, ptr addrspace(5) %"7", align 8
%"50" = inttoptr i64 %"28" to ptr addrspace(1)
%"27" = load i64, ptr addrspace(1) %"50", align 8
store i64 %"27", ptr addrspace(5) %"9", align 8
%"29" = load i64, ptr addrspace(5) %"9", align 8
store i64 %"29", ptr addrspace(5) %"46", align 8
%"44" = alloca i64, align 8, addrspace(5)
%"45" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"21", align 1
%"23" = load i64, ptr addrspace(4) %"42", align 8
store i64 %"23", ptr addrspace(5) %"7", align 8
%"24" = load i64, ptr addrspace(4) %"43", align 8
store i64 %"24", ptr addrspace(5) %"8", align 8
%"26" = load i64, ptr addrspace(5) %"7", align 8
%"48" = inttoptr i64 %"26" to ptr addrspace(1)
%"25" = load i64, ptr addrspace(1) %"48", align 8
store i64 %"25", ptr addrspace(5) %"9", align 8
%"27" = load i64, ptr addrspace(5) %"9", align 8
store i64 %"27", ptr addrspace(5) %"44", align 8
store i64 ptrtoint (ptr @incr to i64), ptr addrspace(5) %"10", align 8
%"17" = load i64, ptr addrspace(5) %"46", align 8
%"31" = load i64, ptr addrspace(5) %"10", align 8
%0 = inttoptr i64 %"31" to ptr
%"18" = call i64 %0(i64 %"17")
store i64 %"18", ptr addrspace(5) %"47", align 8
%"32" = load i64, ptr addrspace(5) %"47", align 8
store i64 %"32", ptr addrspace(5) %"9", align 8
%"33" = load i64, ptr addrspace(5) %"8", align 8
%"34" = load i64, ptr addrspace(5) %"9", align 8
%"54" = inttoptr i64 %"33" to ptr addrspace(1)
store i64 %"34", ptr addrspace(1) %"54", align 8
%"17" = load i64, ptr addrspace(5) %"44", align 8
%"29" = load i64, ptr addrspace(5) %"10", align 8
%2 = inttoptr i64 %"29" to ptr
%"18" = call i64 %2(i64 %"17")
store i64 %"18", ptr addrspace(5) %"45", align 8
%"30" = load i64, ptr addrspace(5) %"45", align 8
store i64 %"30", ptr addrspace(5) %"9", align 8
%"31" = load i64, ptr addrspace(5) %"8", align 8
%"32" = load i64, ptr addrspace(5) %"9", align 8
%"52" = inttoptr i64 %"31" to ptr addrspace(1)
store i64 %"32", ptr addrspace(1) %"52", align 8
ret void
}

View file

@ -1,51 +0,0 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @carry_mixed(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
"44":
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
%"11" = load i64, ptr addrspace(4) %"35", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%0 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1)
%"36" = extractvalue { i32, i1 } %0, 0
%"13" = extractvalue { i32, i1 } %0, 1
store i32 %"36", ptr addrspace(5) %"6", align 4
store i1 %"13", ptr addrspace(5) %"10", align 1
%"15" = load i1, ptr addrspace(5) %"10", align 1
%1 = zext i1 %"15" to i32
%"37" = sub i32 2, %1
store i32 %"37", ptr addrspace(5) %"7", align 4
%2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1)
%"38" = extractvalue { i32, i1 } %2, 0
%"17" = extractvalue { i32, i1 } %2, 1
store i32 %"38", ptr addrspace(5) %"6", align 4
store i1 %"17", ptr addrspace(5) %"10", align 1
%"19" = load i1, ptr addrspace(5) %"9", align 1
%3 = zext i1 %"19" to i32
%"39" = add i32 1, %3
store i32 %"39", ptr addrspace(5) %"8", align 4
%"20" = load i64, ptr addrspace(5) %"5", align 8
%"21" = load i32, ptr addrspace(5) %"7", align 4
%"40" = inttoptr i64 %"20" to ptr
store i32 %"21", ptr %"40", align 4
%"22" = load i64, ptr addrspace(5) %"5", align 8
%"23" = load i32, ptr addrspace(5) %"8", align 4
%"42" = inttoptr i64 %"22" to ptr
%"46" = getelementptr inbounds i8, ptr %"42", i64 4
store i32 %"23", ptr %"46", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }

View file

@ -1,32 +0,0 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry carry_mixed(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .b32 unused;
.reg .b32 carry_out_1;
.reg .b32 carry_out_2;
ld.param.u64 out_addr, [output];
// set carry with sub
sub.cc.s32 unused, 0, 1;
// write carry with sub
subc.s32 carry_out_1, 2, 0;
// set carry with sub
sub.cc.s32 unused, 0, 1;
// fail writing carry with add
addc.s32 carry_out_2, 1, 0;
st.s32 [out_addr], carry_out_1;
st.s32 [out_addr+4], carry_out_2;
ret;
}

View file

@ -0,0 +1,259 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @carry_set_all(ptr addrspace(4) byref(i64) %"208", ptr addrspace(4) byref(i64) %"209") #0 {
%"22" = alloca i1, align 1, addrspace(5)
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
%"9" = alloca i32, align 4, addrspace(5)
%"10" = alloca i32, align 4, addrspace(5)
%"11" = alloca i32, align 4, addrspace(5)
%"12" = alloca i32, align 4, addrspace(5)
%"13" = alloca i32, align 4, addrspace(5)
%"14" = alloca i32, align 4, addrspace(5)
%"15" = alloca i32, align 4, addrspace(5)
%"16" = alloca i32, align 4, addrspace(5)
%"17" = alloca i32, align 4, addrspace(5)
%"18" = alloca i32, align 4, addrspace(5)
%"19" = alloca i32, align 4, addrspace(5)
%"20" = alloca i32, align 4, addrspace(5)
%"21" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"22", align 1
%"37" = load i64, ptr addrspace(4) %"209", align 8
store i64 %"37", ptr addrspace(5) %"5", align 8
%2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 0)
%"210" = extractvalue { i32, i1 } %2, 0
%"23" = extractvalue { i32, i1 } %2, 1
store i32 %"210", ptr addrspace(5) %"6", align 4
%"39" = xor i1 %"23", true
store i1 %"39", ptr addrspace(5) %"22", align 1
%"41" = load i1, ptr addrspace(5) %"22", align 1
%3 = zext i1 %"41" to i32
%"211" = add i32 0, %3
store i32 %"211", ptr addrspace(5) %"6", align 4
%"42" = load i1, ptr addrspace(5) %"22", align 1
%"24" = xor i1 %"42", true
%4 = zext i1 %"24" to i32
%"212" = sub i32 0, %4
store i32 %"212", ptr addrspace(5) %"7", align 4
%5 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1)
%"213" = extractvalue { i32, i1 } %5, 0
%"25" = extractvalue { i32, i1 } %5, 1
store i32 %"213", ptr addrspace(5) %"8", align 4
%"45" = xor i1 %"25", true
store i1 %"45", ptr addrspace(5) %"22", align 1
%"47" = load i1, ptr addrspace(5) %"22", align 1
%6 = zext i1 %"47" to i32
%"214" = add i32 0, %6
store i32 %"214", ptr addrspace(5) %"8", align 4
%"48" = load i1, ptr addrspace(5) %"22", align 1
%"26" = xor i1 %"48", true
%7 = zext i1 %"26" to i32
%"215" = sub i32 0, %7
store i32 %"215", ptr addrspace(5) %"9", align 4
%8 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 0)
%"216" = extractvalue { i32, i1 } %8, 0
%"51" = extractvalue { i32, i1 } %8, 1
store i32 %"216", ptr addrspace(5) %"10", align 4
store i1 %"51", ptr addrspace(5) %"22", align 1
%"53" = load i1, ptr addrspace(5) %"22", align 1
%9 = zext i1 %"53" to i32
%"217" = add i32 0, %9
store i32 %"217", ptr addrspace(5) %"10", align 4
%"54" = load i1, ptr addrspace(5) %"22", align 1
%"27" = xor i1 %"54", true
%10 = zext i1 %"27" to i32
%"218" = sub i32 0, %10
store i32 %"218", ptr addrspace(5) %"11", align 4
%11 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -1, i32 -1)
%"219" = extractvalue { i32, i1 } %11, 0
%"57" = extractvalue { i32, i1 } %11, 1
store i32 %"219", ptr addrspace(5) %"12", align 4
store i1 %"57", ptr addrspace(5) %"22", align 1
%"59" = load i1, ptr addrspace(5) %"22", align 1
%12 = zext i1 %"59" to i32
%"220" = add i32 0, %12
store i32 %"220", ptr addrspace(5) %"12", align 4
%"60" = load i1, ptr addrspace(5) %"22", align 1
%"28" = xor i1 %"60", true
%13 = zext i1 %"28" to i32
%"221" = sub i32 0, %13
store i32 %"221", ptr addrspace(5) %"13", align 4
%14 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 0)
%"222" = extractvalue { i32, i1 } %14, 0
%"63" = extractvalue { i32, i1 } %14, 1
store i32 %"222", ptr addrspace(5) %"14", align 4
store i1 %"63", ptr addrspace(5) %"22", align 1
%"65" = load i1, ptr addrspace(5) %"22", align 1
%15 = zext i1 %"65" to i32
%"223" = add i32 0, %15
store i32 %"223", ptr addrspace(5) %"14", align 4
%"66" = load i1, ptr addrspace(5) %"22", align 1
%"29" = xor i1 %"66", true
%16 = zext i1 %"29" to i32
%"224" = sub i32 0, %16
store i32 %"224", ptr addrspace(5) %"15", align 4
%17 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -1, i32 -1)
%"225" = extractvalue { i32, i1 } %17, 0
%"69" = extractvalue { i32, i1 } %17, 1
store i32 %"225", ptr addrspace(5) %"16", align 4
store i1 %"69", ptr addrspace(5) %"22", align 1
%"71" = load i1, ptr addrspace(5) %"22", align 1
%18 = zext i1 %"71" to i32
%"226" = add i32 0, %18
store i32 %"226", ptr addrspace(5) %"16", align 4
%"72" = load i1, ptr addrspace(5) %"22", align 1
%"30" = xor i1 %"72", true
%19 = zext i1 %"30" to i32
%"227" = sub i32 0, %19
store i32 %"227", ptr addrspace(5) %"17", align 4
%20 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 0)
%"228" = extractvalue { i32, i1 } %20, 0
%"75" = extractvalue { i32, i1 } %20, 1
store i32 %"228", ptr addrspace(5) %"18", align 4
store i1 %"75", ptr addrspace(5) %"22", align 1
%"76" = load i1, ptr addrspace(5) %"22", align 1
%"31" = xor i1 %"76", true
%21 = zext i1 %"31" to i32
%22 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 0)
%23 = extractvalue { i32, i1 } %22, 0
%24 = extractvalue { i32, i1 } %22, 1
%25 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %23, i32 %21)
%"229" = extractvalue { i32, i1 } %25, 0
%26 = extractvalue { i32, i1 } %25, 1
%"32" = xor i1 %24, %26
store i32 %"229", ptr addrspace(5) %"18", align 4
%"78" = xor i1 %"32", true
store i1 %"78", ptr addrspace(5) %"22", align 1
%"80" = load i1, ptr addrspace(5) %"22", align 1
%27 = zext i1 %"80" to i32
%"230" = add i32 0, %27
store i32 %"230", ptr addrspace(5) %"18", align 4
%"81" = load i1, ptr addrspace(5) %"22", align 1
%"33" = xor i1 %"81", true
%28 = zext i1 %"33" to i32
%"231" = sub i32 0, %28
store i32 %"231", ptr addrspace(5) %"19", align 4
%29 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 0)
%"232" = extractvalue { i32, i1 } %29, 0
%"84" = extractvalue { i32, i1 } %29, 1
store i32 %"232", ptr addrspace(5) %"20", align 4
store i1 %"84", ptr addrspace(5) %"22", align 1
%"85" = load i1, ptr addrspace(5) %"22", align 1
%"34" = xor i1 %"85", true
%30 = zext i1 %"34" to i32
%31 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1)
%32 = extractvalue { i32, i1 } %31, 0
%33 = extractvalue { i32, i1 } %31, 1
%34 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %32, i32 %30)
%"233" = extractvalue { i32, i1 } %34, 0
%35 = extractvalue { i32, i1 } %34, 1
%"35" = xor i1 %33, %35
store i32 %"233", ptr addrspace(5) %"20", align 4
%"87" = xor i1 %"35", true
store i1 %"87", ptr addrspace(5) %"22", align 1
%"89" = load i1, ptr addrspace(5) %"22", align 1
%36 = zext i1 %"89" to i32
%"234" = add i32 0, %36
store i32 %"234", ptr addrspace(5) %"20", align 4
%"90" = load i1, ptr addrspace(5) %"22", align 1
%"36" = xor i1 %"90", true
%37 = zext i1 %"36" to i32
%"235" = sub i32 0, %37
store i32 %"235", ptr addrspace(5) %"21", align 4
%"92" = load i64, ptr addrspace(5) %"5", align 8
%"93" = load i32, ptr addrspace(5) %"6", align 4
%"236" = inttoptr i64 %"92" to ptr
store i32 %"93", ptr %"236", align 4
%"94" = load i64, ptr addrspace(5) %"5", align 8
%"95" = load i32, ptr addrspace(5) %"8", align 4
%"238" = inttoptr i64 %"94" to ptr
%"269" = getelementptr inbounds i8, ptr %"238", i64 4
store i32 %"95", ptr %"269", align 4
%"96" = load i64, ptr addrspace(5) %"5", align 8
%"97" = load i32, ptr addrspace(5) %"10", align 4
%"240" = inttoptr i64 %"96" to ptr
%"271" = getelementptr inbounds i8, ptr %"240", i64 8
store i32 %"97", ptr %"271", align 4
%"98" = load i64, ptr addrspace(5) %"5", align 8
%"99" = load i32, ptr addrspace(5) %"12", align 4
%"242" = inttoptr i64 %"98" to ptr
%"273" = getelementptr inbounds i8, ptr %"242", i64 12
store i32 %"99", ptr %"273", align 4
%"100" = load i64, ptr addrspace(5) %"5", align 8
%"101" = load i32, ptr addrspace(5) %"14", align 4
%"244" = inttoptr i64 %"100" to ptr
%"275" = getelementptr inbounds i8, ptr %"244", i64 16
store i32 %"101", ptr %"275", align 4
%"102" = load i64, ptr addrspace(5) %"5", align 8
%"103" = load i32, ptr addrspace(5) %"16", align 4
%"246" = inttoptr i64 %"102" to ptr
%"277" = getelementptr inbounds i8, ptr %"246", i64 20
store i32 %"103", ptr %"277", align 4
%"104" = load i64, ptr addrspace(5) %"5", align 8
%"105" = load i32, ptr addrspace(5) %"18", align 4
%"248" = inttoptr i64 %"104" to ptr
%"279" = getelementptr inbounds i8, ptr %"248", i64 24
store i32 %"105", ptr %"279", align 4
%"106" = load i64, ptr addrspace(5) %"5", align 8
%"107" = load i32, ptr addrspace(5) %"20", align 4
%"250" = inttoptr i64 %"106" to ptr
%"281" = getelementptr inbounds i8, ptr %"250", i64 28
store i32 %"107", ptr %"281", align 4
%"108" = load i64, ptr addrspace(5) %"5", align 8
%"109" = load i32, ptr addrspace(5) %"7", align 4
%"252" = inttoptr i64 %"108" to ptr
%"283" = getelementptr inbounds i8, ptr %"252", i64 32
store i32 %"109", ptr %"283", align 4
%"110" = load i64, ptr addrspace(5) %"5", align 8
%"111" = load i32, ptr addrspace(5) %"9", align 4
%"254" = inttoptr i64 %"110" to ptr
%"285" = getelementptr inbounds i8, ptr %"254", i64 36
store i32 %"111", ptr %"285", align 4
%"112" = load i64, ptr addrspace(5) %"5", align 8
%"113" = load i32, ptr addrspace(5) %"11", align 4
%"256" = inttoptr i64 %"112" to ptr
%"287" = getelementptr inbounds i8, ptr %"256", i64 40
store i32 %"113", ptr %"287", align 4
%"114" = load i64, ptr addrspace(5) %"5", align 8
%"115" = load i32, ptr addrspace(5) %"13", align 4
%"258" = inttoptr i64 %"114" to ptr
%"289" = getelementptr inbounds i8, ptr %"258", i64 44
store i32 %"115", ptr %"289", align 4
%"116" = load i64, ptr addrspace(5) %"5", align 8
%"117" = load i32, ptr addrspace(5) %"15", align 4
%"260" = inttoptr i64 %"116" to ptr
%"291" = getelementptr inbounds i8, ptr %"260", i64 48
store i32 %"117", ptr %"291", align 4
%"118" = load i64, ptr addrspace(5) %"5", align 8
%"119" = load i32, ptr addrspace(5) %"17", align 4
%"262" = inttoptr i64 %"118" to ptr
%"293" = getelementptr inbounds i8, ptr %"262", i64 52
store i32 %"119", ptr %"293", align 4
%"120" = load i64, ptr addrspace(5) %"5", align 8
%"121" = load i32, ptr addrspace(5) %"19", align 4
%"264" = inttoptr i64 %"120" to ptr
%"295" = getelementptr inbounds i8, ptr %"264", i64 56
store i32 %"121", ptr %"295", align 4
%"122" = load i64, ptr addrspace(5) %"5", align 8
%"123" = load i32, ptr addrspace(5) %"21", align 4
%"266" = inttoptr i64 %"122" to ptr
%"297" = getelementptr inbounds i8, ptr %"266", i64 60
store i32 %"123", ptr %"297", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }

View file

@ -0,0 +1,84 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry carry_set_all(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .b32 carry1_add;
.reg .b32 carry1_sub;
.reg .b32 carry2_add;
.reg .b32 carry2_sub;
.reg .b32 carry3_add;
.reg .b32 carry3_sub;
.reg .b32 carry4_add;
.reg .b32 carry4_sub;
.reg .b32 carry5_add;
.reg .b32 carry5_sub;
.reg .b32 carry6_add;
.reg .b32 carry6_sub;
.reg .b32 carry7_add;
.reg .b32 carry7_sub;
.reg .b32 carry8_add;
.reg .b32 carry8_sub;
ld.param.u64 out_addr, [output];
sub.cc.u32 carry1_add, 0, 0;
addc.u32 carry1_add, 0, 0;
subc.u32 carry1_sub, 0, 0;
sub.cc.u32 carry2_add, 0, 1;
addc.u32 carry2_add, 0, 0;
subc.u32 carry2_sub, 0, 0;
add.cc.u32 carry3_add, 0, 0;
addc.u32 carry3_add, 0, 0;
subc.u32 carry3_sub, 0, 0;
add.cc.u32 carry4_add, 4294967295, 4294967295;
addc.u32 carry4_add, 0, 0;
subc.u32 carry4_sub, 0, 0;
mad.lo.cc.u32 carry5_add, 0, 0, 0;
addc.u32 carry5_add, 0, 0;
subc.u32 carry5_sub, 0, 0;
mad.lo.cc.u32 carry6_add, 1, 4294967295, 4294967295;
addc.u32 carry6_add, 0, 0;
subc.u32 carry6_sub, 0, 0;
add.cc.u32 carry7_add, 0, 0;
subc.cc.u32 carry7_add, 0, 0;
addc.u32 carry7_add, 0, 0;
subc.u32 carry7_sub, 0, 0;
add.cc.u32 carry8_add, 0, 0;
subc.cc.u32 carry8_add, 0, 1;
addc.u32 carry8_add, 0, 0;
subc.u32 carry8_sub, 0, 0;
st.u32 [out_addr], carry1_add;
st.u32 [out_addr+4], carry2_add;
st.u32 [out_addr+8], carry3_add;
st.u32 [out_addr+12], carry4_add;
st.u32 [out_addr+16], carry5_add;
st.u32 [out_addr+20], carry6_add;
st.u32 [out_addr+24], carry7_add;
st.u32 [out_addr+28], carry8_add;
st.u32 [out_addr+32], carry1_sub;
st.u32 [out_addr+36], carry2_sub;
st.u32 [out_addr+40], carry3_sub;
st.u32 [out_addr+44], carry4_sub;
st.u32 [out_addr+48], carry5_sub;
st.u32 [out_addr+52], carry6_sub;
st.u32 [out_addr+56], carry7_sub;
st.u32 [out_addr+60], carry8_sub;
ret;
}

View file

@ -1,30 +1,30 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
"21":
define protected amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 {
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = load i64, ptr addrspace(4) %"16", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"19" = inttoptr i64 %"12" to ptr
%"11" = load i32, ptr %"19", align 4
store i32 %"11", ptr addrspace(5) %"6", align 4
%"14" = load i32, ptr addrspace(5) %"6", align 4
%0 = call i32 @llvm.ctlz.i32(i32 %"14", i1 false)
store i32 %0, ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load i32, ptr addrspace(5) %"6", align 4
%"20" = inttoptr i64 %"15" to ptr
store i32 %"16", ptr %"20", align 4
store i64 %"9", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(5) %"4", align 8
%"18" = inttoptr i64 %"11" to ptr
%"10" = load i32, ptr %"18", align 4
store i32 %"10", ptr addrspace(5) %"6", align 4
%"13" = load i32, ptr addrspace(5) %"6", align 4
%2 = call i32 @llvm.ctlz.i32(i32 %"13", i1 false)
store i32 %2, ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"15" = load i32, ptr addrspace(5) %"6", align 4
%"19" = inttoptr i64 %"14" to ptr
store i32 %"15", ptr %"19", align 4
ret void
}

View file

@ -3,49 +3,49 @@ target triple = "amdgcn-amd-amdhsa"
@constparams = protected addrspace(4) externally_initialized global [4 x i16] [i16 10, i16 20, i16 30, i16 40], align 8
define protected amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 {
"53":
define protected amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 {
%"11" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"11", align 1
%"12" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"12", align 1
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i16, align 2, addrspace(5)
%"8" = alloca i16, align 2, addrspace(5)
%"9" = alloca i16, align 2, addrspace(5)
%"10" = alloca i16, align 2, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"11", align 1
%"12" = load i64, ptr addrspace(4) %"38", align 8
store i64 %"12", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(4) %"39", align 8
store i64 %"13", ptr addrspace(5) %"5", align 8
%"14" = load i64, ptr addrspace(4) %"40", align 8
store i64 %"14", ptr addrspace(5) %"6", align 8
%"15" = load i16, ptr addrspace(4) @constparams, align 2
store i16 %"15", ptr addrspace(5) %"7", align 2
%"16" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2
store i16 %"16", ptr addrspace(5) %"8", align 2
%"17" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2
store i16 %"17", ptr addrspace(5) %"9", align 2
%"18" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2
store i16 %"18", ptr addrspace(5) %"10", align 2
%"19" = load i64, ptr addrspace(5) %"6", align 8
%"20" = load i16, ptr addrspace(5) %"7", align 2
%"45" = inttoptr i64 %"19" to ptr
store i16 %"20", ptr %"45", align 2
%"21" = load i64, ptr addrspace(5) %"6", align 8
%"22" = load i16, ptr addrspace(5) %"8", align 2
%"47" = inttoptr i64 %"21" to ptr
%"61" = getelementptr inbounds i8, ptr %"47", i64 2
store i16 %"22", ptr %"61", align 2
%"23" = load i64, ptr addrspace(5) %"6", align 8
%"24" = load i16, ptr addrspace(5) %"9", align 2
%"49" = inttoptr i64 %"23" to ptr
%"63" = getelementptr inbounds i8, ptr %"49", i64 4
store i16 %"24", ptr %"63", align 2
%"25" = load i64, ptr addrspace(5) %"6", align 8
%"26" = load i16, ptr addrspace(5) %"10", align 2
%"51" = inttoptr i64 %"25" to ptr
%"65" = getelementptr inbounds i8, ptr %"51", i64 6
store i16 %"26", ptr %"65", align 2
store i64 %"13", ptr addrspace(5) %"6", align 8
%"14" = load i16, ptr addrspace(4) @constparams, align 2
store i16 %"14", ptr addrspace(5) %"7", align 2
%"15" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2
store i16 %"15", ptr addrspace(5) %"8", align 2
%"16" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2
store i16 %"16", ptr addrspace(5) %"9", align 2
%"17" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2
store i16 %"17", ptr addrspace(5) %"10", align 2
%"18" = load i64, ptr addrspace(5) %"6", align 8
%"19" = load i16, ptr addrspace(5) %"7", align 2
%"44" = inttoptr i64 %"18" to ptr
store i16 %"19", ptr %"44", align 2
%"20" = load i64, ptr addrspace(5) %"6", align 8
%"21" = load i16, ptr addrspace(5) %"8", align 2
%"46" = inttoptr i64 %"20" to ptr
%"59" = getelementptr inbounds i8, ptr %"46", i64 2
store i16 %"21", ptr %"59", align 2
%"22" = load i64, ptr addrspace(5) %"6", align 8
%"23" = load i16, ptr addrspace(5) %"9", align 2
%"48" = inttoptr i64 %"22" to ptr
%"61" = getelementptr inbounds i8, ptr %"48", i64 4
store i16 %"23", ptr %"61", align 2
%"24" = load i64, ptr addrspace(5) %"6", align 8
%"25" = load i16, ptr addrspace(5) %"10", align 2
%"50" = inttoptr i64 %"24" to ptr
%"63" = getelementptr inbounds i8, ptr %"50", i64 6
store i16 %"25", ptr %"63", align 2
ret void
}

View file

@ -1,30 +1,30 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"22":
define protected amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"20" = inttoptr i64 %"12" to ptr
%"11" = load float, ptr %"20", align 4
store float %"11", ptr addrspace(5) %"6", align 4
%"14" = load float, ptr addrspace(5) %"6", align 4
%"13" = fmul float %"14", 5.000000e-01
store float %"13", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load float, ptr addrspace(5) %"6", align 4
%"21" = inttoptr i64 %"15" to ptr
store float %"16", ptr %"21", align 4
store i64 %"9", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(5) %"4", align 8
%"19" = inttoptr i64 %"11" to ptr
%"10" = load float, ptr %"19", align 4
store float %"10", ptr addrspace(5) %"6", align 4
%"13" = load float, ptr addrspace(5) %"6", align 4
%"12" = fmul float %"13", 5.000000e-01
store float %"12", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"15" = load float, ptr addrspace(5) %"6", align 4
%"20" = inttoptr i64 %"14" to ptr
store float %"15", ptr %"20", align 4
ret void
}

View file

@ -1,30 +1,30 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"22":
define protected amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"20" = inttoptr i64 %"12" to ptr
%"11" = load i32, ptr %"20", align 4
store i32 %"11", ptr addrspace(5) %"6", align 4
%"14" = load i32, ptr addrspace(5) %"6", align 4
%"13" = mul i32 %"14", -1
store i32 %"13", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load i32, ptr addrspace(5) %"6", align 4
%"21" = inttoptr i64 %"15" to ptr
store i32 %"16", ptr %"21", align 4
store i64 %"9", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(5) %"4", align 8
%"19" = inttoptr i64 %"11" to ptr
%"10" = load i32, ptr %"19", align 4
store i32 %"10", ptr addrspace(5) %"6", align 4
%"13" = load i32, ptr addrspace(5) %"6", align 4
%"12" = mul i32 %"13", -1
store i32 %"12", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"15" = load i32, ptr addrspace(5) %"6", align 4
%"20" = inttoptr i64 %"14" to ptr
store i32 %"15", ptr %"20", align 4
ret void
}

View file

@ -1,30 +1,30 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
"21":
define protected amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 {
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = load i64, ptr addrspace(4) %"16", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"19" = inttoptr i64 %"12" to ptr
%"11" = load float, ptr %"19", align 4
store float %"11", ptr addrspace(5) %"6", align 4
%"14" = load float, ptr addrspace(5) %"6", align 4
%"13" = call afn float @llvm.cos.f32(float %"14")
store float %"13", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load float, ptr addrspace(5) %"6", align 4
%"20" = inttoptr i64 %"15" to ptr
store float %"16", ptr %"20", align 4
store i64 %"9", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(5) %"4", align 8
%"18" = inttoptr i64 %"11" to ptr
%"10" = load float, ptr %"18", align 4
store float %"10", ptr addrspace(5) %"6", align 4
%"13" = load float, ptr addrspace(5) %"6", align 4
%"12" = call afn float @llvm.cos.f32(float %"13")
store float %"12", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"15" = load float, ptr addrspace(5) %"6", align 4
%"19" = inttoptr i64 %"14" to ptr
store float %"15", ptr %"19", align 4
ret void
}

View file

@ -3,69 +3,69 @@ target triple = "amdgcn-amd-amdhsa"
declare float @__zluda_ptx_impl__cvt_sat_f32_f32(float) #0
define protected amdgpu_kernel void @cvt_clamp(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #1 {
"57":
define protected amdgpu_kernel void @cvt_clamp(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #1 {
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = load i64, ptr addrspace(4) %"46", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = load i64, ptr addrspace(4) %"47", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"48", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"49" = inttoptr i64 %"12" to ptr addrspace(1)
%"11" = load float, ptr addrspace(1) %"49", align 4
store float %"11", ptr addrspace(5) %"6", align 4
%"14" = load float, ptr addrspace(5) %"6", align 4
%"13" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"14")
store float %"13", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load float, ptr addrspace(5) %"6", align 4
%"50" = inttoptr i64 %"15" to ptr addrspace(1)
store float %"16", ptr addrspace(1) %"50", align 4
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"51" = inttoptr i64 %"18" to ptr addrspace(1)
store i64 %"9", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(5) %"4", align 8
%"48" = inttoptr i64 %"11" to ptr addrspace(1)
%"10" = load float, ptr addrspace(1) %"48", align 4
store float %"10", ptr addrspace(5) %"6", align 4
%"13" = load float, ptr addrspace(5) %"6", align 4
%"12" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"13")
store float %"12", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"15" = load float, ptr addrspace(5) %"6", align 4
%"49" = inttoptr i64 %"14" to ptr addrspace(1)
store float %"15", ptr addrspace(1) %"49", align 4
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"50" = inttoptr i64 %"17" to ptr addrspace(1)
%"60" = getelementptr inbounds i8, ptr addrspace(1) %"50", i64 4
%"16" = load float, ptr addrspace(1) %"60", align 4
store float %"16", ptr addrspace(5) %"6", align 4
%"19" = load float, ptr addrspace(5) %"6", align 4
%"18" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"19")
store float %"18", ptr addrspace(5) %"6", align 4
%"20" = load i64, ptr addrspace(5) %"5", align 8
%"21" = load float, ptr addrspace(5) %"6", align 4
%"51" = inttoptr i64 %"20" to ptr addrspace(1)
%"62" = getelementptr inbounds i8, ptr addrspace(1) %"51", i64 4
%"17" = load float, ptr addrspace(1) %"62", align 4
store float %"17", ptr addrspace(5) %"6", align 4
%"20" = load float, ptr addrspace(5) %"6", align 4
%"19" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"20")
store float %"19", ptr addrspace(5) %"6", align 4
%"21" = load i64, ptr addrspace(5) %"5", align 8
%"22" = load float, ptr addrspace(5) %"6", align 4
%"52" = inttoptr i64 %"21" to ptr addrspace(1)
%"64" = getelementptr inbounds i8, ptr addrspace(1) %"52", i64 4
store float %"22", ptr addrspace(1) %"64", align 4
%"24" = load i64, ptr addrspace(5) %"4", align 8
%"53" = inttoptr i64 %"24" to ptr addrspace(1)
store float %"21", ptr addrspace(1) %"62", align 4
%"23" = load i64, ptr addrspace(5) %"4", align 8
%"52" = inttoptr i64 %"23" to ptr addrspace(1)
%"64" = getelementptr inbounds i8, ptr addrspace(1) %"52", i64 8
%"22" = load float, ptr addrspace(1) %"64", align 4
store float %"22", ptr addrspace(5) %"6", align 4
%"25" = load float, ptr addrspace(5) %"6", align 4
%"24" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"25")
store float %"24", ptr addrspace(5) %"6", align 4
%"26" = load i64, ptr addrspace(5) %"5", align 8
%"27" = load float, ptr addrspace(5) %"6", align 4
%"53" = inttoptr i64 %"26" to ptr addrspace(1)
%"66" = getelementptr inbounds i8, ptr addrspace(1) %"53", i64 8
%"23" = load float, ptr addrspace(1) %"66", align 4
store float %"23", ptr addrspace(5) %"6", align 4
%"26" = load float, ptr addrspace(5) %"6", align 4
%"25" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"26")
store float %"25", ptr addrspace(5) %"6", align 4
%"27" = load i64, ptr addrspace(5) %"5", align 8
%"28" = load float, ptr addrspace(5) %"6", align 4
%"54" = inttoptr i64 %"27" to ptr addrspace(1)
%"68" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 8
store float %"28", ptr addrspace(1) %"68", align 4
%"30" = load i64, ptr addrspace(5) %"4", align 8
%"55" = inttoptr i64 %"30" to ptr addrspace(1)
store float %"27", ptr addrspace(1) %"66", align 4
%"29" = load i64, ptr addrspace(5) %"4", align 8
%"54" = inttoptr i64 %"29" to ptr addrspace(1)
%"68" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 12
%"28" = load float, ptr addrspace(1) %"68", align 4
store float %"28", ptr addrspace(5) %"6", align 4
%"31" = load float, ptr addrspace(5) %"6", align 4
%"30" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"31")
store float %"30", ptr addrspace(5) %"6", align 4
%"32" = load i64, ptr addrspace(5) %"5", align 8
%"33" = load float, ptr addrspace(5) %"6", align 4
%"55" = inttoptr i64 %"32" to ptr addrspace(1)
%"70" = getelementptr inbounds i8, ptr addrspace(1) %"55", i64 12
%"29" = load float, ptr addrspace(1) %"70", align 4
store float %"29", ptr addrspace(5) %"6", align 4
%"32" = load float, ptr addrspace(5) %"6", align 4
%"31" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"32")
store float %"31", ptr addrspace(5) %"6", align 4
%"33" = load i64, ptr addrspace(5) %"5", align 8
%"34" = load float, ptr addrspace(5) %"6", align 4
%"56" = inttoptr i64 %"33" to ptr addrspace(1)
%"72" = getelementptr inbounds i8, ptr addrspace(1) %"56", i64 12
store float %"34", ptr addrspace(1) %"72", align 4
store float %"33", ptr addrspace(1) %"70", align 4
ret void
}

View file

@ -1,32 +1,32 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @cvt_f32_f16(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"23":
define protected amdgpu_kernel void @cvt_f32_f16(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca half, align 2, addrspace(5)
%"7" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"21" = inttoptr i64 %"13" to ptr addrspace(1)
%"20" = load i16, ptr addrspace(1) %"21", align 2
%"12" = bitcast i16 %"20" to half
store half %"12", ptr addrspace(5) %"6", align 2
%"15" = load half, ptr addrspace(5) %"6", align 2
%"14" = fpext half %"15" to float
store float %"14", ptr addrspace(5) %"7", align 4
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"17" = load float, ptr addrspace(5) %"7", align 4
%"22" = inttoptr i64 %"16" to ptr
store float %"17", ptr %"22", align 4
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"20" = inttoptr i64 %"12" to ptr addrspace(1)
%"19" = load i16, ptr addrspace(1) %"20", align 2
%"11" = bitcast i16 %"19" to half
store half %"11", ptr addrspace(5) %"6", align 2
%"14" = load half, ptr addrspace(5) %"6", align 2
%"13" = fpext half %"14" to float
store float %"13", ptr addrspace(5) %"7", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load float, ptr addrspace(5) %"7", align 4
%"21" = inttoptr i64 %"15" to ptr
store float %"16", ptr %"21", align 4
ret void
}

View file

@ -9,80 +9,80 @@ declare float @__zluda_ptx_impl__cvt_rp_f32_s32(i32) #0
declare float @__zluda_ptx_impl__cvt_rz_f32_s32(i32) #0
define protected amdgpu_kernel void @cvt_f32_s32(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #1 {
"76":
define protected amdgpu_kernel void @cvt_f32_s32(ptr addrspace(4) byref(i64) %"49", ptr addrspace(4) byref(i64) %"50") #1 {
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"11" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"11", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
%"9" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"10", align 1
%"11" = load i64, ptr addrspace(4) %"49", align 8
store i64 %"11", ptr addrspace(5) %"4", align 8
%"12" = load i64, ptr addrspace(4) %"50", align 8
store i64 %"12", ptr addrspace(5) %"4", align 8
%"13" = load i64, ptr addrspace(4) %"51", align 8
store i64 %"13", ptr addrspace(5) %"5", align 8
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"53" = inttoptr i64 %"15" to ptr
%"52" = load i32, ptr %"53", align 4
store i32 %"52", ptr addrspace(5) %"6", align 4
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"54" = inttoptr i64 %"17" to ptr
%"90" = getelementptr inbounds i8, ptr %"54", i64 4
%"55" = load i32, ptr %"90", align 4
store i32 %"55", ptr addrspace(5) %"7", align 4
%"19" = load i64, ptr addrspace(5) %"4", align 8
%"56" = inttoptr i64 %"19" to ptr
%"92" = getelementptr inbounds i8, ptr %"56", i64 8
%"57" = load i32, ptr %"92", align 4
store i32 %"57", ptr addrspace(5) %"8", align 4
%"21" = load i64, ptr addrspace(5) %"4", align 8
%"58" = inttoptr i64 %"21" to ptr
%"94" = getelementptr inbounds i8, ptr %"58", i64 12
%"59" = load i32, ptr %"94", align 4
store i32 %"59", ptr addrspace(5) %"9", align 4
%"23" = load i32, ptr addrspace(5) %"6", align 4
%"60" = call float @__zluda_ptx_impl__cvt_rn_f32_s32(i32 %"23")
%"22" = bitcast float %"60" to i32
store i32 %"22", ptr addrspace(5) %"6", align 4
%"25" = load i32, ptr addrspace(5) %"7", align 4
%"62" = call float @__zluda_ptx_impl__cvt_rz_f32_s32(i32 %"25")
%"24" = bitcast float %"62" to i32
store i32 %"24", ptr addrspace(5) %"7", align 4
%"27" = load i32, ptr addrspace(5) %"8", align 4
%"64" = call float @__zluda_ptx_impl__cvt_rm_f32_s32(i32 %"27")
%"26" = bitcast float %"64" to i32
store i32 %"26", ptr addrspace(5) %"8", align 4
%"29" = load i32, ptr addrspace(5) %"9", align 4
%"66" = call float @__zluda_ptx_impl__cvt_rp_f32_s32(i32 %"29")
%"28" = bitcast float %"66" to i32
store i32 %"28", ptr addrspace(5) %"9", align 4
%"30" = load i64, ptr addrspace(5) %"5", align 8
%"31" = load i32, ptr addrspace(5) %"6", align 4
%"68" = inttoptr i64 %"30" to ptr addrspace(1)
%"69" = bitcast i32 %"31" to float
store float %"69", ptr addrspace(1) %"68", align 4
%"32" = load i64, ptr addrspace(5) %"5", align 8
%"33" = load i32, ptr addrspace(5) %"7", align 4
%"70" = inttoptr i64 %"32" to ptr addrspace(1)
%"96" = getelementptr inbounds i8, ptr addrspace(1) %"70", i64 4
%"71" = bitcast i32 %"33" to float
store float %"71", ptr addrspace(1) %"96", align 4
%"34" = load i64, ptr addrspace(5) %"5", align 8
%"35" = load i32, ptr addrspace(5) %"8", align 4
%"72" = inttoptr i64 %"34" to ptr addrspace(1)
%"98" = getelementptr inbounds i8, ptr addrspace(1) %"72", i64 8
%"73" = bitcast i32 %"35" to float
store float %"73", ptr addrspace(1) %"98", align 4
%"36" = load i64, ptr addrspace(5) %"5", align 8
%"37" = load i32, ptr addrspace(5) %"9", align 4
%"74" = inttoptr i64 %"36" to ptr addrspace(1)
%"100" = getelementptr inbounds i8, ptr addrspace(1) %"74", i64 12
%"75" = bitcast i32 %"37" to float
store float %"75", ptr addrspace(1) %"100", align 4
store i64 %"12", ptr addrspace(5) %"5", align 8
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"52" = inttoptr i64 %"14" to ptr
%"51" = load i32, ptr %"52", align 4
store i32 %"51", ptr addrspace(5) %"6", align 4
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"53" = inttoptr i64 %"16" to ptr
%"88" = getelementptr inbounds i8, ptr %"53", i64 4
%"54" = load i32, ptr %"88", align 4
store i32 %"54", ptr addrspace(5) %"7", align 4
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"55" = inttoptr i64 %"18" to ptr
%"90" = getelementptr inbounds i8, ptr %"55", i64 8
%"56" = load i32, ptr %"90", align 4
store i32 %"56", ptr addrspace(5) %"8", align 4
%"20" = load i64, ptr addrspace(5) %"4", align 8
%"57" = inttoptr i64 %"20" to ptr
%"92" = getelementptr inbounds i8, ptr %"57", i64 12
%"58" = load i32, ptr %"92", align 4
store i32 %"58", ptr addrspace(5) %"9", align 4
%"22" = load i32, ptr addrspace(5) %"6", align 4
%"59" = call float @__zluda_ptx_impl__cvt_rn_f32_s32(i32 %"22")
%"21" = bitcast float %"59" to i32
store i32 %"21", ptr addrspace(5) %"6", align 4
%"24" = load i32, ptr addrspace(5) %"7", align 4
%"61" = call float @__zluda_ptx_impl__cvt_rz_f32_s32(i32 %"24")
%"23" = bitcast float %"61" to i32
store i32 %"23", ptr addrspace(5) %"7", align 4
%"26" = load i32, ptr addrspace(5) %"8", align 4
%"63" = call float @__zluda_ptx_impl__cvt_rm_f32_s32(i32 %"26")
%"25" = bitcast float %"63" to i32
store i32 %"25", ptr addrspace(5) %"8", align 4
%"28" = load i32, ptr addrspace(5) %"9", align 4
%"65" = call float @__zluda_ptx_impl__cvt_rp_f32_s32(i32 %"28")
%"27" = bitcast float %"65" to i32
store i32 %"27", ptr addrspace(5) %"9", align 4
%"29" = load i64, ptr addrspace(5) %"5", align 8
%"30" = load i32, ptr addrspace(5) %"6", align 4
%"67" = inttoptr i64 %"29" to ptr addrspace(1)
%"68" = bitcast i32 %"30" to float
store float %"68", ptr addrspace(1) %"67", align 4
%"31" = load i64, ptr addrspace(5) %"5", align 8
%"32" = load i32, ptr addrspace(5) %"7", align 4
%"69" = inttoptr i64 %"31" to ptr addrspace(1)
%"94" = getelementptr inbounds i8, ptr addrspace(1) %"69", i64 4
%"70" = bitcast i32 %"32" to float
store float %"70", ptr addrspace(1) %"94", align 4
%"33" = load i64, ptr addrspace(5) %"5", align 8
%"34" = load i32, ptr addrspace(5) %"8", align 4
%"71" = inttoptr i64 %"33" to ptr addrspace(1)
%"96" = getelementptr inbounds i8, ptr addrspace(1) %"71", i64 8
%"72" = bitcast i32 %"34" to float
store float %"72", ptr addrspace(1) %"96", align 4
%"35" = load i64, ptr addrspace(5) %"5", align 8
%"36" = load i32, ptr addrspace(5) %"9", align 4
%"73" = inttoptr i64 %"35" to ptr addrspace(1)
%"98" = getelementptr inbounds i8, ptr addrspace(1) %"73", i64 12
%"74" = bitcast i32 %"36" to float
store float %"74", ptr addrspace(1) %"98", align 4
ret void
}

View file

@ -1,31 +1,31 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"22":
define protected amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca float, align 4, addrspace(5)
%"7" = alloca double, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"20" = inttoptr i64 %"13" to ptr addrspace(1)
%"12" = load float, ptr addrspace(1) %"20", align 4
store float %"12", ptr addrspace(5) %"6", align 4
%"15" = load float, ptr addrspace(5) %"6", align 4
%"14" = fpext float %"15" to double
store double %"14", ptr addrspace(5) %"7", align 8
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"17" = load double, ptr addrspace(5) %"7", align 8
%"21" = inttoptr i64 %"16" to ptr
store double %"17", ptr %"21", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"19" = inttoptr i64 %"12" to ptr addrspace(1)
%"11" = load float, ptr addrspace(1) %"19", align 4
store float %"11", ptr addrspace(5) %"6", align 4
%"14" = load float, ptr addrspace(5) %"6", align 4
%"13" = fpext float %"14" to double
store double %"13", ptr addrspace(5) %"7", align 8
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load double, ptr addrspace(5) %"7", align 8
%"20" = inttoptr i64 %"15" to ptr
store double %"16", ptr %"20", align 8
ret void
}

View file

@ -1,44 +1,44 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 {
"34":
define protected amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca float, align 4, addrspace(5)
%"7" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"27", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"28", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"29", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"30" = inttoptr i64 %"13" to ptr
%"12" = load float, ptr %"30", align 4
store float %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"15" to ptr
%"36" = getelementptr inbounds i8, ptr %"31", i64 4
%"14" = load float, ptr %"36", align 4
store float %"14", ptr addrspace(5) %"7", align 4
%"17" = load float, ptr addrspace(5) %"6", align 4
%"16" = call float @llvm.rint.f32(float %"17")
store float %"16", ptr addrspace(5) %"6", align 4
%"19" = load float, ptr addrspace(5) %"7", align 4
%"18" = call float @llvm.rint.f32(float %"19")
store float %"18", ptr addrspace(5) %"7", align 4
%"20" = load i64, ptr addrspace(5) %"5", align 8
%"21" = load float, ptr addrspace(5) %"6", align 4
%"32" = inttoptr i64 %"20" to ptr
store float %"21", ptr %"32", align 4
%"22" = load i64, ptr addrspace(5) %"5", align 8
%"23" = load float, ptr addrspace(5) %"7", align 4
%"33" = inttoptr i64 %"22" to ptr
%"38" = getelementptr inbounds i8, ptr %"33", i64 4
store float %"23", ptr %"38", align 4
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"29" = inttoptr i64 %"12" to ptr
%"11" = load float, ptr %"29", align 4
store float %"11", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"30" = inttoptr i64 %"14" to ptr
%"34" = getelementptr inbounds i8, ptr %"30", i64 4
%"13" = load float, ptr %"34", align 4
store float %"13", ptr addrspace(5) %"7", align 4
%"16" = load float, ptr addrspace(5) %"6", align 4
%"15" = call float @llvm.rint.f32(float %"16")
store float %"15", ptr addrspace(5) %"6", align 4
%"18" = load float, ptr addrspace(5) %"7", align 4
%"17" = call float @llvm.rint.f32(float %"18")
store float %"17", ptr addrspace(5) %"7", align 4
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load float, ptr addrspace(5) %"6", align 4
%"31" = inttoptr i64 %"19" to ptr
store float %"20", ptr %"31", align 4
%"21" = load i64, ptr addrspace(5) %"5", align 8
%"22" = load float, ptr addrspace(5) %"7", align 4
%"32" = inttoptr i64 %"21" to ptr
%"36" = getelementptr inbounds i8, ptr %"32", i64 4
store float %"22", ptr %"36", align 4
ret void
}

View file

@ -1,44 +1,44 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 {
"34":
define protected amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca float, align 4, addrspace(5)
%"7" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"27", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"28", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"29", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"30" = inttoptr i64 %"13" to ptr
%"12" = load float, ptr %"30", align 4
store float %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"15" to ptr
%"36" = getelementptr inbounds i8, ptr %"31", i64 4
%"14" = load float, ptr %"36", align 4
store float %"14", ptr addrspace(5) %"7", align 4
%"17" = load float, ptr addrspace(5) %"6", align 4
%"16" = call float @llvm.trunc.f32(float %"17")
store float %"16", ptr addrspace(5) %"6", align 4
%"19" = load float, ptr addrspace(5) %"7", align 4
%"18" = call float @llvm.trunc.f32(float %"19")
store float %"18", ptr addrspace(5) %"7", align 4
%"20" = load i64, ptr addrspace(5) %"5", align 8
%"21" = load float, ptr addrspace(5) %"6", align 4
%"32" = inttoptr i64 %"20" to ptr
store float %"21", ptr %"32", align 4
%"22" = load i64, ptr addrspace(5) %"5", align 8
%"23" = load float, ptr addrspace(5) %"7", align 4
%"33" = inttoptr i64 %"22" to ptr
%"38" = getelementptr inbounds i8, ptr %"33", i64 4
store float %"23", ptr %"38", align 4
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"29" = inttoptr i64 %"12" to ptr
%"11" = load float, ptr %"29", align 4
store float %"11", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"30" = inttoptr i64 %"14" to ptr
%"34" = getelementptr inbounds i8, ptr %"30", i64 4
%"13" = load float, ptr %"34", align 4
store float %"13", ptr addrspace(5) %"7", align 4
%"16" = load float, ptr addrspace(5) %"6", align 4
%"15" = call float @llvm.trunc.f32(float %"16")
store float %"15", ptr addrspace(5) %"6", align 4
%"18" = load float, ptr addrspace(5) %"7", align 4
%"17" = call float @llvm.trunc.f32(float %"18")
store float %"17", ptr addrspace(5) %"7", align 4
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load float, ptr addrspace(5) %"6", align 4
%"31" = inttoptr i64 %"19" to ptr
store float %"20", ptr %"31", align 4
%"21" = load i64, ptr addrspace(5) %"5", align 8
%"22" = load float, ptr addrspace(5) %"7", align 4
%"32" = inttoptr i64 %"21" to ptr
%"36" = getelementptr inbounds i8, ptr %"32", i64 4
store float %"22", ptr %"36", align 4
ret void
}

View file

@ -1,33 +1,33 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"24":
define protected amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"20" = inttoptr i64 %"13" to ptr addrspace(1)
%"12" = load i32, ptr addrspace(1) %"20", align 4
store i32 %"12", ptr addrspace(5) %"7", align 4
%"15" = load i32, ptr addrspace(5) %"7", align 4
%"26" = trunc i32 %"15" to i8
%"21" = sext i8 %"26" to i16
%"14" = sext i16 %"21" to i32
store i32 %"14", ptr addrspace(5) %"6", align 4
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"17" = load i32, ptr addrspace(5) %"6", align 4
%"23" = inttoptr i64 %"16" to ptr
store i32 %"17", ptr %"23", align 4
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"19" = inttoptr i64 %"12" to ptr addrspace(1)
%"11" = load i32, ptr addrspace(1) %"19", align 4
store i32 %"11", ptr addrspace(5) %"7", align 4
%"14" = load i32, ptr addrspace(5) %"7", align 4
%"24" = trunc i32 %"14" to i8
%"20" = sext i8 %"24" to i16
%"13" = sext i16 %"20" to i32
store i32 %"13", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load i32, ptr addrspace(5) %"6", align 4
%"22" = inttoptr i64 %"15" to ptr
store i32 %"16", ptr %"22", align 4
ret void
}

View file

@ -3,48 +3,48 @@ target triple = "amdgcn-amd-amdhsa"
declare i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float) #0
define protected amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 {
"42":
define protected amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #1 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"27", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"28", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"29", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"13" to ptr
%"30" = load float, ptr %"31", align 4
%"12" = bitcast float %"30" to i32
store i32 %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"32" = inttoptr i64 %"15" to ptr
%"47" = getelementptr inbounds i8, ptr %"32", i64 4
%"33" = load float, ptr %"47", align 4
%"14" = bitcast float %"33" to i32
store i32 %"14", ptr addrspace(5) %"7", align 4
%"17" = load i32, ptr addrspace(5) %"6", align 4
%"35" = bitcast i32 %"17" to float
%"34" = call i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float %"35")
store i32 %"34", ptr addrspace(5) %"6", align 4
%"19" = load i32, ptr addrspace(5) %"7", align 4
%"37" = bitcast i32 %"19" to float
%"36" = call i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float %"37")
store i32 %"36", ptr addrspace(5) %"7", align 4
%"20" = load i64, ptr addrspace(5) %"5", align 8
%"21" = load i32, ptr addrspace(5) %"6", align 4
%"38" = inttoptr i64 %"20" to ptr addrspace(1)
store i32 %"21", ptr addrspace(1) %"38", align 4
%"22" = load i64, ptr addrspace(5) %"5", align 8
%"23" = load i32, ptr addrspace(5) %"7", align 4
%"40" = inttoptr i64 %"22" to ptr addrspace(1)
%"49" = getelementptr inbounds i8, ptr addrspace(1) %"40", i64 4
store i32 %"23", ptr addrspace(1) %"49", align 4
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"30" = inttoptr i64 %"12" to ptr
%"29" = load float, ptr %"30", align 4
%"11" = bitcast float %"29" to i32
store i32 %"11", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"14" to ptr
%"45" = getelementptr inbounds i8, ptr %"31", i64 4
%"32" = load float, ptr %"45", align 4
%"13" = bitcast float %"32" to i32
store i32 %"13", ptr addrspace(5) %"7", align 4
%"16" = load i32, ptr addrspace(5) %"6", align 4
%"34" = bitcast i32 %"16" to float
%"33" = call i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float %"34")
store i32 %"33", ptr addrspace(5) %"6", align 4
%"18" = load i32, ptr addrspace(5) %"7", align 4
%"36" = bitcast i32 %"18" to float
%"35" = call i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float %"36")
store i32 %"35", ptr addrspace(5) %"7", align 4
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load i32, ptr addrspace(5) %"6", align 4
%"37" = inttoptr i64 %"19" to ptr addrspace(1)
store i32 %"20", ptr addrspace(1) %"37", align 4
%"21" = load i64, ptr addrspace(5) %"5", align 8
%"22" = load i32, ptr addrspace(5) %"7", align 4
%"39" = inttoptr i64 %"21" to ptr addrspace(1)
%"47" = getelementptr inbounds i8, ptr addrspace(1) %"39", i64 4
store i32 %"22", ptr addrspace(1) %"47", align 4
ret void
}

View file

@ -1,31 +1,31 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"24":
define protected amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"21" = inttoptr i64 %"13" to ptr
%"20" = load i32, ptr %"21", align 4
store i32 %"20", ptr addrspace(5) %"6", align 4
%"15" = load i32, ptr addrspace(5) %"6", align 4
%"14" = sext i32 %"15" to i64
store i64 %"14", ptr addrspace(5) %"7", align 8
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"17" = load i64, ptr addrspace(5) %"7", align 8
%"22" = inttoptr i64 %"16" to ptr
store i64 %"17", ptr %"22", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"20" = inttoptr i64 %"12" to ptr
%"19" = load i32, ptr %"20", align 4
store i32 %"19", ptr addrspace(5) %"6", align 4
%"14" = load i32, ptr addrspace(5) %"6", align 4
%"13" = sext i32 %"14" to i64
store i64 %"13", ptr addrspace(5) %"7", align 8
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"7", align 8
%"21" = inttoptr i64 %"15" to ptr
store i64 %"16", ptr %"21", align 8
ret void
}

View file

@ -1,50 +1,50 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
"35":
define protected amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #0 {
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
%"11" = load i64, ptr addrspace(4) %"27", align 8
store i64 %"11", ptr addrspace(5) %"4", align 8
%"12" = load i64, ptr addrspace(4) %"28", align 8
store i64 %"12", ptr addrspace(5) %"5", align 8
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"29" = inttoptr i64 %"14" to ptr
%"13" = load i32, ptr %"29", align 4
store i32 %"13", ptr addrspace(5) %"6", align 4
%"16" = load i32, ptr addrspace(5) %"6", align 4
%0 = call i32 @llvm.smax.i32(i32 %"16", i32 0)
%1 = alloca i32, align 4, addrspace(5)
store i32 %0, ptr addrspace(5) %1, align 4
%"15" = load i32, ptr addrspace(5) %1, align 4
store i32 %"15", ptr addrspace(5) %"7", align 4
%"18" = load i32, ptr addrspace(5) %"7", align 4
%2 = alloca i32, align 4, addrspace(5)
store i32 %"18", ptr addrspace(5) %2, align 4
%"30" = load i32, ptr addrspace(5) %2, align 4
store i32 %"30", ptr addrspace(5) %"7", align 4
%"20" = load i32, ptr addrspace(5) %"6", align 4
%3 = alloca i32, align 4, addrspace(5)
store i32 %"20", ptr addrspace(5) %3, align 4
%"31" = load i32, ptr addrspace(5) %3, align 4
store i32 %"31", ptr addrspace(5) %"8", align 4
%"21" = load i64, ptr addrspace(5) %"5", align 8
%"22" = load i32, ptr addrspace(5) %"7", align 4
%"32" = inttoptr i64 %"21" to ptr
store i32 %"22", ptr %"32", align 4
%"23" = load i64, ptr addrspace(5) %"5", align 8
%"24" = load i32, ptr addrspace(5) %"8", align 4
%"34" = inttoptr i64 %"23" to ptr
%"37" = getelementptr inbounds i8, ptr %"34", i64 4
store i32 %"24", ptr %"37", align 4
br label %4
4: ; preds = %0
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = load i64, ptr addrspace(4) %"26", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"27", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"28" = inttoptr i64 %"13" to ptr
%"12" = load i32, ptr %"28", align 4
store i32 %"12", ptr addrspace(5) %"6", align 4
%"15" = load i32, ptr addrspace(5) %"6", align 4
%5 = call i32 @llvm.smax.i32(i32 %"15", i32 0)
store i32 %5, ptr addrspace(5) %1, align 4
%"14" = load i32, ptr addrspace(5) %1, align 4
store i32 %"14", ptr addrspace(5) %"7", align 4
%"17" = load i32, ptr addrspace(5) %"7", align 4
store i32 %"17", ptr addrspace(5) %2, align 4
%"29" = load i32, ptr addrspace(5) %2, align 4
store i32 %"29", ptr addrspace(5) %"7", align 4
%"19" = load i32, ptr addrspace(5) %"6", align 4
store i32 %"19", ptr addrspace(5) %3, align 4
%"30" = load i32, ptr addrspace(5) %3, align 4
store i32 %"30", ptr addrspace(5) %"8", align 4
%"20" = load i64, ptr addrspace(5) %"5", align 8
%"21" = load i32, ptr addrspace(5) %"7", align 4
%"31" = inttoptr i64 %"20" to ptr
store i32 %"21", ptr %"31", align 4
%"22" = load i64, ptr addrspace(5) %"5", align 8
%"23" = load i32, ptr addrspace(5) %"8", align 4
%"33" = inttoptr i64 %"22" to ptr
%"35" = getelementptr inbounds i8, ptr %"33", i64 4
store i32 %"23", ptr %"35", align 4
ret void
}

View file

@ -1,31 +1,31 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @cvt_u32_s16(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"24":
define protected amdgpu_kernel void @cvt_u32_s16(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i16, align 2, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"20" = inttoptr i64 %"13" to ptr addrspace(1)
%"12" = load i16, ptr addrspace(1) %"20", align 2
store i16 %"12", ptr addrspace(5) %"6", align 2
%"15" = load i16, ptr addrspace(5) %"6", align 2
%"21" = sext i16 %"15" to i32
store i32 %"21", ptr addrspace(5) %"7", align 4
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"17" = load i32, ptr addrspace(5) %"7", align 4
%"23" = inttoptr i64 %"16" to ptr
store i32 %"17", ptr %"23", align 4
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"19" = inttoptr i64 %"12" to ptr addrspace(1)
%"11" = load i16, ptr addrspace(1) %"19", align 2
store i16 %"11", ptr addrspace(5) %"6", align 2
%"14" = load i16, ptr addrspace(5) %"6", align 2
%"20" = sext i16 %"14" to i32
store i32 %"20", ptr addrspace(5) %"7", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load i32, ptr addrspace(5) %"7", align 4
%"22" = inttoptr i64 %"15" to ptr
store i32 %"16", ptr %"22", align 4
ret void
}

View file

@ -1,37 +1,37 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
"27":
define protected amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"20", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%0 = inttoptr i64 %"12" to ptr
%1 = addrspacecast ptr %0 to ptr addrspace(1)
%"21" = ptrtoint ptr addrspace(1) %1 to i64
store i64 %"21", ptr addrspace(5) %"4", align 8
%"14" = load i64, ptr addrspace(5) %"5", align 8
%2 = inttoptr i64 %"14" to ptr
store i64 %"9", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(5) %"4", align 8
%2 = inttoptr i64 %"11" to ptr
%3 = addrspacecast ptr %2 to ptr addrspace(1)
%"23" = ptrtoint ptr addrspace(1) %3 to i64
store i64 %"23", ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"20" = ptrtoint ptr addrspace(1) %3 to i64
store i64 %"20", ptr addrspace(5) %"4", align 8
%"13" = load i64, ptr addrspace(5) %"5", align 8
%4 = inttoptr i64 %"13" to ptr
%5 = addrspacecast ptr %4 to ptr addrspace(1)
%"22" = ptrtoint ptr addrspace(1) %5 to i64
store i64 %"22", ptr addrspace(5) %"5", align 8
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"24" = inttoptr i64 %"15" to ptr addrspace(1)
%"14" = load float, ptr addrspace(1) %"24", align 4
store float %"14", ptr addrspace(5) %"6", align 4
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"17" = load float, ptr addrspace(5) %"6", align 4
%"25" = inttoptr i64 %"16" to ptr addrspace(1)
%"15" = load float, ptr addrspace(1) %"25", align 4
store float %"15", ptr addrspace(5) %"6", align 4
%"17" = load i64, ptr addrspace(5) %"5", align 8
%"18" = load float, ptr addrspace(5) %"6", align 4
%"26" = inttoptr i64 %"17" to ptr addrspace(1)
store float %"18", ptr addrspace(1) %"26", align 4
store float %"17", ptr addrspace(1) %"25", align 4
ret void
}

View file

@ -1,37 +1,37 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
"28":
define protected amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca float, align 4, addrspace(5)
%"7" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"22", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"23", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"24", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"25" = inttoptr i64 %"13" to ptr
%"12" = load float, ptr %"25", align 4
store float %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"26" = inttoptr i64 %"15" to ptr
%"30" = getelementptr inbounds i8, ptr %"26", i64 4
%"14" = load float, ptr %"30", align 4
store float %"14", ptr addrspace(5) %"7", align 4
%"17" = load float, ptr addrspace(5) %"6", align 4
%"18" = load float, ptr addrspace(5) %"7", align 4
%"16" = fdiv arcp afn float %"17", %"18"
store float %"16", ptr addrspace(5) %"6", align 4
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load float, ptr addrspace(5) %"6", align 4
%"27" = inttoptr i64 %"19" to ptr
store float %"20", ptr %"27", align 4
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"24" = inttoptr i64 %"12" to ptr
%"11" = load float, ptr %"24", align 4
store float %"11", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"25" = inttoptr i64 %"14" to ptr
%"28" = getelementptr inbounds i8, ptr %"25", i64 4
%"13" = load float, ptr %"28", align 4
store float %"13", ptr addrspace(5) %"7", align 4
%"16" = load float, ptr addrspace(5) %"6", align 4
%"17" = load float, ptr addrspace(5) %"7", align 4
%"15" = fdiv arcp afn float %"16", %"17"
store float %"15", ptr addrspace(5) %"6", align 4
%"18" = load i64, ptr addrspace(5) %"5", align 8
%"19" = load float, ptr addrspace(5) %"6", align 4
%"26" = inttoptr i64 %"18" to ptr
store float %"19", ptr %"26", align 4
ret void
}

View file

@ -3,44 +3,44 @@ target triple = "amdgcn-amd-amdhsa"
declare i32 @__zluda_ptx_impl__dp4a_s32_s32(i32, i32, i32) #0
define protected amdgpu_kernel void @dp4a(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 {
"39":
define protected amdgpu_kernel void @dp4a(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 {
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = load i64, ptr addrspace(4) %"28", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"29", align 8
store i64 %"11", ptr addrspace(5) %"4", align 8
%"12" = load i64, ptr addrspace(4) %"30", align 8
store i64 %"12", ptr addrspace(5) %"5", align 8
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"14" to ptr
%"13" = load i32, ptr %"31", align 4
store i32 %"13", ptr addrspace(5) %"6", align 4
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"32" = inttoptr i64 %"16" to ptr
%"46" = getelementptr inbounds i8, ptr %"32", i64 4
%"15" = load i32, ptr %"46", align 4
store i32 %"15", ptr addrspace(5) %"7", align 4
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"33" = inttoptr i64 %"18" to ptr
%"48" = getelementptr inbounds i8, ptr %"33", i64 8
%"17" = load i32, ptr %"48", align 4
store i32 %"17", ptr addrspace(5) %"8", align 4
%"20" = load i32, ptr addrspace(5) %"6", align 4
%"21" = load i32, ptr addrspace(5) %"7", align 4
%"22" = load i32, ptr addrspace(5) %"8", align 4
%"34" = call i32 @__zluda_ptx_impl__dp4a_s32_s32(i32 %"20", i32 %"21", i32 %"22")
store i32 %"34", ptr addrspace(5) %"6", align 4
%"23" = load i64, ptr addrspace(5) %"5", align 8
%"24" = load i32, ptr addrspace(5) %"6", align 4
%"38" = inttoptr i64 %"23" to ptr
store i32 %"24", ptr %"38", align 4
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"30" = inttoptr i64 %"13" to ptr
%"12" = load i32, ptr %"30", align 4
store i32 %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"15" to ptr
%"44" = getelementptr inbounds i8, ptr %"31", i64 4
%"14" = load i32, ptr %"44", align 4
store i32 %"14", ptr addrspace(5) %"7", align 4
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"32" = inttoptr i64 %"17" to ptr
%"46" = getelementptr inbounds i8, ptr %"32", i64 8
%"16" = load i32, ptr %"46", align 4
store i32 %"16", ptr addrspace(5) %"8", align 4
%"19" = load i32, ptr addrspace(5) %"6", align 4
%"20" = load i32, ptr addrspace(5) %"7", align 4
%"21" = load i32, ptr addrspace(5) %"8", align 4
%"33" = call i32 @__zluda_ptx_impl__dp4a_s32_s32(i32 %"19", i32 %"20", i32 %"21")
store i32 %"33", ptr addrspace(5) %"6", align 4
%"22" = load i64, ptr addrspace(5) %"5", align 8
%"23" = load i32, ptr addrspace(5) %"6", align 4
%"37" = inttoptr i64 %"22" to ptr
store i32 %"23", ptr %"37", align 4
ret void
}

View file

@ -1,69 +1,69 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #0 {
"57":
define protected amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 {
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = load i64, ptr addrspace(4) %"46", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = load i64, ptr addrspace(4) %"47", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"48", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"49" = inttoptr i64 %"12" to ptr
%"11" = load float, ptr %"49", align 4
store float %"11", ptr addrspace(5) %"6", align 4
%"14" = load float, ptr addrspace(5) %"6", align 4
%"13" = call afn float @llvm.exp2.f32(float %"14")
store float %"13", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load float, ptr addrspace(5) %"6", align 4
%"50" = inttoptr i64 %"15" to ptr
store float %"16", ptr %"50", align 4
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"51" = inttoptr i64 %"18" to ptr
store i64 %"9", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(5) %"4", align 8
%"48" = inttoptr i64 %"11" to ptr
%"10" = load float, ptr %"48", align 4
store float %"10", ptr addrspace(5) %"6", align 4
%"13" = load float, ptr addrspace(5) %"6", align 4
%"12" = call afn float @llvm.exp2.f32(float %"13")
store float %"12", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"15" = load float, ptr addrspace(5) %"6", align 4
%"49" = inttoptr i64 %"14" to ptr
store float %"15", ptr %"49", align 4
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"50" = inttoptr i64 %"17" to ptr
%"57" = getelementptr inbounds i8, ptr %"50", i64 4
%"16" = load float, ptr %"57", align 4
store float %"16", ptr addrspace(5) %"6", align 4
%"19" = load float, ptr addrspace(5) %"6", align 4
%"18" = call afn float @llvm.exp2.f32(float %"19")
store float %"18", ptr addrspace(5) %"6", align 4
%"20" = load i64, ptr addrspace(5) %"5", align 8
%"21" = load float, ptr addrspace(5) %"6", align 4
%"51" = inttoptr i64 %"20" to ptr
%"59" = getelementptr inbounds i8, ptr %"51", i64 4
%"17" = load float, ptr %"59", align 4
store float %"17", ptr addrspace(5) %"6", align 4
%"20" = load float, ptr addrspace(5) %"6", align 4
%"19" = call afn float @llvm.exp2.f32(float %"20")
store float %"19", ptr addrspace(5) %"6", align 4
%"21" = load i64, ptr addrspace(5) %"5", align 8
%"22" = load float, ptr addrspace(5) %"6", align 4
%"52" = inttoptr i64 %"21" to ptr
%"61" = getelementptr inbounds i8, ptr %"52", i64 4
store float %"22", ptr %"61", align 4
%"24" = load i64, ptr addrspace(5) %"4", align 8
%"53" = inttoptr i64 %"24" to ptr
store float %"21", ptr %"59", align 4
%"23" = load i64, ptr addrspace(5) %"4", align 8
%"52" = inttoptr i64 %"23" to ptr
%"61" = getelementptr inbounds i8, ptr %"52", i64 8
%"22" = load float, ptr %"61", align 4
store float %"22", ptr addrspace(5) %"6", align 4
%"25" = load float, ptr addrspace(5) %"6", align 4
%"24" = call afn float @llvm.exp2.f32(float %"25")
store float %"24", ptr addrspace(5) %"6", align 4
%"26" = load i64, ptr addrspace(5) %"5", align 8
%"27" = load float, ptr addrspace(5) %"6", align 4
%"53" = inttoptr i64 %"26" to ptr
%"63" = getelementptr inbounds i8, ptr %"53", i64 8
%"23" = load float, ptr %"63", align 4
store float %"23", ptr addrspace(5) %"6", align 4
%"26" = load float, ptr addrspace(5) %"6", align 4
%"25" = call afn float @llvm.exp2.f32(float %"26")
store float %"25", ptr addrspace(5) %"6", align 4
%"27" = load i64, ptr addrspace(5) %"5", align 8
%"28" = load float, ptr addrspace(5) %"6", align 4
%"54" = inttoptr i64 %"27" to ptr
%"65" = getelementptr inbounds i8, ptr %"54", i64 8
store float %"28", ptr %"65", align 4
%"30" = load i64, ptr addrspace(5) %"4", align 8
%"55" = inttoptr i64 %"30" to ptr
store float %"27", ptr %"63", align 4
%"29" = load i64, ptr addrspace(5) %"4", align 8
%"54" = inttoptr i64 %"29" to ptr
%"65" = getelementptr inbounds i8, ptr %"54", i64 12
%"28" = load float, ptr %"65", align 4
store float %"28", ptr addrspace(5) %"6", align 4
%"31" = load float, ptr addrspace(5) %"6", align 4
%"30" = call afn float @llvm.exp2.f32(float %"31")
store float %"30", ptr addrspace(5) %"6", align 4
%"32" = load i64, ptr addrspace(5) %"5", align 8
%"33" = load float, ptr addrspace(5) %"6", align 4
%"55" = inttoptr i64 %"32" to ptr
%"67" = getelementptr inbounds i8, ptr %"55", i64 12
%"29" = load float, ptr %"67", align 4
store float %"29", ptr addrspace(5) %"6", align 4
%"32" = load float, ptr addrspace(5) %"6", align 4
%"31" = call afn float @llvm.exp2.f32(float %"32")
store float %"31", ptr addrspace(5) %"6", align 4
%"33" = load i64, ptr addrspace(5) %"5", align 8
%"34" = load float, ptr addrspace(5) %"6", align 4
%"56" = inttoptr i64 %"33" to ptr
%"69" = getelementptr inbounds i8, ptr %"56", i64 12
store float %"34", ptr %"69", align 4
store float %"33", ptr %"67", align 4
ret void
}

View file

@ -3,31 +3,31 @@ target triple = "amdgcn-amd-amdhsa"
@shared_mem = external hidden addrspace(3) global [0 x i32]
define protected amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"24":
define protected amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"9", ptr addrspace(5) %"5", align 8
%"10" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"11", ptr addrspace(5) %"6", align 8
%"13" = load i64, ptr addrspace(5) %"5", align 8
%"20" = inttoptr i64 %"13" to ptr addrspace(1)
%"12" = load i64, ptr addrspace(1) %"20", align 8
store i64 %"12", ptr addrspace(5) %"7", align 8
%"14" = load i64, ptr addrspace(5) %"7", align 8
store i64 %"14", ptr addrspace(3) @shared_mem, align 8
%"15" = load i64, ptr addrspace(3) @shared_mem, align 8
store i64 %"15", ptr addrspace(5) %"7", align 8
%"16" = load i64, ptr addrspace(5) %"6", align 8
%"17" = load i64, ptr addrspace(5) %"7", align 8
%"23" = inttoptr i64 %"16" to ptr addrspace(1)
store i64 %"17", ptr addrspace(1) %"23", align 8
store i64 %"10", ptr addrspace(5) %"6", align 8
%"12" = load i64, ptr addrspace(5) %"5", align 8
%"19" = inttoptr i64 %"12" to ptr addrspace(1)
%"11" = load i64, ptr addrspace(1) %"19", align 8
store i64 %"11", ptr addrspace(5) %"7", align 8
%"13" = load i64, ptr addrspace(5) %"7", align 8
store i64 %"13", ptr addrspace(3) @shared_mem, align 8
%"14" = load i64, ptr addrspace(3) @shared_mem, align 8
store i64 %"14", ptr addrspace(5) %"7", align 8
%"15" = load i64, ptr addrspace(5) %"6", align 8
%"16" = load i64, ptr addrspace(5) %"7", align 8
%"22" = inttoptr i64 %"15" to ptr addrspace(1)
store i64 %"16", ptr addrspace(1) %"22", align 8
ret void
}

View file

@ -3,49 +3,49 @@ target triple = "amdgcn-amd-amdhsa"
@shared_mem = external hidden addrspace(3) global [0 x i32], align 4
define private void @"2"(ptr addrspace(3) %"37") #0 {
"35":
define private void @"2"(ptr addrspace(3) %"33") #0 {
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"11" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"11", align 1
%"3" = alloca i64, align 8, addrspace(5)
%"14" = load i64, ptr addrspace(3) %"37", align 8
store i64 %"14", ptr addrspace(5) %"3", align 8
%"16" = load i64, ptr addrspace(5) %"3", align 8
%"15" = add i64 %"16", 2
store i64 %"15", ptr addrspace(5) %"3", align 8
%"17" = load i64, ptr addrspace(5) %"3", align 8
store i64 %"17", ptr addrspace(3) %"37", align 8
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"10", align 1
%"12" = load i64, ptr addrspace(3) %"33", align 8
store i64 %"12", ptr addrspace(5) %"3", align 8
%"14" = load i64, ptr addrspace(5) %"3", align 8
%"13" = add i64 %"14", 2
store i64 %"13", ptr addrspace(5) %"3", align 8
%"15" = load i64, ptr addrspace(5) %"3", align 8
store i64 %"15", ptr addrspace(3) %"33", align 8
ret void
}
define protected amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 {
"36":
%"12" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"12", align 1
%"13" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"13", align 1
define protected amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #0 {
%"11" = alloca i1, align 1, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
%"8" = alloca i64, align 8, addrspace(5)
%"9" = alloca i64, align 8, addrspace(5)
%"18" = load i64, ptr addrspace(4) %"27", align 8
store i64 %"18", ptr addrspace(5) %"7", align 8
%"19" = load i64, ptr addrspace(4) %"28", align 8
store i64 %"19", ptr addrspace(5) %"8", align 8
%"21" = load i64, ptr addrspace(5) %"7", align 8
%"31" = inttoptr i64 %"21" to ptr addrspace(1)
%"20" = load i64, ptr addrspace(1) %"31", align 8
store i64 %"20", ptr addrspace(5) %"9", align 8
%"22" = load i64, ptr addrspace(5) %"9", align 8
store i64 %"22", ptr addrspace(3) @shared_mem, align 8
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"11", align 1
%"16" = load i64, ptr addrspace(4) %"25", align 8
store i64 %"16", ptr addrspace(5) %"7", align 8
%"17" = load i64, ptr addrspace(4) %"26", align 8
store i64 %"17", ptr addrspace(5) %"8", align 8
%"19" = load i64, ptr addrspace(5) %"7", align 8
%"29" = inttoptr i64 %"19" to ptr addrspace(1)
%"18" = load i64, ptr addrspace(1) %"29", align 8
store i64 %"18", ptr addrspace(5) %"9", align 8
%"20" = load i64, ptr addrspace(5) %"9", align 8
store i64 %"20", ptr addrspace(3) @shared_mem, align 8
call void @"2"(ptr addrspace(3) @shared_mem)
%"23" = load i64, ptr addrspace(3) @shared_mem, align 8
store i64 %"23", ptr addrspace(5) %"9", align 8
%"24" = load i64, ptr addrspace(5) %"8", align 8
%"25" = load i64, ptr addrspace(5) %"9", align 8
%"34" = inttoptr i64 %"24" to ptr addrspace(1)
store i64 %"25", ptr addrspace(1) %"34", align 8
%"21" = load i64, ptr addrspace(3) @shared_mem, align 8
store i64 %"21", ptr addrspace(5) %"9", align 8
%"22" = load i64, ptr addrspace(5) %"8", align 8
%"23" = load i64, ptr addrspace(5) %"9", align 8
%"32" = inttoptr i64 %"22" to ptr addrspace(1)
store i64 %"23", ptr addrspace(1) %"32", align 8
ret void
}

View file

@ -1,44 +1,44 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 {
"35":
define protected amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 {
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca float, align 4, addrspace(5)
%"7" = alloca float, align 4, addrspace(5)
%"8" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = load i64, ptr addrspace(4) %"28", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"29", align 8
store i64 %"11", ptr addrspace(5) %"4", align 8
%"12" = load i64, ptr addrspace(4) %"30", align 8
store i64 %"12", ptr addrspace(5) %"5", align 8
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"14" to ptr
%"13" = load float, ptr %"31", align 4
store float %"13", ptr addrspace(5) %"6", align 4
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"32" = inttoptr i64 %"16" to ptr
%"37" = getelementptr inbounds i8, ptr %"32", i64 4
%"15" = load float, ptr %"37", align 4
store float %"15", ptr addrspace(5) %"7", align 4
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"33" = inttoptr i64 %"18" to ptr
%"39" = getelementptr inbounds i8, ptr %"33", i64 8
%"17" = load float, ptr %"39", align 4
store float %"17", ptr addrspace(5) %"8", align 4
%"20" = load float, ptr addrspace(5) %"6", align 4
%"21" = load float, ptr addrspace(5) %"7", align 4
%"22" = load float, ptr addrspace(5) %"8", align 4
%"19" = call float @llvm.fma.f32(float %"20", float %"21", float %"22")
store float %"19", ptr addrspace(5) %"6", align 4
%"23" = load i64, ptr addrspace(5) %"5", align 8
%"24" = load float, ptr addrspace(5) %"6", align 4
%"34" = inttoptr i64 %"23" to ptr
store float %"24", ptr %"34", align 4
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"30" = inttoptr i64 %"13" to ptr
%"12" = load float, ptr %"30", align 4
store float %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"15" to ptr
%"35" = getelementptr inbounds i8, ptr %"31", i64 4
%"14" = load float, ptr %"35", align 4
store float %"14", ptr addrspace(5) %"7", align 4
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"32" = inttoptr i64 %"17" to ptr
%"37" = getelementptr inbounds i8, ptr %"32", i64 8
%"16" = load float, ptr %"37", align 4
store float %"16", ptr addrspace(5) %"8", align 4
%"19" = load float, ptr addrspace(5) %"6", align 4
%"20" = load float, ptr addrspace(5) %"7", align 4
%"21" = load float, ptr addrspace(5) %"8", align 4
%"18" = call float @llvm.fma.f32(float %"19", float %"20", float %"21")
store float %"18", ptr addrspace(5) %"6", align 4
%"22" = load i64, ptr addrspace(5) %"5", align 8
%"23" = load float, ptr addrspace(5) %"6", align 4
%"33" = inttoptr i64 %"22" to ptr
store float %"23", ptr %"33", align 4
ret void
}

View file

@ -1,56 +1,56 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define private float @"1"(float %"17", float %"18") #0 {
"40":
define private float @"1"(float %"15", float %"16") #0 {
%"3" = alloca float, align 4, addrspace(5)
%"4" = alloca float, align 4, addrspace(5)
%"2" = alloca float, align 4, addrspace(5)
%"13" = alloca i1, align 1, addrspace(5)
br label %1
1: ; preds = %0
store float %"15", ptr addrspace(5) %"3", align 4
store float %"16", ptr addrspace(5) %"4", align 4
store i1 false, ptr addrspace(5) %"13", align 1
%"14" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"14", align 1
store float %"17", ptr addrspace(5) %"3", align 4
store float %"18", ptr addrspace(5) %"4", align 4
%"20" = load float, ptr addrspace(5) %"3", align 4
%"21" = load float, ptr addrspace(5) %"4", align 4
%"19" = fadd float %"20", %"21"
store float %"19", ptr addrspace(5) %"2", align 4
%"22" = load float, ptr addrspace(5) %"2", align 4
ret float %"22"
%"18" = load float, ptr addrspace(5) %"3", align 4
%"19" = load float, ptr addrspace(5) %"4", align 4
%"17" = fadd float %"18", %"19"
store float %"17", ptr addrspace(5) %"2", align 4
%"20" = load float, ptr addrspace(5) %"2", align 4
ret float %"20"
}
define protected amdgpu_kernel void @func_ptr(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 {
"41":
%"15" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"15", align 1
%"16" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"16", align 1
define protected amdgpu_kernel void @func_ptr(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
%"14" = alloca i1, align 1, addrspace(5)
%"8" = alloca i64, align 8, addrspace(5)
%"9" = alloca i64, align 8, addrspace(5)
%"10" = alloca i64, align 8, addrspace(5)
%"11" = alloca i64, align 8, addrspace(5)
%"12" = alloca i64, align 8, addrspace(5)
%"23" = load i64, ptr addrspace(4) %"36", align 8
store i64 %"23", ptr addrspace(5) %"8", align 8
%"24" = load i64, ptr addrspace(4) %"37", align 8
store i64 %"24", ptr addrspace(5) %"9", align 8
%"26" = load i64, ptr addrspace(5) %"8", align 8
%"38" = inttoptr i64 %"26" to ptr
%"25" = load i64, ptr %"38", align 8
store i64 %"25", ptr addrspace(5) %"10", align 8
%"28" = load i64, ptr addrspace(5) %"10", align 8
%"27" = add i64 %"28", 1
store i64 %"27", ptr addrspace(5) %"11", align 8
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"14", align 1
%"21" = load i64, ptr addrspace(4) %"34", align 8
store i64 %"21", ptr addrspace(5) %"8", align 8
%"22" = load i64, ptr addrspace(4) %"35", align 8
store i64 %"22", ptr addrspace(5) %"9", align 8
%"24" = load i64, ptr addrspace(5) %"8", align 8
%"36" = inttoptr i64 %"24" to ptr
%"23" = load i64, ptr %"36", align 8
store i64 %"23", ptr addrspace(5) %"10", align 8
%"26" = load i64, ptr addrspace(5) %"10", align 8
%"25" = add i64 %"26", 1
store i64 %"25", ptr addrspace(5) %"11", align 8
store i64 ptrtoint (ptr @"1" to i64), ptr addrspace(5) %"12", align 8
%"31" = load i64, ptr addrspace(5) %"11", align 8
%"32" = load i64, ptr addrspace(5) %"12", align 8
%"30" = add i64 %"31", %"32"
store i64 %"30", ptr addrspace(5) %"11", align 8
%"33" = load i64, ptr addrspace(5) %"9", align 8
%"34" = load i64, ptr addrspace(5) %"11", align 8
%"39" = inttoptr i64 %"33" to ptr
store i64 %"34", ptr %"39", align 8
%"29" = load i64, ptr addrspace(5) %"11", align 8
%"30" = load i64, ptr addrspace(5) %"12", align 8
%"28" = add i64 %"29", %"30"
store i64 %"28", ptr addrspace(5) %"11", align 8
%"31" = load i64, ptr addrspace(5) %"9", align 8
%"32" = load i64, ptr addrspace(5) %"11", align 8
%"37" = inttoptr i64 %"31" to ptr
store i64 %"32", ptr %"37", align 8
ret void
}

View file

@ -4,66 +4,66 @@ target triple = "amdgcn-amd-amdhsa"
@foo = protected addrspace(1) externally_initialized global [4 x i32] [i32 2, i32 3, i32 5, i32 7]
@bar = protected addrspace(1) externally_initialized global [4 x i64] [i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 4), i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 8), i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 12)]
define protected amdgpu_kernel void @generic(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #0 {
"58":
define protected amdgpu_kernel void @generic(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 {
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"11" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"11", align 1
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
%"9" = alloca i32, align 4, addrspace(5)
%"12" = load i64, ptr addrspace(4) %"48", align 8
store i64 %"12", ptr addrspace(5) %"7", align 8
%0 = alloca i32, align 4, addrspace(5)
store i32 1, ptr addrspace(5) %0, align 4
%"13" = load i32, ptr addrspace(5) %0, align 4
store i32 %"13", ptr addrspace(5) %"8", align 4
%"14" = load i64, ptr addrspace(1) @bar, align 8
store i64 %"14", ptr addrspace(5) %"6", align 8
%"16" = load i64, ptr addrspace(5) %"6", align 8
%"50" = inttoptr i64 %"16" to ptr
%"15" = load i32, ptr %"50", align 4
store i32 %"15", ptr addrspace(5) %"9", align 4
%"18" = load i32, ptr addrspace(5) %"8", align 4
%"19" = load i32, ptr addrspace(5) %"9", align 4
%"17" = mul i32 %"18", %"19"
store i32 %"17", ptr addrspace(5) %"8", align 4
%"20" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 8), align 8
store i64 %"20", ptr addrspace(5) %"6", align 8
%"22" = load i64, ptr addrspace(5) %"6", align 8
%"52" = inttoptr i64 %"22" to ptr
%"21" = load i32, ptr %"52", align 4
store i32 %"21", ptr addrspace(5) %"9", align 4
%"24" = load i32, ptr addrspace(5) %"8", align 4
%"25" = load i32, ptr addrspace(5) %"9", align 4
%"23" = mul i32 %"24", %"25"
store i32 %"23", ptr addrspace(5) %"8", align 4
%"26" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 16), align 8
store i64 %"26", ptr addrspace(5) %"6", align 8
%"28" = load i64, ptr addrspace(5) %"6", align 8
%"54" = inttoptr i64 %"28" to ptr
%"27" = load i32, ptr %"54", align 4
store i32 %"27", ptr addrspace(5) %"9", align 4
%"30" = load i32, ptr addrspace(5) %"8", align 4
%"31" = load i32, ptr addrspace(5) %"9", align 4
%"29" = mul i32 %"30", %"31"
store i32 %"29", ptr addrspace(5) %"8", align 4
%"32" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 24), align 8
store i64 %"32", ptr addrspace(5) %"6", align 8
%"34" = load i64, ptr addrspace(5) %"6", align 8
%"56" = inttoptr i64 %"34" to ptr
%"33" = load i32, ptr %"56", align 4
store i32 %"33", ptr addrspace(5) %"9", align 4
%"36" = load i32, ptr addrspace(5) %"8", align 4
%"37" = load i32, ptr addrspace(5) %"9", align 4
%"35" = mul i32 %"36", %"37"
store i32 %"35", ptr addrspace(5) %"8", align 4
%"38" = load i64, ptr addrspace(5) %"7", align 8
%"39" = load i32, ptr addrspace(5) %"8", align 4
%"57" = inttoptr i64 %"38" to ptr
store i32 %"39", ptr %"57", align 4
%1 = alloca i32, align 4, addrspace(5)
br label %2
2: ; preds = %0
store i1 false, ptr addrspace(5) %"10", align 1
%"11" = load i64, ptr addrspace(4) %"47", align 8
store i64 %"11", ptr addrspace(5) %"7", align 8
store i32 1, ptr addrspace(5) %1, align 4
%"12" = load i32, ptr addrspace(5) %1, align 4
store i32 %"12", ptr addrspace(5) %"8", align 4
%"13" = load i64, ptr addrspace(1) @bar, align 8
store i64 %"13", ptr addrspace(5) %"6", align 8
%"15" = load i64, ptr addrspace(5) %"6", align 8
%"49" = inttoptr i64 %"15" to ptr
%"14" = load i32, ptr %"49", align 4
store i32 %"14", ptr addrspace(5) %"9", align 4
%"17" = load i32, ptr addrspace(5) %"8", align 4
%"18" = load i32, ptr addrspace(5) %"9", align 4
%"16" = mul i32 %"17", %"18"
store i32 %"16", ptr addrspace(5) %"8", align 4
%"19" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 8), align 8
store i64 %"19", ptr addrspace(5) %"6", align 8
%"21" = load i64, ptr addrspace(5) %"6", align 8
%"51" = inttoptr i64 %"21" to ptr
%"20" = load i32, ptr %"51", align 4
store i32 %"20", ptr addrspace(5) %"9", align 4
%"23" = load i32, ptr addrspace(5) %"8", align 4
%"24" = load i32, ptr addrspace(5) %"9", align 4
%"22" = mul i32 %"23", %"24"
store i32 %"22", ptr addrspace(5) %"8", align 4
%"25" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 16), align 8
store i64 %"25", ptr addrspace(5) %"6", align 8
%"27" = load i64, ptr addrspace(5) %"6", align 8
%"53" = inttoptr i64 %"27" to ptr
%"26" = load i32, ptr %"53", align 4
store i32 %"26", ptr addrspace(5) %"9", align 4
%"29" = load i32, ptr addrspace(5) %"8", align 4
%"30" = load i32, ptr addrspace(5) %"9", align 4
%"28" = mul i32 %"29", %"30"
store i32 %"28", ptr addrspace(5) %"8", align 4
%"31" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 24), align 8
store i64 %"31", ptr addrspace(5) %"6", align 8
%"33" = load i64, ptr addrspace(5) %"6", align 8
%"55" = inttoptr i64 %"33" to ptr
%"32" = load i32, ptr %"55", align 4
store i32 %"32", ptr addrspace(5) %"9", align 4
%"35" = load i32, ptr addrspace(5) %"8", align 4
%"36" = load i32, ptr addrspace(5) %"9", align 4
%"34" = mul i32 %"35", %"36"
store i32 %"34", ptr addrspace(5) %"8", align 4
%"37" = load i64, ptr addrspace(5) %"7", align 8
%"38" = load i32, ptr addrspace(5) %"8", align 4
%"56" = inttoptr i64 %"37" to ptr
store i32 %"38", ptr %"56", align 4
ret void
}

View file

@ -4,29 +4,29 @@ target triple = "amdgcn-amd-amdhsa"
@asdas = protected addrspace(1) externally_initialized global [4 x [2 x i32]] [[2 x i32] [i32 -1, i32 2], [2 x i32] [i32 3, i32 0], [2 x i32] zeroinitializer, [2 x i32] zeroinitializer]
@foobar = protected addrspace(1) externally_initialized global [4 x [2 x i64]] [[2 x i64] [i64 -1, i64 2], [2 x i64] [i64 3, i64 0], [2 x i64] [i64 ptrtoint (ptr addrspace(1) @asdas to i64), i64 0], [2 x i64] zeroinitializer]
define protected amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
"22":
define protected amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 {
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
%0 = alloca i64, align 8, addrspace(5)
store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %0, align 8
%"11" = load i64, ptr addrspace(5) %0, align 8
store i64 %"11", ptr addrspace(5) %"6", align 8
%"12" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"12", ptr addrspace(5) %"7", align 8
%"14" = load i64, ptr addrspace(5) %"6", align 8
%1 = alloca i64, align 8, addrspace(5)
br label %2
2: ; preds = %0
store i1 false, ptr addrspace(5) %"9", align 1
store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %1, align 8
%"10" = load i64, ptr addrspace(5) %1, align 8
store i64 %"10", ptr addrspace(5) %"6", align 8
%"11" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"11", ptr addrspace(5) %"7", align 8
%"13" = load i64, ptr addrspace(5) %"6", align 8
%"19" = inttoptr i64 %"13" to ptr addrspace(1)
%"12" = load i32, ptr addrspace(1) %"19", align 4
store i32 %"12", ptr addrspace(5) %"8", align 4
%"14" = load i64, ptr addrspace(5) %"7", align 8
%"15" = load i32, ptr addrspace(5) %"8", align 4
%"20" = inttoptr i64 %"14" to ptr addrspace(1)
%"13" = load i32, ptr addrspace(1) %"20", align 4
store i32 %"13", ptr addrspace(5) %"8", align 4
%"15" = load i64, ptr addrspace(5) %"7", align 8
%"16" = load i32, ptr addrspace(5) %"8", align 4
%"21" = inttoptr i64 %"15" to ptr addrspace(1)
store i32 %"16", ptr addrspace(1) %"21", align 4
store i32 %"15", ptr addrspace(1) %"20", align 4
ret void
}

View file

@ -3,41 +3,41 @@ target triple = "amdgcn-amd-amdhsa"
declare i32 @__zluda_ptx_impl__sreg_lanemask_lt() #0
define protected amdgpu_kernel void @lanemask_lt(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 {
"40":
define protected amdgpu_kernel void @lanemask_lt(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #1 {
%"10" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"10", align 1
%"11" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"11", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
%1 = alloca i32, align 4, addrspace(5)
br label %2
2: ; preds = %0
store i1 false, ptr addrspace(5) %"10", align 1
%"14" = load i64, ptr addrspace(4) %"27", align 8
store i64 %"14", ptr addrspace(5) %"4", align 8
%"15" = load i64, ptr addrspace(4) %"28", align 8
store i64 %"15", ptr addrspace(5) %"4", align 8
%"16" = load i64, ptr addrspace(4) %"29", align 8
store i64 %"16", ptr addrspace(5) %"5", align 8
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"31" = inttoptr i64 %"18" to ptr
%"30" = load i32, ptr %"31", align 4
store i32 %"30", ptr addrspace(5) %"6", align 4
%"20" = load i32, ptr addrspace(5) %"6", align 4
%"32" = add i32 %"20", 1
store i32 %"32", ptr addrspace(5) %"7", align 4
%"12" = call i32 @__zluda_ptx_impl__sreg_lanemask_lt()
%0 = alloca i32, align 4, addrspace(5)
store i32 %"12", ptr addrspace(5) %0, align 4
%"34" = load i32, ptr addrspace(5) %0, align 4
store i32 %"34", ptr addrspace(5) %"8", align 4
%"23" = load i32, ptr addrspace(5) %"7", align 4
%"24" = load i32, ptr addrspace(5) %"8", align 4
%"35" = add i32 %"23", %"24"
store i32 %"35", ptr addrspace(5) %"7", align 4
%"25" = load i64, ptr addrspace(5) %"5", align 8
%"26" = load i32, ptr addrspace(5) %"7", align 4
%"38" = inttoptr i64 %"25" to ptr
store i32 %"26", ptr %"38", align 4
store i64 %"15", ptr addrspace(5) %"5", align 8
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"30" = inttoptr i64 %"17" to ptr
%"29" = load i32, ptr %"30", align 4
store i32 %"29", ptr addrspace(5) %"6", align 4
%"19" = load i32, ptr addrspace(5) %"6", align 4
%"31" = add i32 %"19", 1
store i32 %"31", ptr addrspace(5) %"7", align 4
%"11" = call i32 @__zluda_ptx_impl__sreg_lanemask_lt()
store i32 %"11", ptr addrspace(5) %1, align 4
%"33" = load i32, ptr addrspace(5) %1, align 4
store i32 %"33", ptr addrspace(5) %"8", align 4
%"22" = load i32, ptr addrspace(5) %"7", align 4
%"23" = load i32, ptr addrspace(5) %"8", align 4
%"34" = add i32 %"22", %"23"
store i32 %"34", ptr addrspace(5) %"7", align 4
%"24" = load i64, ptr addrspace(5) %"5", align 8
%"25" = load i32, ptr addrspace(5) %"7", align 4
%"37" = inttoptr i64 %"24" to ptr
store i32 %"25", ptr %"37", align 4
ret void
}

View file

@ -1,27 +1,27 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 {
"19":
define protected amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"14", ptr addrspace(4) byref(i64) %"15") #0 {
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = load i64, ptr addrspace(4) %"14", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = load i64, ptr addrspace(4) %"15", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"16", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
store i64 %"9", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(5) %"4", align 8
%"16" = inttoptr i64 %"11" to ptr
%"10" = load i64, ptr %"16", align 8
store i64 %"10", ptr addrspace(5) %"6", align 8
%"12" = load i64, ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"6", align 8
%"17" = inttoptr i64 %"12" to ptr
%"11" = load i64, ptr %"17", align 8
store i64 %"11", ptr addrspace(5) %"6", align 8
%"13" = load i64, ptr addrspace(5) %"5", align 8
%"14" = load i64, ptr addrspace(5) %"6", align 8
%"18" = inttoptr i64 %"13" to ptr
store i64 %"14", ptr %"18", align 8
store i64 %"13", ptr %"17", align 8
ret void
}

View file

@ -1,35 +1,35 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
"23":
define protected amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 {
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%1 = alloca i64, align 8, addrspace(5)
br label %2
2: ; preds = %0
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = load i64, ptr addrspace(4) %"16", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%0 = alloca i64, align 8, addrspace(5)
store i64 81985529216486895, ptr addrspace(5) %0, align 8
%"11" = load i64, ptr addrspace(5) %0, align 8
store i64 %"9", ptr addrspace(5) %"5", align 8
store i64 81985529216486895, ptr addrspace(5) %1, align 8
%"10" = load i64, ptr addrspace(5) %1, align 8
store i64 %"10", ptr addrspace(5) %"6", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"19" = inttoptr i64 %"12" to ptr addrspace(1)
%"18" = load float, ptr addrspace(1) %"19", align 4
%"22" = bitcast float %"18" to i32
%"11" = zext i32 %"22" to i64
store i64 %"11", ptr addrspace(5) %"6", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"13" = load i64, ptr addrspace(5) %"5", align 8
%"14" = load i64, ptr addrspace(5) %"6", align 8
%"20" = inttoptr i64 %"13" to ptr addrspace(1)
%"19" = load float, ptr addrspace(1) %"20", align 4
%"24" = bitcast float %"19" to i32
%"12" = zext i32 %"24" to i64
store i64 %"12", ptr addrspace(5) %"6", align 8
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"15" = load i64, ptr addrspace(5) %"6", align 8
%"21" = inttoptr i64 %"14" to ptr addrspace(1)
%"26" = trunc i64 %"15" to i32
%"22" = bitcast i32 %"26" to float
store float %"22", ptr addrspace(1) %"21", align 4
%"24" = trunc i64 %"14" to i32
%"21" = bitcast i32 %"24" to float
store float %"21", ptr addrspace(1) %"20", align 4
ret void
}

View file

@ -1,38 +1,38 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 {
"30":
define protected amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"23", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"24", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"25", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"26" = inttoptr i64 %"13" to ptr
%"12" = load i32, ptr %"26", align 4
store i32 %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"25" = inttoptr i64 %"12" to ptr
%"11" = load i32, ptr %"25", align 4
store i32 %"11", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"26" = inttoptr i64 %"14" to ptr
%"30" = getelementptr inbounds i8, ptr %"26", i64 4
%"13" = load i32, ptr %"30", align 4
store i32 %"13", ptr addrspace(5) %"7", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load i32, ptr addrspace(5) %"7", align 4
%"27" = inttoptr i64 %"15" to ptr
%"32" = getelementptr inbounds i8, ptr %"27", i64 4
%"14" = load i32, ptr %"32", align 4
store i32 %"14", ptr addrspace(5) %"7", align 4
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"17" = load i32, ptr addrspace(5) %"7", align 4
%"28" = inttoptr i64 %"16" to ptr
store i32 %"17", ptr %"28", align 4
%"18" = load i64, ptr addrspace(5) %"5", align 8
%"19" = load i32, ptr addrspace(5) %"6", align 4
%"29" = inttoptr i64 %"18" to ptr
%"34" = getelementptr inbounds i8, ptr %"29", i64 4
store i32 %"19", ptr %"34", align 4
store i32 %"16", ptr %"27", align 4
%"17" = load i64, ptr addrspace(5) %"5", align 8
%"18" = load i32, ptr addrspace(5) %"6", align 4
%"28" = inttoptr i64 %"17" to ptr
%"32" = getelementptr inbounds i8, ptr %"28", i64 4
store i32 %"18", ptr %"32", align 4
ret void
}

View file

@ -1,30 +1,30 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
"21":
define protected amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 {
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = load i64, ptr addrspace(4) %"16", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"19" = inttoptr i64 %"12" to ptr
%"11" = load float, ptr %"19", align 4
store float %"11", ptr addrspace(5) %"6", align 4
%"14" = load float, ptr addrspace(5) %"6", align 4
%"13" = call afn float @llvm.log2.f32(float %"14")
store float %"13", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load float, ptr addrspace(5) %"6", align 4
%"20" = inttoptr i64 %"15" to ptr
store float %"16", ptr %"20", align 4
store i64 %"9", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(5) %"4", align 8
%"18" = inttoptr i64 %"11" to ptr
%"10" = load float, ptr %"18", align 4
store float %"10", ptr addrspace(5) %"6", align 4
%"13" = load float, ptr addrspace(5) %"6", align 4
%"12" = call afn float @llvm.log2.f32(float %"13")
store float %"12", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"5", align 8
%"15" = load float, ptr addrspace(5) %"6", align 4
%"19" = inttoptr i64 %"14" to ptr
store float %"15", ptr %"19", align 4
ret void
}

View file

@ -1,28 +1,28 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 {
"20":
define protected amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca [8 x i8], align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"15", align 8
store i64 %"9", ptr addrspace(5) %"5", align 8
%"10" = load i64, ptr addrspace(4) %"16", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"11", ptr addrspace(5) %"6", align 8
%"13" = load i64, ptr addrspace(5) %"5", align 8
store i64 %"10", ptr addrspace(5) %"6", align 8
%"12" = load i64, ptr addrspace(5) %"5", align 8
%"17" = inttoptr i64 %"12" to ptr
%"11" = load i64, ptr %"17", align 8
store i64 %"11", ptr addrspace(5) %"7", align 8
%"13" = load i64, ptr addrspace(5) %"6", align 8
%"14" = load i64, ptr addrspace(5) %"7", align 8
%"18" = inttoptr i64 %"13" to ptr
%"12" = load i64, ptr %"18", align 8
store i64 %"12", ptr addrspace(5) %"7", align 8
%"14" = load i64, ptr addrspace(5) %"6", align 8
%"15" = load i64, ptr addrspace(5) %"7", align 8
%"19" = inttoptr i64 %"14" to ptr
store i64 %"15", ptr %"19", align 8
store i64 %"14", ptr %"18", align 8
ret void
}

View file

@ -0,0 +1,90 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @mad_hi_cc(ptr addrspace(4) byref(i64) %"60", ptr addrspace(4) byref(i64) %"61") #0 {
%"14" = alloca i1, align 1, addrspace(5)
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
%"9" = alloca i32, align 4, addrspace(5)
%"10" = alloca i32, align 4, addrspace(5)
%"11" = alloca i32, align 4, addrspace(5)
%"12" = alloca i32, align 4, addrspace(5)
%"13" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"14", align 1
%"15" = load i64, ptr addrspace(4) %"60", align 8
store i64 %"15", ptr addrspace(5) %"4", align 8
%"16" = load i64, ptr addrspace(4) %"61", align 8
store i64 %"16", ptr addrspace(5) %"5", align 8
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"63" = inttoptr i64 %"18" to ptr
%"62" = load i32, ptr %"63", align 4
store i32 %"62", ptr addrspace(5) %"8", align 4
%"20" = load i64, ptr addrspace(5) %"4", align 8
%"64" = inttoptr i64 %"20" to ptr
%"78" = getelementptr inbounds i8, ptr %"64", i64 4
%"65" = load i32, ptr %"78", align 4
store i32 %"65", ptr addrspace(5) %"9", align 4
%"22" = load i64, ptr addrspace(5) %"4", align 8
%"66" = inttoptr i64 %"22" to ptr
%"80" = getelementptr inbounds i8, ptr %"66", i64 8
%"21" = load i32, ptr %"80", align 4
store i32 %"21", ptr addrspace(5) %"10", align 4
%"25" = load i32, ptr addrspace(5) %"8", align 4
%"26" = load i32, ptr addrspace(5) %"9", align 4
%"27" = load i32, ptr addrspace(5) %"10", align 4
%2 = sext i32 %"25" to i64
%3 = sext i32 %"26" to i64
%4 = mul nsw i64 %2, %3
%5 = lshr i64 %4, 32
%6 = trunc i64 %5 to i32
%7 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %6, i32 %"27")
%"23" = extractvalue { i32, i1 } %7, 0
%"24" = extractvalue { i32, i1 } %7, 1
store i32 %"23", ptr addrspace(5) %"7", align 4
store i1 %"24", ptr addrspace(5) %"14", align 1
%8 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 1, i32 -2)
%"28" = extractvalue { i32, i1 } %8, 0
%"29" = extractvalue { i32, i1 } %8, 1
store i32 %"28", ptr addrspace(5) %"6", align 4
store i1 %"29", ptr addrspace(5) %"14", align 1
%"31" = load i1, ptr addrspace(5) %"14", align 1
%9 = zext i1 %"31" to i32
%"70" = add i32 0, %9
store i32 %"70", ptr addrspace(5) %"12", align 4
%10 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 1, i32 -1)
%"32" = extractvalue { i32, i1 } %10, 0
%"33" = extractvalue { i32, i1 } %10, 1
store i32 %"32", ptr addrspace(5) %"6", align 4
store i1 %"33", ptr addrspace(5) %"14", align 1
%"35" = load i1, ptr addrspace(5) %"14", align 1
%11 = zext i1 %"35" to i32
%"71" = add i32 0, %11
store i32 %"71", ptr addrspace(5) %"13", align 4
%"36" = load i64, ptr addrspace(5) %"5", align 8
%"37" = load i32, ptr addrspace(5) %"7", align 4
%"72" = inttoptr i64 %"36" to ptr
store i32 %"37", ptr %"72", align 4
%"38" = load i64, ptr addrspace(5) %"5", align 8
%"39" = load i32, ptr addrspace(5) %"12", align 4
%"73" = inttoptr i64 %"38" to ptr
%"82" = getelementptr inbounds i8, ptr %"73", i64 4
store i32 %"39", ptr %"82", align 4
%"40" = load i64, ptr addrspace(5) %"5", align 8
%"41" = load i32, ptr addrspace(5) %"13", align 4
%"75" = inttoptr i64 %"40" to ptr
%"84" = getelementptr inbounds i8, ptr %"75", i64 8
store i32 %"41", ptr %"84", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }

View file

@ -0,0 +1,41 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry mad_hi_cc(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .u32 unused;
.reg .s32 dst1;
.reg .b32 src1;
.reg .b32 src2;
.reg .b32 src3;
.reg .b32 result_1;
.reg .b32 carry_out_1;
.reg .b32 carry_out_2;
ld.param.u64 in_addr, [input];
ld.param.u64 out_addr, [output];
// test valid computational results
ld.s32 src1, [in_addr];
ld.s32 src2, [in_addr+4];
ld.b32 src3, [in_addr+8];
mad.hi.cc.s32 dst1, src1, src2, src3;
mad.hi.cc.u32 unused, 65536, 65536, 4294967294; // non-overflowing
addc.u32 carry_out_1, 0, 0; // carry_out_1 should be 0
mad.hi.cc.u32 unused, 65536, 65536, 4294967295; // overflowing
addc.u32 carry_out_2, 0, 0; // carry_out_2 should be 1
st.s32 [out_addr], dst1;
st.s32 [out_addr+4], carry_out_1;
st.s32 [out_addr+8], carry_out_2;
ret;
}

View file

@ -1,12 +1,8 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"53", ptr addrspace(4) byref(i64) %"54") #0 {
"76":
define protected amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"52", ptr addrspace(4) byref(i64) %"53") #0 {
%"13" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"13", align 1
%"14" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"14", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
@ -16,67 +12,71 @@ define protected amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"53",
%"10" = alloca i32, align 4, addrspace(5)
%"11" = alloca i32, align 4, addrspace(5)
%"12" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"13", align 1
%"14" = load i64, ptr addrspace(4) %"52", align 8
store i64 %"14", ptr addrspace(5) %"4", align 8
%"15" = load i64, ptr addrspace(4) %"53", align 8
store i64 %"15", ptr addrspace(5) %"4", align 8
%"16" = load i64, ptr addrspace(4) %"54", align 8
store i64 %"16", ptr addrspace(5) %"5", align 8
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"56" = inttoptr i64 %"18" to ptr
%"55" = load i32, ptr %"56", align 4
store i32 %"55", ptr addrspace(5) %"9", align 4
%"20" = load i64, ptr addrspace(5) %"4", align 8
%"57" = inttoptr i64 %"20" to ptr
%"78" = getelementptr inbounds i8, ptr %"57", i64 4
%"58" = load i32, ptr %"78", align 4
store i32 %"58", ptr addrspace(5) %"10", align 4
%"22" = load i64, ptr addrspace(5) %"4", align 8
%"59" = inttoptr i64 %"22" to ptr
%"80" = getelementptr inbounds i8, ptr %"59", i64 8
%"21" = load i64, ptr %"80", align 8
store i64 %"21", ptr addrspace(5) %"12", align 8
%"24" = load i64, ptr addrspace(5) %"4", align 8
%"60" = inttoptr i64 %"24" to ptr
%"82" = getelementptr inbounds i8, ptr %"60", i64 16
%"61" = load i32, ptr %"82", align 4
store i32 %"61", ptr addrspace(5) %"11", align 4
%"26" = load i32, ptr addrspace(5) %"9", align 4
%"27" = load i32, ptr addrspace(5) %"10", align 4
%"28" = load i32, ptr addrspace(5) %"11", align 4
%0 = mul i32 %"26", %"27"
%"25" = add i32 %0, %"28"
store i32 %"25", ptr addrspace(5) %"6", align 4
%"30" = load i32, ptr addrspace(5) %"9", align 4
%"31" = load i32, ptr addrspace(5) %"10", align 4
%"32" = load i32, ptr addrspace(5) %"11", align 4
%1 = sext i32 %"30" to i64
%2 = sext i32 %"31" to i64
%3 = mul nsw i64 %1, %2
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%"29" = add i32 %5, %"32"
store i32 %"29", ptr addrspace(5) %"7", align 4
%"34" = load i32, ptr addrspace(5) %"9", align 4
%"35" = load i32, ptr addrspace(5) %"10", align 4
%"36" = load i64, ptr addrspace(5) %"12", align 8
%6 = sext i32 %"34" to i64
%7 = sext i32 %"35" to i64
%8 = mul nsw i64 %6, %7
%"68" = add i64 %8, %"36"
store i64 %"68", ptr addrspace(5) %"8", align 8
%"37" = load i64, ptr addrspace(5) %"5", align 8
%"38" = load i32, ptr addrspace(5) %"6", align 4
%"72" = inttoptr i64 %"37" to ptr
store i32 %"38", ptr %"72", align 4
%"39" = load i64, ptr addrspace(5) %"5", align 8
%"40" = load i32, ptr addrspace(5) %"7", align 4
%"73" = inttoptr i64 %"39" to ptr
%"84" = getelementptr inbounds i8, ptr %"73", i64 8
store i32 %"40", ptr %"84", align 4
%"41" = load i64, ptr addrspace(5) %"5", align 8
%"42" = load i64, ptr addrspace(5) %"8", align 8
%"74" = inttoptr i64 %"41" to ptr
%"86" = getelementptr inbounds i8, ptr %"74", i64 16
store i64 %"42", ptr %"86", align 8
store i64 %"15", ptr addrspace(5) %"5", align 8
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"55" = inttoptr i64 %"17" to ptr
%"54" = load i32, ptr %"55", align 4
store i32 %"54", ptr addrspace(5) %"9", align 4
%"19" = load i64, ptr addrspace(5) %"4", align 8
%"56" = inttoptr i64 %"19" to ptr
%"76" = getelementptr inbounds i8, ptr %"56", i64 4
%"57" = load i32, ptr %"76", align 4
store i32 %"57", ptr addrspace(5) %"10", align 4
%"21" = load i64, ptr addrspace(5) %"4", align 8
%"58" = inttoptr i64 %"21" to ptr
%"78" = getelementptr inbounds i8, ptr %"58", i64 8
%"20" = load i64, ptr %"78", align 8
store i64 %"20", ptr addrspace(5) %"12", align 8
%"23" = load i64, ptr addrspace(5) %"4", align 8
%"59" = inttoptr i64 %"23" to ptr
%"80" = getelementptr inbounds i8, ptr %"59", i64 16
%"60" = load i32, ptr %"80", align 4
store i32 %"60", ptr addrspace(5) %"11", align 4
%"25" = load i32, ptr addrspace(5) %"9", align 4
%"26" = load i32, ptr addrspace(5) %"10", align 4
%"27" = load i32, ptr addrspace(5) %"11", align 4
%2 = mul i32 %"25", %"26"
%"24" = add i32 %2, %"27"
store i32 %"24", ptr addrspace(5) %"6", align 4
%"29" = load i32, ptr addrspace(5) %"9", align 4
%"30" = load i32, ptr addrspace(5) %"10", align 4
%"31" = load i32, ptr addrspace(5) %"11", align 4
%3 = sext i32 %"29" to i64
%4 = sext i32 %"30" to i64
%5 = mul nsw i64 %3, %4
%6 = lshr i64 %5, 32
%7 = trunc i64 %6 to i32
%"28" = add i32 %7, %"31"
store i32 %"28", ptr addrspace(5) %"7", align 4
%"33" = load i32, ptr addrspace(5) %"9", align 4
%"34" = load i32, ptr addrspace(5) %"10", align 4
%"35" = load i64, ptr addrspace(5) %"12", align 8
%8 = sext i32 %"33" to i64
%9 = sext i32 %"34" to i64
%10 = mul nsw i64 %8, %9
%"67" = add i64 %10, %"35"
store i64 %"67", ptr addrspace(5) %"8", align 8
%"36" = load i64, ptr addrspace(5) %"5", align 8
%"37" = load i32, ptr addrspace(5) %"6", align 4
%"71" = inttoptr i64 %"36" to ptr
store i32 %"37", ptr %"71", align 4
%"38" = load i64, ptr addrspace(5) %"5", align 8
%"39" = load i32, ptr addrspace(5) %"7", align 4
%"72" = inttoptr i64 %"38" to ptr
%"82" = getelementptr inbounds i8, ptr %"72", i64 8
store i32 %"39", ptr %"82", align 4
%"40" = load i64, ptr addrspace(5) %"5", align 8
%"41" = load i64, ptr addrspace(5) %"8", align 8
%"73" = inttoptr i64 %"40" to ptr
%"84" = getelementptr inbounds i8, ptr %"73", i64 16
store i64 %"41", ptr %"84", align 8
ret void
}

View file

@ -1,12 +1,8 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @madc_cc(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 {
"55":
define protected amdgpu_kernel void @madc_cc(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 {
%"11" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"11", align 1
%"12" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"12", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
@ -14,54 +10,58 @@ define protected amdgpu_kernel void @madc_cc(ptr addrspace(4) byref(i64) %"41",
%"8" = alloca i32, align 4, addrspace(5)
%"9" = alloca i32, align 4, addrspace(5)
%"10" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"11", align 1
%"12" = load i64, ptr addrspace(4) %"40", align 8
store i64 %"12", ptr addrspace(5) %"4", align 8
%"13" = load i64, ptr addrspace(4) %"41", align 8
store i64 %"13", ptr addrspace(5) %"4", align 8
%"14" = load i64, ptr addrspace(4) %"42", align 8
store i64 %"14", ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"4", align 8
%"44" = inttoptr i64 %"16" to ptr
%"43" = load i32, ptr %"44", align 4
store i32 %"43", ptr addrspace(5) %"8", align 4
%"18" = load i64, ptr addrspace(5) %"4", align 8
%"45" = inttoptr i64 %"18" to ptr
%"57" = getelementptr inbounds i8, ptr %"45", i64 4
%"46" = load i32, ptr %"57", align 4
store i32 %"46", ptr addrspace(5) %"9", align 4
%"20" = load i64, ptr addrspace(5) %"4", align 8
%"47" = inttoptr i64 %"20" to ptr
%"59" = getelementptr inbounds i8, ptr %"47", i64 8
%"19" = load i32, ptr %"59", align 4
store i32 %"19", ptr addrspace(5) %"10", align 4
%"23" = load i32, ptr addrspace(5) %"8", align 4
%"24" = load i32, ptr addrspace(5) %"9", align 4
%"25" = load i32, ptr addrspace(5) %"10", align 4
%0 = mul i32 %"23", %"24"
%1 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %0, i32 %"25")
%"21" = extractvalue { i32, i1 } %1, 0
%"22" = extractvalue { i32, i1 } %1, 1
store i32 %"21", ptr addrspace(5) %"6", align 4
store i1 %"22", ptr addrspace(5) %"11", align 1
%"27" = load i1, ptr addrspace(5) %"11", align 1
%"28" = load i32, ptr addrspace(5) %"8", align 4
%"29" = load i32, ptr addrspace(5) %"9", align 4
%2 = sext i32 %"28" to i64
%3 = sext i32 %"29" to i64
%4 = mul nsw i64 %2, %3
%5 = lshr i64 %4, 32
%6 = trunc i64 %5 to i32
%7 = zext i1 %"27" to i32
%8 = add i32 %6, 3
%"26" = add i32 %8, %7
store i32 %"26", ptr addrspace(5) %"7", align 4
%"30" = load i64, ptr addrspace(5) %"5", align 8
%"31" = load i32, ptr addrspace(5) %"6", align 4
%"53" = inttoptr i64 %"30" to ptr
store i32 %"31", ptr %"53", align 4
%"32" = load i64, ptr addrspace(5) %"5", align 8
%"33" = load i32, ptr addrspace(5) %"7", align 4
%"54" = inttoptr i64 %"32" to ptr
%"61" = getelementptr inbounds i8, ptr %"54", i64 4
store i32 %"33", ptr %"61", align 4
store i64 %"13", ptr addrspace(5) %"5", align 8
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"43" = inttoptr i64 %"15" to ptr
%"42" = load i32, ptr %"43", align 4
store i32 %"42", ptr addrspace(5) %"8", align 4
%"17" = load i64, ptr addrspace(5) %"4", align 8
%"44" = inttoptr i64 %"17" to ptr
%"55" = getelementptr inbounds i8, ptr %"44", i64 4
%"45" = load i32, ptr %"55", align 4
store i32 %"45", ptr addrspace(5) %"9", align 4
%"19" = load i64, ptr addrspace(5) %"4", align 8
%"46" = inttoptr i64 %"19" to ptr
%"57" = getelementptr inbounds i8, ptr %"46", i64 8
%"18" = load i32, ptr %"57", align 4
store i32 %"18", ptr addrspace(5) %"10", align 4
%"22" = load i32, ptr addrspace(5) %"8", align 4
%"23" = load i32, ptr addrspace(5) %"9", align 4
%"24" = load i32, ptr addrspace(5) %"10", align 4
%2 = mul i32 %"22", %"23"
%3 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %2, i32 %"24")
%"20" = extractvalue { i32, i1 } %3, 0
%"21" = extractvalue { i32, i1 } %3, 1
store i32 %"20", ptr addrspace(5) %"6", align 4
store i1 %"21", ptr addrspace(5) %"11", align 1
%"26" = load i1, ptr addrspace(5) %"11", align 1
%"27" = load i32, ptr addrspace(5) %"8", align 4
%"28" = load i32, ptr addrspace(5) %"9", align 4
%4 = sext i32 %"27" to i64
%5 = sext i32 %"28" to i64
%6 = mul nsw i64 %4, %5
%7 = lshr i64 %6, 32
%8 = trunc i64 %7 to i32
%9 = zext i1 %"26" to i32
%10 = add i32 %8, 3
%"25" = add i32 %10, %9
store i32 %"25", ptr addrspace(5) %"7", align 4
%"29" = load i64, ptr addrspace(5) %"5", align 8
%"30" = load i32, ptr addrspace(5) %"6", align 4
%"52" = inttoptr i64 %"29" to ptr
store i32 %"30", ptr %"52", align 4
%"31" = load i64, ptr addrspace(5) %"5", align 8
%"32" = load i32, ptr addrspace(5) %"7", align 4
%"53" = inttoptr i64 %"31" to ptr
%"59" = getelementptr inbounds i8, ptr %"53", i64 4
store i32 %"32", ptr %"59", align 4
ret void
}

View file

@ -1,73 +0,0 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @madc_cc2(ptr addrspace(4) byref(i64) %"52", ptr addrspace(4) byref(i64) %"53") #0 {
"66":
%"11" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"11", align 1
%"12" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"12", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
%"8" = alloca i32, align 4, addrspace(5)
%"9" = alloca i32, align 4, addrspace(5)
%"10" = alloca i32, align 4, addrspace(5)
%"13" = load i64, ptr addrspace(4) %"53", align 8
store i64 %"13", ptr addrspace(5) %"5", align 8
%0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 -1)
%"14" = extractvalue { i32, i1 } %0, 0
%"15" = extractvalue { i32, i1 } %0, 1
store i32 %"14", ptr addrspace(5) %"6", align 4
store i1 %"15", ptr addrspace(5) %"11", align 1
%"18" = load i1, ptr addrspace(5) %"11", align 1
%1 = zext i1 %"18" to i32
%2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 1, i32 -1)
%3 = extractvalue { i32, i1 } %2, 0
%4 = extractvalue { i32, i1 } %2, 1
%5 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %3, i32 %1)
%"54" = extractvalue { i32, i1 } %5, 0
%6 = extractvalue { i32, i1 } %5, 1
%"17" = xor i1 %4, %6
store i32 %"54", ptr addrspace(5) %"7", align 4
store i1 %"17", ptr addrspace(5) %"11", align 1
%"20" = load i1, ptr addrspace(5) %"11", align 1
%7 = zext i1 %"20" to i32
%"55" = add i32 0, %7
store i32 %"55", ptr addrspace(5) %"8", align 4
%"22" = load i1, ptr addrspace(5) %"11", align 1
%8 = zext i1 %"22" to i32
%"56" = add i32 0, %8
store i32 %"56", ptr addrspace(5) %"9", align 4
%"24" = load i1, ptr addrspace(5) %"12", align 1
%9 = zext i1 %"24" to i32
%"57" = sub i32 2, %9
store i32 %"57", ptr addrspace(5) %"10", align 4
%"25" = load i64, ptr addrspace(5) %"5", align 8
%"26" = load i32, ptr addrspace(5) %"7", align 4
%"58" = inttoptr i64 %"25" to ptr
store i32 %"26", ptr %"58", align 4
%"27" = load i64, ptr addrspace(5) %"5", align 8
%"28" = load i32, ptr addrspace(5) %"8", align 4
%"60" = inttoptr i64 %"27" to ptr
%"68" = getelementptr inbounds i8, ptr %"60", i64 4
store i32 %"28", ptr %"68", align 4
%"29" = load i64, ptr addrspace(5) %"5", align 8
%"30" = load i32, ptr addrspace(5) %"9", align 4
%"62" = inttoptr i64 %"29" to ptr
%"70" = getelementptr inbounds i8, ptr %"62", i64 8
store i32 %"30", ptr %"70", align 4
%"31" = load i64, ptr addrspace(5) %"5", align 8
%"32" = load i32, ptr addrspace(5) %"10", align 4
%"64" = inttoptr i64 %"31" to ptr
%"72" = getelementptr inbounds i8, ptr %"64", i64 12
store i32 %"32", ptr %"72", align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1
attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" }
attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }

View file

@ -1,38 +0,0 @@
.version 6.5
.target sm_30
.address_size 64
.visible .entry madc_cc2(
.param .u64 input,
.param .u64 output
)
{
.reg .u64 in_addr;
.reg .u64 out_addr;
.reg .u32 unused;
.reg .b32 result_1;
.reg .b32 carry_out_1_1;
.reg .b32 carry_out_1_2;
.reg .b32 carry_out_1_3;
ld.param.u64 out_addr, [output];
// set carry=1
mad.lo.cc.u32 unused, 0, 0, 4294967295;
// overflow addition
madc.lo.cc.u32 result_1, 1, 1, 4294967295;
// write carry
madc.lo.u32 carry_out_1_1, 0, 0, 0;
// overflow is also detected by addc
addc.u32 carry_out_1_2, 0, 0;
// but not subc
subc.u32 carry_out_1_3, 2, 0;
st.s32 [out_addr], result_1;
st.s32 [out_addr+4], carry_out_1_1;
st.s32 [out_addr+8], carry_out_1_2;
st.s32 [out_addr+12], carry_out_1_3;
ret;
}

View file

@ -1,37 +1,37 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
"28":
define protected amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"22", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"23", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"24", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"25" = inttoptr i64 %"13" to ptr
%"12" = load i32, ptr %"25", align 4
store i32 %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"26" = inttoptr i64 %"15" to ptr
%"30" = getelementptr inbounds i8, ptr %"26", i64 4
%"14" = load i32, ptr %"30", align 4
store i32 %"14", ptr addrspace(5) %"7", align 4
%"17" = load i32, ptr addrspace(5) %"6", align 4
%"18" = load i32, ptr addrspace(5) %"7", align 4
%"16" = call i32 @llvm.smax.i32(i32 %"17", i32 %"18")
store i32 %"16", ptr addrspace(5) %"6", align 4
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load i32, ptr addrspace(5) %"6", align 4
%"27" = inttoptr i64 %"19" to ptr
store i32 %"20", ptr %"27", align 4
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"24" = inttoptr i64 %"12" to ptr
%"11" = load i32, ptr %"24", align 4
store i32 %"11", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"25" = inttoptr i64 %"14" to ptr
%"28" = getelementptr inbounds i8, ptr %"25", i64 4
%"13" = load i32, ptr %"28", align 4
store i32 %"13", ptr addrspace(5) %"7", align 4
%"16" = load i32, ptr addrspace(5) %"6", align 4
%"17" = load i32, ptr addrspace(5) %"7", align 4
%"15" = call i32 @llvm.smax.i32(i32 %"16", i32 %"17")
store i32 %"15", ptr addrspace(5) %"6", align 4
%"18" = load i64, ptr addrspace(5) %"5", align 8
%"19" = load i32, ptr addrspace(5) %"6", align 4
%"26" = inttoptr i64 %"18" to ptr
store i32 %"19", ptr %"26", align 4
ret void
}

View file

@ -1,28 +1,28 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 {
"20":
define protected amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"14", ptr addrspace(4) byref(i64) %"15") #0 {
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"7", align 1
%"8" = load i64, ptr addrspace(4) %"14", align 8
store i64 %"8", ptr addrspace(5) %"4", align 8
%"9" = load i64, ptr addrspace(4) %"15", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"16", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"18" = inttoptr i64 %"12" to ptr
%"17" = load i32, ptr %"18", align 4
store i32 %"17", ptr addrspace(5) %"6", align 4
store i64 %"9", ptr addrspace(5) %"5", align 8
%"11" = load i64, ptr addrspace(5) %"4", align 8
%"17" = inttoptr i64 %"11" to ptr
%"16" = load i32, ptr %"17", align 4
store i32 %"16", ptr addrspace(5) %"6", align 4
fence seq_cst
%"13" = load i64, ptr addrspace(5) %"5", align 8
%"14" = load i32, ptr addrspace(5) %"6", align 4
%"19" = inttoptr i64 %"13" to ptr
store i32 %"14", ptr %"19", align 4
%"12" = load i64, ptr addrspace(5) %"5", align 8
%"13" = load i32, ptr addrspace(5) %"6", align 4
%"18" = inttoptr i64 %"12" to ptr
store i32 %"13", ptr %"18", align 4
ret void
}

View file

@ -1,37 +1,37 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
"28":
define protected amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i32, align 4, addrspace(5)
%"7" = alloca i32, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"22", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"23", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"24", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"25" = inttoptr i64 %"13" to ptr
%"12" = load i32, ptr %"25", align 4
store i32 %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"26" = inttoptr i64 %"15" to ptr
%"30" = getelementptr inbounds i8, ptr %"26", i64 4
%"14" = load i32, ptr %"30", align 4
store i32 %"14", ptr addrspace(5) %"7", align 4
%"17" = load i32, ptr addrspace(5) %"6", align 4
%"18" = load i32, ptr addrspace(5) %"7", align 4
%"16" = call i32 @llvm.smin.i32(i32 %"17", i32 %"18")
store i32 %"16", ptr addrspace(5) %"6", align 4
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load i32, ptr addrspace(5) %"6", align 4
%"27" = inttoptr i64 %"19" to ptr
store i32 %"20", ptr %"27", align 4
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"24" = inttoptr i64 %"12" to ptr
%"11" = load i32, ptr %"24", align 4
store i32 %"11", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"25" = inttoptr i64 %"14" to ptr
%"28" = getelementptr inbounds i8, ptr %"25", i64 4
%"13" = load i32, ptr %"28", align 4
store i32 %"13", ptr addrspace(5) %"7", align 4
%"16" = load i32, ptr addrspace(5) %"6", align 4
%"17" = load i32, ptr addrspace(5) %"7", align 4
%"15" = call i32 @llvm.smin.i32(i32 %"16", i32 %"17")
store i32 %"15", ptr addrspace(5) %"6", align 4
%"18" = load i64, ptr addrspace(5) %"5", align 8
%"19" = load i32, ptr addrspace(5) %"6", align 4
%"26" = inttoptr i64 %"18" to ptr
store i32 %"19", ptr %"26", align 4
ret void
}

View file

@ -294,7 +294,11 @@ test_ptx!(
[65521u32, 2147549199, 0x1000],
[2147487519u32, 4294934539]
);
test_ptx!(madc_cc2, [0xDEADu32], [0u32, 1, 1, 2]);
test_ptx!(
mad_hi_cc,
[0x26223377u32, 0x70777766u32, 0x60666633u32],
[0x71272866u32, 0u32, 1u32]
); // Multi-tap :)
test_ptx!(mov_vector_cast, [0x200000001u64], [2u32, 1u32]);
test_ptx!(
cvt_clamp,
@ -327,11 +331,13 @@ test_ptx!(
],
[4294967295u32, 0, 2]
);
test_ptx!(carry_mixed, [0xDEADu32], [1u32, 1u32]);
test_ptx!(
subc_cc2,
carry_set_all,
[0xDEADu32],
[0u32, 1, 0, 4294967295, 1, 4294967295, 1]
[
1u32, 0, 0, 1, 0, 1, 0, 0, 0u32, 4294967295, 4294967295, 0, 4294967295, 0, 4294967295,
4294967295
]
);
test_ptx!(vshr, [0x6f3650f4u32, 22, 0xc62d4586], [0xC62D4742u32]);
test_ptx!(bfind, [0u32, 1u32, 0x64eb0414], [u32::MAX, 0, 30]);
@ -360,6 +366,7 @@ test_ptx!(
[1923569713u64, 1923569712]
);
test_ptx!(isspacep, [0xDEADu32], [1u32, 0]);
test_ptx!(sad, [2147483648u32, 2, 13], [2147483659u32, 2147483663]);
test_ptx_warp!(
shfl,

View file

@ -1,33 +1,33 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
"22":
define protected amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
%1 = alloca i64, align 8, addrspace(5)
br label %2
2: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"17", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"20" = inttoptr i64 %"13" to ptr
%"12" = load i64, ptr %"20", align 8
store i64 %"12", ptr addrspace(5) %"6", align 8
%"15" = load i64, ptr addrspace(5) %"6", align 8
%0 = alloca i64, align 8, addrspace(5)
store i64 %"15", ptr addrspace(5) %0, align 8
%"14" = load i64, ptr addrspace(5) %0, align 8
store i64 %"14", ptr addrspace(5) %"7", align 8
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"17" = load i64, ptr addrspace(5) %"7", align 8
%"21" = inttoptr i64 %"16" to ptr
store i64 %"17", ptr %"21", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"19" = inttoptr i64 %"12" to ptr
%"11" = load i64, ptr %"19", align 8
store i64 %"11", ptr addrspace(5) %"6", align 8
%"14" = load i64, ptr addrspace(5) %"6", align 8
store i64 %"14", ptr addrspace(5) %1, align 8
%"13" = load i64, ptr addrspace(5) %1, align 8
store i64 %"13", ptr addrspace(5) %"7", align 8
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"7", align 8
%"20" = inttoptr i64 %"15" to ptr
store i64 %"16", ptr %"20", align 8
ret void
}

View file

@ -1,19 +1,19 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"9", ptr addrspace(4) byref(i64) %"10") #0 {
"12":
define protected amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"8", ptr addrspace(4) byref(i64) %"9") #0 {
%"6" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"6", align 1
%"7" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"7", align 1
%"4" = alloca [8 x i8], align 1, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"11" = ptrtoint ptr addrspace(5) %"4" to i64
%0 = alloca i64, align 8, addrspace(5)
store i64 %"11", ptr addrspace(5) %0, align 8
%"8" = load i64, ptr addrspace(5) %0, align 8
store i64 %"8", ptr addrspace(5) %"5", align 8
%1 = alloca i64, align 8, addrspace(5)
br label %2
2: ; preds = %0
store i1 false, ptr addrspace(5) %"6", align 1
%"10" = ptrtoint ptr addrspace(5) %"4" to i64
store i64 %"10", ptr addrspace(5) %1, align 8
%"7" = load i64, ptr addrspace(5) %1, align 8
store i64 %"7", ptr addrspace(5) %"5", align 8
ret void
}

View file

@ -1,12 +1,8 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @mov_vector_cast(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 {
"50":
define protected amdgpu_kernel void @mov_vector_cast(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 {
%"15" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"15", align 1
%"16" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"16", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
@ -16,51 +12,55 @@ define protected amdgpu_kernel void @mov_vector_cast(ptr addrspace(4) byref(i64)
%"10" = alloca half, align 2, addrspace(5)
%"11" = alloca half, align 2, addrspace(5)
%"12" = alloca half, align 2, addrspace(5)
%"17" = load i64, ptr addrspace(4) %"35", align 8
store i64 %"17", ptr addrspace(5) %"4", align 8
%"18" = load i64, ptr addrspace(4) %"36", align 8
store i64 %"18", ptr addrspace(5) %"5", align 8
%"20" = load i64, ptr addrspace(5) %"4", align 8
%"37" = inttoptr i64 %"20" to ptr
%"19" = load i64, ptr %"37", align 8
store i64 %"19", ptr addrspace(5) %"6", align 8
%"21" = load i64, ptr addrspace(5) %"6", align 8
%0 = alloca i64, align 8, addrspace(5)
store i64 %"21", ptr addrspace(5) %0, align 8
%"13" = load i64, ptr addrspace(5) %0, align 8
%"39" = bitcast i64 %"13" to <2 x i32>
%"40" = extractelement <2 x i32> %"39", i32 0
%"41" = extractelement <2 x i32> %"39", i32 1
%"22" = bitcast i32 %"40" to float
%"23" = bitcast i32 %"41" to float
store float %"22", ptr addrspace(5) %"7", align 4
store float %"23", ptr addrspace(5) %"8", align 4
%"24" = load i64, ptr addrspace(5) %"6", align 8
%1 = alloca i64, align 8, addrspace(5)
store i64 %"24", ptr addrspace(5) %1, align 8
%"14" = load i64, ptr addrspace(5) %1, align 8
%"43" = bitcast i64 %"14" to <4 x i16>
%"44" = extractelement <4 x i16> %"43", i32 0
%"45" = extractelement <4 x i16> %"43", i32 1
%"46" = extractelement <4 x i16> %"43", i32 2
%"47" = extractelement <4 x i16> %"43", i32 3
%2 = alloca i64, align 8, addrspace(5)
br label %3
3: ; preds = %0
store i1 false, ptr addrspace(5) %"15", align 1
%"16" = load i64, ptr addrspace(4) %"34", align 8
store i64 %"16", ptr addrspace(5) %"4", align 8
%"17" = load i64, ptr addrspace(4) %"35", align 8
store i64 %"17", ptr addrspace(5) %"5", align 8
%"19" = load i64, ptr addrspace(5) %"4", align 8
%"36" = inttoptr i64 %"19" to ptr
%"18" = load i64, ptr %"36", align 8
store i64 %"18", ptr addrspace(5) %"6", align 8
%"20" = load i64, ptr addrspace(5) %"6", align 8
store i64 %"20", ptr addrspace(5) %1, align 8
%"13" = load i64, ptr addrspace(5) %1, align 8
%"38" = bitcast i64 %"13" to <2 x i32>
%"39" = extractelement <2 x i32> %"38", i32 0
%"40" = extractelement <2 x i32> %"38", i32 1
%"21" = bitcast i32 %"39" to float
%"22" = bitcast i32 %"40" to float
store float %"21", ptr addrspace(5) %"7", align 4
store float %"22", ptr addrspace(5) %"8", align 4
%"23" = load i64, ptr addrspace(5) %"6", align 8
store i64 %"23", ptr addrspace(5) %2, align 8
%"14" = load i64, ptr addrspace(5) %2, align 8
%"42" = bitcast i64 %"14" to <4 x i16>
%"43" = extractelement <4 x i16> %"42", i32 0
%"44" = extractelement <4 x i16> %"42", i32 1
%"45" = extractelement <4 x i16> %"42", i32 2
%"46" = extractelement <4 x i16> %"42", i32 3
%"24" = bitcast i16 %"43" to half
%"25" = bitcast i16 %"44" to half
%"26" = bitcast i16 %"45" to half
%"27" = bitcast i16 %"46" to half
%"28" = bitcast i16 %"47" to half
store half %"25", ptr addrspace(5) %"9", align 2
store half %"26", ptr addrspace(5) %"10", align 2
store half %"27", ptr addrspace(5) %"11", align 2
store half %"28", ptr addrspace(5) %"12", align 2
%"29" = load i64, ptr addrspace(5) %"5", align 8
%"30" = load float, ptr addrspace(5) %"8", align 4
%"48" = inttoptr i64 %"29" to ptr
store float %"30", ptr %"48", align 4
%"31" = load i64, ptr addrspace(5) %"5", align 8
%"32" = load float, ptr addrspace(5) %"7", align 4
%"49" = inttoptr i64 %"31" to ptr
%"52" = getelementptr inbounds i8, ptr %"49", i64 4
store float %"32", ptr %"52", align 4
store half %"24", ptr addrspace(5) %"9", align 2
store half %"25", ptr addrspace(5) %"10", align 2
store half %"26", ptr addrspace(5) %"11", align 2
store half %"27", ptr addrspace(5) %"12", align 2
%"28" = load i64, ptr addrspace(5) %"5", align 8
%"29" = load float, ptr addrspace(5) %"8", align 4
%"47" = inttoptr i64 %"28" to ptr
store float %"29", ptr %"47", align 4
%"30" = load i64, ptr addrspace(5) %"5", align 8
%"31" = load float, ptr addrspace(5) %"7", align 4
%"48" = inttoptr i64 %"30" to ptr
%"50" = getelementptr inbounds i8, ptr %"48", i64 4
store float %"31", ptr %"50", align 4
ret void
}

View file

@ -1,37 +1,37 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 {
"28":
define protected amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca float, align 4, addrspace(5)
%"7" = alloca float, align 4, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"22", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"23", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"24", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"25" = inttoptr i64 %"13" to ptr
%"12" = load float, ptr %"25", align 4
store float %"12", ptr addrspace(5) %"6", align 4
%"15" = load i64, ptr addrspace(5) %"4", align 8
%"26" = inttoptr i64 %"15" to ptr
%"30" = getelementptr inbounds i8, ptr %"26", i64 4
%"14" = load float, ptr %"30", align 4
store float %"14", ptr addrspace(5) %"7", align 4
%"17" = load float, ptr addrspace(5) %"6", align 4
%"18" = load float, ptr addrspace(5) %"7", align 4
%"16" = fmul float %"17", %"18"
store float %"16", ptr addrspace(5) %"6", align 4
%"19" = load i64, ptr addrspace(5) %"5", align 8
%"20" = load float, ptr addrspace(5) %"6", align 4
%"27" = inttoptr i64 %"19" to ptr
store float %"20", ptr %"27", align 4
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"24" = inttoptr i64 %"12" to ptr
%"11" = load float, ptr %"24", align 4
store float %"11", ptr addrspace(5) %"6", align 4
%"14" = load i64, ptr addrspace(5) %"4", align 8
%"25" = inttoptr i64 %"14" to ptr
%"28" = getelementptr inbounds i8, ptr %"25", i64 4
%"13" = load float, ptr %"28", align 4
store float %"13", ptr addrspace(5) %"7", align 4
%"16" = load float, ptr addrspace(5) %"6", align 4
%"17" = load float, ptr addrspace(5) %"7", align 4
%"15" = fmul float %"16", %"17"
store float %"15", ptr addrspace(5) %"6", align 4
%"18" = load i64, ptr addrspace(5) %"5", align 8
%"19" = load float, ptr addrspace(5) %"6", align 4
%"26" = inttoptr i64 %"18" to ptr
store float %"19", ptr %"26", align 4
ret void
}

View file

@ -3,31 +3,31 @@ target triple = "amdgcn-amd-amdhsa"
declare i64 @__zluda_ptx_impl__mul_hi_u64(i64, i64) #0
define protected amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #1 {
"23":
define protected amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #1 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"20", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"21" = inttoptr i64 %"13" to ptr
%"12" = load i64, ptr %"21", align 8
store i64 %"12", ptr addrspace(5) %"6", align 8
%"15" = load i64, ptr addrspace(5) %"6", align 8
%"14" = call i64 @__zluda_ptx_impl__mul_hi_u64(i64 %"15", i64 2)
store i64 %"14", ptr addrspace(5) %"7", align 8
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"17" = load i64, ptr addrspace(5) %"7", align 8
%"22" = inttoptr i64 %"16" to ptr
store i64 %"17", ptr %"22", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"20" = inttoptr i64 %"12" to ptr
%"11" = load i64, ptr %"20", align 8
store i64 %"11", ptr addrspace(5) %"6", align 8
%"14" = load i64, ptr addrspace(5) %"6", align 8
%"13" = call i64 @__zluda_ptx_impl__mul_hi_u64(i64 %"14", i64 2)
store i64 %"13", ptr addrspace(5) %"7", align 8
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"7", align 8
%"21" = inttoptr i64 %"15" to ptr
store i64 %"16", ptr %"21", align 8
ret void
}

View file

@ -1,31 +1,31 @@
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
target triple = "amdgcn-amd-amdhsa"
define protected amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 {
"23":
define protected amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 {
%"8" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = alloca i1, align 1, addrspace(5)
store i1 false, ptr addrspace(5) %"9", align 1
%"4" = alloca i64, align 8, addrspace(5)
%"5" = alloca i64, align 8, addrspace(5)
%"6" = alloca i64, align 8, addrspace(5)
%"7" = alloca i64, align 8, addrspace(5)
br label %1
1: ; preds = %0
store i1 false, ptr addrspace(5) %"8", align 1
%"9" = load i64, ptr addrspace(4) %"18", align 8
store i64 %"9", ptr addrspace(5) %"4", align 8
%"10" = load i64, ptr addrspace(4) %"19", align 8
store i64 %"10", ptr addrspace(5) %"4", align 8
%"11" = load i64, ptr addrspace(4) %"20", align 8
store i64 %"11", ptr addrspace(5) %"5", align 8
%"13" = load i64, ptr addrspace(5) %"4", align 8
%"21" = inttoptr i64 %"13" to ptr
%"12" = load i64, ptr %"21", align 8
store i64 %"12", ptr addrspace(5) %"6", align 8
%"15" = load i64, ptr addrspace(5) %"6", align 8
%"14" = mul i64 %"15", 2
store i64 %"14", ptr addrspace(5) %"7", align 8
%"16" = load i64, ptr addrspace(5) %"5", align 8
%"17" = load i64, ptr addrspace(5) %"7", align 8
%"22" = inttoptr i64 %"16" to ptr
store i64 %"17", ptr %"22", align 8
store i64 %"10", ptr addrspace(5) %"5", align 8
%"12" = load i64, ptr addrspace(5) %"4", align 8
%"20" = inttoptr i64 %"12" to ptr
%"11" = load i64, ptr %"20", align 8
store i64 %"11", ptr addrspace(5) %"6", align 8
%"14" = load i64, ptr addrspace(5) %"6", align 8
%"13" = mul i64 %"14", 2
store i64 %"13", ptr addrspace(5) %"7", align 8
%"15" = load i64, ptr addrspace(5) %"5", align 8
%"16" = load i64, ptr addrspace(5) %"7", align 8
%"21" = inttoptr i64 %"15" to ptr
store i64 %"16", ptr %"21", align 8
ret void
}

Some files were not shown because too many files have changed in this diff Show more