diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..a9037fd --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,67 @@ +name: Rust +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +env: + CARGO_TERM_COLOR: always + ROCM_VERSION: "5.7.3" + +jobs: + build_lin: + name: Build and publish (Linux) + runs-on: ubuntu-20.04 + steps: + - uses: jlumbroso/free-disk-space@main + - name: Install ROCm + run: | + sudo mkdir --parents --mode=0755 /etc/apt/keyrings + sudo sh -c 'wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null' + sudo sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ env.ROCM_VERSION }} focal main > /etc/apt/sources.list.d/rocm.list' + sudo apt-get update + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends rocm-smi-lib hip-runtime-amd comgr hipblaslt-dev hipfft-dev rocblas-dev rocsolver-dev rocsparse-dev miopen-hip-dev rocm-device-libs + echo 'export PATH="$PATH:/opt/rocm/bin"' | sudo tee /etc/profile.d/rocm.sh + echo '/opt/rocm/lib' | sudo tee /etc/ld.so.conf.d/rocm.conf + sudo ldconfig + - uses: actions/checkout@v4 + with: + submodules: true + - uses: Swatinem/rust-cache@v2 + - name: Build + # We use tar to unpack .tar.gz we've created because Github actions/upload-artifact + # is broken and will _always_ zip your artifact (even if it is a single file). + # See here: https://github.com/actions/upload-artifact/issues/39 + # and here: https://github.com/actions/upload-artifact/issues/109 + run: | + cargo xtask zip -r + tar -xzf target/release/zluda.tar.gz -C target/release + # https://stackoverflow.com/a/64195658 + - name: Set revision hash + run: echo "SHORT_SHA=$(git rev-parse --short HEAD)" >> $GITHUB_ENV + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: zluda-linux-${{ env.SHORT_SHA }} + path: target/release/zluda + build_win: + name: Build and publish (Windows) + runs-on: windows-2019 + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - uses: Swatinem/rust-cache@v2 + - name: Build + run: | + cargo xtask zip -r + Expand-Archive -Path target/release/zluda.zip -DestinationPath target/release + # https://stackoverflow.com/a/74033027 + - name: Set revision hash + run: echo "SHORT_SHA=$("${{ github.sha }}".SubString(0, 7))" >> $env:GITHUB_ENV + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: zluda-windows-${{ env.SHORT_SHA }} + path: target/release/zluda diff --git a/.gitignore b/.gitignore index 76550e8..7fd074b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ -target/ -Cargo.lock - -.vscode/ +target/ + +.vscode/ .idea/ \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..ddeb7af --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,2561 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "ahash" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anyhow" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" + +[[package]] +name = "argh" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7af5ba06967ff7214ce4c7419c7d185be7ecd6cc4965a8f6e1d8ce0398aad219" +dependencies = [ + "argh_derive", + "argh_shared", +] + +[[package]] +name = "argh_derive" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56df0aeedf6b7a2fc67d06db35b09684c3e8da0c95f8f27685cb17e08413d87a" +dependencies = [ + "argh_shared", + "proc-macro2", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "argh_shared" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5693f39141bda5760ecc4111ab08da40565d1771038c4a0250f03457ec707531" +dependencies = [ + "serde", +] + +[[package]] +name = "arrayref" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" + +[[package]] +name = "arrayvec" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" + +[[package]] +name = "ascii-canvas" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8824ecca2e851cec16968d54a01dd372ef8f95b244fb84b84e70128be347c3c6" +dependencies = [ + "term", +] + +[[package]] +name = "atiadlxx-sys" +version = "0.0.0" + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" + +[[package]] +name = "blake3" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ae2468a89544a466886840aa467a25b766499f4f04bf7d9fcd10ecee9fccef" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "digest", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bumpalo" +version = "3.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "camino" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +dependencies = [ + "serde", +] + +[[package]] +name = "capnp" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95e65021d89250bbfe7c2791789ced2c4bdc21b0e8bb59c64f3fd6145a5fd678" + +[[package]] +name = "capnpc" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbbc3763fb3e6635188e9cc51ee11a26f8777c553ca377430818dbebaaf6042b" +dependencies = [ + "capnp", +] + +[[package]] +name = "cargo-platform" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e34637b3140142bdf929fb439e8aa4ebad7651ebf7b1080b3930aa16ac1459ff" +dependencies = [ + "serde", +] + +[[package]] +name = "cargo_metadata" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7daec1a2a2129eeba1644b220b4647ec537b0b5d4bfd6876fcc5a540056b592" +dependencies = [ + "camino", + "cargo-platform", + "semver", + "serde", + "serde_json", + "thiserror", +] + +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "jobserver", + "libc", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "serde", + "wasm-bindgen", + "windows-targets 0.48.5", +] + +[[package]] +name = "cloudflare-zlib" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40fa160a8670a2607111b0d6474261ad2992f3b4651982e14f902859086ecb91" +dependencies = [ + "cloudflare-zlib-sys", +] + +[[package]] +name = "cloudflare-zlib-sys" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3185ff8c69c53ab346d5ac89f418e194b997d48393cae321cb611dd05f83c90" +dependencies = [ + "cc", +] + +[[package]] +name = "cmake" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +dependencies = [ + "cc", +] + +[[package]] +name = "comgr" +version = "0.0.0" +dependencies = [ + "hip_common", + "itertools", + "libloading", +] + +[[package]] +name = "const_format" +version = "0.2.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a214c7af3d04997541b18d432afaff4c455e79e2029079647e72fc2bd27673" +dependencies = [ + "const_format_proc_macros", +] + +[[package]] +name = "const_format_proc_macros" +version = "0.2.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7f6ff08fd20f4f299298a28e2dfa8a8ba1036e6cd2460ac1de7b425d76f2500" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "constant_time_eq" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21a53c0a4d288377e7415b53dcfc3c04da5cdc2cc95c8d5ac178b58f0b861ad6" + +[[package]] +name = "convert_case" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb4a24b1aaf0fd0ce8b45161144d6f42cd91677fd5940fd431183eb023b3a2b8" + +[[package]] +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + +[[package]] +name = "cpp_demangle" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "cpufeatures" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce420fe07aecd3e67c5f910618fe65e94158f6dcc0adf44e00d69ce2bdfe0fd0" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c3242926edf34aec4ac3a77108ad4854bffaa2e4ddc1824124ce59231302d5" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d96137f14f244c37f989d9fff8f95e6c18b918e71f36638f8c49112e4c78f" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "cuda_base" +version = "0.0.0" +dependencies = [ + "proc-macro2", + "quote", + "rustc-hash", + "syn 1.0.109", +] + +[[package]] +name = "cuda_types" +version = "0.0.0" +dependencies = [ + "cuda_base", +] + +[[package]] +name = "darling" +version = "0.20.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc5d6b04b3fd0ba9926f945895de7d806260a2d7431ba82e7edaecb043c4c6b8" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04e48a959bcd5c761246f5d090ebc2fbf7b9cd527a492b07a67510c108f1e7e3" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.41", +] + +[[package]] +name = "darling_macro" +version = "0.20.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1545d67a2149e1d93b7e5c7752dce5a7426eb5d1357ddcfd89336b94444f77" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "data-encoding" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e962a19be5cfc3f3bf6dd8f61eb50107f356ad6270fbb3ed41476571db78be5" + +[[package]] +name = "derivative" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "detours-sys" +version = "0.1.2" +dependencies = [ + "cc", + "winapi", +] + +[[package]] +name = "diff" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "dirs" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-next" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" +dependencies = [ + "cfg-if", + "dirs-sys-next", +] + +[[package]] +name = "dirs-sys" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + +[[package]] +name = "dirs-sys-next" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + +[[package]] +name = "dynasm" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "add9a102807b524ec050363f09e06f1504214b0e1c7797f64261c891022dce8b" +dependencies = [ + "bitflags 1.3.2", + "byteorder", + "lazy_static", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "dynasmrt" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64fba5a42bd76a17cad4bfa00de168ee1cbfa06a5e8ce992ae880218c05641a9" +dependencies = [ + "byteorder", + "dynasm", + "memmap2", +] + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "embed-manifest" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41cd446c890d6bed1d8b53acef5f240069ebef91d6fae7c5f52efe61fe8b5eae" + +[[package]] +name = "ena" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c533630cf40e9caa44bd91aadc88a75d75a4c3a12b4cfde353cbed41daa1e1f1" +dependencies = [ + "log", +] + +[[package]] +name = "enum-iterator" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fd242f399be1da0a5354aa462d57b4ab2b4ee0683cc552f7c007d2d12d36e94" +dependencies = [ + "enum-iterator-derive", +] + +[[package]] +name = "enum-iterator-derive" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03cdc46ec28bd728e67540c528013c6a10eb69a02eb31078a1bda695438cbfb8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "fallible-iterator" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "fastrand" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" + +[[package]] +name = "filedescriptor" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7199d965852c3bac31f779ef99cbb4537f80e952e2d6aa0ffeb30cce00f4f46e" +dependencies = [ + "libc", + "thiserror", + "winapi", +] + +[[package]] +name = "filetime" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "windows-sys 0.52.0", +] + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "flate2" +version = "1.0.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4556222738635b7a3417ae6130d8f52201e45a0c4d1a907f0826383adb5f85e7" +dependencies = [ + "cloudflare-zlib-sys", + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "float-cmp" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4" +dependencies = [ + "num-traits", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "gag" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a713bee13966e9fbffdf7193af71d54a6b35a0bb34997cd6c9519ebeb5005972" +dependencies = [ + "filedescriptor", + "tempfile", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "getset" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e45727250e75cc04ff2846a66397da8ef2b3db8e40e0cef4df67950a07621eb9" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "git2" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf7f68c2995f392c49fffb4f95ae2c873297830eb25c6bc4c114ce8f4562acc" +dependencies = [ + "bitflags 1.3.2", + "libc", + "libgit2-sys", + "log", + "url", +] + +[[package]] +name = "glam" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12f597d56c1bd55a811a1be189459e8fad2bbc272616375602443bdfb37fa774" + +[[package]] +name = "goblin" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7666983ed0dd8d21a6f6576ee00053ca0926fb281a5522577a4dbd0f1b54143" +dependencies = [ + "log", + "plain", + "scroll", +] + +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +dependencies = [ + "num-traits", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +dependencies = [ + "ahash", + "allocator-api2", +] + +[[package]] +name = "hashlink" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7" +dependencies = [ + "hashbrown 0.14.3", +] + +[[package]] +name = "hermit-abi" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hip_common" +version = "0.0.0" +dependencies = [ + "capnp", + "capnpc", + "const_format", + "cuda_types", + "goblin", + "hip_runtime-sys", + "itertools", + "libloading", + "memchr", + "rusqlite", + "rustc-hash", + "sha2", +] + +[[package]] +name = "hip_runtime-sys" +version = "0.0.0" +dependencies = [ + "rustc-hash", +] + +[[package]] +name = "hipblaslt-sys" +version = "0.0.0" + +[[package]] +name = "hipfft-sys" +version = "0.0.0" + +[[package]] +name = "hiprt-sys" +version = "0.0.0" +dependencies = [ + "libloading", + "widestring 1.0.2", + "winapi", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8326b86b6cff230b97d0d312a6c40a60726df3332e721f72a1b035f451663b20" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", + "serde", +] + +[[package]] +name = "indexmap" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" +dependencies = [ + "equivalent", + "hashbrown 0.14.3", +] + +[[package]] +name = "is-terminal" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +dependencies = [ + "hermit-abi", + "rustix", + "windows-sys 0.48.0", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" + +[[package]] +name = "jobserver" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cee9c64da59eae3b50095c18d3e74f8b73c0b86d2792824ff01bbce68ba229ca" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lalrpop" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da4081d44f4611b66c6dd725e6de3169f9f63905421e8626fcb86b6a898998b8" +dependencies = [ + "ascii-canvas", + "bit-set", + "diff", + "ena", + "is-terminal", + "itertools", + "lalrpop-util", + "petgraph", + "pico-args", + "regex", + "regex-syntax 0.7.5", + "string_cache", + "term", + "tiny-keccak", + "unicode-xid", +] + +[[package]] +name = "lalrpop-util" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f35c735096c0293d313e8f2a641627472b83d01b937177fe76e5e2708d31e0d" +dependencies = [ + "regex", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.151" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" + +[[package]] +name = "libgit2-sys" +version = "0.14.2+1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f3d95f6b51075fe9810a7ae22c7095f12b98005ab364d8544797a825ce946a4" +dependencies = [ + "cc", + "libc", + "libz-sys", + "pkg-config", +] + +[[package]] +name = "libloading" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c571b676ddfc9a8c12f1f3d3085a7b163966a8fd8098a90640953ce5f6170161" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + +[[package]] +name = "libredox" +version = "0.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85c833ca1e66078851dba29046874e38f08b2c883700aa29a03ddd3b23814ee8" +dependencies = [ + "bitflags 2.4.1", + "libc", + "redox_syscall", +] + +[[package]] +name = "libsqlite3-sys" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29f835d03d717946d28b1d1ed632eb6f0e24a299388ee623d0c23118d3e8a7fa" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "libz-sys" +version = "1.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037731f5d3aaa87a5675e895b63ddff1a87624bc29f77004ea829809654e48f6" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" + +[[package]] +name = "llvm-sys" +version = "150.1.2" +dependencies = [ + "cmake", + "convert_case", + "libc", +] + +[[package]] +name = "lock_api" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + +[[package]] +name = "lz4-sys" +version = "1.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "macro_rules_attribute" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf0c9b980bf4f3a37fd7b1c066941dd1b1d0152ce6ee6e8fe8c49b9f6810d862" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d" + +[[package]] +name = "memchr" +version = "2.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" + +[[package]] +name = "memmap2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" +dependencies = [ + "libc", +] + +[[package]] +name = "memoffset" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" +dependencies = [ + "autocfg", +] + +[[package]] +name = "miniz_oxide" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +dependencies = [ + "adler", +] + +[[package]] +name = "miopen-sys" +version = "0.0.0" + +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + +[[package]] +name = "nougat" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b57b9ced431322f054fc673f1d3c7fa52d80efd9df74ad2fc759f044742510" +dependencies = [ + "macro_rules_attribute", + "nougat-proc_macros", +] + +[[package]] +name = "nougat-proc_macros" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c84f77a45e99a2f9b492695d99e1c23844619caa5f3e57647cffacad773ca257" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "num-traits" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "num_enum" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca565a7df06f3d4b485494f25ba05da1435950f4dc263440eda7a6fa9b8e36e4" +dependencies = [ + "derivative", + "num_enum_derive", +] + +[[package]] +name = "num_enum_derive" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffa5a33ddddfee04c0283a7653987d634e880347e96b5b2ed64de07efb59db9d" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + +[[package]] +name = "offline_compiler" +version = "0.0.0" +dependencies = [ + "argh", + "comgr", + "hip_common", + "hip_runtime-sys", + "hiprt-sys", + "libloading", + "ptx", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "optix_base" +version = "0.0.0" +dependencies = [ + "proc-macro2", + "quote", + "rustc-hash", + "syn 1.0.109", +] + +[[package]] +name = "optix_dump" +version = "0.0.0" +dependencies = [ + "cuda_types", + "generic-array", + "lazy_static", + "optix_base", + "paste", + "sha2", + "typenum", + "winapi", + "wmi", +] + +[[package]] +name = "optix_types" +version = "0.0.0" +dependencies = [ + "cuda_types", + "optix_base", +] + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets 0.48.5", +] + +[[package]] +name = "paste" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "petgraph" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" +dependencies = [ + "fixedbitset", + "indexmap 2.1.0", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pico-args" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5be167a7af36ee22fe3115051bc51f6e6c7054c9348e28deb4f49bd6f705a315" + +[[package]] +name = "pkg-config" +version = "0.3.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" + +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + +[[package]] +name = "proc-macro-crate" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d6ea3c4595b96363c13943497db34af4460fb474a95c43f4446ad341b8c9785" +dependencies = [ + "toml", +] + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.109", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro-hack" +version = "0.5.20+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" + +[[package]] +name = "proc-macro2" +version = "1.0.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39278fbbf5fb4f646ce651690877f89d1c5811a3d4acb27700c1cb3cdb78fd3b" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "process_address_table" +version = "0.0.0" +dependencies = [ + "detours-sys", + "libloading", + "windows", +] + +[[package]] +name = "ptx" +version = "0.0.0" +dependencies = [ + "bit-vec", + "comgr", + "cpp_demangle", + "cuda_base", + "cuda_types", + "either", + "half", + "hip_common", + "hip_runtime-sys", + "hiprt-sys", + "lalrpop", + "lalrpop-util", + "lazy_static", + "libloading", + "num-traits", + "paste", + "regex", + "rustc-hash", + "tempfile", + "thiserror", + "zluda_llvm", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "redox_users" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a18479200779601e498ada4e8c1e1f50e3ee19deb0259c25825a98b5603b2cb4" +dependencies = [ + "getrandom", + "libredox", + "thiserror", +] + +[[package]] +name = "regex" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax 0.8.2", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax 0.8.2", +] + +[[package]] +name = "regex-syntax" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + +[[package]] +name = "rocblas-sys" +version = "0.0.0" + +[[package]] +name = "rocm_smi-sys" +version = "0.0.0" + +[[package]] +name = "rocsolver-sys" +version = "0.0.0" + +[[package]] +name = "rocsparse-sys" +version = "0.0.0" + +[[package]] +name = "rusqlite" +version = "0.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01e213bc3ecb39ac32e81e51ebe31fd888a940515173e3a18a35f8c6e896422a" +dependencies = [ + "bitflags 1.3.2", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "serde_json", + "smallvec", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustix" +version = "0.38.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +dependencies = [ + "bitflags 2.4.1", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustversion" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" + +[[package]] +name = "ryu" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "scroll" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04c565b551bafbef4157586fa379538366e4385d42082f255bfd96e4fe8519da" +dependencies = [ + "scroll_derive", +] + +[[package]] +name = "scroll_derive" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1db149f81d46d2deba7cd3c50772474707729550221e69588478ebf9ada425ae" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "semver" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "836fa6a3e1e547f9a2c4040802ec865b5d85f4014efe00555d7090a3dcaa1090" +dependencies = [ + "serde", +] + +[[package]] +name = "serde" +version = "1.0.193" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.193" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "serde_json" +version = "1.0.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d1c7e3eac408d115102c4c24ad393e0821bb3a5df4d506a80f85f7a742a526b" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_with" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe" +dependencies = [ + "base64", + "chrono", + "hex", + "indexmap 1.9.3", + "serde", + "serde_json", + "serde_with_macros", + "time", +] + +[[package]] +name = "serde_with_macros" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "881b6f881b17d13214e5d494c939ebab463d01264ce1811e9d4ac3a882e7695f" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "string_cache" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" +dependencies = [ + "new_debug_unreachable", + "once_cell", + "parking_lot", + "phf_shared", + "precomputed-hash", +] + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c8b28c477cc3bf0e7966561e3460130e1255f7a1cf71931075f1c5e7a7e269" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tar" +version = "0.4.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb" +dependencies = [ + "filetime", + "libc", + "xattr", +] + +[[package]] +name = "tempfile" +version = "3.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys 0.48.0", +] + +[[package]] +name = "term" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f" +dependencies = [ + "dirs-next", + "rustversion", + "winapi", +] + +[[package]] +name = "thiserror" +version = "1.0.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f11c217e1416d6f036b870f14e0413d480dbf28edbee1f877abaf0206af43bb7" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01742297787513b79cf8e29d1056ede1313e2420b7b3b15d0a768b4921f549df" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "thread-id" +version = "4.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0ec81c46e9eb50deaa257be2f148adf052d1fb7701cfd55ccfab2525280b70b" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "time" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59e399c068f43a5d116fedaf73b203fa4f9c519f17e2b34f63221d3792f81446" +dependencies = [ + "itoa", + "libc", + "num_threads", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" + +[[package]] +name = "time-macros" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96ba15a897f3c86766b757e5ac7221554c6750054d74d5b28844fce5fb36a6c4" +dependencies = [ + "time-core", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "toml" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +dependencies = [ + "serde", +] + +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + +[[package]] +name = "unicode-bidi" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "unicode-normalization" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" + +[[package]] +name = "url" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "vergen" +version = "7.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f21b881cd6636ece9735721cf03c1fe1e774fe258683d084bb2812ab67435749" +dependencies = [ + "anyhow", + "cfg-if", + "enum-iterator", + "getset", + "git2", + "rustversion", + "thiserror", + "time", +] + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ed0d4f68a3015cc185aff4db9506a015f4b96f95303897bfa23f846db54064e" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b56f625e64f3a1084ded111c4d5f477df9f8c92df113852fa5a374dbda78826" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.41", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0162dbf37223cd2afce98f3d0785506dcb8d266223983e4b5b525859e6e182b2" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.41", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f" + +[[package]] +name = "wchar" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c74d010bf16569f942b0b7d3c777dd674f8ee539b48d809dc548b3453039c2df" +dependencies = [ + "proc-macro-hack", + "wchar-impl", +] + +[[package]] +name = "wchar-impl" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f135922b9303f899bfa446fce1eb149f43462f1e9ac7f50e24ea6b913416dd84" +dependencies = [ + "proc-macro-hack", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "widestring" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17882f045410753661207383517a6f62ec3dbeb6a4ed2acce01f0728238d1983" + +[[package]] +name = "widestring" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "653f141f39ec16bba3c5abe400a0c60da7468261cc2cbf36805022876bc721a8" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-core" +version = "0.51.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1f8cf84f35d2db49a46868f947758c7a1138116f7fac3bc844f43ade1292e64" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.5", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" +dependencies = [ + "windows_aarch64_gnullvm 0.52.5", + "windows_aarch64_msvc 0.52.5", + "windows_i686_gnu 0.52.5", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.5", + "windows_x86_64_gnu 0.52.5", + "windows_x86_64_gnullvm 0.52.5", + "windows_x86_64_msvc 0.52.5", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" + +[[package]] +name = "wmi" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "757a458f9bfab0542c11feed99bd492cbe23add50515bd8eecf8c6973673d32d" +dependencies = [ + "chrono", + "log", + "serde", + "thiserror", + "widestring 0.5.1", + "winapi", +] + +[[package]] +name = "xattr" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "914566e6413e7fa959cc394fb30e563ba80f3541fbd40816d4c05a0fc3f2a0f1" +dependencies = [ + "libc", + "linux-raw-sys", + "rustix", +] + +[[package]] +name = "xtask" +version = "0.0.0" +dependencies = [ + "argh", + "cargo-platform", + "cargo_metadata", + "flate2", + "serde", + "serde_json", + "tar", + "time", + "zip", +] + +[[package]] +name = "zerocopy" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.41", +] + +[[package]] +name = "zip" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" +dependencies = [ + "byteorder", + "crc32fast", + "crossbeam-utils", + "flate2", + "time", +] + +[[package]] +name = "zluda" +version = "0.0.0" +dependencies = [ + "blake3", + "comgr", + "cuda_base", + "cuda_types", + "dirs", + "gag", + "goblin", + "half", + "hip_common", + "hip_runtime-sys", + "lazy_static", + "libc", + "lz4-sys", + "memchr", + "memoffset", + "num-traits", + "num_enum", + "paste", + "ptx", + "rand", + "rand_chacha", + "rusqlite", + "rustc-hash", + "static_assertions", + "tempfile", + "time", + "vergen", + "winapi", + "zluda_dark_api", +] + +[[package]] +name = "zluda_api" +version = "0.0.0" +dependencies = [ + "cuda_types", + "libloading", + "once_cell", + "windows", +] + +[[package]] +name = "zluda_blas" +version = "0.0.0" +dependencies = [ + "cuda_types", + "hip_common", + "rocblas-sys", + "rocsolver-sys", + "zluda_dark_api", +] + +[[package]] +name = "zluda_blaslt" +version = "0.0.0" +dependencies = [ + "cuda_types", + "hip_common", + "hipblaslt-sys", + "zluda_dark_api", +] + +[[package]] +name = "zluda_ccl" +version = "0.0.0" + +[[package]] +name = "zluda_dark_api" +version = "0.0.0" +dependencies = [ + "bit-vec", + "bitflags 2.4.1", + "cloudflare-zlib", + "cuda_types", + "either", + "goblin", + "hip_common", + "lz4-sys", + "paste", + "thread-id", +] + +[[package]] +name = "zluda_dnn" +version = "0.0.0" +dependencies = [ + "hip_runtime-sys", + "miopen-sys", +] + +[[package]] +name = "zluda_dump" +version = "0.0.0" +dependencies = [ + "crossbeam-channel", + "cuda_base", + "cuda_types", + "detours-sys", + "dynasm", + "dynasmrt", + "goblin", + "hip_common", + "lazy_static", + "libc", + "lz4-sys", + "paste", + "ptx", + "rand", + "rand_chacha", + "regex", + "rustc-hash", + "serde", + "serde_derive", + "serde_json", + "thread-id", + "wchar", + "winapi", + "zluda_dark_api", +] + +[[package]] +name = "zluda_fft" +version = "0.0.0" +dependencies = [ + "cuda_types", + "hip_common", + "hipfft-sys", + "lazy_static", + "slab", + "zluda_dark_api", +] + +[[package]] +name = "zluda_inject" +version = "0.0.0" +dependencies = [ + "argh", + "detours-sys", + "embed-manifest", + "tempfile", + "winapi", + "zluda_dump", + "zluda_ml", + "zluda_redirect", +] + +[[package]] +name = "zluda_lib" +version = "0.0.0" +dependencies = [ + "zluda", +] + +[[package]] +name = "zluda_llvm" +version = "0.0.0" +dependencies = [ + "bitflags 2.4.1", + "cc", + "llvm-sys", +] + +[[package]] +name = "zluda_ml" +version = "0.0.0" +dependencies = [ + "atiadlxx-sys", + "rocm_smi-sys", +] + +[[package]] +name = "zluda_redirect" +version = "0.0.0" +dependencies = [ + "detours-sys", + "memchr", + "wchar", + "winapi", +] + +[[package]] +name = "zluda_rt" +version = "0.0.0" +dependencies = [ + "comgr", + "data-encoding", + "dirs", + "float-cmp", + "generic-array", + "glam", + "hip_common", + "hip_runtime-sys", + "hiprt-sys", + "libloading", + "nougat", + "optix_base", + "optix_types", + "paste", + "ptx", + "rusqlite", + "rustc-hash", + "serde", + "serde_json", + "serde_with", + "sha2", + "static_assertions", + "time", + "typenum", + "vergen", + "winapi", +] + +[[package]] +name = "zluda_sparse" +version = "0.0.0" +dependencies = [ + "cuda_types", + "hip_common", + "hip_runtime-sys", + "rocsparse-sys", + "zluda_dark_api", +] diff --git a/Cargo.toml b/Cargo.toml index 2e9a6ed..c20fa2f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -58,4 +58,4 @@ opt-level = 3 opt-level = 3 [profile.dev.package.xtask] -opt-level = 3 +opt-level = 2 diff --git a/Makefile.toml b/Makefile.toml deleted file mode 100644 index adab2b9..0000000 --- a/Makefile.toml +++ /dev/null @@ -1,57 +0,0 @@ -[config] -default_to_workspace = false -skip_core_tasks = true - -[tasks.build] -run_task = [ - { name = "build-windows", condition = { platforms = ["windows"] } }, - { name = "build-linux", condition = { platforms = ["linux"] } }, -] - -[tasks.build-windows] -command = "cargo" -args = [ - "build", - "-p", "offline_compiler", - "-p", "zluda_dump", - "-p", "zluda_inject", - "-p", "zluda_lib", - "-p", "zluda_ml", - "-p", "zluda_redirect", -] - -[tasks.build-linux] -command = "cargo" -args = [ - "build", - "-p", "offline_compiler", - "-p", "zluda_blas", - "-p", "zluda_blaslt", - "-p", "zluda_ccl", - "-p", "zluda_dnn", - "-p", "zluda_dump", - "-p", "zluda_fft", - "-p", "zluda_lib", - "-p", "zluda_ml", - "-p", "zluda_sparse", -] - -[tasks.build-release] -command = "cargo" -args = [ - "build", - "--release", - "-p", "offline_compiler", - "-p", "zluda_blas", - "-p", "zluda_blaslt", - "-p", "zluda_ccl", - "-p", "zluda_dnn", - "-p", "zluda_dump", - "-p", "zluda_fft", - "-p", "zluda_lib", - "-p", "zluda_ml", - "-p", "zluda_sparse", -] - -[tasks.default] -alias = "build" diff --git a/README.md b/README.md index c4b4780..af045ad 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ If an application fails to start under ZLUDA or crashes please check [Known Issu - If both integrated AMD GPU and dedicated AMD GPU are present in the system, ZLUDA uses the integrated GPU. - This is a bug in underying ROCm/HIP runtime. You can work around it by disabling the integrated GPU. + This is a bug in underlying ROCm/HIP runtime. You can work around it by disabling the integrated GPU. On Windows we recommend you use environment variable `HIP_VISIBLE_DEVICES=1` environment variable (more [here](https://rocmdocs.amd.com/en/latest/conceptual/gpu-isolation.html#hip-visible-devices)) or disable it system-wide in Device Manager. @@ -235,10 +235,6 @@ Performance is currently much lower than the native HIP backend, see the discuss This is a ROCm/HIP bug. Currently, CompuBench tests have to be run one at a time. -- Some tests output black screen. - - This is due to a bug (or an unintended hardware feature) in CompuBench that just happens to work on NVIDIA GPUs. - #### V-Ray Benchmark - Currently, ZLUDA crashes when running V-Ray benchmark. Nonetheless, certain "lucky" older combinations of ZLUDA and ROCm/HIP are known to run V-Ray Benchmark successfully. diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md index 18ec079..3679f6c 100644 --- a/TROUBLESHOOTING.md +++ b/TROUBLESHOOTING.md @@ -92,13 +92,16 @@ If you are dumping original CUDA use: ### Linux -Known bug: when dumping from original CUDA you should remove (or rename) all the files in `/dump` except `libcuda.so` and `libcuda.so.1`. - -Use it like this: +If dumping from ZLUDA use it like this: ``` LD_LIBRARY_PATH="/dump:$LD_LIBRARY_PATH" ``` +If dumping from NVIDIA CUDA use it like this: +``` +LD_LIBRARY_PATH="/dump_nvidia:$LD_LIBRARY_PATH" +``` + ### Result If all went well you should see lines like this in the console output and in the log file specified by `ZLUDA_DUMP_DIR`: diff --git a/ext/llvm-sys.rs/build.rs b/ext/llvm-sys.rs/build.rs index a7363a9..9b43c8b 100644 --- a/ext/llvm-sys.rs/build.rs +++ b/ext/llvm-sys.rs/build.rs @@ -3,7 +3,7 @@ extern crate convert_case; use convert_case::{Case, Casing, StateConverter}; use std::{ - env, + env, io, path::PathBuf, process::{Command, Stdio}, }; @@ -17,8 +17,9 @@ fn main() { .map(|comp| comp.from_case(Case::Snake)); let msvc = is_msvc(); let (llvm_dir, additonal_cmake_file) = get_llvm_dir(); - let out_dir = build_cmake_targets(llvm_components.clone(), llvm_dir, additonal_cmake_file); - emit_compile_and_linking_information(llvm_components, out_dir, msvc) + let (cmake_profile, out_dir) = + build_cmake_targets(llvm_components.clone(), llvm_dir, additonal_cmake_file); + emit_compile_and_linking_information(llvm_components, cmake_profile, out_dir, msvc) } fn is_msvc() -> bool { @@ -41,11 +42,20 @@ fn build_cmake_targets<'a>( components: impl Iterator>, llvm_dir: PathBuf, additional_cmake_file: PathBuf, -) -> PathBuf { +) -> (String, PathBuf) { let mut cmake = Config::new(llvm_dir); use_ninja(&mut cmake); cmake .always_configure(true) + // Should be detected automatically, but we have reports of + // LLVM fiding ZLIB on Windows and then failing to link it. + // Out of caution we explicitly disable all autodetectable components + .define("LLVM_ENABLE_LIBXML2", "OFF") + .define("LLVM_ENABLE_ZLIB", "OFF") + .define("LLVM_ENABLE_ZSTD", "OFF") + .define("LLVM_ENABLE_CURL", "OFF") + .define("LLVM_ENABLE_HTTPLIB", "OFF") + .define("LLVM_ENABLE_LIBEDIT", "OFF") .define("LLVM_ENABLE_TERMINFO", "OFF") .define("LLVM_BUILD_TOOLS", "OFF") .define("LLVM_TARGETS_TO_BUILD", "") @@ -57,7 +67,10 @@ fn build_cmake_targets<'a>( .build_target(&format!("LLVM{}", component.to_case(Case::Pascal))) .build(); } - cmake.build_target("llvm-config").build() + ( + cmake.get_profile().to_string(), + cmake.build_target("llvm-config").build(), + ) } fn use_ninja(cmake: &mut Config) { @@ -76,31 +89,27 @@ fn use_ninja(cmake: &mut Config) { } fn emit_compile_and_linking_information<'a>( - llvm_components: impl Iterator>, + llvm_components: impl Iterator> + Clone, + cmake_profile: String, out_dir: PathBuf, is_msvc: bool, ) { - let mut llvm_config_path = out_dir.clone(); - llvm_config_path.push("build"); - llvm_config_path.push("bin"); - llvm_config_path.push("llvm-config"); - let mut llvm_config_cmd = Command::new(&llvm_config_path); - llvm_config_cmd.args([ - "--cxxflags", - "--ldflags", - "--libdir", - "--libnames", - "--system-libs", - "--link-static", - ]); - for component in llvm_components { - llvm_config_cmd.arg(&component.to_case(Case::Flat)); - } - let llvm_config_output = llvm_config_cmd - .stdin(Stdio::null()) - .stderr(Stdio::null()) - .output() - .unwrap(); + // MSBuild uses didfferent output path from ninja or Makefile. + // Not sure how to query CMake about it, so we just try once with + // ninja/Makefile path and then once with MSBuild path + let llvm_config_output = execute_llvm_config( + &out_dir, + &["build", "bin", "llvm-config"], + llvm_components.clone(), + ) + .or_else(|_| { + execute_llvm_config( + &out_dir, + &["build", &*cmake_profile, "bin", "llvm-config"], + llvm_components, + ) + }) + .unwrap(); if !llvm_config_output.status.success() { panic!() } @@ -138,3 +147,28 @@ fn emit_compile_and_linking_information<'a>( println!("cargo:rustc-link-lib=stdc++"); } } + +fn execute_llvm_config<'a>( + out_dir: &PathBuf, + llvm_config_exe_relative: &[&str], + llvm_components: impl Iterator>, +) -> io::Result { + let mut llvm_config_path = out_dir.clone(); + llvm_config_path.extend(llvm_config_exe_relative); + let mut llvm_config_cmd = Command::new(&llvm_config_path); + llvm_config_cmd.args([ + "--cxxflags", + "--ldflags", + "--libdir", + "--libnames", + "--system-libs", + "--link-static", + ]); + for component in llvm_components { + llvm_config_cmd.arg(&component.to_case(Case::Flat)); + } + llvm_config_cmd + .stdin(Stdio::null()) + .stderr(Stdio::null()) + .output() +} diff --git a/process_address_table/Cargo.toml b/process_address_table/Cargo.toml index 2de38f1..97d6083 100644 --- a/process_address_table/Cargo.toml +++ b/process_address_table/Cargo.toml @@ -18,3 +18,4 @@ features = [ [package.metadata.zluda] debug_only = true +skip_zip = true diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index e8d5915..5c073f5 100644 Binary files a/ptx/lib/zluda_ptx_impl.bc and b/ptx/lib/zluda_ptx_impl.bc differ diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index 5d224bd..9a1b29a 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -1,5 +1,5 @@ // Compile and disassemble: -// python3 ./cvt.py > cvt.h && /opt/rocm/llvm/bin/clang -std=c++17 -Xclang -no-opaque-pointers -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -S -emit-llvm --cuda-device-only -nogpulib -O3 -Xclang -fallow-half-arguments-and-returns -o - | sed -e 's/define/define linkonce_odr/g' | sed -e '/@llvm.used/d' | sed -e 's/\"target-cpu\"=\"[^\"]*\"//g' | sed -e 's/\"target-features\"=\"[^\"]*\"//g' | sed -e 's/\"denormal-fp-math-f32\"=\"[^\"]*\"//g' | sed -e 's/!llvm.module.flags = !{!0, !1, !2, !3, !4}/!llvm.module.flags = !{ }/g' | sed -e 's/memory(none)/readnone/g' | sed -e 's/memory(argmem: readwrite, inaccessiblemem: readwrite)/inaccessiblemem_or_argmemonly/g' | sed -e 's/memory(read)/readonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | llvm-as-13 -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc +// python3 ./cvt.py > cvt.h && /opt/rocm/llvm/bin/clang -std=c++20 -Xclang -no-opaque-pointers -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -S -emit-llvm --cuda-device-only -nogpulib -O3 -Xclang -fallow-half-arguments-and-returns -o - | sed -e 's/define/define linkonce_odr/g' | sed -e '/@llvm.used/d' | sed -e 's/\"target-cpu\"=\"[^\"]*\"//g' | sed -e 's/\"target-features\"=\"[^\"]*\"//g' | sed -e 's/\"denormal-fp-math-f32\"=\"[^\"]*\"//g' | sed -e 's/!llvm.module.flags = !{!0, !1, !2, !3, !4}/!llvm.module.flags = !{ }/g' | sed -e 's/memory(none)/readnone/g' | sed -e 's/memory(argmem: readwrite, inaccessiblemem: readwrite)/inaccessiblemem_or_argmemonly/g' | sed -e 's/memory(read)/readonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | llvm-as-13 -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc // Compile to binary: // /opt/rocm/llvm/bin/clang -x ir -target amdgcn-amd-amdhsa -Xlinker --no-undefined zluda_ptx_impl.bc -mno-wavefrontsize64 -mcpu=gfx1030 // Decompile: @@ -11,6 +11,7 @@ // https://llvm.org/docs/AMDGPUUsage.html #include +#include #include #define HIP_NO_HALF #include @@ -155,6 +156,399 @@ static __device__ float4::Native_vec_ __pack_to_float4(const T &t) return result; } +typedef uint32_t uint8 __attribute__((ext_vector_type(8))); +typedef uint32_t zluda_uint3 __attribute__((ext_vector_type(3))); +typedef uint8 CONSTANT_SPACE *surface_ptr; + +template +static __device__ To transmute(From f) +{ + if constexpr (sizeof(To) == sizeof(From)) + { + return std::bit_cast(f); + } + else if constexpr (sizeof(To) > sizeof(From)) + { + union + { + To t; + From f; + } u = {To{0}}; + u.f = f; + return u.t; + } + else if constexpr (sizeof(To) < sizeof(From)) + { + union + { + From f; + To t; + } u = {From{f}}; + return u.t; + } + else + { + static_assert(sizeof(To) == 0); + } +} + +enum class ImageGeometry +{ + _1D, + _2D, + _3D, + A1D, + A2D +}; + +// clang-format off +template struct Coordinates; +template <> struct Coordinates { using type = uint1::Native_vec_; }; +template <> struct Coordinates { using type = uint2::Native_vec_; }; +template <> struct Coordinates { using type = uint4::Native_vec_; }; +template <> struct Coordinates +{ + using type = uint2::Native_vec_; using arg_type = uint1::Native_vec_; + static __device__ type pack_layer(uint32_t layer, arg_type coord) + { + return type { coord.x, layer }; + } +}; +template <> struct Coordinates +{ + using type = zluda_uint3; using arg_type = uint2::Native_vec_; + static __device__ type pack_layer(uint32_t layer, arg_type coord) + { + return type { coord.x, coord.y, layer }; + } +}; +// clang-format on + +template +static __device__ void image_store_pck(T value, typename Coordinates::type coord, surface_ptr surface) +{ + if constexpr (sizeof(T) <= sizeof(uint)) + { + uint value_dword = transmute(value); + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:1D unorm" : : "v"(value_dword), "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:2D unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:3D unorm" : : "v"(value_dword), "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:1D_ARRAY unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x1 dim:2D_ARRAY unorm" : : "v"(value_dword), "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(T) == 0, "Invalid geometry"); + } + } + else if constexpr (sizeof(T) == sizeof(uint2::Native_vec_)) + { + uint2::Native_vec_ value_dword2 = transmute(value); + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:1D unorm" : : "v"(value_dword2), "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:2D unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:3D unorm" : : "v"(value_dword2), "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:1D_ARRAY unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0x3 dim:2D_ARRAY unorm" : : "v"(value_dword2), "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(T) == 0, "Invalid geometry"); + } + } + else if constexpr (sizeof(T) == sizeof(uint4::Native_vec_)) + { + uint4::Native_vec_ value_dword4 = transmute(value); + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:1D unorm" : : "v"(value_dword4), "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:2D unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:3D unorm" : : "v"(value_dword4), "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_store_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm" : : "v"(value_dword4), "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(T) == 0, "Invalid geometry"); + } + } + else + { + static_assert(sizeof(T) == 0, "Invalid vector size"); + } +} + +template +static __device__ T image_load_pck(typename Coordinates::type coord, surface_ptr surface) +{ + if constexpr (sizeof(T) <= sizeof(uint)) + { + uint data; + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x1 dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(ImageGeometry) == 0, "Invalid geometry"); + } + return transmute(data); + } + else if constexpr (sizeof(T) == sizeof(uint2::Native_vec_)) + { + uint2::Native_vec_ data; + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0x3 dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(ImageGeometry) == 0, "Invalid geometry"); + } + return transmute(data); + } + else if constexpr (sizeof(T) == sizeof(uint4::Native_vec_)) + { + uint4::Native_vec_ data; + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(ImageGeometry) == 0, "Invalid geometry"); + } + return transmute(data); + } + else + { + static_assert(sizeof(T) == 0, "Invalid vector size"); + } +} + +template +static __device__ uint4::Native_vec_ image_load_pck_full(typename Coordinates::type coord, surface_ptr surface) +{ + uint4::Native_vec_ data; + if constexpr (geo == ImageGeometry::_1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord.x), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::_3D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:3D unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(transmute(coord)), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A1D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:1D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else if constexpr (geo == ImageGeometry::A2D) + { + asm volatile("image_load_pck %0, %1, %2 dmask:0xf dim:2D_ARRAY unorm\ns_waitcnt vmcnt(0)" : "=v"(data) : "v"(coord), "s"(*surface) : "memory"); + } + else + { + static_assert(sizeof(ImageGeometry) == 0, "Invalid geometry"); + } + return data; +} + +template +static __device__ void image_store_pck_full_with(uint4::Native_vec_ data, T value, typename Coordinates::type coord, surface_ptr surface) +{ + // We avoid unions for types smaller than sizeof(uint32_t), + // because in those cases we get this garbage: + // ds_write_b128 v2, v[5:8] + // ds_write_b16 v2, v9 + // ds_read_b128 v[5:8], v2 + // tested with ROCm 5.7.1 on gfx1030 + if constexpr (sizeof(T) == sizeof(uint8_t)) + { + uint32_t x = uint32_t(std::bit_cast(value)); + uint32_t data_0 = ((data[0]) >> 8) << 8; + data[0] = data_0 | x; + } + else if constexpr (sizeof(T) == sizeof(uint16_t)) + { + uint32_t x = uint32_t(std::bit_cast(value)); + uint32_t data_0 = ((data[0]) >> 16) << 16; + data[0] = data_0 | x; + } + else + { + union + { + uint4::Native_vec_ full_vec; + T value; + } u = {0}; + u.full_vec = data; + u.value = value; + data = u.full_vec; + } + image_store_pck(data, coord, surface); +} + +constexpr auto IMAGE_RESERVED_TOP_BITS = 3; + +static __device__ surface_ptr get_surface_pointer(uint64_t s) +{ + return (surface_ptr)((s << IMAGE_RESERVED_TOP_BITS) >> IMAGE_RESERVED_TOP_BITS); +} + +static __device__ surface_ptr get_surface_pointer(struct textureReference GLOBAL_SPACE *surf_ref) +{ + return (surface_ptr)(surf_ref->textureObject); +} + +static __device__ uint32_t x_coordinate_shift(uint64_t s) +{ + return uint32_t(s >> (64 - IMAGE_RESERVED_TOP_BITS)); +} + +static __device__ uint32_t x_coordinate_shift(struct textureReference GLOBAL_SPACE *ptr) +{ + uint32_t channels = uint32_t(ptr->numChannels); + uint32_t format_width = 0; + hipArray_Format format = ptr->format; + switch (format) + { + case hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8: + case hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8: + format_width = 1; + break; + case hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16: + case hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16: + case hipArray_Format::HIP_AD_FORMAT_HALF: + format_width = 2; + break; + case hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32: + case hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32: + case hipArray_Format::HIP_AD_FORMAT_FLOAT: + format_width = 4; + break; + default: + __builtin_unreachable(); + } + return uint32_t(__builtin_ctz(format_width * channels)); +} + +template +static __device__ T suld_b_zero(Surface surf_arg, typename Coordinates::type coord) +{ + surface_ptr surface = get_surface_pointer(surf_arg); + uint32_t shift_x = x_coordinate_shift(surf_arg); + coord.x = coord.x >> shift_x; + return image_load_pck(coord, surface); +} + +template +static __device__ void sust_b_zero(Surface surf_arg, typename Coordinates::type coord, T data) +{ + surface_ptr surface = get_surface_pointer(surf_arg); + uint32_t shift_x = x_coordinate_shift(surf_arg); + coord.x = coord.x >> shift_x; + if (shift_x <= __builtin_ctz(sizeof(T))) [[likely]] + { + image_store_pck(data, coord, surface); + } + else + { + uint4::Native_vec_ pixel = image_load_pck_full(coord, surface); + image_store_pck_full_with(pixel, data, coord, surface); + } +} + extern "C" { #define atomic_inc(NAME, SUCCESS, FAILURE, SCOPE, SPACE) \ @@ -660,179 +1054,101 @@ extern "C" suld_b_a2d_vec(_v4, b32, uint4); // suld_b_a2d_vec(_v4, b64, ulong4); -#define sust_b_1d_vec(VEC, TYPE, HIP_TYPE) \ - void FUNC(sust_b_1d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, int1::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - TEXTURE_OBJECT_PARAMETERS_INIT; \ - (void)s; \ - int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_1D(i, byte_coord, tmp); \ - } \ - void FUNC(sust_b_indirect_1d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int1::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \ - { \ - hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - surf1Dwrite(hip_data, surfObj, coord.x); \ +#define SUST_B_ZERO(TYPE, GEOMETRY, HIP_TYPE) \ + HIP_TYPE::Native_vec_ FUNC(suld_b_indirect_##TYPE##_zero)(uint64_t surf_arg, typename Coordinates::type coord) \ + { \ + return suld_b_zero(surf_arg, coord); \ + } \ + void FUNC(sust_b_indirect_##TYPE##_zero)(uint64_t surf_arg, typename Coordinates::type coord, HIP_TYPE::Native_vec_ data) \ + { \ + sust_b_zero(surf_arg, coord, data); \ + } \ + HIP_TYPE::Native_vec_ FUNC(suld_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, typename Coordinates::type coord) \ + { \ + return suld_b_zero(ptr, coord); \ + } \ + void FUNC(sust_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, typename Coordinates::type coord, HIP_TYPE::Native_vec_ data) \ + { \ + sust_b_zero(ptr, coord, data); \ } - sust_b_1d_vec(, b8, uchar1); - sust_b_1d_vec(, b16, ushort1); - sust_b_1d_vec(, b32, uint1); - // sust_b_1d_vec(, b64, ulong1); - sust_b_1d_vec(_v2, b8, uchar2); - sust_b_1d_vec(_v2, b16, ushort2); - sust_b_1d_vec(_v2, b32, uint2); - // sust_b_1d_vec(_v2, b64, ulong2); - sust_b_1d_vec(_v4, b8, uchar4); - sust_b_1d_vec(_v4, b16, ushort4); - sust_b_1d_vec(_v4, b32, uint4); - // sust_b_1d_vec(_v4, b64, ulong4); - -#define sust_b_2d_vec(VEC, TYPE, HIP_TYPE) \ - void FUNC(sust_b_2d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, int2::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - TEXTURE_OBJECT_PARAMETERS_INIT; \ - (void)s; \ - int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_2D(i), __ockl_image_channel_order_2D(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_2D(i, int2(byte_coord, coord.y).data, tmp); \ - } \ - void FUNC(sust_b_indirect_2d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int2::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \ - { \ - hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - surf2Dwrite(hip_data, surfObj, coord.x, coord.y); \ +#define SUST_B_ZERO_ARRAY(TYPE, GEOMETRY, HIP_TYPE) \ + HIP_TYPE::Native_vec_ FUNC(suld_b_indirect_##TYPE##_zero)(uint64_t surf_arg, uint32_t layer, typename Coordinates::arg_type coord) \ + { \ + auto coord_array = Coordinates::pack_layer(layer, coord); \ + return suld_b_zero(surf_arg, coord_array); \ + } \ + void FUNC(sust_b_indirect_##TYPE##_zero)(uint64_t surf_arg, uint32_t layer, typename Coordinates::arg_type coord, HIP_TYPE::Native_vec_ data) \ + { \ + auto coord_array = Coordinates::pack_layer(layer, coord); \ + sust_b_zero(surf_arg, coord_array, data); \ + } \ + HIP_TYPE::Native_vec_ FUNC(suld_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, uint32_t layer, typename Coordinates::arg_type coord) \ + { \ + auto coord_array = Coordinates::pack_layer(layer, coord); \ + return suld_b_zero(ptr, coord_array); \ + } \ + void FUNC(sust_b_##TYPE##_zero)(struct textureReference GLOBAL_SPACE * ptr, uint32_t layer, typename Coordinates::arg_type coord, HIP_TYPE::Native_vec_ data) \ + { \ + auto coord_array = Coordinates::pack_layer(layer, coord); \ + sust_b_zero(ptr, coord_array, data); \ } - sust_b_2d_vec(, b8, uchar1); - sust_b_2d_vec(, b16, ushort1); - sust_b_2d_vec(, b32, uint1); - // sust_b_2d_vec(, b64, ulong1); - sust_b_2d_vec(_v2, b8, uchar2); - sust_b_2d_vec(_v2, b16, ushort2); - sust_b_2d_vec(_v2, b32, uint2); - // sust_b_2d_vec(_v2, b64, ulong2); - sust_b_2d_vec(_v4, b8, uchar4); - sust_b_2d_vec(_v4, b16, ushort4); - sust_b_2d_vec(_v4, b32, uint4); - // sust_b_2d_vec(_v4, b64, ulong4); - -#define sust_b_3d_vec(VEC, TYPE, HIP_TYPE) \ - void FUNC(sust_b_3d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, int4::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - TEXTURE_OBJECT_PARAMETERS_INIT; \ - (void)s; \ - int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_3D(i, int4(byte_coord, coord.y, coord.z, 0).data, tmp); \ - } \ - void FUNC(sust_b_indirect_3d##VEC##_##TYPE##_trap)(uint64_t serf_arg, int4::Native_vec_ coord, HIP_TYPE::Native_vec_ data) \ - { \ - hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \ - __HIP_SURFACE_OBJECT_PARAMETERS_INIT; \ - int byte_coord = __hipGetPixelAddr(coord.x, __ockl_image_channel_data_type_3D(i), __ockl_image_channel_order_3D(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_3D(i, int4(byte_coord, coord.y, coord.z, 0).data, tmp); \ - } - - sust_b_3d_vec(, b8, uchar1); - sust_b_3d_vec(, b16, ushort1); - sust_b_3d_vec(, b32, uint1); - // sust_b_3d_vec(, b64, ulong1); - sust_b_3d_vec(_v2, b8, uchar2); - sust_b_3d_vec(_v2, b16, ushort2); - sust_b_3d_vec(_v2, b32, uint2); - // sust_b_3d_vec(_v2, b64, ulong2); - sust_b_3d_vec(_v4, b8, uchar4); - sust_b_3d_vec(_v4, b16, ushort4); - sust_b_3d_vec(_v4, b32, uint4); - // sust_b_3d_vec(_v4, b64, ulong4); - -#define sust_b_a1d_vec(VEC, TYPE, HIP_TYPE) \ - void FUNC(sust_b_a1d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, uint layer, int x, HIP_TYPE::Native_vec_ data) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - TEXTURE_OBJECT_PARAMETERS_INIT; \ - (void)s; \ - int byte_coord = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1Da(i), __ockl_image_channel_order_1Da(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_1Da(i, int2(byte_coord, int(layer)).data, tmp); \ - } \ - void FUNC(sust_b_indirect_a1d##VEC##_##TYPE##_trap)(uint64_t serf_arg, uint layer, int x, HIP_TYPE::Native_vec_ data) \ - { \ - hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \ - __HIP_SURFACE_OBJECT_PARAMETERS_INIT; \ - int byte_coord = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1Da(i), __ockl_image_channel_order_1Da(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_1Da(i, int2(byte_coord, int(layer)).data, tmp); \ - } - - sust_b_a1d_vec(, b8, uchar1); - sust_b_a1d_vec(, b16, ushort1); - sust_b_a1d_vec(, b32, uint1); - // sust_b_a1d_vec(, b64, ulong1); - sust_b_a1d_vec(_v2, b8, uchar2); - sust_b_a1d_vec(_v2, b16, ushort2); - sust_b_a1d_vec(_v2, b32, uint2); - // sust_b_a1d_vec(_v2, b64, ulong2); - sust_b_a1d_vec(_v4, b8, uchar4); - sust_b_a1d_vec(_v4, b16, ushort4); - sust_b_a1d_vec(_v4, b32, uint4); - // sust_b_a1d_vec(_v4, b64, ulong4); - -#define sust_b_a2d_vec(VEC, TYPE, HIP_TYPE) \ - void FUNC(sust_b_a2d##VEC##_##TYPE##_trap)(struct textureReference GLOBAL_SPACE * ptr, uint layer, int x, int y, HIP_TYPE::Native_vec_ data) \ - { \ - hipTextureObject_t textureObject = ptr->textureObject; \ - TEXTURE_OBJECT_PARAMETERS_INIT; \ - (void)s; \ - int byte_coord = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2Da(i), __ockl_image_channel_order_2Da(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_2Da(i, int4(byte_coord, y, int(layer), 0).data, tmp); \ - } \ - void FUNC(sust_b_indirect_a2d##VEC##_##TYPE##_trap)(uint64_t serf_arg, uint layer, int x, int y, HIP_TYPE::Native_vec_ data) \ - { \ - hipSurfaceObject_t surfObj = (hipSurfaceObject_t)serf_arg; \ - __HIP_SURFACE_OBJECT_PARAMETERS_INIT; \ - int byte_coord = __hipGetPixelAddr(x, __ockl_image_channel_data_type_2Da(i), __ockl_image_channel_order_2Da(i)); \ - HIP_TYPE hip_data; \ - hip_data.data = data; \ - auto tmp = __pack_to_float4(hip_data); \ - __ockl_image_store_2Da(i, int4(byte_coord, y, int(layer), 0).data, tmp); \ - } - - sust_b_a2d_vec(, b8, uchar1); - sust_b_a2d_vec(, b16, ushort1); - sust_b_a2d_vec(, b32, uint1); - // sust_b_a2d_vec(, b64, ulong1); - sust_b_a2d_vec(_v2, b8, uchar2); - sust_b_a2d_vec(_v2, b16, ushort2); - sust_b_a2d_vec(_v2, b32, uint2); - // sust_b_a2d_vec(_v2, b64, ulong2); - sust_b_a2d_vec(_v4, b8, uchar4); - sust_b_a2d_vec(_v4, b16, ushort4); - sust_b_a2d_vec(_v4, b32, uint4); - // sust_b_a2d_vec(_v4, b64, ulong4); + SUST_B_ZERO(1d_b8, ImageGeometry::_1D, uchar1); + SUST_B_ZERO(1d_b16, ImageGeometry::_1D, ushort1); + SUST_B_ZERO(1d_b32, ImageGeometry::_1D, uint1); + SUST_B_ZERO(1d_b64, ImageGeometry::_1D, ulong1); + SUST_B_ZERO(1d_v2_b8, ImageGeometry::_1D, uchar2); + SUST_B_ZERO(1d_v2_b16, ImageGeometry::_1D, ushort2); + SUST_B_ZERO(1d_v2_b32, ImageGeometry::_1D, uint2); + SUST_B_ZERO(1d_v2_b64, ImageGeometry::_1D, ulong2); + SUST_B_ZERO(1d_v4_b8, ImageGeometry::_1D, uchar4); + SUST_B_ZERO(1d_v4_b16, ImageGeometry::_1D, ushort4); + SUST_B_ZERO(1d_v4_b32, ImageGeometry::_1D, uint4); + SUST_B_ZERO(2d_b8, ImageGeometry::_2D, uchar1); + SUST_B_ZERO(2d_b16, ImageGeometry::_2D, ushort1); + SUST_B_ZERO(2d_b32, ImageGeometry::_2D, uint1); + SUST_B_ZERO(2d_b64, ImageGeometry::_2D, ulong1); + SUST_B_ZERO(2d_v2_b8, ImageGeometry::_2D, uchar2); + SUST_B_ZERO(2d_v2_b16, ImageGeometry::_2D, ushort2); + SUST_B_ZERO(2d_v2_b32, ImageGeometry::_2D, uint2); + SUST_B_ZERO(2d_v2_b64, ImageGeometry::_2D, ulong2); + SUST_B_ZERO(2d_v4_b8, ImageGeometry::_2D, uchar4); + SUST_B_ZERO(2d_v4_b16, ImageGeometry::_2D, ushort4); + SUST_B_ZERO(2d_v4_b32, ImageGeometry::_2D, uint4); + SUST_B_ZERO(3d_b8, ImageGeometry::_3D, uchar1); + SUST_B_ZERO(3d_b16, ImageGeometry::_3D, ushort1); + SUST_B_ZERO(3d_b32, ImageGeometry::_3D, uint1); + SUST_B_ZERO(3d_b64, ImageGeometry::_3D, ulong1); + SUST_B_ZERO(3d_v2_b8, ImageGeometry::_3D, uchar2); + SUST_B_ZERO(3d_v2_b16, ImageGeometry::_3D, ushort2); + SUST_B_ZERO(3d_v2_b32, ImageGeometry::_3D, uint2); + SUST_B_ZERO(3d_v2_b64, ImageGeometry::_3D, ulong2); + SUST_B_ZERO(3d_v4_b8, ImageGeometry::_3D, uchar4); + SUST_B_ZERO(3d_v4_b16, ImageGeometry::_3D, ushort4); + SUST_B_ZERO(3d_v4_b32, ImageGeometry::_3D, uint4); + SUST_B_ZERO_ARRAY(a1d_b8, ImageGeometry::A1D, uchar1); + SUST_B_ZERO_ARRAY(a1d_b16, ImageGeometry::A1D, ushort1); + SUST_B_ZERO_ARRAY(a1d_b32, ImageGeometry::A1D, uint1); + SUST_B_ZERO_ARRAY(a1d_b64, ImageGeometry::A1D, ulong1); + SUST_B_ZERO_ARRAY(a1d_v2_b8, ImageGeometry::A1D, uchar2); + SUST_B_ZERO_ARRAY(a1d_v2_b16, ImageGeometry::A1D, ushort2); + SUST_B_ZERO_ARRAY(a1d_v2_b32, ImageGeometry::A1D, uint2); + SUST_B_ZERO_ARRAY(a1d_v2_b64, ImageGeometry::A1D, ulong2); + SUST_B_ZERO_ARRAY(a1d_v4_b8, ImageGeometry::A1D, uchar4); + SUST_B_ZERO_ARRAY(a1d_v4_b16, ImageGeometry::A1D, ushort4); + SUST_B_ZERO_ARRAY(a1d_v4_b32, ImageGeometry::A1D, uint4); + SUST_B_ZERO_ARRAY(a2d_b8, ImageGeometry::A2D, uchar1); + SUST_B_ZERO_ARRAY(a2d_b16, ImageGeometry::A2D, ushort1); + SUST_B_ZERO_ARRAY(a2d_b32, ImageGeometry::A2D, uint1); + SUST_B_ZERO_ARRAY(a2d_b64, ImageGeometry::A2D, ulong1); + SUST_B_ZERO_ARRAY(a2d_v2_b8, ImageGeometry::A2D, uchar2); + SUST_B_ZERO_ARRAY(a2d_v2_b16, ImageGeometry::A2D, ushort2); + SUST_B_ZERO_ARRAY(a2d_v2_b32, ImageGeometry::A2D, uint2); + SUST_B_ZERO_ARRAY(a2d_v2_b64, ImageGeometry::A2D, ulong2); + SUST_B_ZERO_ARRAY(a2d_v4_b8, ImageGeometry::A2D, uchar4); + SUST_B_ZERO_ARRAY(a2d_v4_b16, ImageGeometry::A2D, ushort4); + SUST_B_ZERO_ARRAY(a2d_v4_b32, ImageGeometry::A2D, uint4); __device__ static inline bool is_upper_warp() { @@ -984,6 +1300,7 @@ extern "C" default: return 0; } + return 2; case 'l': switch (s[1]) { @@ -1013,17 +1330,18 @@ extern "C" case 'X': case 'n': len = 8; - return 2; + break; default: return 0; } + return 3; default: return 0; } + return 2; default: return 0; } - return 1; } __device__ static bool parse_printf_specifier(const char *s, uint8_t &len) @@ -1117,8 +1435,36 @@ extern "C" char c = *(s++); if (c == 0) break; - if (c == '%') + if (c != '%') + continue; + + // %% requires no additional handling + if (*s == '%') { + s++; + continue; + } + + // %s uses __ockl_printf_append_string_n + // https://github.com/ROCm/ROCm-Device-Libs/blob/rocm-5.7.x/ockl/src/services.cl#L343 + if (*s == 's') + { + s++; + const char *value = (const char *)read_valist(valist_ptr, valist_offset, 8); + handle = __ockl_printf_append_string_n(handle, value, strlen_plus_one(value), 0); + continue; + } + + // Keep scanning until we figure out the length of this specifier or if we reach the end of the string + while (*s != 0) { + // "The width is not specified in the format string, but as an additional integer value argument preceding the argument that has to be formatted." + if (*s == '*') { + s++; + uint64_t value = read_valist(valist_ptr, valist_offset, 4); + handle = __ockl_printf_append_args(handle, 1, value, 0, 0, 0, 0, 0, 0, 0); + continue; + } + uint8_t len = 0; if (parse_printf_specifier(s, len)) { @@ -1130,16 +1476,22 @@ extern "C" if (specifier_with_length) { s += specifier_with_length; - } - if (len > 0) - { - uint64_t value = read_valist(valist_ptr, valist_offset, len); - handle = __ockl_printf_append_args(handle, 1, value, 0, 0, 0, 0, 0, 0, 0); + } else { + // Assume the unknown character is a sub-specifier and move on + s++; + continue; } } + + if (len > 0) + { + uint64_t value = read_valist(valist_ptr, valist_offset, len); + handle = __ockl_printf_append_args(handle, 1, value, 0, 0, 0, 0, 0, 0, 0); + } + break; } } - return (uint32_t)__ockl_printf_append_args(handle, 0, 0, 0, 0, 0, 0, 0, 0, 1); + __ockl_printf_append_args(handle, 0, 0, 0, 0, 0, 0, 0, 0, 1); return 1; } diff --git a/ptx/src/ast.rs b/ptx/src/ast.rs index d3b9403..5568773 100644 --- a/ptx/src/ast.rs +++ b/ptx/src/ast.rs @@ -380,6 +380,7 @@ pub enum Instruction { }, MadCC { type_: ScalarType, + is_hi: bool, arg: Arg4

, }, Fma(ArithFloat, Arg4

), @@ -476,6 +477,7 @@ pub enum Instruction { Red(AtomDetails, Arg2St

), Nanosleep(Arg1

), Isspacep(StateSpace, Arg2

), + Sad(ScalarType, Arg4

), } #[derive(Copy, Clone)] diff --git a/ptx/src/emit.rs b/ptx/src/emit.rs index 346cc64..1bb10b1 100644 --- a/ptx/src/emit.rs +++ b/ptx/src/emit.rs @@ -7,12 +7,13 @@ use std::ffi::CStr; use std::fmt::Display; use std::io::Write; use std::ptr::null_mut; -use std::{convert, iter, mem, ptr}; +use std::{iter, mem, ptr}; use zluda_llvm::core::*; use zluda_llvm::prelude::*; use zluda_llvm::zluda::*; use zluda_llvm::*; +use crate::ast::SetpData; use crate::translate::{ self, Arg4CarryOut, ConstType, ConversionKind, DenormSummary, ExpandedArgParams, FPDenormMode, MadCCDetails, MadCDetails, TranslationModule, TypeKind, TypeParts, @@ -156,7 +157,7 @@ impl NamedIdGenerator { if let Some(id) = id { self.register_result(id, func) } else { - func(b"\0".as_ptr() as _) + func(LLVM_UNNAMED) } } @@ -497,10 +498,12 @@ fn emit_function_variable( ) -> Result<(), TranslateError> { let builder = ctx.builder.get(); let llvm_type = get_llvm_type(ctx, &variable.type_)?; - let addr_space = get_llvm_address_space(&ctx.constants, variable.state_space)?; - let value = ctx.names.register_result(variable.name, |name| unsafe { - LLVMZludaBuildAlloca(builder, llvm_type, addr_space, name) - }); + let value = emit_alloca( + ctx, + llvm_type, + get_llvm_address_space(&ctx.constants, variable.state_space)?, + Some(variable.name), + ); match variable.initializer { None => {} Some(init) => { @@ -523,12 +526,27 @@ fn emit_method<'a, 'input>( let llvm_method = emit_method_declaration(ctx, &method)?; emit_linkage_for_method(&method, is_kernel, llvm_method); emit_tuning(ctx, llvm_method, &method.tuning); - for statement in method.body.iter().flat_map(convert::identity) { + let statements = match method.body { + Some(statements) => statements, + None => return Ok(()), + }; + // Initial BB that holds all the variable declarations + let bb_with_variables = + unsafe { LLVMAppendBasicBlockInContext(ctx.context.get(), llvm_method, LLVM_UNNAMED) }; + // Rest of the code + let starting_bb = + unsafe { LLVMAppendBasicBlockInContext(ctx.context.get(), llvm_method, LLVM_UNNAMED) }; + unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), starting_bb) }; + for statement in statements.iter() { register_basic_blocks(ctx, llvm_method, statement); } - for statement in method.body.into_iter().flatten() { + for statement in statements.into_iter() { emit_statement(ctx, is_kernel, statement)?; } + // happens if there is a post-ret trailing label + terminate_current_block_if_needed(ctx, None); + unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), bb_with_variables) }; + unsafe { LLVMBuildBr(ctx.builder.get(), starting_bb) }; Ok(()) } @@ -607,7 +625,6 @@ fn emit_statement( is_kernel: bool, statement: crate::translate::ExpandedStatement, ) -> Result<(), TranslateError> { - start_synthetic_basic_block_if_needed(ctx, &statement); Ok(match statement { crate::translate::Statement::Label(label) => emit_label(ctx, label)?, crate::translate::Statement::Variable(var) => emit_function_variable(ctx, var)?, @@ -625,8 +642,8 @@ fn emit_statement( crate::translate::Statement::MadC(MadCDetails { type_, is_hi, arg }) => { emit_inst_madc(ctx, type_, is_hi, &arg)? } - crate::translate::Statement::MadCC(MadCCDetails { type_, arg }) => { - emit_inst_madcc(ctx, type_, &arg)? + crate::translate::Statement::MadCC(MadCCDetails { type_, is_hi, arg }) => { + emit_inst_madcc(ctx, type_, is_hi, &arg)? } crate::translate::Statement::AddC(type_, arg) => emit_inst_add_c(ctx, type_, &arg)?, crate::translate::Statement::AddCC(type_, arg) => { @@ -752,27 +769,6 @@ fn emit_ret_value( Ok(()) } -fn start_synthetic_basic_block_if_needed( - ctx: &mut EmitContext, - statement: &crate::translate::ExpandedStatement, -) { - let current_block = unsafe { LLVMGetInsertBlock(ctx.builder.get()) }; - if current_block == ptr::null_mut() { - return; - } - let terminator = unsafe { LLVMGetBasicBlockTerminator(current_block) }; - if terminator == ptr::null_mut() { - return; - } - if let crate::translate::Statement::Label(..) = statement { - return; - } - let new_block = - unsafe { LLVMCreateBasicBlockInContext(ctx.context.get(), b"\0".as_ptr() as _) }; - unsafe { LLVMInsertExistingBasicBlockAfterInsertBlock(ctx.builder.get(), new_block) }; - unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), new_block) }; -} - fn emit_ptr_access( ctx: &mut EmitContext, ptr_access: &crate::translate::PtrAccess, @@ -1076,7 +1072,7 @@ fn emit_value_copy( ) -> Result<(), TranslateError> { let builder = ctx.builder.get(); let type_ = get_llvm_type(ctx, type_)?; - let temp_value = unsafe { LLVMBuildAlloca(builder, type_, LLVM_UNNAMED) }; + let temp_value = emit_alloca(ctx, type_, ctx.constants.private_space, None); unsafe { LLVMBuildStore(builder, src, temp_value) }; ctx.names.register_result(dst, |dst| unsafe { LLVMBuildLoad2(builder, type_, temp_value, dst) @@ -1084,6 +1080,28 @@ fn emit_value_copy( Ok(()) } +// From "Performance Tips for Frontend Authors" (https://llvm.org/docs/Frontend/PerformanceTips.html): +// "The SROA (Scalar Replacement Of Aggregates) and Mem2Reg passes only attempt to eliminate alloca +// instructions that are in the entry basic block. Given SSA is the canonical form expected by much +// of the optimizer; if allocas can not be eliminated by Mem2Reg or SROA, the optimizer is likely to +// be less effective than it could be." +fn emit_alloca( + ctx: &mut EmitContext, + type_: LLVMTypeRef, + addr_space: u32, + name: Option, +) -> LLVMValueRef { + let builder = ctx.builder.get(); + let current_bb = unsafe { LLVMGetInsertBlock(builder) }; + let variables_bb = unsafe { LLVMGetFirstBasicBlock(LLVMGetBasicBlockParent(current_bb)) }; + unsafe { LLVMPositionBuilderAtEnd(builder, variables_bb) }; + let result = ctx.names.register_result_option(name, |name| unsafe { + LLVMZludaBuildAlloca(builder, type_, addr_space, name) + }); + unsafe { LLVMPositionBuilderAtEnd(builder, current_bb) }; + result +} + fn emit_instruction( ctx: &mut EmitContext, is_kernel: bool, @@ -1142,6 +1160,7 @@ fn emit_instruction( ast::Instruction::Set(details, arg) => emit_inst_set(ctx, details, arg)?, ast::Instruction::Red(details, arg) => emit_inst_red(ctx, details, arg)?, ast::Instruction::Isspacep(space, arg) => emit_inst_isspacep(ctx, *space, arg)?, + ast::Instruction::Sad(type_, arg) => emit_inst_sad(ctx, *type_, arg)?, // replaced by function calls or Statement variants ast::Instruction::Activemask { .. } | ast::Instruction::Bar(..) @@ -1230,6 +1249,36 @@ fn emit_inst_isspacep_impl( ) } + +fn emit_inst_sad( + ctx: &mut EmitContext, + type_: ast::ScalarType, + arg: &ast::Arg4, +) -> Result<(), TranslateError> { + let builder = ctx.builder.get(); + let less_than = emit_inst_setp_int( + ctx, + &SetpData { + typ: type_, + flush_to_zero: None, + cmp_op: ast::SetpCompareOp::Greater, + }, + None, + arg.src1, + arg.src2, + )?; + let a = ctx.names.value(arg.src1)?; + let b = ctx.names.value(arg.src2)?; + let a_minus_b = unsafe { LLVMBuildSub(builder, a, b, LLVM_UNNAMED) }; + let b_minus_a = unsafe { LLVMBuildSub(builder, b, a, LLVM_UNNAMED) }; + let a_or_b = unsafe { LLVMBuildSelect(builder, less_than, a_minus_b, b_minus_a, LLVM_UNNAMED) }; + let src3 = ctx.names.value(arg.src3)?; + ctx.names.register_result(arg.dst, |dst_name| unsafe { + LLVMBuildAdd(builder, src3, a_or_b, dst_name) + }); + Ok(()) +} + fn emit_inst_red( ctx: &mut EmitContext, details: &ast::AtomDetails, @@ -1359,9 +1408,6 @@ fn emit_int_trap(ctx: &mut EmitContext) -> Result<(), TranslateError> { 0, LLVM_UNNAMED, ); - // llvm.trap is not a terminator, - // LLVM might fail with an unterminated basic block if we don't insert unreachable - LLVMBuildUnreachable(builder); } Ok(()) } @@ -2149,16 +2195,17 @@ fn emit_inst_mad_lo( ) } -// TODO: support mad.hi.cc fn emit_inst_madcc( ctx: &mut EmitContext, type_: ast::ScalarType, + is_hi: bool, arg: &Arg4CarryOut, ) -> Result<(), TranslateError> { - let builder = ctx.builder.get(); - let src1 = ctx.names.value(arg.src1)?; - let src2 = ctx.names.value(arg.src2)?; - let mul_result = unsafe { LLVMBuildMul(builder, src1, src2, LLVM_UNNAMED) }; + let mul_result = if is_hi { + emit_inst_mul_hi_impl(ctx, type_, None, arg.src1, arg.src2)? + } else { + emit_inst_mul_low_impl(ctx, None, arg.src1, arg.src2, LLVMBuildMul)? + }; emit_inst_addsub_cc_impl( ctx, "add", @@ -2246,29 +2293,6 @@ fn emit_inst_madc( mul_result, args.src3, ) - /* - let src3 = ctx.names.value(args.src3)?; - let add_no_carry = unsafe { LLVMBuildAdd(builder, mul_result, src3, LLVM_UNNAMED) }; - let carry_flag = ctx.names.value(args.carry_in)?; - let llvm_type = get_llvm_type(ctx, &ast::Type::Scalar(type_))?; - let carry_flag = unsafe { LLVMBuildZExt(builder, carry_flag, llvm_type, LLVM_UNNAMED) }; - if let Some(carry_out) = args.carry_out { - emit_inst_addsub_cc_impl( - ctx, - "add", - type_, - args.dst, - carry_out, - add_no_carry, - carry_flag, - )?; - } else { - ctx.names.register_result(args.dst, |dst| unsafe { - LLVMBuildAdd(builder, add_no_carry, carry_flag, dst) - }); - } - Ok(()) - */ } fn emit_inst_add_c( @@ -3559,12 +3583,12 @@ fn emit_store_var( fn emit_label(ctx: &mut EmitContext, label: Id) -> Result<(), TranslateError> { let new_block = unsafe { LLVMValueAsBasicBlock(ctx.names.value(label)?) }; - terminate_current_block_if_needed(ctx, new_block); + terminate_current_block_if_needed(ctx, Some(new_block)); unsafe { LLVMPositionBuilderAtEnd(ctx.builder.get(), new_block) }; Ok(()) } -fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: LLVMBasicBlockRef) { +fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: Option) { let current_block = unsafe { LLVMGetInsertBlock(ctx.builder.get()) }; if current_block == ptr::null_mut() { return; @@ -3573,7 +3597,10 @@ fn terminate_current_block_if_needed(ctx: &mut EmitContext, new_block: LLVMBasic if terminator != ptr::null_mut() { return; } - unsafe { LLVMBuildBr(ctx.builder.get(), new_block) }; + match new_block { + Some(new_block) => unsafe { LLVMBuildBr(ctx.builder.get(), new_block) }, + None => unsafe { LLVMBuildUnreachable(ctx.builder.get()) }, + }; } fn emit_method_declaration<'input>( diff --git a/ptx/src/ptx.lalrpop b/ptx/src/ptx.lalrpop index daad23d..612d3bd 100644 --- a/ptx/src/ptx.lalrpop +++ b/ptx/src/ptx.lalrpop @@ -227,6 +227,7 @@ match { "rem", "ret", "rsqrt", + "sad", "selp", "set", "setp", @@ -309,6 +310,7 @@ ExtendedID : &'input str = { "rem", "ret", "rsqrt", + "sad", "selp", "set", "setp", @@ -846,6 +848,7 @@ Instruction: ast::Instruction> = { InstRed, InstNanosleep, InstIsspacep, + InstSad }; // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld @@ -1523,7 +1526,12 @@ InstMad: ast::Instruction> = { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-mad-cc InstMadCC: ast::Instruction> = { - "mad" ".lo" ".cc" => ast::Instruction::MadCC{<>}, + "mad" ".lo" ".cc" => { + ast::Instruction::MadCC { type_, arg, is_hi: false } + }, + "mad" ".hi" ".cc" => { + ast::Instruction::MadCC { type_, arg, is_hi: true } + }, }; // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-arithmetic-instructions-madc @@ -2435,6 +2443,15 @@ InstIsspacep: ast::Instruction> = { } } + +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-sad +InstSad: ast::Instruction> = { + "sad" => { + ast::Instruction::Sad(type_, a) + } +} + + NegTypeFtz: ast::ScalarType = { ".f16" => ast::ScalarType::F16, ".f16x2" => ast::ScalarType::F16x2, diff --git a/ptx/src/test/spirv_run/abs.ll b/ptx/src/test/spirv_run/abs.ll index c698e66..4300790 100644 --- a/ptx/src/test/spirv_run/abs.ll +++ b/ptx/src/test/spirv_run/abs.ll @@ -1,44 +1,44 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 { -"38": +define protected amdgpu_kernel void @abs(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"27", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"28", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"29", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"31" = inttoptr i64 %"13" to ptr - %"30" = load i32, ptr %"31", align 4 - store i32 %"30", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"32" = inttoptr i64 %"15" to ptr - %"40" = getelementptr inbounds i8, ptr %"32", i64 4 - %"33" = load i32, ptr %"40", align 4 - store i32 %"33", ptr addrspace(5) %"7", align 4 - %"17" = load i32, ptr addrspace(5) %"6", align 4 - %"16" = call i32 @llvm.abs.i32(i32 %"17", i1 false) - store i32 %"16", ptr addrspace(5) %"6", align 4 - %"19" = load i32, ptr addrspace(5) %"7", align 4 - %"18" = call i32 @llvm.abs.i32(i32 %"19", i1 false) - store i32 %"18", ptr addrspace(5) %"7", align 4 - %"20" = load i64, ptr addrspace(5) %"5", align 8 - %"21" = load i32, ptr addrspace(5) %"6", align 4 - %"34" = inttoptr i64 %"20" to ptr - store i32 %"21", ptr %"34", align 4 - %"22" = load i64, ptr addrspace(5) %"5", align 8 - %"23" = load i32, ptr addrspace(5) %"7", align 4 - %"36" = inttoptr i64 %"22" to ptr - %"42" = getelementptr inbounds i8, ptr %"36", i64 4 - store i32 %"23", ptr %"42", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"30" = inttoptr i64 %"12" to ptr + %"29" = load i32, ptr %"30", align 4 + store i32 %"29", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"14" to ptr + %"38" = getelementptr inbounds i8, ptr %"31", i64 4 + %"32" = load i32, ptr %"38", align 4 + store i32 %"32", ptr addrspace(5) %"7", align 4 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"15" = call i32 @llvm.abs.i32(i32 %"16", i1 false) + store i32 %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i32, ptr addrspace(5) %"7", align 4 + %"17" = call i32 @llvm.abs.i32(i32 %"18", i1 false) + store i32 %"17", ptr addrspace(5) %"7", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"33" = inttoptr i64 %"19" to ptr + store i32 %"20", ptr %"33", align 4 + %"21" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = load i32, ptr addrspace(5) %"7", align 4 + %"35" = inttoptr i64 %"21" to ptr + %"40" = getelementptr inbounds i8, ptr %"35", i64 4 + store i32 %"22", ptr %"40", align 4 ret void } diff --git a/ptx/src/test/spirv_run/activemask.ll b/ptx/src/test/spirv_run/activemask.ll index 4e53429..684f89a 100644 --- a/ptx/src/test/spirv_run/activemask.ll +++ b/ptx/src/test/spirv_run/activemask.ll @@ -3,22 +3,22 @@ target triple = "amdgcn-amd-amdhsa" declare i32 @__zluda_ptx_impl__activemask() #0 -define protected amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"12", ptr addrspace(4) byref(i64) %"13") #1 { -"16": +define protected amdgpu_kernel void @activemask(ptr addrspace(4) byref(i64) %"11", ptr addrspace(4) byref(i64) %"12") #1 { %"6" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"6", align 1 - %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i32, align 4, addrspace(5) - %"8" = load i64, ptr addrspace(4) %"13", align 8 - store i64 %"8", ptr addrspace(5) %"4", align 8 - %"9" = call i32 @__zluda_ptx_impl__activemask() - store i32 %"9", ptr addrspace(5) %"5", align 4 - %"10" = load i64, ptr addrspace(5) %"4", align 8 - %"11" = load i32, ptr addrspace(5) %"5", align 4 - %"14" = inttoptr i64 %"10" to ptr - store i32 %"11", ptr %"14", align 4 + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"6", align 1 + %"7" = load i64, ptr addrspace(4) %"12", align 8 + store i64 %"7", ptr addrspace(5) %"4", align 8 + %"8" = call i32 @__zluda_ptx_impl__activemask() + store i32 %"8", ptr addrspace(5) %"5", align 4 + %"9" = load i64, ptr addrspace(5) %"4", align 8 + %"10" = load i32, ptr addrspace(5) %"5", align 4 + %"13" = inttoptr i64 %"9" to ptr + store i32 %"10", ptr %"13", align 4 ret void } diff --git a/ptx/src/test/spirv_run/add.ll b/ptx/src/test/spirv_run/add.ll index 3b11a73..babe5bb 100644 --- a/ptx/src/test/spirv_run/add.ll +++ b/ptx/src/test/spirv_run/add.ll @@ -1,31 +1,31 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { -"23": +define protected amdgpu_kernel void @add(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"20", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"21" = inttoptr i64 %"13" to ptr - %"12" = load i64, ptr %"21", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"15" = load i64, ptr addrspace(5) %"6", align 8 - %"14" = add i64 %"15", 1 - store i64 %"14", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"22" = inttoptr i64 %"16" to ptr - store i64 %"17", ptr %"22", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"12" to ptr + %"11" = load i64, ptr %"20", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 + %"13" = add i64 %"14", 1 + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"21" = inttoptr i64 %"15" to ptr + store i64 %"16", ptr %"21", align 8 ret void } diff --git a/ptx/src/test/spirv_run/add_global.ll b/ptx/src/test/spirv_run/add_global.ll index 14ae1f9..7034857 100644 --- a/ptx/src/test/spirv_run/add_global.ll +++ b/ptx/src/test/spirv_run/add_global.ll @@ -3,34 +3,34 @@ target triple = "amdgcn-amd-amdhsa" @PI = protected addrspace(1) externally_initialized global float 0x400921FB60000000, align 4 -define protected amdgpu_kernel void @add_global(ptr addrspace(4) byref(i64) %"21", ptr addrspace(4) byref(i64) %"22") #0 { -"25": +define protected amdgpu_kernel void @add_global(ptr addrspace(4) byref(i64) %"20", ptr addrspace(4) byref(i64) %"21") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca float, align 4, addrspace(5) %"8" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"20", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 %"11" = load i64, ptr addrspace(4) %"21", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(4) %"22", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"14" = load i64, ptr addrspace(5) %"5", align 8 - %"23" = inttoptr i64 %"14" to ptr - %"13" = load float, ptr %"23", align 4 - store float %"13", ptr addrspace(5) %"7", align 4 - %"15" = load float, ptr addrspace(1) @PI, align 4 - store float %"15", ptr addrspace(5) %"8", align 4 - %"17" = load float, ptr addrspace(5) %"7", align 4 - %"18" = load float, ptr addrspace(5) %"8", align 4 - %"16" = fadd float %"17", %"18" - store float %"16", ptr addrspace(5) %"7", align 4 - %"19" = load i64, ptr addrspace(5) %"6", align 8 - %"20" = load float, ptr addrspace(5) %"7", align 4 - %"24" = inttoptr i64 %"19" to ptr - store float %"20", ptr %"24", align 4 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = inttoptr i64 %"13" to ptr + %"12" = load float, ptr %"22", align 4 + store float %"12", ptr addrspace(5) %"7", align 4 + %"14" = load float, ptr addrspace(1) @PI, align 4 + store float %"14", ptr addrspace(5) %"8", align 4 + %"16" = load float, ptr addrspace(5) %"7", align 4 + %"17" = load float, ptr addrspace(5) %"8", align 4 + %"15" = fadd float %"16", %"17" + store float %"15", ptr addrspace(5) %"7", align 4 + %"18" = load i64, ptr addrspace(5) %"6", align 8 + %"19" = load float, ptr addrspace(5) %"7", align 4 + %"23" = inttoptr i64 %"18" to ptr + store float %"19", ptr %"23", align 4 ret void } diff --git a/ptx/src/test/spirv_run/add_non_coherent.ll b/ptx/src/test/spirv_run/add_non_coherent.ll index 7cf364c..4d97dad 100644 --- a/ptx/src/test/spirv_run/add_non_coherent.ll +++ b/ptx/src/test/spirv_run/add_non_coherent.ll @@ -1,31 +1,31 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { -"23": +define protected amdgpu_kernel void @add_non_coherent(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"20", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"21" = inttoptr i64 %"13" to ptr addrspace(1) - %"12" = load i64, ptr addrspace(1) %"21", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"15" = load i64, ptr addrspace(5) %"6", align 8 - %"14" = add i64 %"15", 1 - store i64 %"14", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"22" = inttoptr i64 %"16" to ptr addrspace(1) - store i64 %"17", ptr addrspace(1) %"22", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"12" to ptr addrspace(1) + %"11" = load i64, ptr addrspace(1) %"20", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 + %"13" = add i64 %"14", 1 + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"21" = inttoptr i64 %"15" to ptr addrspace(1) + store i64 %"16", ptr addrspace(1) %"21", align 8 ret void } diff --git a/ptx/src/test/spirv_run/add_param_ptr.ll b/ptx/src/test/spirv_run/add_param_ptr.ll index 9d90b23..9553fa5 100644 --- a/ptx/src/test/spirv_run/add_param_ptr.ll +++ b/ptx/src/test/spirv_run/add_param_ptr.ll @@ -1,47 +1,47 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @add_param_ptr(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 { -"39": +define protected amdgpu_kernel void @add_param_ptr(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) - %"32" = ptrtoint ptr addrspace(4) %"27" to i64 - %0 = alloca i64, align 8, addrspace(5) - store i64 %"32", ptr addrspace(5) %0, align 8 - %"31" = load i64, ptr addrspace(5) %0, align 8 - store i64 %"31", ptr addrspace(5) %"4", align 8 - %"34" = ptrtoint ptr addrspace(4) %"28" to i64 %1 = alloca i64, align 8, addrspace(5) - store i64 %"34", ptr addrspace(5) %1, align 8 - %"33" = load i64, ptr addrspace(5) %1, align 8 - store i64 %"33", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"35" = inttoptr i64 %"13" to ptr addrspace(4) + %2 = alloca i64, align 8, addrspace(5) + br label %3 + +3: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"31" = ptrtoint ptr addrspace(4) %"26" to i64 + store i64 %"31", ptr addrspace(5) %1, align 8 + %"30" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"30", ptr addrspace(5) %"4", align 8 + %"33" = ptrtoint ptr addrspace(4) %"27" to i64 + store i64 %"33", ptr addrspace(5) %2, align 8 + %"32" = load i64, ptr addrspace(5) %2, align 8 + store i64 %"32", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"34" = inttoptr i64 %"12" to ptr addrspace(4) + %"39" = getelementptr inbounds i8, ptr addrspace(4) %"34", i64 0 + %"11" = load i64, ptr addrspace(4) %"39", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"35" = inttoptr i64 %"14" to ptr addrspace(4) %"41" = getelementptr inbounds i8, ptr addrspace(4) %"35", i64 0 - %"12" = load i64, ptr addrspace(4) %"41", align 8 - store i64 %"12", ptr addrspace(5) %"4", align 8 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"36" = inttoptr i64 %"15" to ptr addrspace(4) - %"43" = getelementptr inbounds i8, ptr addrspace(4) %"36", i64 0 - %"14" = load i64, ptr addrspace(4) %"43", align 8 - store i64 %"14", ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"4", align 8 - %"37" = inttoptr i64 %"17" to ptr - %"16" = load i64, ptr %"37", align 8 - store i64 %"16", ptr addrspace(5) %"6", align 8 - %"19" = load i64, ptr addrspace(5) %"6", align 8 - %"18" = add i64 %"19", 1 - store i64 %"18", ptr addrspace(5) %"7", align 8 - %"20" = load i64, ptr addrspace(5) %"5", align 8 - %"21" = load i64, ptr addrspace(5) %"7", align 8 - %"38" = inttoptr i64 %"20" to ptr - store i64 %"21", ptr %"38", align 8 + %"13" = load i64, ptr addrspace(4) %"41", align 8 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"36" = inttoptr i64 %"16" to ptr + %"15" = load i64, ptr %"36", align 8 + store i64 %"15", ptr addrspace(5) %"6", align 8 + %"18" = load i64, ptr addrspace(5) %"6", align 8 + %"17" = add i64 %"18", 1 + store i64 %"17", ptr addrspace(5) %"7", align 8 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i64, ptr addrspace(5) %"7", align 8 + %"37" = inttoptr i64 %"19" to ptr + store i64 %"20", ptr %"37", align 8 ret void } diff --git a/ptx/src/test/spirv_run/add_tuning.ll b/ptx/src/test/spirv_run/add_tuning.ll index 1f36397..ac2972c 100644 --- a/ptx/src/test/spirv_run/add_tuning.ll +++ b/ptx/src/test/spirv_run/add_tuning.ll @@ -1,31 +1,31 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { -"23": +define protected amdgpu_kernel void @add_tuning(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"20", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"21" = inttoptr i64 %"13" to ptr - %"12" = load i64, ptr %"21", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"15" = load i64, ptr addrspace(5) %"6", align 8 - %"14" = add i64 %"15", 1 - store i64 %"14", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"22" = inttoptr i64 %"16" to ptr - store i64 %"17", ptr %"22", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"12" to ptr + %"11" = load i64, ptr %"20", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 + %"13" = add i64 %"14", 1 + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"21" = inttoptr i64 %"15" to ptr + store i64 %"16", ptr %"21", align 8 ret void } diff --git a/ptx/src/test/spirv_run/addc_cc.ll b/ptx/src/test/spirv_run/addc_cc.ll index 9015a80..d781744 100644 --- a/ptx/src/test/spirv_run/addc_cc.ll +++ b/ptx/src/test/spirv_run/addc_cc.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @addc_cc(ptr addrspace(4) byref(i64) %"54", ptr addrspace(4) byref(i64) %"55") #0 { -"69": +define protected amdgpu_kernel void @addc_cc(ptr addrspace(4) byref(i64) %"53", ptr addrspace(4) byref(i64) %"54") #0 { %"13" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"13", align 1 - %"14" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"14", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) @@ -16,70 +12,74 @@ define protected amdgpu_kernel void @addc_cc(ptr addrspace(4) byref(i64) %"54", %"10" = alloca i32, align 4, addrspace(5) %"11" = alloca i32, align 4, addrspace(5) %"12" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"13", align 1 + %"14" = load i64, ptr addrspace(4) %"53", align 8 + store i64 %"14", ptr addrspace(5) %"4", align 8 %"15" = load i64, ptr addrspace(4) %"54", align 8 - store i64 %"15", ptr addrspace(5) %"4", align 8 - %"16" = load i64, ptr addrspace(4) %"55", align 8 - store i64 %"16", ptr addrspace(5) %"5", align 8 - %"18" = load i64, ptr addrspace(5) %"4", align 8 - %"57" = inttoptr i64 %"18" to ptr - %"56" = load i32, ptr %"57", align 4 - store i32 %"56", ptr addrspace(5) %"9", align 4 - %"20" = load i64, ptr addrspace(5) %"4", align 8 - %"58" = inttoptr i64 %"20" to ptr - %"71" = getelementptr inbounds i8, ptr %"58", i64 4 - %"59" = load i32, ptr %"71", align 4 - store i32 %"59", ptr addrspace(5) %"10", align 4 - %"22" = load i64, ptr addrspace(5) %"4", align 8 - %"60" = inttoptr i64 %"22" to ptr - %"73" = getelementptr inbounds i8, ptr %"60", i64 8 - %"21" = load i32, ptr %"73", align 4 - store i32 %"21", ptr addrspace(5) %"11", align 4 - %"24" = load i64, ptr addrspace(5) %"4", align 8 - %"61" = inttoptr i64 %"24" to ptr - %"75" = getelementptr inbounds i8, ptr %"61", i64 12 - %"23" = load i32, ptr %"75", align 4 - store i32 %"23", ptr addrspace(5) %"12", align 4 - %"27" = load i32, ptr addrspace(5) %"9", align 4 - %"28" = load i32, ptr addrspace(5) %"10", align 4 - %0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %"27", i32 %"28") - %"25" = extractvalue { i32, i1 } %0, 0 - %"26" = extractvalue { i32, i1 } %0, 1 - store i32 %"25", ptr addrspace(5) %"6", align 4 - store i1 %"26", ptr addrspace(5) %"13", align 1 - %"31" = load i1, ptr addrspace(5) %"13", align 1 - %"32" = load i32, ptr addrspace(5) %"6", align 4 - %"33" = load i32, ptr addrspace(5) %"11", align 4 - %1 = zext i1 %"31" to i32 - %2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %"32", i32 %"33") - %3 = extractvalue { i32, i1 } %2, 0 - %4 = extractvalue { i32, i1 } %2, 1 - %5 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %3, i32 %1) - %"29" = extractvalue { i32, i1 } %5, 0 - %6 = extractvalue { i32, i1 } %5, 1 - %"30" = xor i1 %4, %6 - store i32 %"29", ptr addrspace(5) %"7", align 4 - store i1 %"30", ptr addrspace(5) %"13", align 1 - %"35" = load i1, ptr addrspace(5) %"13", align 1 - %"36" = load i32, ptr addrspace(5) %"7", align 4 - %"37" = load i32, ptr addrspace(5) %"12", align 4 - %7 = zext i1 %"35" to i32 - %8 = add i32 %"36", %"37" - %"34" = add i32 %8, %7 - store i32 %"34", ptr addrspace(5) %"8", align 4 - %"38" = load i64, ptr addrspace(5) %"5", align 8 - %"39" = load i32, ptr addrspace(5) %"6", align 4 - %"66" = inttoptr i64 %"38" to ptr - store i32 %"39", ptr %"66", align 4 - %"40" = load i64, ptr addrspace(5) %"5", align 8 - %"41" = load i32, ptr addrspace(5) %"7", align 4 - %"67" = inttoptr i64 %"40" to ptr - %"77" = getelementptr inbounds i8, ptr %"67", i64 4 - store i32 %"41", ptr %"77", align 4 - %"42" = load i64, ptr addrspace(5) %"5", align 8 - %"43" = load i32, ptr addrspace(5) %"8", align 4 - %"68" = inttoptr i64 %"42" to ptr - %"79" = getelementptr inbounds i8, ptr %"68", i64 8 - store i32 %"43", ptr %"79", align 4 + store i64 %"15", ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"56" = inttoptr i64 %"17" to ptr + %"55" = load i32, ptr %"56", align 4 + store i32 %"55", ptr addrspace(5) %"9", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"57" = inttoptr i64 %"19" to ptr + %"69" = getelementptr inbounds i8, ptr %"57", i64 4 + %"58" = load i32, ptr %"69", align 4 + store i32 %"58", ptr addrspace(5) %"10", align 4 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"59" = inttoptr i64 %"21" to ptr + %"71" = getelementptr inbounds i8, ptr %"59", i64 8 + %"20" = load i32, ptr %"71", align 4 + store i32 %"20", ptr addrspace(5) %"11", align 4 + %"23" = load i64, ptr addrspace(5) %"4", align 8 + %"60" = inttoptr i64 %"23" to ptr + %"73" = getelementptr inbounds i8, ptr %"60", i64 12 + %"22" = load i32, ptr %"73", align 4 + store i32 %"22", ptr addrspace(5) %"12", align 4 + %"26" = load i32, ptr addrspace(5) %"9", align 4 + %"27" = load i32, ptr addrspace(5) %"10", align 4 + %2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %"26", i32 %"27") + %"24" = extractvalue { i32, i1 } %2, 0 + %"25" = extractvalue { i32, i1 } %2, 1 + store i32 %"24", ptr addrspace(5) %"6", align 4 + store i1 %"25", ptr addrspace(5) %"13", align 1 + %"30" = load i1, ptr addrspace(5) %"13", align 1 + %"31" = load i32, ptr addrspace(5) %"6", align 4 + %"32" = load i32, ptr addrspace(5) %"11", align 4 + %3 = zext i1 %"30" to i32 + %4 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %"31", i32 %"32") + %5 = extractvalue { i32, i1 } %4, 0 + %6 = extractvalue { i32, i1 } %4, 1 + %7 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %5, i32 %3) + %"28" = extractvalue { i32, i1 } %7, 0 + %8 = extractvalue { i32, i1 } %7, 1 + %"29" = xor i1 %6, %8 + store i32 %"28", ptr addrspace(5) %"7", align 4 + store i1 %"29", ptr addrspace(5) %"13", align 1 + %"34" = load i1, ptr addrspace(5) %"13", align 1 + %"35" = load i32, ptr addrspace(5) %"7", align 4 + %"36" = load i32, ptr addrspace(5) %"12", align 4 + %9 = zext i1 %"34" to i32 + %10 = add i32 %"35", %"36" + %"33" = add i32 %10, %9 + store i32 %"33", ptr addrspace(5) %"8", align 4 + %"37" = load i64, ptr addrspace(5) %"5", align 8 + %"38" = load i32, ptr addrspace(5) %"6", align 4 + %"65" = inttoptr i64 %"37" to ptr + store i32 %"38", ptr %"65", align 4 + %"39" = load i64, ptr addrspace(5) %"5", align 8 + %"40" = load i32, ptr addrspace(5) %"7", align 4 + %"66" = inttoptr i64 %"39" to ptr + %"75" = getelementptr inbounds i8, ptr %"66", i64 4 + store i32 %"40", ptr %"75", align 4 + %"41" = load i64, ptr addrspace(5) %"5", align 8 + %"42" = load i32, ptr addrspace(5) %"8", align 4 + %"67" = inttoptr i64 %"41" to ptr + %"77" = getelementptr inbounds i8, ptr %"67", i64 8 + store i32 %"42", ptr %"77", align 4 ret void } diff --git a/ptx/src/test/spirv_run/addc_cc2.ll b/ptx/src/test/spirv_run/addc_cc2.ll index 982be96..cd06ea2 100644 --- a/ptx/src/test/spirv_run/addc_cc2.ll +++ b/ptx/src/test/spirv_run/addc_cc2.ll @@ -1,63 +1,63 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @addc_cc2(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { -"51": +define protected amdgpu_kernel void @addc_cc2(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) - %"11" = load i64, ptr addrspace(4) %"41", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -1, i32 -1) - %"42" = extractvalue { i32, i1 } %0, 0 - %"13" = extractvalue { i32, i1 } %0, 1 + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"40", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -1, i32 -1) + %"41" = extractvalue { i32, i1 } %2, 0 + %"12" = extractvalue { i32, i1 } %2, 1 + store i32 %"41", ptr addrspace(5) %"6", align 4 + store i1 %"12", ptr addrspace(5) %"9", align 1 + %"15" = load i1, ptr addrspace(5) %"9", align 1 + %3 = zext i1 %"15" to i32 + %4 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -4, i32 -4) + %5 = extractvalue { i32, i1 } %4, 0 + %6 = extractvalue { i32, i1 } %4, 1 + %7 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %5, i32 %3) + %"42" = extractvalue { i32, i1 } %7, 0 + %8 = extractvalue { i32, i1 } %7, 1 + %"14" = xor i1 %6, %8 store i32 %"42", ptr addrspace(5) %"6", align 4 - store i1 %"13", ptr addrspace(5) %"9", align 1 - %"16" = load i1, ptr addrspace(5) %"9", align 1 - %1 = zext i1 %"16" to i32 - %2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -4, i32 -4) - %3 = extractvalue { i32, i1 } %2, 0 - %4 = extractvalue { i32, i1 } %2, 1 - %5 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %3, i32 %1) - %"43" = extractvalue { i32, i1 } %5, 0 - %6 = extractvalue { i32, i1 } %5, 1 - %"15" = xor i1 %4, %6 - store i32 %"43", ptr addrspace(5) %"6", align 4 - store i1 %"15", ptr addrspace(5) %"9", align 1 - %"18" = load i1, ptr addrspace(5) %"9", align 1 - %7 = zext i1 %"18" to i32 - %"44" = add i32 0, %7 - store i32 %"44", ptr addrspace(5) %"7", align 4 - %"21" = load i1, ptr addrspace(5) %"9", align 1 - %8 = zext i1 %"21" to i32 - %9 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 -1) - %10 = extractvalue { i32, i1 } %9, 0 - %11 = extractvalue { i32, i1 } %9, 1 - %12 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %10, i32 %8) - %"45" = extractvalue { i32, i1 } %12, 0 - %13 = extractvalue { i32, i1 } %12, 1 - %"20" = xor i1 %11, %13 - store i32 %"45", ptr addrspace(5) %"6", align 4 - store i1 %"20", ptr addrspace(5) %"9", align 1 - %"23" = load i1, ptr addrspace(5) %"9", align 1 - %14 = zext i1 %"23" to i32 - %"46" = add i32 0, %14 - store i32 %"46", ptr addrspace(5) %"8", align 4 - %"24" = load i64, ptr addrspace(5) %"5", align 8 - %"25" = load i32, ptr addrspace(5) %"7", align 4 - %"47" = inttoptr i64 %"24" to ptr - store i32 %"25", ptr %"47", align 4 - %"26" = load i64, ptr addrspace(5) %"5", align 8 - %"27" = load i32, ptr addrspace(5) %"8", align 4 - %"49" = inttoptr i64 %"26" to ptr - %"53" = getelementptr inbounds i8, ptr %"49", i64 4 - store i32 %"27", ptr %"53", align 4 + store i1 %"14", ptr addrspace(5) %"9", align 1 + %"17" = load i1, ptr addrspace(5) %"9", align 1 + %9 = zext i1 %"17" to i32 + %"43" = add i32 0, %9 + store i32 %"43", ptr addrspace(5) %"7", align 4 + %"20" = load i1, ptr addrspace(5) %"9", align 1 + %10 = zext i1 %"20" to i32 + %11 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 -1) + %12 = extractvalue { i32, i1 } %11, 0 + %13 = extractvalue { i32, i1 } %11, 1 + %14 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %12, i32 %10) + %"44" = extractvalue { i32, i1 } %14, 0 + %15 = extractvalue { i32, i1 } %14, 1 + %"19" = xor i1 %13, %15 + store i32 %"44", ptr addrspace(5) %"6", align 4 + store i1 %"19", ptr addrspace(5) %"9", align 1 + %"22" = load i1, ptr addrspace(5) %"9", align 1 + %16 = zext i1 %"22" to i32 + %"45" = add i32 0, %16 + store i32 %"45", ptr addrspace(5) %"8", align 4 + %"23" = load i64, ptr addrspace(5) %"5", align 8 + %"24" = load i32, ptr addrspace(5) %"7", align 4 + %"46" = inttoptr i64 %"23" to ptr + store i32 %"24", ptr %"46", align 4 + %"25" = load i64, ptr addrspace(5) %"5", align 8 + %"26" = load i32, ptr addrspace(5) %"8", align 4 + %"48" = inttoptr i64 %"25" to ptr + %"51" = getelementptr inbounds i8, ptr %"48", i64 4 + store i32 %"26", ptr %"51", align 4 ret void } diff --git a/ptx/src/test/spirv_run/alloca_call.ll b/ptx/src/test/spirv_run/alloca_call.ll index 1ae760b..aae7a91 100644 --- a/ptx/src/test/spirv_run/alloca_call.ll +++ b/ptx/src/test/spirv_run/alloca_call.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @_Z13callback_onlyIdEvPvS0_10callback_tx(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { -"59": +define protected amdgpu_kernel void @_Z13callback_onlyIdEvPvS0_10callback_tx(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #0 { %"22" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"22", align 1 - %"23" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"23", align 1 %"7" = alloca i1, align 1, addrspace(5) %"8" = alloca double, align 8, addrspace(5) %"9" = alloca double, align 8, addrspace(5) @@ -14,47 +10,51 @@ define protected amdgpu_kernel void @_Z13callback_onlyIdEvPvS0_10callback_tx(ptr %"11" = alloca i64, align 8, addrspace(5) %"12" = alloca i64, align 8, addrspace(5) %"13" = alloca i64, align 8, addrspace(5) - %"47" = alloca i64, align 8, addrspace(5) - %"49" = alloca [4 x i32], align 16, addrspace(5) - %"51" = load i64, ptr addrspace(4) %"43", align 8 - store i64 %"51", ptr addrspace(5) %"10", align 8 - %"52" = load i64, ptr addrspace(4) %"44", align 8 - store i64 %"52", ptr addrspace(5) %"11", align 8 - %"53" = load i64, ptr addrspace(4) %"45", align 8 - store i64 %"53", ptr addrspace(5) %"12", align 8 - %"54" = load i64, ptr addrspace(4) %"46", align 8 - store i64 %"54", ptr addrspace(5) %"13", align 8 - %"29" = load i64, ptr addrspace(5) %"12", align 8 - %"30" = load i64, ptr addrspace(5) %"13", align 8 - %"28" = icmp sge i64 %"29", %"30" - store i1 %"28", ptr addrspace(5) %"7", align 1 - %"31" = load i1, ptr addrspace(5) %"7", align 1 - br i1 %"31", label %"6", label %"18" + %"46" = alloca i64, align 8, addrspace(5) + %"48" = alloca [4 x i32], align 16, addrspace(5) + br label %1 -"18": ; preds = %"59" +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"22", align 1 + %"50" = load i64, ptr addrspace(4) %"42", align 8 + store i64 %"50", ptr addrspace(5) %"10", align 8 + %"51" = load i64, ptr addrspace(4) %"43", align 8 + store i64 %"51", ptr addrspace(5) %"11", align 8 + %"52" = load i64, ptr addrspace(4) %"44", align 8 + store i64 %"52", ptr addrspace(5) %"12", align 8 + %"53" = load i64, ptr addrspace(4) %"45", align 8 + store i64 %"53", ptr addrspace(5) %"13", align 8 + %"28" = load i64, ptr addrspace(5) %"12", align 8 + %"29" = load i64, ptr addrspace(5) %"13", align 8 + %"27" = icmp sge i64 %"28", %"29" + store i1 %"27", ptr addrspace(5) %"7", align 1 + %"30" = load i1, ptr addrspace(5) %"7", align 1 + br i1 %"30", label %"6", label %"18" + +"18": ; preds = %1 + %"31" = load i64, ptr addrspace(5) %"11", align 8 + %"59" = getelementptr inbounds i8, ptr addrspace(5) %"46", i64 0 + store i64 %"31", ptr addrspace(5) %"59", align 8 %"32" = load i64, ptr addrspace(5) %"11", align 8 - %"61" = getelementptr inbounds i8, ptr addrspace(5) %"47", i64 0 - store i64 %"32", ptr addrspace(5) %"61", align 8 - %"33" = load i64, ptr addrspace(5) %"11", align 8 - %0 = inttoptr i64 %"33" to ptr - %"21" = call [4 x i32] %0() - store [4 x i32] %"21", ptr addrspace(5) %"49", align 4 - %"63" = getelementptr inbounds i8, ptr addrspace(5) %"49", i64 0 - %"19" = load <2 x double>, ptr addrspace(5) %"63", align 16 - %"34" = extractelement <2 x double> %"19", i32 0 - %"35" = extractelement <2 x double> %"19", i32 1 - store double %"34", ptr addrspace(5) %"8", align 8 - store double %"35", ptr addrspace(5) %"9", align 8 - %"36" = load double, ptr addrspace(5) %"8", align 8 - %"37" = load double, ptr addrspace(5) %"9", align 8 - %1 = insertelement <2 x double> undef, double %"36", i32 0 - %"20" = insertelement <2 x double> %1, double %"37", i32 1 - %"38" = load i64, ptr addrspace(5) %"10", align 8 - %"58" = inttoptr i64 %"38" to ptr addrspace(1) - store <2 x double> %"20", ptr addrspace(1) %"58", align 16 + %2 = inttoptr i64 %"32" to ptr + %"21" = call [4 x i32] %2() + store [4 x i32] %"21", ptr addrspace(5) %"48", align 4 + %"61" = getelementptr inbounds i8, ptr addrspace(5) %"48", i64 0 + %"19" = load <2 x double>, ptr addrspace(5) %"61", align 16 + %"33" = extractelement <2 x double> %"19", i32 0 + %"34" = extractelement <2 x double> %"19", i32 1 + store double %"33", ptr addrspace(5) %"8", align 8 + store double %"34", ptr addrspace(5) %"9", align 8 + %"35" = load double, ptr addrspace(5) %"8", align 8 + %"36" = load double, ptr addrspace(5) %"9", align 8 + %3 = insertelement <2 x double> undef, double %"35", i32 0 + %"20" = insertelement <2 x double> %3, double %"36", i32 1 + %"37" = load i64, ptr addrspace(5) %"10", align 8 + %"57" = inttoptr i64 %"37" to ptr addrspace(1) + store <2 x double> %"20", ptr addrspace(1) %"57", align 16 br label %"6" -"6": ; preds = %"18", %"59" +"6": ; preds = %"18", %1 ret void } diff --git a/ptx/src/test/spirv_run/amdgpu_unnamed.ll b/ptx/src/test/spirv_run/amdgpu_unnamed.ll index b08350b..1a1ce58 100644 --- a/ptx/src/test/spirv_run/amdgpu_unnamed.ll +++ b/ptx/src/test/spirv_run/amdgpu_unnamed.ll @@ -7,12 +7,8 @@ target triple = "amdgcn-amd-amdhsa" declare void @__zluda_ptx_impl____assertfail(i64, i64, i32, i64, i64) #0 -define protected amdgpu_kernel void @amdgpu_unnamed(ptr addrspace(4) byref(i64) %"58", ptr addrspace(4) byref(i64) %"59") #1 { -"74": +define protected amdgpu_kernel void @amdgpu_unnamed(ptr addrspace(4) byref(i64) %"57", ptr addrspace(4) byref(i64) %"58") #1 { %"33" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"33", align 1 - %"34" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"34", align 1 %"14" = alloca i64, align 8, addrspace(5) %"15" = alloca i64, align 8, addrspace(5) %"16" = alloca i64, align 8, addrspace(5) @@ -20,63 +16,67 @@ define protected amdgpu_kernel void @amdgpu_unnamed(ptr addrspace(4) byref(i64) %"18" = alloca i1, align 1, addrspace(5) %"19" = alloca i64, align 8, addrspace(5) %"20" = alloca i32, align 4, addrspace(5) - %"60" = alloca i64, align 8, addrspace(5) - %"61" = alloca i64, align 8, addrspace(5) - %"62" = alloca i32, align 4, addrspace(5) - %"63" = alloca i64, align 8, addrspace(5) - %"64" = alloca i64, align 8, addrspace(5) - %"35" = load i64, ptr addrspace(4) %"58", align 8 - store i64 %"35", ptr addrspace(5) %"14", align 8 - %"36" = load i64, ptr addrspace(4) %"59", align 8 - store i64 %"36", ptr addrspace(5) %"15", align 8 - %"38" = load i64, ptr addrspace(5) %"14", align 8 - %"66" = inttoptr i64 %"38" to ptr - %"37" = load i64, ptr %"66", align 8 - store i64 %"37", ptr addrspace(5) %"16", align 8 - %"40" = load i64, ptr addrspace(5) %"16", align 8 - %"39" = icmp uge i64 %"40", 1 - store i1 %"39", ptr addrspace(5) %"18", align 1 - %"41" = load i1, ptr addrspace(5) %"18", align 1 - br i1 %"41", label %"13", label %"27" - -"27": ; preds = %"74" - %0 = alloca i64, align 8, addrspace(5) - store i64 ptrtoint (ptr addrspace(1) @0 to i64), ptr addrspace(5) %0, align 8 - %"67" = load i64, ptr addrspace(5) %0, align 8 - store i64 %"67", ptr addrspace(5) %"19", align 8 - %"43" = load i64, ptr addrspace(5) %"19", align 8 - store i64 %"43", ptr addrspace(5) %"60", align 8 + %"59" = alloca i64, align 8, addrspace(5) %1 = alloca i64, align 8, addrspace(5) - store i64 ptrtoint (ptr addrspace(1) @1 to i64), ptr addrspace(5) %1, align 8 - %"69" = load i64, ptr addrspace(5) %1, align 8 - store i64 %"69", ptr addrspace(5) %"19", align 8 - %"45" = load i64, ptr addrspace(5) %"19", align 8 - store i64 %"45", ptr addrspace(5) %"61", align 8 - store i32 1, ptr addrspace(5) %"62", align 4 + %"60" = alloca i64, align 8, addrspace(5) %2 = alloca i64, align 8, addrspace(5) - store i64 ptrtoint (ptr addrspace(1) @2 to i64), ptr addrspace(5) %2, align 8 - %"71" = load i64, ptr addrspace(5) %2, align 8 - store i64 %"71", ptr addrspace(5) %"19", align 8 - %"47" = load i64, ptr addrspace(5) %"19", align 8 - store i64 %"47", ptr addrspace(5) %"63", align 8 - %"76" = getelementptr inbounds i8, ptr addrspace(5) %"64", i64 0 - store i64 1, ptr addrspace(5) %"76", align 8 - %"28" = load i64, ptr addrspace(5) %"60", align 8 - %"29" = load i64, ptr addrspace(5) %"61", align 8 - %"30" = load i32, ptr addrspace(5) %"62", align 4 - %"31" = load i64, ptr addrspace(5) %"63", align 8 - %"32" = load i64, ptr addrspace(5) %"64", align 8 + %"61" = alloca i32, align 4, addrspace(5) + %"62" = alloca i64, align 8, addrspace(5) + %3 = alloca i64, align 8, addrspace(5) + %"63" = alloca i64, align 8, addrspace(5) + br label %4 + +4: ; preds = %0 + store i1 false, ptr addrspace(5) %"33", align 1 + %"34" = load i64, ptr addrspace(4) %"57", align 8 + store i64 %"34", ptr addrspace(5) %"14", align 8 + %"35" = load i64, ptr addrspace(4) %"58", align 8 + store i64 %"35", ptr addrspace(5) %"15", align 8 + %"37" = load i64, ptr addrspace(5) %"14", align 8 + %"65" = inttoptr i64 %"37" to ptr + %"36" = load i64, ptr %"65", align 8 + store i64 %"36", ptr addrspace(5) %"16", align 8 + %"39" = load i64, ptr addrspace(5) %"16", align 8 + %"38" = icmp uge i64 %"39", 1 + store i1 %"38", ptr addrspace(5) %"18", align 1 + %"40" = load i1, ptr addrspace(5) %"18", align 1 + br i1 %"40", label %"13", label %"27" + +"27": ; preds = %4 + store i64 ptrtoint (ptr addrspace(1) @0 to i64), ptr addrspace(5) %1, align 8 + %"66" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"66", ptr addrspace(5) %"19", align 8 + %"42" = load i64, ptr addrspace(5) %"19", align 8 + store i64 %"42", ptr addrspace(5) %"59", align 8 + store i64 ptrtoint (ptr addrspace(1) @1 to i64), ptr addrspace(5) %2, align 8 + %"68" = load i64, ptr addrspace(5) %2, align 8 + store i64 %"68", ptr addrspace(5) %"19", align 8 + %"44" = load i64, ptr addrspace(5) %"19", align 8 + store i64 %"44", ptr addrspace(5) %"60", align 8 + store i32 1, ptr addrspace(5) %"61", align 4 + store i64 ptrtoint (ptr addrspace(1) @2 to i64), ptr addrspace(5) %3, align 8 + %"70" = load i64, ptr addrspace(5) %3, align 8 + store i64 %"70", ptr addrspace(5) %"19", align 8 + %"46" = load i64, ptr addrspace(5) %"19", align 8 + store i64 %"46", ptr addrspace(5) %"62", align 8 + %"74" = getelementptr inbounds i8, ptr addrspace(5) %"63", i64 0 + store i64 1, ptr addrspace(5) %"74", align 8 + %"28" = load i64, ptr addrspace(5) %"59", align 8 + %"29" = load i64, ptr addrspace(5) %"60", align 8 + %"30" = load i32, ptr addrspace(5) %"61", align 4 + %"31" = load i64, ptr addrspace(5) %"62", align 8 + %"32" = load i64, ptr addrspace(5) %"63", align 8 call void @__zluda_ptx_impl____assertfail(i64 %"28", i64 %"29", i32 %"30", i64 %"31", i64 %"32") br label %"13" -"13": ; preds = %"27", %"74" - %"49" = load i64, ptr addrspace(5) %"16", align 8 - %"48" = add i64 %"49", 1 - store i64 %"48", ptr addrspace(5) %"17", align 8 - %"50" = load i64, ptr addrspace(5) %"15", align 8 - %"51" = load i64, ptr addrspace(5) %"17", align 8 - %"73" = inttoptr i64 %"50" to ptr - store i64 %"51", ptr %"73", align 8 +"13": ; preds = %"27", %4 + %"48" = load i64, ptr addrspace(5) %"16", align 8 + %"47" = add i64 %"48", 1 + store i64 %"47", ptr addrspace(5) %"17", align 8 + %"49" = load i64, ptr addrspace(5) %"15", align 8 + %"50" = load i64, ptr addrspace(5) %"17", align 8 + %"72" = inttoptr i64 %"49" to ptr + store i64 %"50", ptr %"72", align 8 ret void } diff --git a/ptx/src/test/spirv_run/and.ll b/ptx/src/test/spirv_run/and.ll index 2862bcc..7bb262d 100644 --- a/ptx/src/test/spirv_run/and.ll +++ b/ptx/src/test/spirv_run/and.ll @@ -1,37 +1,37 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"31": +define protected amdgpu_kernel void @and(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"25" = inttoptr i64 %"13" to ptr - %"12" = load i32, ptr %"25", align 4 - store i32 %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"15" to ptr - %"33" = getelementptr inbounds i8, ptr %"26", i64 4 - %"14" = load i32, ptr %"33", align 4 - store i32 %"14", ptr addrspace(5) %"7", align 4 - %"17" = load i32, ptr addrspace(5) %"6", align 4 - %"18" = load i32, ptr addrspace(5) %"7", align 4 - %"27" = and i32 %"17", %"18" - store i32 %"27", ptr addrspace(5) %"6", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i32, ptr addrspace(5) %"6", align 4 - %"30" = inttoptr i64 %"19" to ptr - store i32 %"20", ptr %"30", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"24", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"14" to ptr + %"31" = getelementptr inbounds i8, ptr %"25", i64 4 + %"13" = load i32, ptr %"31", align 4 + store i32 %"13", ptr addrspace(5) %"7", align 4 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + %"26" = and i32 %"16", %"17" + store i32 %"26", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i32, ptr addrspace(5) %"6", align 4 + %"29" = inttoptr i64 %"18" to ptr + store i32 %"19", ptr %"29", align 4 ret void } diff --git a/ptx/src/test/spirv_run/assertfail.ll b/ptx/src/test/spirv_run/assertfail.ll index 0fb51f7..9334859 100644 --- a/ptx/src/test/spirv_run/assertfail.ll +++ b/ptx/src/test/spirv_run/assertfail.ll @@ -3,62 +3,62 @@ target triple = "amdgcn-amd-amdhsa" declare void @__zluda_ptx_impl____assertfail(i64, i64, i32, i64, i64) #0 -define protected amdgpu_kernel void @assertfail(ptr addrspace(4) byref(i64) %"63", ptr addrspace(4) byref(i64) %"64") #1 { -"82": +define protected amdgpu_kernel void @assertfail(ptr addrspace(4) byref(i64) %"62", ptr addrspace(4) byref(i64) %"63") #1 { %"35" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"35", align 1 - %"36" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"36", align 1 %"15" = alloca i64, align 8, addrspace(5) %"16" = alloca i64, align 8, addrspace(5) %"17" = alloca i64, align 8, addrspace(5) %"18" = alloca i64, align 8, addrspace(5) %"19" = alloca i32, align 4, addrspace(5) - %"65" = alloca i64, align 8, addrspace(5) - %"67" = alloca i64, align 8, addrspace(5) - %"69" = alloca i32, align 4, addrspace(5) - %"71" = alloca i64, align 8, addrspace(5) - %"73" = alloca i64, align 8, addrspace(5) + %1 = alloca i32, align 4, addrspace(5) + %"64" = alloca i64, align 8, addrspace(5) + %"66" = alloca i64, align 8, addrspace(5) + %"68" = alloca i32, align 4, addrspace(5) + %"70" = alloca i64, align 8, addrspace(5) + %"72" = alloca i64, align 8, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"35", align 1 + %"36" = load i64, ptr addrspace(4) %"62", align 8 + store i64 %"36", ptr addrspace(5) %"15", align 8 %"37" = load i64, ptr addrspace(4) %"63", align 8 - store i64 %"37", ptr addrspace(5) %"15", align 8 - %"38" = load i64, ptr addrspace(4) %"64", align 8 - store i64 %"38", ptr addrspace(5) %"16", align 8 - %0 = alloca i32, align 4, addrspace(5) - store i32 0, ptr addrspace(5) %0, align 4 - %"75" = load i32, ptr addrspace(5) %0, align 4 - store i32 %"75", ptr addrspace(5) %"19", align 4 + store i64 %"37", ptr addrspace(5) %"16", align 8 + store i32 0, ptr addrspace(5) %1, align 4 + %"74" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"74", ptr addrspace(5) %"19", align 4 + %"39" = load i64, ptr addrspace(5) %"15", align 8 + %"82" = getelementptr inbounds i8, ptr addrspace(5) %"64", i64 0 + store i64 %"39", ptr addrspace(5) %"82", align 8 %"40" = load i64, ptr addrspace(5) %"15", align 8 - %"84" = getelementptr inbounds i8, ptr addrspace(5) %"65", i64 0 + %"84" = getelementptr inbounds i8, ptr addrspace(5) %"66", i64 0 store i64 %"40", ptr addrspace(5) %"84", align 8 - %"41" = load i64, ptr addrspace(5) %"15", align 8 - %"86" = getelementptr inbounds i8, ptr addrspace(5) %"67", i64 0 - store i64 %"41", ptr addrspace(5) %"86", align 8 - %"42" = load i32, ptr addrspace(5) %"19", align 4 - %"88" = getelementptr inbounds i8, ptr addrspace(5) %"69", i64 0 - store i32 %"42", ptr addrspace(5) %"88", align 4 + %"41" = load i32, ptr addrspace(5) %"19", align 4 + %"86" = getelementptr inbounds i8, ptr addrspace(5) %"68", i64 0 + store i32 %"41", ptr addrspace(5) %"86", align 4 + %"42" = load i64, ptr addrspace(5) %"15", align 8 + %"88" = getelementptr inbounds i8, ptr addrspace(5) %"70", i64 0 + store i64 %"42", ptr addrspace(5) %"88", align 8 %"43" = load i64, ptr addrspace(5) %"15", align 8 - %"90" = getelementptr inbounds i8, ptr addrspace(5) %"71", i64 0 + %"90" = getelementptr inbounds i8, ptr addrspace(5) %"72", i64 0 store i64 %"43", ptr addrspace(5) %"90", align 8 - %"44" = load i64, ptr addrspace(5) %"15", align 8 - %"92" = getelementptr inbounds i8, ptr addrspace(5) %"73", i64 0 - store i64 %"44", ptr addrspace(5) %"92", align 8 - %"30" = load i64, ptr addrspace(5) %"65", align 8 - %"31" = load i64, ptr addrspace(5) %"67", align 8 - %"32" = load i32, ptr addrspace(5) %"69", align 4 - %"33" = load i64, ptr addrspace(5) %"71", align 8 - %"34" = load i64, ptr addrspace(5) %"73", align 8 + %"30" = load i64, ptr addrspace(5) %"64", align 8 + %"31" = load i64, ptr addrspace(5) %"66", align 8 + %"32" = load i32, ptr addrspace(5) %"68", align 4 + %"33" = load i64, ptr addrspace(5) %"70", align 8 + %"34" = load i64, ptr addrspace(5) %"72", align 8 call void @__zluda_ptx_impl____assertfail(i64 %"30", i64 %"31", i32 %"32", i64 %"33", i64 %"34") - %"46" = load i64, ptr addrspace(5) %"15", align 8 - %"80" = inttoptr i64 %"46" to ptr - %"45" = load i64, ptr %"80", align 8 - store i64 %"45", ptr addrspace(5) %"17", align 8 - %"48" = load i64, ptr addrspace(5) %"17", align 8 - %"47" = add i64 %"48", 1 - store i64 %"47", ptr addrspace(5) %"18", align 8 - %"49" = load i64, ptr addrspace(5) %"16", align 8 - %"50" = load i64, ptr addrspace(5) %"18", align 8 - %"81" = inttoptr i64 %"49" to ptr - store i64 %"50", ptr %"81", align 8 + %"45" = load i64, ptr addrspace(5) %"15", align 8 + %"79" = inttoptr i64 %"45" to ptr + %"44" = load i64, ptr %"79", align 8 + store i64 %"44", ptr addrspace(5) %"17", align 8 + %"47" = load i64, ptr addrspace(5) %"17", align 8 + %"46" = add i64 %"47", 1 + store i64 %"46", ptr addrspace(5) %"18", align 8 + %"48" = load i64, ptr addrspace(5) %"16", align 8 + %"49" = load i64, ptr addrspace(5) %"18", align 8 + %"80" = inttoptr i64 %"48" to ptr + store i64 %"49", ptr %"80", align 8 ret void } diff --git a/ptx/src/test/spirv_run/atom_add.ll b/ptx/src/test/spirv_run/atom_add.ll index 88ccc57..6dd159f 100644 --- a/ptx/src/test/spirv_run/atom_add.ll +++ b/ptx/src/test/spirv_run/atom_add.ll @@ -3,45 +3,45 @@ target triple = "amdgcn-amd-amdhsa" @"4" = private addrspace(3) global [1024 x i8] undef, align 4 -define protected amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 { -"38": +define protected amdgpu_kernel void @atom_add(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"28", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 %"11" = load i64, ptr addrspace(4) %"29", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(4) %"30", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"14" = load i64, ptr addrspace(5) %"5", align 8 - %"31" = inttoptr i64 %"14" to ptr - %"13" = load i32, ptr %"31", align 4 - store i32 %"13", ptr addrspace(5) %"7", align 4 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"32" = inttoptr i64 %"16" to ptr - %"40" = getelementptr inbounds i8, ptr %"32", i64 4 - %"15" = load i32, ptr %"40", align 4 - store i32 %"15", ptr addrspace(5) %"8", align 4 - %"17" = load i32, ptr addrspace(5) %"7", align 4 - store i32 %"17", ptr addrspace(3) @"4", align 4 - %"19" = load i32, ptr addrspace(5) %"8", align 4 - %"18" = atomicrmw add ptr addrspace(3) @"4", i32 %"19" syncscope("agent-one-as") monotonic, align 4 - store i32 %"18", ptr addrspace(5) %"7", align 4 - %"20" = load i32, ptr addrspace(3) @"4", align 4 - store i32 %"20", ptr addrspace(5) %"8", align 4 - %"21" = load i64, ptr addrspace(5) %"6", align 8 - %"22" = load i32, ptr addrspace(5) %"7", align 4 - %"36" = inttoptr i64 %"21" to ptr - store i32 %"22", ptr %"36", align 4 - %"23" = load i64, ptr addrspace(5) %"6", align 8 - %"24" = load i32, ptr addrspace(5) %"8", align 4 - %"37" = inttoptr i64 %"23" to ptr - %"42" = getelementptr inbounds i8, ptr %"37", i64 4 - store i32 %"24", ptr %"42", align 4 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"30" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"30", align 4 + store i32 %"12", ptr addrspace(5) %"7", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = inttoptr i64 %"15" to ptr + %"38" = getelementptr inbounds i8, ptr %"31", i64 4 + %"14" = load i32, ptr %"38", align 4 + store i32 %"14", ptr addrspace(5) %"8", align 4 + %"16" = load i32, ptr addrspace(5) %"7", align 4 + store i32 %"16", ptr addrspace(3) @"4", align 4 + %"18" = load i32, ptr addrspace(5) %"8", align 4 + %"17" = atomicrmw add ptr addrspace(3) @"4", i32 %"18" syncscope("agent-one-as") monotonic, align 4 + store i32 %"17", ptr addrspace(5) %"7", align 4 + %"19" = load i32, ptr addrspace(3) @"4", align 4 + store i32 %"19", ptr addrspace(5) %"8", align 4 + %"20" = load i64, ptr addrspace(5) %"6", align 8 + %"21" = load i32, ptr addrspace(5) %"7", align 4 + %"35" = inttoptr i64 %"20" to ptr + store i32 %"21", ptr %"35", align 4 + %"22" = load i64, ptr addrspace(5) %"6", align 8 + %"23" = load i32, ptr addrspace(5) %"8", align 4 + %"36" = inttoptr i64 %"22" to ptr + %"40" = getelementptr inbounds i8, ptr %"36", i64 4 + store i32 %"23", ptr %"40", align 4 ret void } diff --git a/ptx/src/test/spirv_run/atom_add_f16.ll b/ptx/src/test/spirv_run/atom_add_f16.ll index 10a22a0..a8fa430 100644 --- a/ptx/src/test/spirv_run/atom_add_f16.ll +++ b/ptx/src/test/spirv_run/atom_add_f16.ll @@ -3,46 +3,46 @@ target triple = "amdgcn-amd-amdhsa" @"4" = private addrspace(3) global [1024 x i8] undef, align 4 -define protected amdgpu_kernel void @atom_add_f16(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 { -"38": +define protected amdgpu_kernel void @atom_add_f16(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca half, align 2, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"26", align 8 + store i64 %"9", ptr addrspace(5) %"5", align 8 %"10" = load i64, ptr addrspace(4) %"27", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"11" = load i64, ptr addrspace(4) %"28", align 8 - store i64 %"11", ptr addrspace(5) %"6", align 8 - %"13" = load i64, ptr addrspace(5) %"5", align 8 - %"29" = inttoptr i64 %"13" to ptr - %"40" = getelementptr inbounds i8, ptr %"29", i64 2 - %"30" = load i16, ptr %"40", align 2 - %"12" = bitcast i16 %"30" to half - store half %"12", ptr addrspace(5) %"7", align 2 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load half, ptr addrspace(5) %"7", align 2 - %"31" = inttoptr i64 %"15" to ptr - %"14" = atomicrmw fadd ptr %"31", half %"16" syncscope("agent-one-as") monotonic, align 2 - store half %"14", ptr addrspace(5) %"7", align 2 - %"17" = load i64, ptr addrspace(5) %"6", align 8 - %"18" = load half, ptr addrspace(5) %"7", align 2 - %"32" = inttoptr i64 %"17" to ptr - %"33" = bitcast half %"18" to i16 - store i16 %"33", ptr %"32", align 2 - %"20" = load i64, ptr addrspace(5) %"5", align 8 + store i64 %"10", ptr addrspace(5) %"6", align 8 + %"12" = load i64, ptr addrspace(5) %"5", align 8 + %"28" = inttoptr i64 %"12" to ptr + %"38" = getelementptr inbounds i8, ptr %"28", i64 2 + %"29" = load i16, ptr %"38", align 2 + %"11" = bitcast i16 %"29" to half + store half %"11", ptr addrspace(5) %"7", align 2 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load half, ptr addrspace(5) %"7", align 2 + %"30" = inttoptr i64 %"14" to ptr + %"13" = atomicrmw fadd ptr %"30", half %"15" syncscope("agent-one-as") monotonic, align 2 + store half %"13", ptr addrspace(5) %"7", align 2 + %"16" = load i64, ptr addrspace(5) %"6", align 8 + %"17" = load half, ptr addrspace(5) %"7", align 2 + %"31" = inttoptr i64 %"16" to ptr + %"32" = bitcast half %"17" to i16 + store i16 %"32", ptr %"31", align 2 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"34" = inttoptr i64 %"19" to ptr + %"33" = load i16, ptr %"34", align 2 + %"18" = bitcast i16 %"33" to half + store half %"18", ptr addrspace(5) %"7", align 2 + %"20" = load i64, ptr addrspace(5) %"6", align 8 + %"21" = load half, ptr addrspace(5) %"7", align 2 %"35" = inttoptr i64 %"20" to ptr - %"34" = load i16, ptr %"35", align 2 - %"19" = bitcast i16 %"34" to half - store half %"19", ptr addrspace(5) %"7", align 2 - %"21" = load i64, ptr addrspace(5) %"6", align 8 - %"22" = load half, ptr addrspace(5) %"7", align 2 - %"36" = inttoptr i64 %"21" to ptr - %"42" = getelementptr inbounds i8, ptr %"36", i64 2 - %"37" = bitcast half %"22" to i16 - store i16 %"37", ptr %"42", align 2 + %"40" = getelementptr inbounds i8, ptr %"35", i64 2 + %"36" = bitcast half %"21" to i16 + store i16 %"36", ptr %"40", align 2 ret void } diff --git a/ptx/src/test/spirv_run/atom_add_float.ll b/ptx/src/test/spirv_run/atom_add_float.ll index efce26c..d0e3c14 100644 --- a/ptx/src/test/spirv_run/atom_add_float.ll +++ b/ptx/src/test/spirv_run/atom_add_float.ll @@ -3,45 +3,45 @@ target triple = "amdgcn-amd-amdhsa" @"4" = private addrspace(3) global [1024 x i8] undef, align 4 -define protected amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 { -"38": +define protected amdgpu_kernel void @atom_add_float(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca float, align 4, addrspace(5) %"8" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"28", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 %"11" = load i64, ptr addrspace(4) %"29", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(4) %"30", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"14" = load i64, ptr addrspace(5) %"5", align 8 - %"31" = inttoptr i64 %"14" to ptr - %"13" = load float, ptr %"31", align 4 - store float %"13", ptr addrspace(5) %"7", align 4 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"32" = inttoptr i64 %"16" to ptr - %"40" = getelementptr inbounds i8, ptr %"32", i64 4 - %"15" = load float, ptr %"40", align 4 - store float %"15", ptr addrspace(5) %"8", align 4 - %"17" = load float, ptr addrspace(5) %"7", align 4 - store float %"17", ptr addrspace(3) @"4", align 4 - %"19" = load float, ptr addrspace(5) %"8", align 4 - %"18" = atomicrmw fadd ptr addrspace(3) @"4", float %"19" syncscope("agent-one-as") monotonic, align 4 - store float %"18", ptr addrspace(5) %"7", align 4 - %"20" = load float, ptr addrspace(3) @"4", align 4 - store float %"20", ptr addrspace(5) %"8", align 4 - %"21" = load i64, ptr addrspace(5) %"6", align 8 - %"22" = load float, ptr addrspace(5) %"7", align 4 - %"36" = inttoptr i64 %"21" to ptr - store float %"22", ptr %"36", align 4 - %"23" = load i64, ptr addrspace(5) %"6", align 8 - %"24" = load float, ptr addrspace(5) %"8", align 4 - %"37" = inttoptr i64 %"23" to ptr - %"42" = getelementptr inbounds i8, ptr %"37", i64 4 - store float %"24", ptr %"42", align 4 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"30" = inttoptr i64 %"13" to ptr + %"12" = load float, ptr %"30", align 4 + store float %"12", ptr addrspace(5) %"7", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = inttoptr i64 %"15" to ptr + %"38" = getelementptr inbounds i8, ptr %"31", i64 4 + %"14" = load float, ptr %"38", align 4 + store float %"14", ptr addrspace(5) %"8", align 4 + %"16" = load float, ptr addrspace(5) %"7", align 4 + store float %"16", ptr addrspace(3) @"4", align 4 + %"18" = load float, ptr addrspace(5) %"8", align 4 + %"17" = atomicrmw fadd ptr addrspace(3) @"4", float %"18" syncscope("agent-one-as") monotonic, align 4 + store float %"17", ptr addrspace(5) %"7", align 4 + %"19" = load float, ptr addrspace(3) @"4", align 4 + store float %"19", ptr addrspace(5) %"8", align 4 + %"20" = load i64, ptr addrspace(5) %"6", align 8 + %"21" = load float, ptr addrspace(5) %"7", align 4 + %"35" = inttoptr i64 %"20" to ptr + store float %"21", ptr %"35", align 4 + %"22" = load i64, ptr addrspace(5) %"6", align 8 + %"23" = load float, ptr addrspace(5) %"8", align 4 + %"36" = inttoptr i64 %"22" to ptr + %"40" = getelementptr inbounds i8, ptr %"36", i64 4 + store float %"23", ptr %"40", align 4 ret void } diff --git a/ptx/src/test/spirv_run/atom_cas.ll b/ptx/src/test/spirv_run/atom_cas.ll index fb83ed4..a9af2c4 100644 --- a/ptx/src/test/spirv_run/atom_cas.ll +++ b/ptx/src/test/spirv_run/atom_cas.ll @@ -1,45 +1,45 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { -"39": +define protected amdgpu_kernel void @atom_cas(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"29", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"30", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"31", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"32" = inttoptr i64 %"13" to ptr - %"12" = load i32, ptr %"32", align 4 - store i32 %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"16" = load i32, ptr addrspace(5) %"6", align 4 - %"33" = inttoptr i64 %"15" to ptr - %"41" = getelementptr inbounds i8, ptr %"33", i64 4 - %0 = cmpxchg ptr %"41", i32 %"16", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4 - %"34" = extractvalue { i32, i1 } %0, 0 - store i32 %"34", ptr addrspace(5) %"6", align 4 - %"18" = load i64, ptr addrspace(5) %"4", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"31", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"15" = load i32, ptr addrspace(5) %"6", align 4 + %"32" = inttoptr i64 %"14" to ptr + %"39" = getelementptr inbounds i8, ptr %"32", i64 4 + %2 = cmpxchg ptr %"39", i32 %"15", i32 100 syncscope("agent-one-as") monotonic monotonic, align 4 + %"33" = extractvalue { i32, i1 } %2, 0 + store i32 %"33", ptr addrspace(5) %"6", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"35" = inttoptr i64 %"17" to ptr + %"41" = getelementptr inbounds i8, ptr %"35", i64 4 + %"16" = load i32, ptr %"41", align 4 + store i32 %"16", ptr addrspace(5) %"7", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i32, ptr addrspace(5) %"6", align 4 %"36" = inttoptr i64 %"18" to ptr - %"43" = getelementptr inbounds i8, ptr %"36", i64 4 - %"17" = load i32, ptr %"43", align 4 - store i32 %"17", ptr addrspace(5) %"7", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i32, ptr addrspace(5) %"6", align 4 - %"37" = inttoptr i64 %"19" to ptr - store i32 %"20", ptr %"37", align 4 - %"21" = load i64, ptr addrspace(5) %"5", align 8 - %"22" = load i32, ptr addrspace(5) %"7", align 4 - %"38" = inttoptr i64 %"21" to ptr - %"45" = getelementptr inbounds i8, ptr %"38", i64 4 - store i32 %"22", ptr %"45", align 4 + store i32 %"19", ptr %"36", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load i32, ptr addrspace(5) %"7", align 4 + %"37" = inttoptr i64 %"20" to ptr + %"43" = getelementptr inbounds i8, ptr %"37", i64 4 + store i32 %"21", ptr %"43", align 4 ret void } diff --git a/ptx/src/test/spirv_run/atom_inc.ll b/ptx/src/test/spirv_run/atom_inc.ll index 26b7b70..212c592 100644 --- a/ptx/src/test/spirv_run/atom_inc.ll +++ b/ptx/src/test/spirv_run/atom_inc.ll @@ -5,47 +5,47 @@ declare i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr, i32) #0 declare i32 @__zluda_ptx_impl__atom_relaxed_gpu_global_inc(ptr addrspace(1), i32) #0 -define protected amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #1 { -"39": +define protected amdgpu_kernel void @atom_inc(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #1 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"30", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 %"11" = load i64, ptr addrspace(4) %"31", align 8 - store i64 %"11", ptr addrspace(5) %"4", align 8 - %"12" = load i64, ptr addrspace(4) %"32", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(5) %"4", align 8 - %"33" = inttoptr i64 %"14" to ptr - %"13" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr %"33", i32 101) - store i32 %"13", ptr addrspace(5) %"6", align 4 - %"16" = load i64, ptr addrspace(5) %"4", align 8 - %"34" = inttoptr i64 %"16" to ptr addrspace(1) - %"15" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_global_inc(ptr addrspace(1) %"34", i32 101) - store i32 %"15", ptr addrspace(5) %"7", align 4 - %"18" = load i64, ptr addrspace(5) %"4", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"32" = inttoptr i64 %"13" to ptr + %"12" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr %"32", i32 101) + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"33" = inttoptr i64 %"15" to ptr addrspace(1) + %"14" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_global_inc(ptr addrspace(1) %"33", i32 101) + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"34" = inttoptr i64 %"17" to ptr + %"16" = load i32, ptr %"34", align 4 + store i32 %"16", ptr addrspace(5) %"8", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i32, ptr addrspace(5) %"6", align 4 %"35" = inttoptr i64 %"18" to ptr - %"17" = load i32, ptr %"35", align 4 - store i32 %"17", ptr addrspace(5) %"8", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i32, ptr addrspace(5) %"6", align 4 - %"36" = inttoptr i64 %"19" to ptr - store i32 %"20", ptr %"36", align 4 - %"21" = load i64, ptr addrspace(5) %"5", align 8 - %"22" = load i32, ptr addrspace(5) %"7", align 4 - %"37" = inttoptr i64 %"21" to ptr - %"49" = getelementptr inbounds i8, ptr %"37", i64 4 - store i32 %"22", ptr %"49", align 4 - %"23" = load i64, ptr addrspace(5) %"5", align 8 - %"24" = load i32, ptr addrspace(5) %"8", align 4 - %"38" = inttoptr i64 %"23" to ptr - %"51" = getelementptr inbounds i8, ptr %"38", i64 8 - store i32 %"24", ptr %"51", align 4 + store i32 %"19", ptr %"35", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load i32, ptr addrspace(5) %"7", align 4 + %"36" = inttoptr i64 %"20" to ptr + %"47" = getelementptr inbounds i8, ptr %"36", i64 4 + store i32 %"21", ptr %"47", align 4 + %"22" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = load i32, ptr addrspace(5) %"8", align 4 + %"37" = inttoptr i64 %"22" to ptr + %"49" = getelementptr inbounds i8, ptr %"37", i64 8 + store i32 %"23", ptr %"49", align 4 ret void } diff --git a/ptx/src/test/spirv_run/atom_ld_st.ll b/ptx/src/test/spirv_run/atom_ld_st.ll index 31f39c8..eb59d31 100644 --- a/ptx/src/test/spirv_run/atom_ld_st.ll +++ b/ptx/src/test/spirv_run/atom_ld_st.ll @@ -1,27 +1,27 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @atom_ld_st(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 { -"19": +define protected amdgpu_kernel void @atom_ld_st(ptr addrspace(4) byref(i64) %"14", ptr addrspace(4) byref(i64) %"15") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"14", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"15", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"16", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"16" = inttoptr i64 %"11" to ptr + %"10" = load atomic i32, ptr %"16" syncscope("agent-one-as") acquire, align 4 + store i32 %"10", ptr addrspace(5) %"6", align 4 + %"12" = load i64, ptr addrspace(5) %"5", align 8 + %"13" = load i32, ptr addrspace(5) %"6", align 4 %"17" = inttoptr i64 %"12" to ptr - %"11" = load atomic i32, ptr %"17" syncscope("agent-one-as") acquire, align 4 - store i32 %"11", ptr addrspace(5) %"6", align 4 - %"13" = load i64, ptr addrspace(5) %"5", align 8 - %"14" = load i32, ptr addrspace(5) %"6", align 4 - %"18" = inttoptr i64 %"13" to ptr - store atomic i32 %"14", ptr %"18" syncscope("agent-one-as") release, align 4 + store atomic i32 %"13", ptr %"17" syncscope("agent-one-as") release, align 4 ret void } diff --git a/ptx/src/test/spirv_run/atom_ld_st_vec.ll b/ptx/src/test/spirv_run/atom_ld_st_vec.ll index 95ff710..5fa2409 100644 --- a/ptx/src/test/spirv_run/atom_ld_st_vec.ll +++ b/ptx/src/test/spirv_run/atom_ld_st_vec.ll @@ -1,36 +1,36 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @atom_ld_st_vec(ptr addrspace(4) byref(i64) %"20", ptr addrspace(4) byref(i64) %"21") #0 { -"24": +define protected amdgpu_kernel void @atom_ld_st_vec(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 - %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = load i64, ptr addrspace(4) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 %"12" = load i64, ptr addrspace(4) %"20", align 8 - store i64 %"12", ptr addrspace(5) %"4", align 8 - %"13" = load i64, ptr addrspace(4) %"21", align 8 - store i64 %"13", ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(5) %"4", align 8 - %"22" = inttoptr i64 %"14" to ptr - %0 = load atomic i128, ptr %"22" syncscope("agent-one-as") acquire, align 16 - %"8" = bitcast i128 %0 to <2 x i64> - %"15" = extractelement <2 x i64> %"8", i32 0 - %"16" = extractelement <2 x i64> %"8", i32 1 - store i64 %"15", ptr addrspace(5) %"6", align 8 - store i64 %"16", ptr addrspace(5) %"7", align 8 - %"17" = load i64, ptr addrspace(5) %"6", align 8 - %"18" = load i64, ptr addrspace(5) %"7", align 8 - %1 = insertelement <2 x i64> undef, i64 %"17", i32 0 - %"9" = insertelement <2 x i64> %1, i64 %"18", i32 1 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"23" = inttoptr i64 %"19" to ptr - %2 = bitcast <2 x i64> %"9" to i128 - store atomic i128 %2, ptr %"23" syncscope("agent-one-as") release, align 16 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"21" = inttoptr i64 %"13" to ptr + %2 = load atomic i128, ptr %"21" syncscope("agent-one-as") acquire, align 16 + %"8" = bitcast i128 %2 to <2 x i64> + %"14" = extractelement <2 x i64> %"8", i32 0 + %"15" = extractelement <2 x i64> %"8", i32 1 + store i64 %"14", ptr addrspace(5) %"6", align 8 + store i64 %"15", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"6", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %3 = insertelement <2 x i64> undef, i64 %"16", i32 0 + %"9" = insertelement <2 x i64> %3, i64 %"17", i32 1 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = inttoptr i64 %"18" to ptr + %4 = bitcast <2 x i64> %"9" to i128 + store atomic i128 %4, ptr %"22" syncscope("agent-one-as") release, align 16 ret void } diff --git a/ptx/src/test/spirv_run/atom_max_u32.ll b/ptx/src/test/spirv_run/atom_max_u32.ll index 7a89a13..8135e3d 100644 --- a/ptx/src/test/spirv_run/atom_max_u32.ll +++ b/ptx/src/test/spirv_run/atom_max_u32.ll @@ -1,38 +1,38 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @atom_max_u32(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"31": +define protected amdgpu_kernel void @atom_max_u32(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"24", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"14" = load i32, ptr addrspace(5) %"6", align 4 %"25" = inttoptr i64 %"13" to ptr - %"12" = load i32, ptr %"25", align 4 - store i32 %"12", ptr addrspace(5) %"6", align 4 - %"14" = load i64, ptr addrspace(5) %"5", align 8 - %"15" = load i32, ptr addrspace(5) %"6", align 4 - %"26" = inttoptr i64 %"14" to ptr - store i32 %"15", ptr %"26", align 4 - %"17" = load i64, ptr addrspace(5) %"4", align 8 - %"27" = inttoptr i64 %"17" to ptr - %"33" = getelementptr inbounds i8, ptr %"27", i64 4 - %"16" = load i32, ptr %"33", align 4 - store i32 %"16", ptr addrspace(5) %"7", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i32, ptr addrspace(5) %"7", align 4 - %"29" = inttoptr i64 %"19" to ptr - %"28" = atomicrmw umax ptr %"29", i32 %"20" syncscope("agent-one-as") monotonic, align 4 - store i32 %"28", ptr addrspace(5) %"6", align 4 + store i32 %"14", ptr %"25", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"16" to ptr + %"31" = getelementptr inbounds i8, ptr %"26", i64 4 + %"15" = load i32, ptr %"31", align 4 + store i32 %"15", ptr addrspace(5) %"7", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i32, ptr addrspace(5) %"7", align 4 + %"28" = inttoptr i64 %"18" to ptr + %"27" = atomicrmw umax ptr %"28", i32 %"19" syncscope("agent-one-as") monotonic, align 4 + store i32 %"27", ptr addrspace(5) %"6", align 4 ret void } diff --git a/ptx/src/test/spirv_run/b64tof64.ll b/ptx/src/test/spirv_run/b64tof64.ll index 2c2b674..4a8d9b3 100644 --- a/ptx/src/test/spirv_run/b64tof64.ll +++ b/ptx/src/test/spirv_run/b64tof64.ll @@ -1,34 +1,34 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { -"24": +define protected amdgpu_kernel void @b64tof64(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca double, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) - %"10" = load double, ptr addrspace(4) %"18", align 8 - store double %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"11", ptr addrspace(5) %"6", align 8 - %"13" = load double, ptr addrspace(5) %"4", align 8 - %"21" = bitcast double %"13" to i64 - %0 = alloca i64, align 8, addrspace(5) - store i64 %"21", ptr addrspace(5) %0, align 8 - %"12" = load i64, ptr addrspace(5) %0, align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"15" = load i64, ptr addrspace(5) %"5", align 8 + %1 = alloca i64, align 8, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load double, ptr addrspace(4) %"17", align 8 + store double %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"6", align 8 + %"12" = load double, ptr addrspace(5) %"4", align 8 + %"20" = bitcast double %"12" to i64 + store i64 %"20", ptr addrspace(5) %1, align 8 + %"11" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = inttoptr i64 %"14" to ptr + %"13" = load i64, ptr %"21", align 8 + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 %"22" = inttoptr i64 %"15" to ptr - %"14" = load i64, ptr %"22", align 8 - store i64 %"14", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"6", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"23" = inttoptr i64 %"16" to ptr - store i64 %"17", ptr %"23", align 8 + store i64 %"16", ptr %"22", align 8 ret void } diff --git a/ptx/src/test/spirv_run/barrier.ll b/ptx/src/test/spirv_run/barrier.ll index c247e32..55d0c93 100644 --- a/ptx/src/test/spirv_run/barrier.ll +++ b/ptx/src/test/spirv_run/barrier.ll @@ -4,11 +4,11 @@ target triple = "amdgcn-amd-amdhsa" declare void @__zluda_ptx_impl__barrier_sync(i32) #0 define protected amdgpu_kernel void @barrier() #1 { -"5": %"2" = alloca i1, align 1, addrspace(5) + br label %1 + +1: ; preds = %0 store i1 false, ptr addrspace(5) %"2", align 1 - %"3" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"3", align 1 call void @__zluda_ptx_impl__barrier_sync(i32 0) ret void } diff --git a/ptx/src/test/spirv_run/bfe.ll b/ptx/src/test/spirv_run/bfe.ll index c67513a..6644c20 100644 --- a/ptx/src/test/spirv_run/bfe.ll +++ b/ptx/src/test/spirv_run/bfe.ll @@ -3,44 +3,44 @@ target triple = "amdgcn-amd-amdhsa" declare i32 @__zluda_ptx_impl__bfe_u32(i32, i32, i32) #0 -define protected amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 { -"35": +define protected amdgpu_kernel void @bfe(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"28", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 %"11" = load i64, ptr addrspace(4) %"29", align 8 - store i64 %"11", ptr addrspace(5) %"4", align 8 - %"12" = load i64, ptr addrspace(4) %"30", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(5) %"4", align 8 - %"31" = inttoptr i64 %"14" to ptr - %"13" = load i32, ptr %"31", align 4 - store i32 %"13", ptr addrspace(5) %"6", align 4 - %"16" = load i64, ptr addrspace(5) %"4", align 8 - %"32" = inttoptr i64 %"16" to ptr - %"42" = getelementptr inbounds i8, ptr %"32", i64 4 - %"15" = load i32, ptr %"42", align 4 - store i32 %"15", ptr addrspace(5) %"7", align 4 - %"18" = load i64, ptr addrspace(5) %"4", align 8 - %"33" = inttoptr i64 %"18" to ptr - %"44" = getelementptr inbounds i8, ptr %"33", i64 8 - %"17" = load i32, ptr %"44", align 4 - store i32 %"17", ptr addrspace(5) %"8", align 4 - %"20" = load i32, ptr addrspace(5) %"6", align 4 - %"21" = load i32, ptr addrspace(5) %"7", align 4 - %"22" = load i32, ptr addrspace(5) %"8", align 4 - %"19" = call i32 @__zluda_ptx_impl__bfe_u32(i32 %"20", i32 %"21", i32 %"22") - store i32 %"19", ptr addrspace(5) %"6", align 4 - %"23" = load i64, ptr addrspace(5) %"5", align 8 - %"24" = load i32, ptr addrspace(5) %"6", align 4 - %"34" = inttoptr i64 %"23" to ptr - store i32 %"24", ptr %"34", align 4 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"30" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"30", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"15" to ptr + %"40" = getelementptr inbounds i8, ptr %"31", i64 4 + %"14" = load i32, ptr %"40", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"32" = inttoptr i64 %"17" to ptr + %"42" = getelementptr inbounds i8, ptr %"32", i64 8 + %"16" = load i32, ptr %"42", align 4 + store i32 %"16", ptr addrspace(5) %"8", align 4 + %"19" = load i32, ptr addrspace(5) %"6", align 4 + %"20" = load i32, ptr addrspace(5) %"7", align 4 + %"21" = load i32, ptr addrspace(5) %"8", align 4 + %"18" = call i32 @__zluda_ptx_impl__bfe_u32(i32 %"19", i32 %"20", i32 %"21") + store i32 %"18", ptr addrspace(5) %"6", align 4 + %"22" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = load i32, ptr addrspace(5) %"6", align 4 + %"33" = inttoptr i64 %"22" to ptr + store i32 %"23", ptr %"33", align 4 ret void } diff --git a/ptx/src/test/spirv_run/bfi.ll b/ptx/src/test/spirv_run/bfi.ll index 2fc4191..3c6a377 100644 --- a/ptx/src/test/spirv_run/bfi.ll +++ b/ptx/src/test/spirv_run/bfi.ll @@ -3,51 +3,51 @@ target triple = "amdgcn-amd-amdhsa" declare i32 @__zluda_ptx_impl__bfi_b32(i32, i32, i32, i32) #0 -define protected amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #1 { -"45": +define protected amdgpu_kernel void @bfi(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #1 { %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 - %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) %"9" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = load i64, ptr addrspace(4) %"34", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 %"12" = load i64, ptr addrspace(4) %"35", align 8 - store i64 %"12", ptr addrspace(5) %"4", align 8 - %"13" = load i64, ptr addrspace(4) %"36", align 8 - store i64 %"13", ptr addrspace(5) %"5", align 8 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"37" = inttoptr i64 %"15" to ptr - %"14" = load i32, ptr %"37", align 4 - store i32 %"14", ptr addrspace(5) %"6", align 4 - %"17" = load i64, ptr addrspace(5) %"4", align 8 - %"38" = inttoptr i64 %"17" to ptr - %"53" = getelementptr inbounds i8, ptr %"38", i64 4 - %"16" = load i32, ptr %"53", align 4 - store i32 %"16", ptr addrspace(5) %"7", align 4 - %"19" = load i64, ptr addrspace(5) %"4", align 8 - %"39" = inttoptr i64 %"19" to ptr - %"55" = getelementptr inbounds i8, ptr %"39", i64 8 - %"18" = load i32, ptr %"55", align 4 - store i32 %"18", ptr addrspace(5) %"8", align 4 - %"21" = load i64, ptr addrspace(5) %"4", align 8 - %"40" = inttoptr i64 %"21" to ptr - %"57" = getelementptr inbounds i8, ptr %"40", i64 12 - %"20" = load i32, ptr %"57", align 4 - store i32 %"20", ptr addrspace(5) %"9", align 4 - %"23" = load i32, ptr addrspace(5) %"6", align 4 - %"24" = load i32, ptr addrspace(5) %"7", align 4 - %"25" = load i32, ptr addrspace(5) %"8", align 4 - %"26" = load i32, ptr addrspace(5) %"9", align 4 - %"41" = call i32 @__zluda_ptx_impl__bfi_b32(i32 %"23", i32 %"24", i32 %"25", i32 %"26") - store i32 %"41", ptr addrspace(5) %"6", align 4 - %"27" = load i64, ptr addrspace(5) %"5", align 8 - %"28" = load i32, ptr addrspace(5) %"6", align 4 - %"44" = inttoptr i64 %"27" to ptr - store i32 %"28", ptr %"44", align 4 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"36" = inttoptr i64 %"14" to ptr + %"13" = load i32, ptr %"36", align 4 + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"37" = inttoptr i64 %"16" to ptr + %"51" = getelementptr inbounds i8, ptr %"37", i64 4 + %"15" = load i32, ptr %"51", align 4 + store i32 %"15", ptr addrspace(5) %"7", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"38" = inttoptr i64 %"18" to ptr + %"53" = getelementptr inbounds i8, ptr %"38", i64 8 + %"17" = load i32, ptr %"53", align 4 + store i32 %"17", ptr addrspace(5) %"8", align 4 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"39" = inttoptr i64 %"20" to ptr + %"55" = getelementptr inbounds i8, ptr %"39", i64 12 + %"19" = load i32, ptr %"55", align 4 + store i32 %"19", ptr addrspace(5) %"9", align 4 + %"22" = load i32, ptr addrspace(5) %"6", align 4 + %"23" = load i32, ptr addrspace(5) %"7", align 4 + %"24" = load i32, ptr addrspace(5) %"8", align 4 + %"25" = load i32, ptr addrspace(5) %"9", align 4 + %"40" = call i32 @__zluda_ptx_impl__bfi_b32(i32 %"22", i32 %"23", i32 %"24", i32 %"25") + store i32 %"40", ptr addrspace(5) %"6", align 4 + %"26" = load i64, ptr addrspace(5) %"5", align 8 + %"27" = load i32, ptr addrspace(5) %"6", align 4 + %"43" = inttoptr i64 %"26" to ptr + store i32 %"27", ptr %"43", align 4 ret void } diff --git a/ptx/src/test/spirv_run/bfind.ll b/ptx/src/test/spirv_run/bfind.ll index 4b7dc1b..a427332 100644 --- a/ptx/src/test/spirv_run/bfind.ll +++ b/ptx/src/test/spirv_run/bfind.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @bfind(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { -"53": +define protected amdgpu_kernel void @bfind(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { %"12" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"12", align 1 - %"13" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"13", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) @@ -15,56 +11,60 @@ define protected amdgpu_kernel void @bfind(ptr addrspace(4) byref(i64) %"42", pt %"9" = alloca i32, align 4, addrspace(5) %"10" = alloca i32, align 4, addrspace(5) %"11" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"12", align 1 + %"13" = load i64, ptr addrspace(4) %"41", align 8 + store i64 %"13", ptr addrspace(5) %"4", align 8 %"14" = load i64, ptr addrspace(4) %"42", align 8 - store i64 %"14", ptr addrspace(5) %"4", align 8 - %"15" = load i64, ptr addrspace(4) %"43", align 8 - store i64 %"15", ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"4", align 8 - %"44" = inttoptr i64 %"17" to ptr - %"16" = load i32, ptr %"44", align 4 - store i32 %"16", ptr addrspace(5) %"6", align 4 - %"19" = load i64, ptr addrspace(5) %"4", align 8 - %"45" = inttoptr i64 %"19" to ptr - %"55" = getelementptr inbounds i8, ptr %"45", i64 4 - %"18" = load i32, ptr %"55", align 4 - store i32 %"18", ptr addrspace(5) %"7", align 4 - %"21" = load i64, ptr addrspace(5) %"4", align 8 - %"46" = inttoptr i64 %"21" to ptr - %"57" = getelementptr inbounds i8, ptr %"46", i64 8 - %"20" = load i32, ptr %"57", align 4 - store i32 %"20", ptr addrspace(5) %"8", align 4 - %"23" = load i32, ptr addrspace(5) %"6", align 4 - %0 = icmp eq i32 %"23", 0 - %1 = call i32 @llvm.ctlz.i32(i32 %"23", i1 true) - %2 = sub i32 31, %1 - %"47" = select i1 %0, i32 -1, i32 %2 - store i32 %"47", ptr addrspace(5) %"9", align 4 - %"25" = load i32, ptr addrspace(5) %"7", align 4 - %3 = icmp eq i32 %"25", 0 - %4 = call i32 @llvm.ctlz.i32(i32 %"25", i1 true) - %5 = sub i32 31, %4 - %"48" = select i1 %3, i32 -1, i32 %5 - store i32 %"48", ptr addrspace(5) %"10", align 4 - %"27" = load i32, ptr addrspace(5) %"8", align 4 - %6 = icmp eq i32 %"27", 0 - %7 = call i32 @llvm.ctlz.i32(i32 %"27", i1 true) - %8 = sub i32 31, %7 - %"49" = select i1 %6, i32 -1, i32 %8 - store i32 %"49", ptr addrspace(5) %"11", align 4 - %"28" = load i64, ptr addrspace(5) %"5", align 8 - %"29" = load i32, ptr addrspace(5) %"9", align 4 - %"50" = inttoptr i64 %"28" to ptr - store i32 %"29", ptr %"50", align 4 - %"30" = load i64, ptr addrspace(5) %"5", align 8 - %"31" = load i32, ptr addrspace(5) %"10", align 4 - %"51" = inttoptr i64 %"30" to ptr - %"59" = getelementptr inbounds i8, ptr %"51", i64 4 - store i32 %"31", ptr %"59", align 4 - %"32" = load i64, ptr addrspace(5) %"5", align 8 - %"33" = load i32, ptr addrspace(5) %"11", align 4 - %"52" = inttoptr i64 %"32" to ptr - %"61" = getelementptr inbounds i8, ptr %"52", i64 8 - store i32 %"33", ptr %"61", align 4 + store i64 %"14", ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"43" = inttoptr i64 %"16" to ptr + %"15" = load i32, ptr %"43", align 4 + store i32 %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"44" = inttoptr i64 %"18" to ptr + %"53" = getelementptr inbounds i8, ptr %"44", i64 4 + %"17" = load i32, ptr %"53", align 4 + store i32 %"17", ptr addrspace(5) %"7", align 4 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"45" = inttoptr i64 %"20" to ptr + %"55" = getelementptr inbounds i8, ptr %"45", i64 8 + %"19" = load i32, ptr %"55", align 4 + store i32 %"19", ptr addrspace(5) %"8", align 4 + %"22" = load i32, ptr addrspace(5) %"6", align 4 + %2 = icmp eq i32 %"22", 0 + %3 = call i32 @llvm.ctlz.i32(i32 %"22", i1 true) + %4 = sub i32 31, %3 + %"46" = select i1 %2, i32 -1, i32 %4 + store i32 %"46", ptr addrspace(5) %"9", align 4 + %"24" = load i32, ptr addrspace(5) %"7", align 4 + %5 = icmp eq i32 %"24", 0 + %6 = call i32 @llvm.ctlz.i32(i32 %"24", i1 true) + %7 = sub i32 31, %6 + %"47" = select i1 %5, i32 -1, i32 %7 + store i32 %"47", ptr addrspace(5) %"10", align 4 + %"26" = load i32, ptr addrspace(5) %"8", align 4 + %8 = icmp eq i32 %"26", 0 + %9 = call i32 @llvm.ctlz.i32(i32 %"26", i1 true) + %10 = sub i32 31, %9 + %"48" = select i1 %8, i32 -1, i32 %10 + store i32 %"48", ptr addrspace(5) %"11", align 4 + %"27" = load i64, ptr addrspace(5) %"5", align 8 + %"28" = load i32, ptr addrspace(5) %"9", align 4 + %"49" = inttoptr i64 %"27" to ptr + store i32 %"28", ptr %"49", align 4 + %"29" = load i64, ptr addrspace(5) %"5", align 8 + %"30" = load i32, ptr addrspace(5) %"10", align 4 + %"50" = inttoptr i64 %"29" to ptr + %"57" = getelementptr inbounds i8, ptr %"50", i64 4 + store i32 %"30", ptr %"57", align 4 + %"31" = load i64, ptr addrspace(5) %"5", align 8 + %"32" = load i32, ptr addrspace(5) %"11", align 4 + %"51" = inttoptr i64 %"31" to ptr + %"59" = getelementptr inbounds i8, ptr %"51", i64 8 + store i32 %"32", ptr %"59", align 4 ret void } diff --git a/ptx/src/test/spirv_run/bfind_shiftamt.ll b/ptx/src/test/spirv_run/bfind_shiftamt.ll index 6a3ca72..9968d85 100644 --- a/ptx/src/test/spirv_run/bfind_shiftamt.ll +++ b/ptx/src/test/spirv_run/bfind_shiftamt.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @bfind_shiftamt(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { -"53": +define protected amdgpu_kernel void @bfind_shiftamt(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { %"12" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"12", align 1 - %"13" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"13", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) @@ -15,53 +11,57 @@ define protected amdgpu_kernel void @bfind_shiftamt(ptr addrspace(4) byref(i64) %"9" = alloca i32, align 4, addrspace(5) %"10" = alloca i32, align 4, addrspace(5) %"11" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"12", align 1 + %"13" = load i64, ptr addrspace(4) %"41", align 8 + store i64 %"13", ptr addrspace(5) %"4", align 8 %"14" = load i64, ptr addrspace(4) %"42", align 8 - store i64 %"14", ptr addrspace(5) %"4", align 8 - %"15" = load i64, ptr addrspace(4) %"43", align 8 - store i64 %"15", ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"4", align 8 - %"44" = inttoptr i64 %"17" to ptr - %"16" = load i32, ptr %"44", align 4 - store i32 %"16", ptr addrspace(5) %"6", align 4 - %"19" = load i64, ptr addrspace(5) %"4", align 8 - %"45" = inttoptr i64 %"19" to ptr - %"55" = getelementptr inbounds i8, ptr %"45", i64 4 - %"18" = load i32, ptr %"55", align 4 - store i32 %"18", ptr addrspace(5) %"7", align 4 - %"21" = load i64, ptr addrspace(5) %"4", align 8 - %"46" = inttoptr i64 %"21" to ptr - %"57" = getelementptr inbounds i8, ptr %"46", i64 8 - %"20" = load i32, ptr %"57", align 4 - store i32 %"20", ptr addrspace(5) %"8", align 4 - %"23" = load i32, ptr addrspace(5) %"6", align 4 - %0 = icmp eq i32 %"23", 0 - %1 = call i32 @llvm.ctlz.i32(i32 %"23", i1 true) - %"47" = select i1 %0, i32 -1, i32 %1 - store i32 %"47", ptr addrspace(5) %"9", align 4 - %"25" = load i32, ptr addrspace(5) %"7", align 4 - %2 = icmp eq i32 %"25", 0 - %3 = call i32 @llvm.ctlz.i32(i32 %"25", i1 true) - %"48" = select i1 %2, i32 -1, i32 %3 - store i32 %"48", ptr addrspace(5) %"10", align 4 - %"27" = load i32, ptr addrspace(5) %"8", align 4 - %4 = icmp eq i32 %"27", 0 - %5 = call i32 @llvm.ctlz.i32(i32 %"27", i1 true) - %"49" = select i1 %4, i32 -1, i32 %5 - store i32 %"49", ptr addrspace(5) %"11", align 4 - %"28" = load i64, ptr addrspace(5) %"5", align 8 - %"29" = load i32, ptr addrspace(5) %"9", align 4 - %"50" = inttoptr i64 %"28" to ptr - store i32 %"29", ptr %"50", align 4 - %"30" = load i64, ptr addrspace(5) %"5", align 8 - %"31" = load i32, ptr addrspace(5) %"10", align 4 - %"51" = inttoptr i64 %"30" to ptr - %"59" = getelementptr inbounds i8, ptr %"51", i64 4 - store i32 %"31", ptr %"59", align 4 - %"32" = load i64, ptr addrspace(5) %"5", align 8 - %"33" = load i32, ptr addrspace(5) %"11", align 4 - %"52" = inttoptr i64 %"32" to ptr - %"61" = getelementptr inbounds i8, ptr %"52", i64 8 - store i32 %"33", ptr %"61", align 4 + store i64 %"14", ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"43" = inttoptr i64 %"16" to ptr + %"15" = load i32, ptr %"43", align 4 + store i32 %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"44" = inttoptr i64 %"18" to ptr + %"53" = getelementptr inbounds i8, ptr %"44", i64 4 + %"17" = load i32, ptr %"53", align 4 + store i32 %"17", ptr addrspace(5) %"7", align 4 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"45" = inttoptr i64 %"20" to ptr + %"55" = getelementptr inbounds i8, ptr %"45", i64 8 + %"19" = load i32, ptr %"55", align 4 + store i32 %"19", ptr addrspace(5) %"8", align 4 + %"22" = load i32, ptr addrspace(5) %"6", align 4 + %2 = icmp eq i32 %"22", 0 + %3 = call i32 @llvm.ctlz.i32(i32 %"22", i1 true) + %"46" = select i1 %2, i32 -1, i32 %3 + store i32 %"46", ptr addrspace(5) %"9", align 4 + %"24" = load i32, ptr addrspace(5) %"7", align 4 + %4 = icmp eq i32 %"24", 0 + %5 = call i32 @llvm.ctlz.i32(i32 %"24", i1 true) + %"47" = select i1 %4, i32 -1, i32 %5 + store i32 %"47", ptr addrspace(5) %"10", align 4 + %"26" = load i32, ptr addrspace(5) %"8", align 4 + %6 = icmp eq i32 %"26", 0 + %7 = call i32 @llvm.ctlz.i32(i32 %"26", i1 true) + %"48" = select i1 %6, i32 -1, i32 %7 + store i32 %"48", ptr addrspace(5) %"11", align 4 + %"27" = load i64, ptr addrspace(5) %"5", align 8 + %"28" = load i32, ptr addrspace(5) %"9", align 4 + %"49" = inttoptr i64 %"27" to ptr + store i32 %"28", ptr %"49", align 4 + %"29" = load i64, ptr addrspace(5) %"5", align 8 + %"30" = load i32, ptr addrspace(5) %"10", align 4 + %"50" = inttoptr i64 %"29" to ptr + %"57" = getelementptr inbounds i8, ptr %"50", i64 4 + store i32 %"30", ptr %"57", align 4 + %"31" = load i64, ptr addrspace(5) %"5", align 8 + %"32" = load i32, ptr addrspace(5) %"11", align 4 + %"51" = inttoptr i64 %"31" to ptr + %"59" = getelementptr inbounds i8, ptr %"51", i64 8 + store i32 %"32", ptr %"59", align 4 ret void } diff --git a/ptx/src/test/spirv_run/block.ll b/ptx/src/test/spirv_run/block.ll index 87c9374..b482fe2 100644 --- a/ptx/src/test/spirv_run/block.ll +++ b/ptx/src/test/spirv_run/block.ll @@ -1,35 +1,35 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"27": +define protected amdgpu_kernel void @block(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 %"11" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"11", ptr addrspace(5) %"4", align 8 - %"12" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(5) %"4", align 8 - %"25" = inttoptr i64 %"14" to ptr - %"13" = load i64, ptr %"25", align 8 - store i64 %"13", ptr addrspace(5) %"6", align 8 - %"16" = load i64, ptr addrspace(5) %"6", align 8 - %"15" = add i64 %"16", 1 - store i64 %"15", ptr addrspace(5) %"7", align 8 - %"18" = load i64, ptr addrspace(5) %"8", align 8 - %"17" = add i64 %"18", 1 - store i64 %"17", ptr addrspace(5) %"8", align 8 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i64, ptr addrspace(5) %"7", align 8 - %"26" = inttoptr i64 %"19" to ptr - store i64 %"20", ptr %"26", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"13" to ptr + %"12" = load i64, ptr %"24", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"14" = add i64 %"15", 1 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"17" = load i64, ptr addrspace(5) %"8", align 8 + %"16" = add i64 %"17", 1 + store i64 %"16", ptr addrspace(5) %"8", align 8 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i64, ptr addrspace(5) %"7", align 8 + %"25" = inttoptr i64 %"18" to ptr + store i64 %"19", ptr %"25", align 8 ret void } diff --git a/ptx/src/test/spirv_run/bra.ll b/ptx/src/test/spirv_run/bra.ll index 6188dc7..4173392 100644 --- a/ptx/src/test/spirv_run/bra.ll +++ b/ptx/src/test/spirv_run/bra.ll @@ -1,43 +1,43 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #0 { -"29": +define protected amdgpu_kernel void @bra(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 - %"12" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"12", align 1 %"7" = alloca i64, align 8, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) %"9" = alloca i64, align 8, addrspace(5) %"10" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"11", align 1 + %"12" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"12", ptr addrspace(5) %"7", align 8 %"13" = load i64, ptr addrspace(4) %"25", align 8 - store i64 %"13", ptr addrspace(5) %"7", align 8 - %"14" = load i64, ptr addrspace(4) %"26", align 8 - store i64 %"14", ptr addrspace(5) %"8", align 8 - %"16" = load i64, ptr addrspace(5) %"7", align 8 - %"27" = inttoptr i64 %"16" to ptr - %"15" = load i64, ptr %"27", align 8 - store i64 %"15", ptr addrspace(5) %"9", align 8 + store i64 %"13", ptr addrspace(5) %"8", align 8 + %"15" = load i64, ptr addrspace(5) %"7", align 8 + %"26" = inttoptr i64 %"15" to ptr + %"14" = load i64, ptr %"26", align 8 + store i64 %"14", ptr addrspace(5) %"9", align 8 br label %"4" -"4": ; preds = %"29" - %"18" = load i64, ptr addrspace(5) %"9", align 8 - %"17" = add i64 %"18", 1 - store i64 %"17", ptr addrspace(5) %"10", align 8 +"4": ; preds = %1 + %"17" = load i64, ptr addrspace(5) %"9", align 8 + %"16" = add i64 %"17", 1 + store i64 %"16", ptr addrspace(5) %"10", align 8 br label %"6" -0: ; No predecessors! - %"20" = load i64, ptr addrspace(5) %"9", align 8 - %"19" = add i64 %"20", 2 - store i64 %"19", ptr addrspace(5) %"10", align 8 +"5": ; No predecessors! + %"19" = load i64, ptr addrspace(5) %"9", align 8 + %"18" = add i64 %"19", 2 + store i64 %"18", ptr addrspace(5) %"10", align 8 br label %"6" -"6": ; preds = %0, %"4" - %"21" = load i64, ptr addrspace(5) %"8", align 8 - %"22" = load i64, ptr addrspace(5) %"10", align 8 - %"28" = inttoptr i64 %"21" to ptr - store i64 %"22", ptr %"28", align 8 +"6": ; preds = %"5", %"4" + %"20" = load i64, ptr addrspace(5) %"8", align 8 + %"21" = load i64, ptr addrspace(5) %"10", align 8 + %"27" = inttoptr i64 %"20" to ptr + store i64 %"21", ptr %"27", align 8 ret void } diff --git a/ptx/src/test/spirv_run/brev.ll b/ptx/src/test/spirv_run/brev.ll index e43d1c6..d838750 100644 --- a/ptx/src/test/spirv_run/brev.ll +++ b/ptx/src/test/spirv_run/brev.ll @@ -1,30 +1,30 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { -"21": +define protected amdgpu_kernel void @brev(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"17", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"19" = inttoptr i64 %"12" to ptr - %"11" = load i32, ptr %"19", align 4 - store i32 %"11", ptr addrspace(5) %"6", align 4 - %"14" = load i32, ptr addrspace(5) %"6", align 4 - %"13" = call i32 @llvm.bitreverse.i32(i32 %"14") - store i32 %"13", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load i32, ptr addrspace(5) %"6", align 4 - %"20" = inttoptr i64 %"15" to ptr - store i32 %"16", ptr %"20", align 4 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"18" = inttoptr i64 %"11" to ptr + %"10" = load i32, ptr %"18", align 4 + store i32 %"10", ptr addrspace(5) %"6", align 4 + %"13" = load i32, ptr addrspace(5) %"6", align 4 + %"12" = call i32 @llvm.bitreverse.i32(i32 %"13") + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load i32, ptr addrspace(5) %"6", align 4 + %"19" = inttoptr i64 %"14" to ptr + store i32 %"15", ptr %"19", align 4 ret void } diff --git a/ptx/src/test/spirv_run/call.ll b/ptx/src/test/spirv_run/call.ll index af26549..684bb0c 100644 --- a/ptx/src/test/spirv_run/call.ll +++ b/ptx/src/test/spirv_run/call.ll @@ -1,63 +1,63 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define private i64 @incr(i64 %"31") #0 { -"51": +define private i64 @incr(i64 %"29") #0 { %"18" = alloca i64, align 8, addrspace(5) %"17" = alloca i64, align 8, addrspace(5) - %"21" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"21", align 1 - %"22" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"22", align 1 - %"44" = alloca i64, align 8, addrspace(5) - %"45" = alloca i64, align 8, addrspace(5) + %"20" = alloca i1, align 1, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) %"14" = alloca i64, align 8, addrspace(5) - store i64 %"31", ptr addrspace(5) %"18", align 8 - %"32" = load i64, ptr addrspace(5) %"18", align 8 - store i64 %"32", ptr addrspace(5) %"45", align 8 - %"33" = load i64, ptr addrspace(5) %"45", align 8 - store i64 %"33", ptr addrspace(5) %"14", align 8 - %"35" = load i64, ptr addrspace(5) %"14", align 8 - %"34" = add i64 %"35", 1 - store i64 %"34", ptr addrspace(5) %"14", align 8 - %"36" = load i64, ptr addrspace(5) %"14", align 8 - store i64 %"36", ptr addrspace(5) %"44", align 8 - %"37" = load i64, ptr addrspace(5) %"44", align 8 - store i64 %"37", ptr addrspace(5) %"17", align 8 - %"38" = load i64, ptr addrspace(5) %"17", align 8 - ret i64 %"38" + br label %1 + +1: ; preds = %0 + store i64 %"29", ptr addrspace(5) %"18", align 8 + store i1 false, ptr addrspace(5) %"20", align 1 + %"30" = load i64, ptr addrspace(5) %"18", align 8 + store i64 %"30", ptr addrspace(5) %"43", align 8 + %"31" = load i64, ptr addrspace(5) %"43", align 8 + store i64 %"31", ptr addrspace(5) %"14", align 8 + %"33" = load i64, ptr addrspace(5) %"14", align 8 + %"32" = add i64 %"33", 1 + store i64 %"32", ptr addrspace(5) %"14", align 8 + %"34" = load i64, ptr addrspace(5) %"14", align 8 + store i64 %"34", ptr addrspace(5) %"42", align 8 + %"35" = load i64, ptr addrspace(5) %"42", align 8 + store i64 %"35", ptr addrspace(5) %"17", align 8 + %"36" = load i64, ptr addrspace(5) %"17", align 8 + ret i64 %"36" } -define protected amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { -"50": +define protected amdgpu_kernel void @call(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { %"19" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"19", align 1 - %"20" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"20", align 1 %"7" = alloca i64, align 8, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) %"9" = alloca i64, align 8, addrspace(5) - %"42" = alloca i64, align 8, addrspace(5) - %"43" = alloca i64, align 8, addrspace(5) - %"23" = load i64, ptr addrspace(4) %"40", align 8 - store i64 %"23", ptr addrspace(5) %"7", align 8 - %"24" = load i64, ptr addrspace(4) %"41", align 8 - store i64 %"24", ptr addrspace(5) %"8", align 8 - %"26" = load i64, ptr addrspace(5) %"7", align 8 - %"46" = inttoptr i64 %"26" to ptr addrspace(1) - %"25" = load i64, ptr addrspace(1) %"46", align 8 - store i64 %"25", ptr addrspace(5) %"9", align 8 - %"27" = load i64, ptr addrspace(5) %"9", align 8 - store i64 %"27", ptr addrspace(5) %"42", align 8 - %"15" = load i64, ptr addrspace(5) %"42", align 8 + %"40" = alloca i64, align 8, addrspace(5) + %"41" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"19", align 1 + %"21" = load i64, ptr addrspace(4) %"38", align 8 + store i64 %"21", ptr addrspace(5) %"7", align 8 + %"22" = load i64, ptr addrspace(4) %"39", align 8 + store i64 %"22", ptr addrspace(5) %"8", align 8 + %"24" = load i64, ptr addrspace(5) %"7", align 8 + %"44" = inttoptr i64 %"24" to ptr addrspace(1) + %"23" = load i64, ptr addrspace(1) %"44", align 8 + store i64 %"23", ptr addrspace(5) %"9", align 8 + %"25" = load i64, ptr addrspace(5) %"9", align 8 + store i64 %"25", ptr addrspace(5) %"40", align 8 + %"15" = load i64, ptr addrspace(5) %"40", align 8 %"16" = call i64 @incr(i64 %"15") - store i64 %"16", ptr addrspace(5) %"43", align 8 - %"28" = load i64, ptr addrspace(5) %"43", align 8 - store i64 %"28", ptr addrspace(5) %"9", align 8 - %"29" = load i64, ptr addrspace(5) %"8", align 8 - %"30" = load i64, ptr addrspace(5) %"9", align 8 - %"49" = inttoptr i64 %"29" to ptr addrspace(1) - store i64 %"30", ptr addrspace(1) %"49", align 8 + store i64 %"16", ptr addrspace(5) %"41", align 8 + %"26" = load i64, ptr addrspace(5) %"41", align 8 + store i64 %"26", ptr addrspace(5) %"9", align 8 + %"27" = load i64, ptr addrspace(5) %"8", align 8 + %"28" = load i64, ptr addrspace(5) %"9", align 8 + %"47" = inttoptr i64 %"27" to ptr addrspace(1) + store i64 %"28", ptr addrspace(1) %"47", align 8 ret void } diff --git a/ptx/src/test/spirv_run/call_bug.ll b/ptx/src/test/spirv_run/call_bug.ll index 749b2b6..12c8e2c 100644 --- a/ptx/src/test/spirv_run/call_bug.ll +++ b/ptx/src/test/spirv_run/call_bug.ll @@ -1,68 +1,68 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define private [2 x i32] @incr(i64 %"23") #0 { -"58": +define private [2 x i32] @incr(i64 %"21") #0 { %"16" = alloca i64, align 8, addrspace(5) %"15" = alloca [2 x i32], align 4, addrspace(5) %"19" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"19", align 1 - %"20" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"20", align 1 - %"44" = alloca [2 x i32], align 4, addrspace(5) - %"45" = alloca i64, align 8, addrspace(5) + %"42" = alloca [2 x i32], align 4, addrspace(5) + %"43" = alloca i64, align 8, addrspace(5) %"4" = alloca i64, align 8, addrspace(5) - store i64 %"23", ptr addrspace(5) %"16", align 8 - %"24" = load i64, ptr addrspace(5) %"16", align 8 - store i64 %"24", ptr addrspace(5) %"45", align 8 - %"25" = load i64, ptr addrspace(5) %"45", align 8 - store i64 %"25", ptr addrspace(5) %"4", align 8 - %"27" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = add i64 %"27", 1 - store i64 %"26", ptr addrspace(5) %"4", align 8 - %"28" = load i64, ptr addrspace(5) %"4", align 8 - store i64 %"28", ptr addrspace(5) %"44", align 8 - %"29" = load [2 x i32], ptr addrspace(5) %"44", align 4 - store [2 x i32] %"29", ptr addrspace(5) %"15", align 4 - %"30" = load [2 x i32], ptr addrspace(5) %"15", align 4 - ret [2 x i32] %"30" + br label %1 + +1: ; preds = %0 + store i64 %"21", ptr addrspace(5) %"16", align 8 + store i1 false, ptr addrspace(5) %"19", align 1 + %"22" = load i64, ptr addrspace(5) %"16", align 8 + store i64 %"22", ptr addrspace(5) %"43", align 8 + %"23" = load i64, ptr addrspace(5) %"43", align 8 + store i64 %"23", ptr addrspace(5) %"4", align 8 + %"25" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = add i64 %"25", 1 + store i64 %"24", ptr addrspace(5) %"4", align 8 + %"26" = load i64, ptr addrspace(5) %"4", align 8 + store i64 %"26", ptr addrspace(5) %"42", align 8 + %"27" = load [2 x i32], ptr addrspace(5) %"42", align 4 + store [2 x i32] %"27", ptr addrspace(5) %"15", align 4 + %"28" = load [2 x i32], ptr addrspace(5) %"15", align 4 + ret [2 x i32] %"28" } -define protected amdgpu_kernel void @call_bug(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 { -"59": - %"21" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"21", align 1 - %"22" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"22", align 1 +define protected amdgpu_kernel void @call_bug(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #0 { + %"20" = alloca i1, align 1, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) %"9" = alloca i64, align 8, addrspace(5) %"10" = alloca i64, align 8, addrspace(5) %"11" = alloca i64, align 8, addrspace(5) - %"48" = alloca i64, align 8, addrspace(5) - %"49" = alloca [2 x i32], align 4, addrspace(5) - %"31" = load i64, ptr addrspace(4) %"46", align 8 - store i64 %"31", ptr addrspace(5) %"8", align 8 - %"32" = load i64, ptr addrspace(4) %"47", align 8 - store i64 %"32", ptr addrspace(5) %"9", align 8 - %"34" = load i64, ptr addrspace(5) %"8", align 8 - %"52" = inttoptr i64 %"34" to ptr addrspace(1) - %"33" = load i64, ptr addrspace(1) %"52", align 8 - store i64 %"33", ptr addrspace(5) %"10", align 8 - %"35" = load i64, ptr addrspace(5) %"10", align 8 - store i64 %"35", ptr addrspace(5) %"48", align 8 + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca [2 x i32], align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"20", align 1 + %"29" = load i64, ptr addrspace(4) %"44", align 8 + store i64 %"29", ptr addrspace(5) %"8", align 8 + %"30" = load i64, ptr addrspace(4) %"45", align 8 + store i64 %"30", ptr addrspace(5) %"9", align 8 + %"32" = load i64, ptr addrspace(5) %"8", align 8 + %"50" = inttoptr i64 %"32" to ptr addrspace(1) + %"31" = load i64, ptr addrspace(1) %"50", align 8 + store i64 %"31", ptr addrspace(5) %"10", align 8 + %"33" = load i64, ptr addrspace(5) %"10", align 8 + store i64 %"33", ptr addrspace(5) %"46", align 8 store i64 ptrtoint (ptr @incr to i64), ptr addrspace(5) %"11", align 8 - %"17" = load i64, ptr addrspace(5) %"48", align 8 - %"37" = load i64, ptr addrspace(5) %"11", align 8 - %0 = inttoptr i64 %"37" to ptr - %"18" = call [2 x i32] %0(i64 %"17") - store [2 x i32] %"18", ptr addrspace(5) %"49", align 4 - %"61" = getelementptr inbounds i8, ptr addrspace(5) %"49", i64 0 - %"38" = load i64, ptr addrspace(5) %"61", align 8 - store i64 %"38", ptr addrspace(5) %"10", align 8 - %"39" = load i64, ptr addrspace(5) %"9", align 8 - %"40" = load i64, ptr addrspace(5) %"10", align 8 - %"57" = inttoptr i64 %"39" to ptr addrspace(1) - store i64 %"40", ptr addrspace(1) %"57", align 8 + %"17" = load i64, ptr addrspace(5) %"46", align 8 + %"35" = load i64, ptr addrspace(5) %"11", align 8 + %2 = inttoptr i64 %"35" to ptr + %"18" = call [2 x i32] %2(i64 %"17") + store [2 x i32] %"18", ptr addrspace(5) %"47", align 4 + %"57" = getelementptr inbounds i8, ptr addrspace(5) %"47", i64 0 + %"36" = load i64, ptr addrspace(5) %"57", align 8 + store i64 %"36", ptr addrspace(5) %"10", align 8 + %"37" = load i64, ptr addrspace(5) %"9", align 8 + %"38" = load i64, ptr addrspace(5) %"10", align 8 + %"55" = inttoptr i64 %"37" to ptr addrspace(1) + store i64 %"38", ptr addrspace(1) %"55", align 8 ret void } diff --git a/ptx/src/test/spirv_run/call_multi_return.ll b/ptx/src/test/spirv_run/call_multi_return.ll index a6cb883..5cf701b 100644 --- a/ptx/src/test/spirv_run/call_multi_return.ll +++ b/ptx/src/test/spirv_run/call_multi_return.ll @@ -3,43 +3,39 @@ target triple = "amdgcn-amd-amdhsa" %struct.i64i32 = type { i64, i32 } -define private %struct.i64i32 @"1"(i32 %"41", i32 %"42") #0 { -"64": +define private %struct.i64i32 @"1"(i32 %"39", i32 %"40") #0 { %"18" = alloca i32, align 4, addrspace(5) %"19" = alloca i32, align 4, addrspace(5) %"16" = alloca i64, align 8, addrspace(5) %"17" = alloca i32, align 4, addrspace(5) - %"23" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"23", align 1 - %"24" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"24", align 1 + %"22" = alloca i1, align 1, addrspace(5) %"20" = alloca i32, align 4, addrspace(5) - store i32 %"41", ptr addrspace(5) %"18", align 4 - store i32 %"42", ptr addrspace(5) %"19", align 4 - %"44" = load i32, ptr addrspace(5) %"18", align 4 - %"45" = load i32, ptr addrspace(5) %"19", align 4 - %"43" = add i32 %"44", %"45" - store i32 %"43", ptr addrspace(5) %"20", align 4 - %"47" = load i32, ptr addrspace(5) %"20", align 4 - %"46" = zext i32 %"47" to i64 - store i64 %"46", ptr addrspace(5) %"16", align 8 - %"49" = load i32, ptr addrspace(5) %"18", align 4 - %"50" = load i32, ptr addrspace(5) %"19", align 4 - %"48" = mul i32 %"49", %"50" - store i32 %"48", ptr addrspace(5) %"17", align 4 - %"51" = load i64, ptr addrspace(5) %"16", align 8 - %"52" = load i32, ptr addrspace(5) %"17", align 4 - %0 = insertvalue %struct.i64i32 undef, i64 %"51", 0 - %1 = insertvalue %struct.i64i32 %0, i32 %"52", 1 - ret %struct.i64i32 %1 + br label %1 + +1: ; preds = %0 + store i32 %"39", ptr addrspace(5) %"18", align 4 + store i32 %"40", ptr addrspace(5) %"19", align 4 + store i1 false, ptr addrspace(5) %"22", align 1 + %"42" = load i32, ptr addrspace(5) %"18", align 4 + %"43" = load i32, ptr addrspace(5) %"19", align 4 + %"41" = add i32 %"42", %"43" + store i32 %"41", ptr addrspace(5) %"20", align 4 + %"45" = load i32, ptr addrspace(5) %"20", align 4 + %"44" = zext i32 %"45" to i64 + store i64 %"44", ptr addrspace(5) %"16", align 8 + %"47" = load i32, ptr addrspace(5) %"18", align 4 + %"48" = load i32, ptr addrspace(5) %"19", align 4 + %"46" = mul i32 %"47", %"48" + store i32 %"46", ptr addrspace(5) %"17", align 4 + %"49" = load i64, ptr addrspace(5) %"16", align 8 + %"50" = load i32, ptr addrspace(5) %"17", align 4 + %2 = insertvalue %struct.i64i32 undef, i64 %"49", 0 + %3 = insertvalue %struct.i64i32 %2, i32 %"50", 1 + ret %struct.i64i32 %3 } -define protected amdgpu_kernel void @call_multi_return(ptr addrspace(4) byref(i64) %"57", ptr addrspace(4) byref(i64) %"58") #0 { -"63": +define protected amdgpu_kernel void @call_multi_return(ptr addrspace(4) byref(i64) %"55", ptr addrspace(4) byref(i64) %"56") #0 { %"21" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"21", align 1 - %"22" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"22", align 1 %"9" = alloca i64, align 8, addrspace(5) %"10" = alloca i64, align 8, addrspace(5) %"11" = alloca i32, align 4, addrspace(5) @@ -47,38 +43,42 @@ define protected amdgpu_kernel void @call_multi_return(ptr addrspace(4) byref(i6 %"13" = alloca i64, align 8, addrspace(5) %"14" = alloca i64, align 8, addrspace(5) %"15" = alloca i32, align 4, addrspace(5) - %"25" = load i64, ptr addrspace(4) %"57", align 8 - store i64 %"25", ptr addrspace(5) %"9", align 8 - %"26" = load i64, ptr addrspace(4) %"58", align 8 - store i64 %"26", ptr addrspace(5) %"10", align 8 + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"21", align 1 + %"23" = load i64, ptr addrspace(4) %"55", align 8 + store i64 %"23", ptr addrspace(5) %"9", align 8 + %"24" = load i64, ptr addrspace(4) %"56", align 8 + store i64 %"24", ptr addrspace(5) %"10", align 8 + %"26" = load i64, ptr addrspace(5) %"9", align 8 + %"57" = inttoptr i64 %"26" to ptr addrspace(1) + %"25" = load i32, ptr addrspace(1) %"57", align 4 + store i32 %"25", ptr addrspace(5) %"11", align 4 %"28" = load i64, ptr addrspace(5) %"9", align 8 - %"59" = inttoptr i64 %"28" to ptr addrspace(1) - %"27" = load i32, ptr addrspace(1) %"59", align 4 - store i32 %"27", ptr addrspace(5) %"11", align 4 - %"30" = load i64, ptr addrspace(5) %"9", align 8 - %"60" = inttoptr i64 %"30" to ptr addrspace(1) - %"66" = getelementptr inbounds i8, ptr addrspace(1) %"60", i64 4 - %"29" = load i32, ptr addrspace(1) %"66", align 4 - store i32 %"29", ptr addrspace(5) %"12", align 4 - %"33" = load i32, ptr addrspace(5) %"11", align 4 - %"34" = load i32, ptr addrspace(5) %"12", align 4 - %0 = call %struct.i64i32 @"1"(i32 %"33", i32 %"34") - %"31" = extractvalue %struct.i64i32 %0, 0 - %"32" = extractvalue %struct.i64i32 %0, 1 - store i64 %"31", ptr addrspace(5) %"13", align 8 - store i32 %"32", ptr addrspace(5) %"15", align 4 - %"36" = load i32, ptr addrspace(5) %"15", align 4 - %"35" = zext i32 %"36" to i64 - store i64 %"35", ptr addrspace(5) %"14", align 8 + %"58" = inttoptr i64 %"28" to ptr addrspace(1) + %"62" = getelementptr inbounds i8, ptr addrspace(1) %"58", i64 4 + %"27" = load i32, ptr addrspace(1) %"62", align 4 + store i32 %"27", ptr addrspace(5) %"12", align 4 + %"31" = load i32, ptr addrspace(5) %"11", align 4 + %"32" = load i32, ptr addrspace(5) %"12", align 4 + %2 = call %struct.i64i32 @"1"(i32 %"31", i32 %"32") + %"29" = extractvalue %struct.i64i32 %2, 0 + %"30" = extractvalue %struct.i64i32 %2, 1 + store i64 %"29", ptr addrspace(5) %"13", align 8 + store i32 %"30", ptr addrspace(5) %"15", align 4 + %"34" = load i32, ptr addrspace(5) %"15", align 4 + %"33" = zext i32 %"34" to i64 + store i64 %"33", ptr addrspace(5) %"14", align 8 + %"35" = load i64, ptr addrspace(5) %"10", align 8 + %"36" = load i64, ptr addrspace(5) %"13", align 8 + %"59" = inttoptr i64 %"35" to ptr addrspace(1) + store i64 %"36", ptr addrspace(1) %"59", align 8 %"37" = load i64, ptr addrspace(5) %"10", align 8 - %"38" = load i64, ptr addrspace(5) %"13", align 8 - %"61" = inttoptr i64 %"37" to ptr addrspace(1) - store i64 %"38", ptr addrspace(1) %"61", align 8 - %"39" = load i64, ptr addrspace(5) %"10", align 8 - %"40" = load i64, ptr addrspace(5) %"14", align 8 - %"62" = inttoptr i64 %"39" to ptr addrspace(1) - %"68" = getelementptr inbounds i8, ptr addrspace(1) %"62", i64 8 - store i64 %"40", ptr addrspace(1) %"68", align 8 + %"38" = load i64, ptr addrspace(5) %"14", align 8 + %"60" = inttoptr i64 %"37" to ptr addrspace(1) + %"64" = getelementptr inbounds i8, ptr addrspace(1) %"60", i64 8 + store i64 %"38", ptr addrspace(1) %"64", align 8 ret void } diff --git a/ptx/src/test/spirv_run/callprototype.ll b/ptx/src/test/spirv_run/callprototype.ll index 84e5987..9cba37c 100644 --- a/ptx/src/test/spirv_run/callprototype.ll +++ b/ptx/src/test/spirv_run/callprototype.ll @@ -1,67 +1,67 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define private i64 @incr(i64 %"35") #0 { -"56": +define private i64 @incr(i64 %"33") #0 { %"20" = alloca i64, align 8, addrspace(5) %"19" = alloca i64, align 8, addrspace(5) - %"23" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"23", align 1 - %"24" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"24", align 1 - %"48" = alloca i64, align 8, addrspace(5) - %"49" = alloca i64, align 8, addrspace(5) + %"22" = alloca i1, align 1, addrspace(5) + %"46" = alloca i64, align 8, addrspace(5) + %"47" = alloca i64, align 8, addrspace(5) %"16" = alloca i64, align 8, addrspace(5) - store i64 %"35", ptr addrspace(5) %"20", align 8 - %"36" = load i64, ptr addrspace(5) %"20", align 8 - store i64 %"36", ptr addrspace(5) %"49", align 8 - %"37" = load i64, ptr addrspace(5) %"49", align 8 - store i64 %"37", ptr addrspace(5) %"16", align 8 - %"39" = load i64, ptr addrspace(5) %"16", align 8 - %"38" = add i64 %"39", 1 - store i64 %"38", ptr addrspace(5) %"16", align 8 - %"40" = load i64, ptr addrspace(5) %"16", align 8 - store i64 %"40", ptr addrspace(5) %"48", align 8 - %"41" = load i64, ptr addrspace(5) %"48", align 8 - store i64 %"41", ptr addrspace(5) %"19", align 8 - %"42" = load i64, ptr addrspace(5) %"19", align 8 - ret i64 %"42" + br label %1 + +1: ; preds = %0 + store i64 %"33", ptr addrspace(5) %"20", align 8 + store i1 false, ptr addrspace(5) %"22", align 1 + %"34" = load i64, ptr addrspace(5) %"20", align 8 + store i64 %"34", ptr addrspace(5) %"47", align 8 + %"35" = load i64, ptr addrspace(5) %"47", align 8 + store i64 %"35", ptr addrspace(5) %"16", align 8 + %"37" = load i64, ptr addrspace(5) %"16", align 8 + %"36" = add i64 %"37", 1 + store i64 %"36", ptr addrspace(5) %"16", align 8 + %"38" = load i64, ptr addrspace(5) %"16", align 8 + store i64 %"38", ptr addrspace(5) %"46", align 8 + %"39" = load i64, ptr addrspace(5) %"46", align 8 + store i64 %"39", ptr addrspace(5) %"19", align 8 + %"40" = load i64, ptr addrspace(5) %"19", align 8 + ret i64 %"40" } -define protected amdgpu_kernel void @callprototype(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #0 { -"55": +define protected amdgpu_kernel void @callprototype(ptr addrspace(4) byref(i64) %"42", ptr addrspace(4) byref(i64) %"43") #0 { %"21" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"21", align 1 - %"22" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"22", align 1 %"7" = alloca i64, align 8, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) %"9" = alloca i64, align 8, addrspace(5) %"10" = alloca i64, align 8, addrspace(5) - %"46" = alloca i64, align 8, addrspace(5) - %"47" = alloca i64, align 8, addrspace(5) - %"25" = load i64, ptr addrspace(4) %"44", align 8 - store i64 %"25", ptr addrspace(5) %"7", align 8 - %"26" = load i64, ptr addrspace(4) %"45", align 8 - store i64 %"26", ptr addrspace(5) %"8", align 8 - %"28" = load i64, ptr addrspace(5) %"7", align 8 - %"50" = inttoptr i64 %"28" to ptr addrspace(1) - %"27" = load i64, ptr addrspace(1) %"50", align 8 - store i64 %"27", ptr addrspace(5) %"9", align 8 - %"29" = load i64, ptr addrspace(5) %"9", align 8 - store i64 %"29", ptr addrspace(5) %"46", align 8 + %"44" = alloca i64, align 8, addrspace(5) + %"45" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"21", align 1 + %"23" = load i64, ptr addrspace(4) %"42", align 8 + store i64 %"23", ptr addrspace(5) %"7", align 8 + %"24" = load i64, ptr addrspace(4) %"43", align 8 + store i64 %"24", ptr addrspace(5) %"8", align 8 + %"26" = load i64, ptr addrspace(5) %"7", align 8 + %"48" = inttoptr i64 %"26" to ptr addrspace(1) + %"25" = load i64, ptr addrspace(1) %"48", align 8 + store i64 %"25", ptr addrspace(5) %"9", align 8 + %"27" = load i64, ptr addrspace(5) %"9", align 8 + store i64 %"27", ptr addrspace(5) %"44", align 8 store i64 ptrtoint (ptr @incr to i64), ptr addrspace(5) %"10", align 8 - %"17" = load i64, ptr addrspace(5) %"46", align 8 - %"31" = load i64, ptr addrspace(5) %"10", align 8 - %0 = inttoptr i64 %"31" to ptr - %"18" = call i64 %0(i64 %"17") - store i64 %"18", ptr addrspace(5) %"47", align 8 - %"32" = load i64, ptr addrspace(5) %"47", align 8 - store i64 %"32", ptr addrspace(5) %"9", align 8 - %"33" = load i64, ptr addrspace(5) %"8", align 8 - %"34" = load i64, ptr addrspace(5) %"9", align 8 - %"54" = inttoptr i64 %"33" to ptr addrspace(1) - store i64 %"34", ptr addrspace(1) %"54", align 8 + %"17" = load i64, ptr addrspace(5) %"44", align 8 + %"29" = load i64, ptr addrspace(5) %"10", align 8 + %2 = inttoptr i64 %"29" to ptr + %"18" = call i64 %2(i64 %"17") + store i64 %"18", ptr addrspace(5) %"45", align 8 + %"30" = load i64, ptr addrspace(5) %"45", align 8 + store i64 %"30", ptr addrspace(5) %"9", align 8 + %"31" = load i64, ptr addrspace(5) %"8", align 8 + %"32" = load i64, ptr addrspace(5) %"9", align 8 + %"52" = inttoptr i64 %"31" to ptr addrspace(1) + store i64 %"32", ptr addrspace(1) %"52", align 8 ret void } diff --git a/ptx/src/test/spirv_run/carry_mixed.ll b/ptx/src/test/spirv_run/carry_mixed.ll deleted file mode 100644 index c33cc5e..0000000 --- a/ptx/src/test/spirv_run/carry_mixed.ll +++ /dev/null @@ -1,51 +0,0 @@ -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" -target triple = "amdgcn-amd-amdhsa" - -define protected amdgpu_kernel void @carry_mixed(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { -"44": - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 - %"4" = alloca i64, align 8, addrspace(5) - %"5" = alloca i64, align 8, addrspace(5) - %"6" = alloca i32, align 4, addrspace(5) - %"7" = alloca i32, align 4, addrspace(5) - %"8" = alloca i32, align 4, addrspace(5) - %"11" = load i64, ptr addrspace(4) %"35", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %0 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1) - %"36" = extractvalue { i32, i1 } %0, 0 - %"13" = extractvalue { i32, i1 } %0, 1 - store i32 %"36", ptr addrspace(5) %"6", align 4 - store i1 %"13", ptr addrspace(5) %"10", align 1 - %"15" = load i1, ptr addrspace(5) %"10", align 1 - %1 = zext i1 %"15" to i32 - %"37" = sub i32 2, %1 - store i32 %"37", ptr addrspace(5) %"7", align 4 - %2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1) - %"38" = extractvalue { i32, i1 } %2, 0 - %"17" = extractvalue { i32, i1 } %2, 1 - store i32 %"38", ptr addrspace(5) %"6", align 4 - store i1 %"17", ptr addrspace(5) %"10", align 1 - %"19" = load i1, ptr addrspace(5) %"9", align 1 - %3 = zext i1 %"19" to i32 - %"39" = add i32 1, %3 - store i32 %"39", ptr addrspace(5) %"8", align 4 - %"20" = load i64, ptr addrspace(5) %"5", align 8 - %"21" = load i32, ptr addrspace(5) %"7", align 4 - %"40" = inttoptr i64 %"20" to ptr - store i32 %"21", ptr %"40", align 4 - %"22" = load i64, ptr addrspace(5) %"5", align 8 - %"23" = load i32, ptr addrspace(5) %"8", align 4 - %"42" = inttoptr i64 %"22" to ptr - %"46" = getelementptr inbounds i8, ptr %"42", i64 4 - store i32 %"23", ptr %"46", align 4 - ret void -} - -; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn -declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1 - -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/carry_mixed.ptx b/ptx/src/test/spirv_run/carry_mixed.ptx deleted file mode 100644 index b4f2caa..0000000 --- a/ptx/src/test/spirv_run/carry_mixed.ptx +++ /dev/null @@ -1,32 +0,0 @@ -.version 6.5 -.target sm_30 -.address_size 64 - -.visible .entry carry_mixed( - .param .u64 input, - .param .u64 output -) -{ - .reg .u64 in_addr; - .reg .u64 out_addr; - .reg .b32 unused; - - .reg .b32 carry_out_1; - .reg .b32 carry_out_2; - - ld.param.u64 out_addr, [output]; - - // set carry with sub - sub.cc.s32 unused, 0, 1; - // write carry with sub - subc.s32 carry_out_1, 2, 0; - - // set carry with sub - sub.cc.s32 unused, 0, 1; - // fail writing carry with add - addc.s32 carry_out_2, 1, 0; - - st.s32 [out_addr], carry_out_1; - st.s32 [out_addr+4], carry_out_2; - ret; -} diff --git a/ptx/src/test/spirv_run/carry_set_all.ll b/ptx/src/test/spirv_run/carry_set_all.ll new file mode 100644 index 0000000..8983b70 --- /dev/null +++ b/ptx/src/test/spirv_run/carry_set_all.ll @@ -0,0 +1,259 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @carry_set_all(ptr addrspace(4) byref(i64) %"208", ptr addrspace(4) byref(i64) %"209") #0 { + %"22" = alloca i1, align 1, addrspace(5) + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"10" = alloca i32, align 4, addrspace(5) + %"11" = alloca i32, align 4, addrspace(5) + %"12" = alloca i32, align 4, addrspace(5) + %"13" = alloca i32, align 4, addrspace(5) + %"14" = alloca i32, align 4, addrspace(5) + %"15" = alloca i32, align 4, addrspace(5) + %"16" = alloca i32, align 4, addrspace(5) + %"17" = alloca i32, align 4, addrspace(5) + %"18" = alloca i32, align 4, addrspace(5) + %"19" = alloca i32, align 4, addrspace(5) + %"20" = alloca i32, align 4, addrspace(5) + %"21" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"22", align 1 + %"37" = load i64, ptr addrspace(4) %"209", align 8 + store i64 %"37", ptr addrspace(5) %"5", align 8 + %2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 0) + %"210" = extractvalue { i32, i1 } %2, 0 + %"23" = extractvalue { i32, i1 } %2, 1 + store i32 %"210", ptr addrspace(5) %"6", align 4 + %"39" = xor i1 %"23", true + store i1 %"39", ptr addrspace(5) %"22", align 1 + %"41" = load i1, ptr addrspace(5) %"22", align 1 + %3 = zext i1 %"41" to i32 + %"211" = add i32 0, %3 + store i32 %"211", ptr addrspace(5) %"6", align 4 + %"42" = load i1, ptr addrspace(5) %"22", align 1 + %"24" = xor i1 %"42", true + %4 = zext i1 %"24" to i32 + %"212" = sub i32 0, %4 + store i32 %"212", ptr addrspace(5) %"7", align 4 + %5 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1) + %"213" = extractvalue { i32, i1 } %5, 0 + %"25" = extractvalue { i32, i1 } %5, 1 + store i32 %"213", ptr addrspace(5) %"8", align 4 + %"45" = xor i1 %"25", true + store i1 %"45", ptr addrspace(5) %"22", align 1 + %"47" = load i1, ptr addrspace(5) %"22", align 1 + %6 = zext i1 %"47" to i32 + %"214" = add i32 0, %6 + store i32 %"214", ptr addrspace(5) %"8", align 4 + %"48" = load i1, ptr addrspace(5) %"22", align 1 + %"26" = xor i1 %"48", true + %7 = zext i1 %"26" to i32 + %"215" = sub i32 0, %7 + store i32 %"215", ptr addrspace(5) %"9", align 4 + %8 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 0) + %"216" = extractvalue { i32, i1 } %8, 0 + %"51" = extractvalue { i32, i1 } %8, 1 + store i32 %"216", ptr addrspace(5) %"10", align 4 + store i1 %"51", ptr addrspace(5) %"22", align 1 + %"53" = load i1, ptr addrspace(5) %"22", align 1 + %9 = zext i1 %"53" to i32 + %"217" = add i32 0, %9 + store i32 %"217", ptr addrspace(5) %"10", align 4 + %"54" = load i1, ptr addrspace(5) %"22", align 1 + %"27" = xor i1 %"54", true + %10 = zext i1 %"27" to i32 + %"218" = sub i32 0, %10 + store i32 %"218", ptr addrspace(5) %"11", align 4 + %11 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -1, i32 -1) + %"219" = extractvalue { i32, i1 } %11, 0 + %"57" = extractvalue { i32, i1 } %11, 1 + store i32 %"219", ptr addrspace(5) %"12", align 4 + store i1 %"57", ptr addrspace(5) %"22", align 1 + %"59" = load i1, ptr addrspace(5) %"22", align 1 + %12 = zext i1 %"59" to i32 + %"220" = add i32 0, %12 + store i32 %"220", ptr addrspace(5) %"12", align 4 + %"60" = load i1, ptr addrspace(5) %"22", align 1 + %"28" = xor i1 %"60", true + %13 = zext i1 %"28" to i32 + %"221" = sub i32 0, %13 + store i32 %"221", ptr addrspace(5) %"13", align 4 + %14 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 0) + %"222" = extractvalue { i32, i1 } %14, 0 + %"63" = extractvalue { i32, i1 } %14, 1 + store i32 %"222", ptr addrspace(5) %"14", align 4 + store i1 %"63", ptr addrspace(5) %"22", align 1 + %"65" = load i1, ptr addrspace(5) %"22", align 1 + %15 = zext i1 %"65" to i32 + %"223" = add i32 0, %15 + store i32 %"223", ptr addrspace(5) %"14", align 4 + %"66" = load i1, ptr addrspace(5) %"22", align 1 + %"29" = xor i1 %"66", true + %16 = zext i1 %"29" to i32 + %"224" = sub i32 0, %16 + store i32 %"224", ptr addrspace(5) %"15", align 4 + %17 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 -1, i32 -1) + %"225" = extractvalue { i32, i1 } %17, 0 + %"69" = extractvalue { i32, i1 } %17, 1 + store i32 %"225", ptr addrspace(5) %"16", align 4 + store i1 %"69", ptr addrspace(5) %"22", align 1 + %"71" = load i1, ptr addrspace(5) %"22", align 1 + %18 = zext i1 %"71" to i32 + %"226" = add i32 0, %18 + store i32 %"226", ptr addrspace(5) %"16", align 4 + %"72" = load i1, ptr addrspace(5) %"22", align 1 + %"30" = xor i1 %"72", true + %19 = zext i1 %"30" to i32 + %"227" = sub i32 0, %19 + store i32 %"227", ptr addrspace(5) %"17", align 4 + %20 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 0) + %"228" = extractvalue { i32, i1 } %20, 0 + %"75" = extractvalue { i32, i1 } %20, 1 + store i32 %"228", ptr addrspace(5) %"18", align 4 + store i1 %"75", ptr addrspace(5) %"22", align 1 + %"76" = load i1, ptr addrspace(5) %"22", align 1 + %"31" = xor i1 %"76", true + %21 = zext i1 %"31" to i32 + %22 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 0) + %23 = extractvalue { i32, i1 } %22, 0 + %24 = extractvalue { i32, i1 } %22, 1 + %25 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %23, i32 %21) + %"229" = extractvalue { i32, i1 } %25, 0 + %26 = extractvalue { i32, i1 } %25, 1 + %"32" = xor i1 %24, %26 + store i32 %"229", ptr addrspace(5) %"18", align 4 + %"78" = xor i1 %"32", true + store i1 %"78", ptr addrspace(5) %"22", align 1 + %"80" = load i1, ptr addrspace(5) %"22", align 1 + %27 = zext i1 %"80" to i32 + %"230" = add i32 0, %27 + store i32 %"230", ptr addrspace(5) %"18", align 4 + %"81" = load i1, ptr addrspace(5) %"22", align 1 + %"33" = xor i1 %"81", true + %28 = zext i1 %"33" to i32 + %"231" = sub i32 0, %28 + store i32 %"231", ptr addrspace(5) %"19", align 4 + %29 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 0) + %"232" = extractvalue { i32, i1 } %29, 0 + %"84" = extractvalue { i32, i1 } %29, 1 + store i32 %"232", ptr addrspace(5) %"20", align 4 + store i1 %"84", ptr addrspace(5) %"22", align 1 + %"85" = load i1, ptr addrspace(5) %"22", align 1 + %"34" = xor i1 %"85", true + %30 = zext i1 %"34" to i32 + %31 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1) + %32 = extractvalue { i32, i1 } %31, 0 + %33 = extractvalue { i32, i1 } %31, 1 + %34 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %32, i32 %30) + %"233" = extractvalue { i32, i1 } %34, 0 + %35 = extractvalue { i32, i1 } %34, 1 + %"35" = xor i1 %33, %35 + store i32 %"233", ptr addrspace(5) %"20", align 4 + %"87" = xor i1 %"35", true + store i1 %"87", ptr addrspace(5) %"22", align 1 + %"89" = load i1, ptr addrspace(5) %"22", align 1 + %36 = zext i1 %"89" to i32 + %"234" = add i32 0, %36 + store i32 %"234", ptr addrspace(5) %"20", align 4 + %"90" = load i1, ptr addrspace(5) %"22", align 1 + %"36" = xor i1 %"90", true + %37 = zext i1 %"36" to i32 + %"235" = sub i32 0, %37 + store i32 %"235", ptr addrspace(5) %"21", align 4 + %"92" = load i64, ptr addrspace(5) %"5", align 8 + %"93" = load i32, ptr addrspace(5) %"6", align 4 + %"236" = inttoptr i64 %"92" to ptr + store i32 %"93", ptr %"236", align 4 + %"94" = load i64, ptr addrspace(5) %"5", align 8 + %"95" = load i32, ptr addrspace(5) %"8", align 4 + %"238" = inttoptr i64 %"94" to ptr + %"269" = getelementptr inbounds i8, ptr %"238", i64 4 + store i32 %"95", ptr %"269", align 4 + %"96" = load i64, ptr addrspace(5) %"5", align 8 + %"97" = load i32, ptr addrspace(5) %"10", align 4 + %"240" = inttoptr i64 %"96" to ptr + %"271" = getelementptr inbounds i8, ptr %"240", i64 8 + store i32 %"97", ptr %"271", align 4 + %"98" = load i64, ptr addrspace(5) %"5", align 8 + %"99" = load i32, ptr addrspace(5) %"12", align 4 + %"242" = inttoptr i64 %"98" to ptr + %"273" = getelementptr inbounds i8, ptr %"242", i64 12 + store i32 %"99", ptr %"273", align 4 + %"100" = load i64, ptr addrspace(5) %"5", align 8 + %"101" = load i32, ptr addrspace(5) %"14", align 4 + %"244" = inttoptr i64 %"100" to ptr + %"275" = getelementptr inbounds i8, ptr %"244", i64 16 + store i32 %"101", ptr %"275", align 4 + %"102" = load i64, ptr addrspace(5) %"5", align 8 + %"103" = load i32, ptr addrspace(5) %"16", align 4 + %"246" = inttoptr i64 %"102" to ptr + %"277" = getelementptr inbounds i8, ptr %"246", i64 20 + store i32 %"103", ptr %"277", align 4 + %"104" = load i64, ptr addrspace(5) %"5", align 8 + %"105" = load i32, ptr addrspace(5) %"18", align 4 + %"248" = inttoptr i64 %"104" to ptr + %"279" = getelementptr inbounds i8, ptr %"248", i64 24 + store i32 %"105", ptr %"279", align 4 + %"106" = load i64, ptr addrspace(5) %"5", align 8 + %"107" = load i32, ptr addrspace(5) %"20", align 4 + %"250" = inttoptr i64 %"106" to ptr + %"281" = getelementptr inbounds i8, ptr %"250", i64 28 + store i32 %"107", ptr %"281", align 4 + %"108" = load i64, ptr addrspace(5) %"5", align 8 + %"109" = load i32, ptr addrspace(5) %"7", align 4 + %"252" = inttoptr i64 %"108" to ptr + %"283" = getelementptr inbounds i8, ptr %"252", i64 32 + store i32 %"109", ptr %"283", align 4 + %"110" = load i64, ptr addrspace(5) %"5", align 8 + %"111" = load i32, ptr addrspace(5) %"9", align 4 + %"254" = inttoptr i64 %"110" to ptr + %"285" = getelementptr inbounds i8, ptr %"254", i64 36 + store i32 %"111", ptr %"285", align 4 + %"112" = load i64, ptr addrspace(5) %"5", align 8 + %"113" = load i32, ptr addrspace(5) %"11", align 4 + %"256" = inttoptr i64 %"112" to ptr + %"287" = getelementptr inbounds i8, ptr %"256", i64 40 + store i32 %"113", ptr %"287", align 4 + %"114" = load i64, ptr addrspace(5) %"5", align 8 + %"115" = load i32, ptr addrspace(5) %"13", align 4 + %"258" = inttoptr i64 %"114" to ptr + %"289" = getelementptr inbounds i8, ptr %"258", i64 44 + store i32 %"115", ptr %"289", align 4 + %"116" = load i64, ptr addrspace(5) %"5", align 8 + %"117" = load i32, ptr addrspace(5) %"15", align 4 + %"260" = inttoptr i64 %"116" to ptr + %"291" = getelementptr inbounds i8, ptr %"260", i64 48 + store i32 %"117", ptr %"291", align 4 + %"118" = load i64, ptr addrspace(5) %"5", align 8 + %"119" = load i32, ptr addrspace(5) %"17", align 4 + %"262" = inttoptr i64 %"118" to ptr + %"293" = getelementptr inbounds i8, ptr %"262", i64 52 + store i32 %"119", ptr %"293", align 4 + %"120" = load i64, ptr addrspace(5) %"5", align 8 + %"121" = load i32, ptr addrspace(5) %"19", align 4 + %"264" = inttoptr i64 %"120" to ptr + %"295" = getelementptr inbounds i8, ptr %"264", i64 56 + store i32 %"121", ptr %"295", align 4 + %"122" = load i64, ptr addrspace(5) %"5", align 8 + %"123" = load i32, ptr addrspace(5) %"21", align 4 + %"266" = inttoptr i64 %"122" to ptr + %"297" = getelementptr inbounds i8, ptr %"266", i64 60 + store i32 %"123", ptr %"297", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1 + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/carry_set_all.ptx b/ptx/src/test/spirv_run/carry_set_all.ptx new file mode 100644 index 0000000..ace6e33 --- /dev/null +++ b/ptx/src/test/spirv_run/carry_set_all.ptx @@ -0,0 +1,84 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry carry_set_all( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + + .reg .b32 carry1_add; + .reg .b32 carry1_sub; + .reg .b32 carry2_add; + .reg .b32 carry2_sub; + .reg .b32 carry3_add; + .reg .b32 carry3_sub; + .reg .b32 carry4_add; + .reg .b32 carry4_sub; + .reg .b32 carry5_add; + .reg .b32 carry5_sub; + .reg .b32 carry6_add; + .reg .b32 carry6_sub; + .reg .b32 carry7_add; + .reg .b32 carry7_sub; + .reg .b32 carry8_add; + .reg .b32 carry8_sub; + + ld.param.u64 out_addr, [output]; + + sub.cc.u32 carry1_add, 0, 0; + addc.u32 carry1_add, 0, 0; + subc.u32 carry1_sub, 0, 0; + + sub.cc.u32 carry2_add, 0, 1; + addc.u32 carry2_add, 0, 0; + subc.u32 carry2_sub, 0, 0; + + add.cc.u32 carry3_add, 0, 0; + addc.u32 carry3_add, 0, 0; + subc.u32 carry3_sub, 0, 0; + + add.cc.u32 carry4_add, 4294967295, 4294967295; + addc.u32 carry4_add, 0, 0; + subc.u32 carry4_sub, 0, 0; + + mad.lo.cc.u32 carry5_add, 0, 0, 0; + addc.u32 carry5_add, 0, 0; + subc.u32 carry5_sub, 0, 0; + + mad.lo.cc.u32 carry6_add, 1, 4294967295, 4294967295; + addc.u32 carry6_add, 0, 0; + subc.u32 carry6_sub, 0, 0; + + add.cc.u32 carry7_add, 0, 0; + subc.cc.u32 carry7_add, 0, 0; + addc.u32 carry7_add, 0, 0; + subc.u32 carry7_sub, 0, 0; + + add.cc.u32 carry8_add, 0, 0; + subc.cc.u32 carry8_add, 0, 1; + addc.u32 carry8_add, 0, 0; + subc.u32 carry8_sub, 0, 0; + + st.u32 [out_addr], carry1_add; + st.u32 [out_addr+4], carry2_add; + st.u32 [out_addr+8], carry3_add; + st.u32 [out_addr+12], carry4_add; + st.u32 [out_addr+16], carry5_add; + st.u32 [out_addr+20], carry6_add; + st.u32 [out_addr+24], carry7_add; + st.u32 [out_addr+28], carry8_add; + + st.u32 [out_addr+32], carry1_sub; + st.u32 [out_addr+36], carry2_sub; + st.u32 [out_addr+40], carry3_sub; + st.u32 [out_addr+44], carry4_sub; + st.u32 [out_addr+48], carry5_sub; + st.u32 [out_addr+52], carry6_sub; + st.u32 [out_addr+56], carry7_sub; + st.u32 [out_addr+60], carry8_sub; + ret; +} diff --git a/ptx/src/test/spirv_run/clz.ll b/ptx/src/test/spirv_run/clz.ll index 356ee7d..5a93145 100644 --- a/ptx/src/test/spirv_run/clz.ll +++ b/ptx/src/test/spirv_run/clz.ll @@ -1,30 +1,30 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { -"21": +define protected amdgpu_kernel void @clz(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"17", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"19" = inttoptr i64 %"12" to ptr - %"11" = load i32, ptr %"19", align 4 - store i32 %"11", ptr addrspace(5) %"6", align 4 - %"14" = load i32, ptr addrspace(5) %"6", align 4 - %0 = call i32 @llvm.ctlz.i32(i32 %"14", i1 false) - store i32 %0, ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load i32, ptr addrspace(5) %"6", align 4 - %"20" = inttoptr i64 %"15" to ptr - store i32 %"16", ptr %"20", align 4 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"18" = inttoptr i64 %"11" to ptr + %"10" = load i32, ptr %"18", align 4 + store i32 %"10", ptr addrspace(5) %"6", align 4 + %"13" = load i32, ptr addrspace(5) %"6", align 4 + %2 = call i32 @llvm.ctlz.i32(i32 %"13", i1 false) + store i32 %2, ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load i32, ptr addrspace(5) %"6", align 4 + %"19" = inttoptr i64 %"14" to ptr + store i32 %"15", ptr %"19", align 4 ret void } diff --git a/ptx/src/test/spirv_run/const.ll b/ptx/src/test/spirv_run/const.ll index 472421d..df0de94 100644 --- a/ptx/src/test/spirv_run/const.ll +++ b/ptx/src/test/spirv_run/const.ll @@ -3,49 +3,49 @@ target triple = "amdgcn-amd-amdhsa" @constparams = protected addrspace(4) externally_initialized global [4 x i16] [i16 10, i16 20, i16 30, i16 40], align 8 -define protected amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #0 { -"53": +define protected amdgpu_kernel void @const(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 - %"12" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"12", align 1 %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i16, align 2, addrspace(5) %"8" = alloca i16, align 2, addrspace(5) %"9" = alloca i16, align 2, addrspace(5) %"10" = alloca i16, align 2, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"11", align 1 + %"12" = load i64, ptr addrspace(4) %"38", align 8 + store i64 %"12", ptr addrspace(5) %"5", align 8 %"13" = load i64, ptr addrspace(4) %"39", align 8 - store i64 %"13", ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(4) %"40", align 8 - store i64 %"14", ptr addrspace(5) %"6", align 8 - %"15" = load i16, ptr addrspace(4) @constparams, align 2 - store i16 %"15", ptr addrspace(5) %"7", align 2 - %"16" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2 - store i16 %"16", ptr addrspace(5) %"8", align 2 - %"17" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2 - store i16 %"17", ptr addrspace(5) %"9", align 2 - %"18" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2 - store i16 %"18", ptr addrspace(5) %"10", align 2 - %"19" = load i64, ptr addrspace(5) %"6", align 8 - %"20" = load i16, ptr addrspace(5) %"7", align 2 - %"45" = inttoptr i64 %"19" to ptr - store i16 %"20", ptr %"45", align 2 - %"21" = load i64, ptr addrspace(5) %"6", align 8 - %"22" = load i16, ptr addrspace(5) %"8", align 2 - %"47" = inttoptr i64 %"21" to ptr - %"61" = getelementptr inbounds i8, ptr %"47", i64 2 - store i16 %"22", ptr %"61", align 2 - %"23" = load i64, ptr addrspace(5) %"6", align 8 - %"24" = load i16, ptr addrspace(5) %"9", align 2 - %"49" = inttoptr i64 %"23" to ptr - %"63" = getelementptr inbounds i8, ptr %"49", i64 4 - store i16 %"24", ptr %"63", align 2 - %"25" = load i64, ptr addrspace(5) %"6", align 8 - %"26" = load i16, ptr addrspace(5) %"10", align 2 - %"51" = inttoptr i64 %"25" to ptr - %"65" = getelementptr inbounds i8, ptr %"51", i64 6 - store i16 %"26", ptr %"65", align 2 + store i64 %"13", ptr addrspace(5) %"6", align 8 + %"14" = load i16, ptr addrspace(4) @constparams, align 2 + store i16 %"14", ptr addrspace(5) %"7", align 2 + %"15" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 2), align 2 + store i16 %"15", ptr addrspace(5) %"8", align 2 + %"16" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 4), align 2 + store i16 %"16", ptr addrspace(5) %"9", align 2 + %"17" = load i16, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) @constparams, i64 6), align 2 + store i16 %"17", ptr addrspace(5) %"10", align 2 + %"18" = load i64, ptr addrspace(5) %"6", align 8 + %"19" = load i16, ptr addrspace(5) %"7", align 2 + %"44" = inttoptr i64 %"18" to ptr + store i16 %"19", ptr %"44", align 2 + %"20" = load i64, ptr addrspace(5) %"6", align 8 + %"21" = load i16, ptr addrspace(5) %"8", align 2 + %"46" = inttoptr i64 %"20" to ptr + %"59" = getelementptr inbounds i8, ptr %"46", i64 2 + store i16 %"21", ptr %"59", align 2 + %"22" = load i64, ptr addrspace(5) %"6", align 8 + %"23" = load i16, ptr addrspace(5) %"9", align 2 + %"48" = inttoptr i64 %"22" to ptr + %"61" = getelementptr inbounds i8, ptr %"48", i64 4 + store i16 %"23", ptr %"61", align 2 + %"24" = load i64, ptr addrspace(5) %"6", align 8 + %"25" = load i16, ptr addrspace(5) %"10", align 2 + %"50" = inttoptr i64 %"24" to ptr + %"63" = getelementptr inbounds i8, ptr %"50", i64 6 + store i16 %"25", ptr %"63", align 2 ret void } diff --git a/ptx/src/test/spirv_run/constant_f32.ll b/ptx/src/test/spirv_run/constant_f32.ll index e918c89..a6558c9 100644 --- a/ptx/src/test/spirv_run/constant_f32.ll +++ b/ptx/src/test/spirv_run/constant_f32.ll @@ -1,30 +1,30 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { -"22": +define protected amdgpu_kernel void @constant_f32(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"20" = inttoptr i64 %"12" to ptr - %"11" = load float, ptr %"20", align 4 - store float %"11", ptr addrspace(5) %"6", align 4 - %"14" = load float, ptr addrspace(5) %"6", align 4 - %"13" = fmul float %"14", 5.000000e-01 - store float %"13", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load float, ptr addrspace(5) %"6", align 4 - %"21" = inttoptr i64 %"15" to ptr - store float %"16", ptr %"21", align 4 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"11" to ptr + %"10" = load float, ptr %"19", align 4 + store float %"10", ptr addrspace(5) %"6", align 4 + %"13" = load float, ptr addrspace(5) %"6", align 4 + %"12" = fmul float %"13", 5.000000e-01 + store float %"12", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load float, ptr addrspace(5) %"6", align 4 + %"20" = inttoptr i64 %"14" to ptr + store float %"15", ptr %"20", align 4 ret void } diff --git a/ptx/src/test/spirv_run/constant_negative.ll b/ptx/src/test/spirv_run/constant_negative.ll index 09478b6..c3e7e86 100644 --- a/ptx/src/test/spirv_run/constant_negative.ll +++ b/ptx/src/test/spirv_run/constant_negative.ll @@ -1,30 +1,30 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { -"22": +define protected amdgpu_kernel void @constant_negative(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"20" = inttoptr i64 %"12" to ptr - %"11" = load i32, ptr %"20", align 4 - store i32 %"11", ptr addrspace(5) %"6", align 4 - %"14" = load i32, ptr addrspace(5) %"6", align 4 - %"13" = mul i32 %"14", -1 - store i32 %"13", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load i32, ptr addrspace(5) %"6", align 4 - %"21" = inttoptr i64 %"15" to ptr - store i32 %"16", ptr %"21", align 4 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"11" to ptr + %"10" = load i32, ptr %"19", align 4 + store i32 %"10", ptr addrspace(5) %"6", align 4 + %"13" = load i32, ptr addrspace(5) %"6", align 4 + %"12" = mul i32 %"13", -1 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load i32, ptr addrspace(5) %"6", align 4 + %"20" = inttoptr i64 %"14" to ptr + store i32 %"15", ptr %"20", align 4 ret void } diff --git a/ptx/src/test/spirv_run/cos.ll b/ptx/src/test/spirv_run/cos.ll index 0cf9c30..da48297 100644 --- a/ptx/src/test/spirv_run/cos.ll +++ b/ptx/src/test/spirv_run/cos.ll @@ -1,30 +1,30 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { -"21": +define protected amdgpu_kernel void @cos(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"17", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"19" = inttoptr i64 %"12" to ptr - %"11" = load float, ptr %"19", align 4 - store float %"11", ptr addrspace(5) %"6", align 4 - %"14" = load float, ptr addrspace(5) %"6", align 4 - %"13" = call afn float @llvm.cos.f32(float %"14") - store float %"13", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load float, ptr addrspace(5) %"6", align 4 - %"20" = inttoptr i64 %"15" to ptr - store float %"16", ptr %"20", align 4 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"18" = inttoptr i64 %"11" to ptr + %"10" = load float, ptr %"18", align 4 + store float %"10", ptr addrspace(5) %"6", align 4 + %"13" = load float, ptr addrspace(5) %"6", align 4 + %"12" = call afn float @llvm.cos.f32(float %"13") + store float %"12", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load float, ptr addrspace(5) %"6", align 4 + %"19" = inttoptr i64 %"14" to ptr + store float %"15", ptr %"19", align 4 ret void } diff --git a/ptx/src/test/spirv_run/cvt_clamp.ll b/ptx/src/test/spirv_run/cvt_clamp.ll index 29de682..b610ca9 100644 --- a/ptx/src/test/spirv_run/cvt_clamp.ll +++ b/ptx/src/test/spirv_run/cvt_clamp.ll @@ -3,69 +3,69 @@ target triple = "amdgcn-amd-amdhsa" declare float @__zluda_ptx_impl__cvt_sat_f32_f32(float) #0 -define protected amdgpu_kernel void @cvt_clamp(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #1 { -"57": +define protected amdgpu_kernel void @cvt_clamp(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #1 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"46", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"47", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"48", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"49" = inttoptr i64 %"12" to ptr addrspace(1) - %"11" = load float, ptr addrspace(1) %"49", align 4 - store float %"11", ptr addrspace(5) %"6", align 4 - %"14" = load float, ptr addrspace(5) %"6", align 4 - %"13" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"14") - store float %"13", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load float, ptr addrspace(5) %"6", align 4 - %"50" = inttoptr i64 %"15" to ptr addrspace(1) - store float %"16", ptr addrspace(1) %"50", align 4 - %"18" = load i64, ptr addrspace(5) %"4", align 8 - %"51" = inttoptr i64 %"18" to ptr addrspace(1) + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"48" = inttoptr i64 %"11" to ptr addrspace(1) + %"10" = load float, ptr addrspace(1) %"48", align 4 + store float %"10", ptr addrspace(5) %"6", align 4 + %"13" = load float, ptr addrspace(5) %"6", align 4 + %"12" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"13") + store float %"12", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load float, ptr addrspace(5) %"6", align 4 + %"49" = inttoptr i64 %"14" to ptr addrspace(1) + store float %"15", ptr addrspace(1) %"49", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"50" = inttoptr i64 %"17" to ptr addrspace(1) + %"60" = getelementptr inbounds i8, ptr addrspace(1) %"50", i64 4 + %"16" = load float, ptr addrspace(1) %"60", align 4 + store float %"16", ptr addrspace(5) %"6", align 4 + %"19" = load float, ptr addrspace(5) %"6", align 4 + %"18" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"19") + store float %"18", ptr addrspace(5) %"6", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load float, ptr addrspace(5) %"6", align 4 + %"51" = inttoptr i64 %"20" to ptr addrspace(1) %"62" = getelementptr inbounds i8, ptr addrspace(1) %"51", i64 4 - %"17" = load float, ptr addrspace(1) %"62", align 4 - store float %"17", ptr addrspace(5) %"6", align 4 - %"20" = load float, ptr addrspace(5) %"6", align 4 - %"19" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"20") - store float %"19", ptr addrspace(5) %"6", align 4 - %"21" = load i64, ptr addrspace(5) %"5", align 8 - %"22" = load float, ptr addrspace(5) %"6", align 4 - %"52" = inttoptr i64 %"21" to ptr addrspace(1) - %"64" = getelementptr inbounds i8, ptr addrspace(1) %"52", i64 4 - store float %"22", ptr addrspace(1) %"64", align 4 - %"24" = load i64, ptr addrspace(5) %"4", align 8 - %"53" = inttoptr i64 %"24" to ptr addrspace(1) + store float %"21", ptr addrspace(1) %"62", align 4 + %"23" = load i64, ptr addrspace(5) %"4", align 8 + %"52" = inttoptr i64 %"23" to ptr addrspace(1) + %"64" = getelementptr inbounds i8, ptr addrspace(1) %"52", i64 8 + %"22" = load float, ptr addrspace(1) %"64", align 4 + store float %"22", ptr addrspace(5) %"6", align 4 + %"25" = load float, ptr addrspace(5) %"6", align 4 + %"24" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"25") + store float %"24", ptr addrspace(5) %"6", align 4 + %"26" = load i64, ptr addrspace(5) %"5", align 8 + %"27" = load float, ptr addrspace(5) %"6", align 4 + %"53" = inttoptr i64 %"26" to ptr addrspace(1) %"66" = getelementptr inbounds i8, ptr addrspace(1) %"53", i64 8 - %"23" = load float, ptr addrspace(1) %"66", align 4 - store float %"23", ptr addrspace(5) %"6", align 4 - %"26" = load float, ptr addrspace(5) %"6", align 4 - %"25" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"26") - store float %"25", ptr addrspace(5) %"6", align 4 - %"27" = load i64, ptr addrspace(5) %"5", align 8 - %"28" = load float, ptr addrspace(5) %"6", align 4 - %"54" = inttoptr i64 %"27" to ptr addrspace(1) - %"68" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 8 - store float %"28", ptr addrspace(1) %"68", align 4 - %"30" = load i64, ptr addrspace(5) %"4", align 8 - %"55" = inttoptr i64 %"30" to ptr addrspace(1) + store float %"27", ptr addrspace(1) %"66", align 4 + %"29" = load i64, ptr addrspace(5) %"4", align 8 + %"54" = inttoptr i64 %"29" to ptr addrspace(1) + %"68" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 12 + %"28" = load float, ptr addrspace(1) %"68", align 4 + store float %"28", ptr addrspace(5) %"6", align 4 + %"31" = load float, ptr addrspace(5) %"6", align 4 + %"30" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"31") + store float %"30", ptr addrspace(5) %"6", align 4 + %"32" = load i64, ptr addrspace(5) %"5", align 8 + %"33" = load float, ptr addrspace(5) %"6", align 4 + %"55" = inttoptr i64 %"32" to ptr addrspace(1) %"70" = getelementptr inbounds i8, ptr addrspace(1) %"55", i64 12 - %"29" = load float, ptr addrspace(1) %"70", align 4 - store float %"29", ptr addrspace(5) %"6", align 4 - %"32" = load float, ptr addrspace(5) %"6", align 4 - %"31" = call float @__zluda_ptx_impl__cvt_sat_f32_f32(float %"32") - store float %"31", ptr addrspace(5) %"6", align 4 - %"33" = load i64, ptr addrspace(5) %"5", align 8 - %"34" = load float, ptr addrspace(5) %"6", align 4 - %"56" = inttoptr i64 %"33" to ptr addrspace(1) - %"72" = getelementptr inbounds i8, ptr addrspace(1) %"56", i64 12 - store float %"34", ptr addrspace(1) %"72", align 4 + store float %"33", ptr addrspace(1) %"70", align 4 ret void } diff --git a/ptx/src/test/spirv_run/cvt_f32_f16.ll b/ptx/src/test/spirv_run/cvt_f32_f16.ll index 169eb59..7379876 100644 --- a/ptx/src/test/spirv_run/cvt_f32_f16.ll +++ b/ptx/src/test/spirv_run/cvt_f32_f16.ll @@ -1,32 +1,32 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @cvt_f32_f16(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { -"23": +define protected amdgpu_kernel void @cvt_f32_f16(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca half, align 2, addrspace(5) %"7" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"21" = inttoptr i64 %"13" to ptr addrspace(1) - %"20" = load i16, ptr addrspace(1) %"21", align 2 - %"12" = bitcast i16 %"20" to half - store half %"12", ptr addrspace(5) %"6", align 2 - %"15" = load half, ptr addrspace(5) %"6", align 2 - %"14" = fpext half %"15" to float - store float %"14", ptr addrspace(5) %"7", align 4 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load float, ptr addrspace(5) %"7", align 4 - %"22" = inttoptr i64 %"16" to ptr - store float %"17", ptr %"22", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"12" to ptr addrspace(1) + %"19" = load i16, ptr addrspace(1) %"20", align 2 + %"11" = bitcast i16 %"19" to half + store half %"11", ptr addrspace(5) %"6", align 2 + %"14" = load half, ptr addrspace(5) %"6", align 2 + %"13" = fpext half %"14" to float + store float %"13", ptr addrspace(5) %"7", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load float, ptr addrspace(5) %"7", align 4 + %"21" = inttoptr i64 %"15" to ptr + store float %"16", ptr %"21", align 4 ret void } diff --git a/ptx/src/test/spirv_run/cvt_f32_s32.ll b/ptx/src/test/spirv_run/cvt_f32_s32.ll index 119d052..90b0e4a 100644 --- a/ptx/src/test/spirv_run/cvt_f32_s32.ll +++ b/ptx/src/test/spirv_run/cvt_f32_s32.ll @@ -9,80 +9,80 @@ declare float @__zluda_ptx_impl__cvt_rp_f32_s32(i32) #0 declare float @__zluda_ptx_impl__cvt_rz_f32_s32(i32) #0 -define protected amdgpu_kernel void @cvt_f32_s32(ptr addrspace(4) byref(i64) %"50", ptr addrspace(4) byref(i64) %"51") #1 { -"76": +define protected amdgpu_kernel void @cvt_f32_s32(ptr addrspace(4) byref(i64) %"49", ptr addrspace(4) byref(i64) %"50") #1 { %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 - %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) %"9" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = load i64, ptr addrspace(4) %"49", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 %"12" = load i64, ptr addrspace(4) %"50", align 8 - store i64 %"12", ptr addrspace(5) %"4", align 8 - %"13" = load i64, ptr addrspace(4) %"51", align 8 - store i64 %"13", ptr addrspace(5) %"5", align 8 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"53" = inttoptr i64 %"15" to ptr - %"52" = load i32, ptr %"53", align 4 - store i32 %"52", ptr addrspace(5) %"6", align 4 - %"17" = load i64, ptr addrspace(5) %"4", align 8 - %"54" = inttoptr i64 %"17" to ptr - %"90" = getelementptr inbounds i8, ptr %"54", i64 4 - %"55" = load i32, ptr %"90", align 4 - store i32 %"55", ptr addrspace(5) %"7", align 4 - %"19" = load i64, ptr addrspace(5) %"4", align 8 - %"56" = inttoptr i64 %"19" to ptr - %"92" = getelementptr inbounds i8, ptr %"56", i64 8 - %"57" = load i32, ptr %"92", align 4 - store i32 %"57", ptr addrspace(5) %"8", align 4 - %"21" = load i64, ptr addrspace(5) %"4", align 8 - %"58" = inttoptr i64 %"21" to ptr - %"94" = getelementptr inbounds i8, ptr %"58", i64 12 - %"59" = load i32, ptr %"94", align 4 - store i32 %"59", ptr addrspace(5) %"9", align 4 - %"23" = load i32, ptr addrspace(5) %"6", align 4 - %"60" = call float @__zluda_ptx_impl__cvt_rn_f32_s32(i32 %"23") - %"22" = bitcast float %"60" to i32 - store i32 %"22", ptr addrspace(5) %"6", align 4 - %"25" = load i32, ptr addrspace(5) %"7", align 4 - %"62" = call float @__zluda_ptx_impl__cvt_rz_f32_s32(i32 %"25") - %"24" = bitcast float %"62" to i32 - store i32 %"24", ptr addrspace(5) %"7", align 4 - %"27" = load i32, ptr addrspace(5) %"8", align 4 - %"64" = call float @__zluda_ptx_impl__cvt_rm_f32_s32(i32 %"27") - %"26" = bitcast float %"64" to i32 - store i32 %"26", ptr addrspace(5) %"8", align 4 - %"29" = load i32, ptr addrspace(5) %"9", align 4 - %"66" = call float @__zluda_ptx_impl__cvt_rp_f32_s32(i32 %"29") - %"28" = bitcast float %"66" to i32 - store i32 %"28", ptr addrspace(5) %"9", align 4 - %"30" = load i64, ptr addrspace(5) %"5", align 8 - %"31" = load i32, ptr addrspace(5) %"6", align 4 - %"68" = inttoptr i64 %"30" to ptr addrspace(1) - %"69" = bitcast i32 %"31" to float - store float %"69", ptr addrspace(1) %"68", align 4 - %"32" = load i64, ptr addrspace(5) %"5", align 8 - %"33" = load i32, ptr addrspace(5) %"7", align 4 - %"70" = inttoptr i64 %"32" to ptr addrspace(1) - %"96" = getelementptr inbounds i8, ptr addrspace(1) %"70", i64 4 - %"71" = bitcast i32 %"33" to float - store float %"71", ptr addrspace(1) %"96", align 4 - %"34" = load i64, ptr addrspace(5) %"5", align 8 - %"35" = load i32, ptr addrspace(5) %"8", align 4 - %"72" = inttoptr i64 %"34" to ptr addrspace(1) - %"98" = getelementptr inbounds i8, ptr addrspace(1) %"72", i64 8 - %"73" = bitcast i32 %"35" to float - store float %"73", ptr addrspace(1) %"98", align 4 - %"36" = load i64, ptr addrspace(5) %"5", align 8 - %"37" = load i32, ptr addrspace(5) %"9", align 4 - %"74" = inttoptr i64 %"36" to ptr addrspace(1) - %"100" = getelementptr inbounds i8, ptr addrspace(1) %"74", i64 12 - %"75" = bitcast i32 %"37" to float - store float %"75", ptr addrspace(1) %"100", align 4 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"52" = inttoptr i64 %"14" to ptr + %"51" = load i32, ptr %"52", align 4 + store i32 %"51", ptr addrspace(5) %"6", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"53" = inttoptr i64 %"16" to ptr + %"88" = getelementptr inbounds i8, ptr %"53", i64 4 + %"54" = load i32, ptr %"88", align 4 + store i32 %"54", ptr addrspace(5) %"7", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"55" = inttoptr i64 %"18" to ptr + %"90" = getelementptr inbounds i8, ptr %"55", i64 8 + %"56" = load i32, ptr %"90", align 4 + store i32 %"56", ptr addrspace(5) %"8", align 4 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"57" = inttoptr i64 %"20" to ptr + %"92" = getelementptr inbounds i8, ptr %"57", i64 12 + %"58" = load i32, ptr %"92", align 4 + store i32 %"58", ptr addrspace(5) %"9", align 4 + %"22" = load i32, ptr addrspace(5) %"6", align 4 + %"59" = call float @__zluda_ptx_impl__cvt_rn_f32_s32(i32 %"22") + %"21" = bitcast float %"59" to i32 + store i32 %"21", ptr addrspace(5) %"6", align 4 + %"24" = load i32, ptr addrspace(5) %"7", align 4 + %"61" = call float @__zluda_ptx_impl__cvt_rz_f32_s32(i32 %"24") + %"23" = bitcast float %"61" to i32 + store i32 %"23", ptr addrspace(5) %"7", align 4 + %"26" = load i32, ptr addrspace(5) %"8", align 4 + %"63" = call float @__zluda_ptx_impl__cvt_rm_f32_s32(i32 %"26") + %"25" = bitcast float %"63" to i32 + store i32 %"25", ptr addrspace(5) %"8", align 4 + %"28" = load i32, ptr addrspace(5) %"9", align 4 + %"65" = call float @__zluda_ptx_impl__cvt_rp_f32_s32(i32 %"28") + %"27" = bitcast float %"65" to i32 + store i32 %"27", ptr addrspace(5) %"9", align 4 + %"29" = load i64, ptr addrspace(5) %"5", align 8 + %"30" = load i32, ptr addrspace(5) %"6", align 4 + %"67" = inttoptr i64 %"29" to ptr addrspace(1) + %"68" = bitcast i32 %"30" to float + store float %"68", ptr addrspace(1) %"67", align 4 + %"31" = load i64, ptr addrspace(5) %"5", align 8 + %"32" = load i32, ptr addrspace(5) %"7", align 4 + %"69" = inttoptr i64 %"31" to ptr addrspace(1) + %"94" = getelementptr inbounds i8, ptr addrspace(1) %"69", i64 4 + %"70" = bitcast i32 %"32" to float + store float %"70", ptr addrspace(1) %"94", align 4 + %"33" = load i64, ptr addrspace(5) %"5", align 8 + %"34" = load i32, ptr addrspace(5) %"8", align 4 + %"71" = inttoptr i64 %"33" to ptr addrspace(1) + %"96" = getelementptr inbounds i8, ptr addrspace(1) %"71", i64 8 + %"72" = bitcast i32 %"34" to float + store float %"72", ptr addrspace(1) %"96", align 4 + %"35" = load i64, ptr addrspace(5) %"5", align 8 + %"36" = load i32, ptr addrspace(5) %"9", align 4 + %"73" = inttoptr i64 %"35" to ptr addrspace(1) + %"98" = getelementptr inbounds i8, ptr addrspace(1) %"73", i64 12 + %"74" = bitcast i32 %"36" to float + store float %"74", ptr addrspace(1) %"98", align 4 ret void } diff --git a/ptx/src/test/spirv_run/cvt_f64_f32.ll b/ptx/src/test/spirv_run/cvt_f64_f32.ll index f608ed1..64b4bb8 100644 --- a/ptx/src/test/spirv_run/cvt_f64_f32.ll +++ b/ptx/src/test/spirv_run/cvt_f64_f32.ll @@ -1,31 +1,31 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { -"22": +define protected amdgpu_kernel void @cvt_f64_f32(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) %"7" = alloca double, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"20" = inttoptr i64 %"13" to ptr addrspace(1) - %"12" = load float, ptr addrspace(1) %"20", align 4 - store float %"12", ptr addrspace(5) %"6", align 4 - %"15" = load float, ptr addrspace(5) %"6", align 4 - %"14" = fpext float %"15" to double - store double %"14", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load double, ptr addrspace(5) %"7", align 8 - %"21" = inttoptr i64 %"16" to ptr - store double %"17", ptr %"21", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr addrspace(1) + %"11" = load float, ptr addrspace(1) %"19", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load float, ptr addrspace(5) %"6", align 4 + %"13" = fpext float %"14" to double + store double %"13", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load double, ptr addrspace(5) %"7", align 8 + %"20" = inttoptr i64 %"15" to ptr + store double %"16", ptr %"20", align 8 ret void } diff --git a/ptx/src/test/spirv_run/cvt_rni.ll b/ptx/src/test/spirv_run/cvt_rni.ll index fa56dfa..77d2999 100644 --- a/ptx/src/test/spirv_run/cvt_rni.ll +++ b/ptx/src/test/spirv_run/cvt_rni.ll @@ -1,44 +1,44 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 { -"34": +define protected amdgpu_kernel void @cvt_rni(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) %"7" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"27", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"28", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"29", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"30" = inttoptr i64 %"13" to ptr - %"12" = load float, ptr %"30", align 4 - store float %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"31" = inttoptr i64 %"15" to ptr - %"36" = getelementptr inbounds i8, ptr %"31", i64 4 - %"14" = load float, ptr %"36", align 4 - store float %"14", ptr addrspace(5) %"7", align 4 - %"17" = load float, ptr addrspace(5) %"6", align 4 - %"16" = call float @llvm.rint.f32(float %"17") - store float %"16", ptr addrspace(5) %"6", align 4 - %"19" = load float, ptr addrspace(5) %"7", align 4 - %"18" = call float @llvm.rint.f32(float %"19") - store float %"18", ptr addrspace(5) %"7", align 4 - %"20" = load i64, ptr addrspace(5) %"5", align 8 - %"21" = load float, ptr addrspace(5) %"6", align 4 - %"32" = inttoptr i64 %"20" to ptr - store float %"21", ptr %"32", align 4 - %"22" = load i64, ptr addrspace(5) %"5", align 8 - %"23" = load float, ptr addrspace(5) %"7", align 4 - %"33" = inttoptr i64 %"22" to ptr - %"38" = getelementptr inbounds i8, ptr %"33", i64 4 - store float %"23", ptr %"38", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"29" = inttoptr i64 %"12" to ptr + %"11" = load float, ptr %"29", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"30" = inttoptr i64 %"14" to ptr + %"34" = getelementptr inbounds i8, ptr %"30", i64 4 + %"13" = load float, ptr %"34", align 4 + store float %"13", ptr addrspace(5) %"7", align 4 + %"16" = load float, ptr addrspace(5) %"6", align 4 + %"15" = call float @llvm.rint.f32(float %"16") + store float %"15", ptr addrspace(5) %"6", align 4 + %"18" = load float, ptr addrspace(5) %"7", align 4 + %"17" = call float @llvm.rint.f32(float %"18") + store float %"17", ptr addrspace(5) %"7", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load float, ptr addrspace(5) %"6", align 4 + %"31" = inttoptr i64 %"19" to ptr + store float %"20", ptr %"31", align 4 + %"21" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = load float, ptr addrspace(5) %"7", align 4 + %"32" = inttoptr i64 %"21" to ptr + %"36" = getelementptr inbounds i8, ptr %"32", i64 4 + store float %"22", ptr %"36", align 4 ret void } diff --git a/ptx/src/test/spirv_run/cvt_rzi.ll b/ptx/src/test/spirv_run/cvt_rzi.ll index ad4a305..e651db5 100644 --- a/ptx/src/test/spirv_run/cvt_rzi.ll +++ b/ptx/src/test/spirv_run/cvt_rzi.ll @@ -1,44 +1,44 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 { -"34": +define protected amdgpu_kernel void @cvt_rzi(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) %"7" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"27", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"28", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"29", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"30" = inttoptr i64 %"13" to ptr - %"12" = load float, ptr %"30", align 4 - store float %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"31" = inttoptr i64 %"15" to ptr - %"36" = getelementptr inbounds i8, ptr %"31", i64 4 - %"14" = load float, ptr %"36", align 4 - store float %"14", ptr addrspace(5) %"7", align 4 - %"17" = load float, ptr addrspace(5) %"6", align 4 - %"16" = call float @llvm.trunc.f32(float %"17") - store float %"16", ptr addrspace(5) %"6", align 4 - %"19" = load float, ptr addrspace(5) %"7", align 4 - %"18" = call float @llvm.trunc.f32(float %"19") - store float %"18", ptr addrspace(5) %"7", align 4 - %"20" = load i64, ptr addrspace(5) %"5", align 8 - %"21" = load float, ptr addrspace(5) %"6", align 4 - %"32" = inttoptr i64 %"20" to ptr - store float %"21", ptr %"32", align 4 - %"22" = load i64, ptr addrspace(5) %"5", align 8 - %"23" = load float, ptr addrspace(5) %"7", align 4 - %"33" = inttoptr i64 %"22" to ptr - %"38" = getelementptr inbounds i8, ptr %"33", i64 4 - store float %"23", ptr %"38", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"29" = inttoptr i64 %"12" to ptr + %"11" = load float, ptr %"29", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"30" = inttoptr i64 %"14" to ptr + %"34" = getelementptr inbounds i8, ptr %"30", i64 4 + %"13" = load float, ptr %"34", align 4 + store float %"13", ptr addrspace(5) %"7", align 4 + %"16" = load float, ptr addrspace(5) %"6", align 4 + %"15" = call float @llvm.trunc.f32(float %"16") + store float %"15", ptr addrspace(5) %"6", align 4 + %"18" = load float, ptr addrspace(5) %"7", align 4 + %"17" = call float @llvm.trunc.f32(float %"18") + store float %"17", ptr addrspace(5) %"7", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load float, ptr addrspace(5) %"6", align 4 + %"31" = inttoptr i64 %"19" to ptr + store float %"20", ptr %"31", align 4 + %"21" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = load float, ptr addrspace(5) %"7", align 4 + %"32" = inttoptr i64 %"21" to ptr + %"36" = getelementptr inbounds i8, ptr %"32", i64 4 + store float %"22", ptr %"36", align 4 ret void } diff --git a/ptx/src/test/spirv_run/cvt_s16_s8.ll b/ptx/src/test/spirv_run/cvt_s16_s8.ll index dcf4555..6f49cea 100644 --- a/ptx/src/test/spirv_run/cvt_s16_s8.ll +++ b/ptx/src/test/spirv_run/cvt_s16_s8.ll @@ -1,33 +1,33 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { -"24": +define protected amdgpu_kernel void @cvt_s16_s8(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"20" = inttoptr i64 %"13" to ptr addrspace(1) - %"12" = load i32, ptr addrspace(1) %"20", align 4 - store i32 %"12", ptr addrspace(5) %"7", align 4 - %"15" = load i32, ptr addrspace(5) %"7", align 4 - %"26" = trunc i32 %"15" to i8 - %"21" = sext i8 %"26" to i16 - %"14" = sext i16 %"21" to i32 - store i32 %"14", ptr addrspace(5) %"6", align 4 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i32, ptr addrspace(5) %"6", align 4 - %"23" = inttoptr i64 %"16" to ptr - store i32 %"17", ptr %"23", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr addrspace(1) + %"11" = load i32, ptr addrspace(1) %"19", align 4 + store i32 %"11", ptr addrspace(5) %"7", align 4 + %"14" = load i32, ptr addrspace(5) %"7", align 4 + %"24" = trunc i32 %"14" to i8 + %"20" = sext i8 %"24" to i16 + %"13" = sext i16 %"20" to i32 + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"22" = inttoptr i64 %"15" to ptr + store i32 %"16", ptr %"22", align 4 ret void } diff --git a/ptx/src/test/spirv_run/cvt_s32_f32.ll b/ptx/src/test/spirv_run/cvt_s32_f32.ll index b8f8b2b..e8b8bc1 100644 --- a/ptx/src/test/spirv_run/cvt_s32_f32.ll +++ b/ptx/src/test/spirv_run/cvt_s32_f32.ll @@ -3,48 +3,48 @@ target triple = "amdgcn-amd-amdhsa" declare i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float) #0 -define protected amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 { -"42": +define protected amdgpu_kernel void @cvt_s32_f32(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #1 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"27", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"28", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"29", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"31" = inttoptr i64 %"13" to ptr - %"30" = load float, ptr %"31", align 4 - %"12" = bitcast float %"30" to i32 - store i32 %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"32" = inttoptr i64 %"15" to ptr - %"47" = getelementptr inbounds i8, ptr %"32", i64 4 - %"33" = load float, ptr %"47", align 4 - %"14" = bitcast float %"33" to i32 - store i32 %"14", ptr addrspace(5) %"7", align 4 - %"17" = load i32, ptr addrspace(5) %"6", align 4 - %"35" = bitcast i32 %"17" to float - %"34" = call i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float %"35") - store i32 %"34", ptr addrspace(5) %"6", align 4 - %"19" = load i32, ptr addrspace(5) %"7", align 4 - %"37" = bitcast i32 %"19" to float - %"36" = call i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float %"37") - store i32 %"36", ptr addrspace(5) %"7", align 4 - %"20" = load i64, ptr addrspace(5) %"5", align 8 - %"21" = load i32, ptr addrspace(5) %"6", align 4 - %"38" = inttoptr i64 %"20" to ptr addrspace(1) - store i32 %"21", ptr addrspace(1) %"38", align 4 - %"22" = load i64, ptr addrspace(5) %"5", align 8 - %"23" = load i32, ptr addrspace(5) %"7", align 4 - %"40" = inttoptr i64 %"22" to ptr addrspace(1) - %"49" = getelementptr inbounds i8, ptr addrspace(1) %"40", i64 4 - store i32 %"23", ptr addrspace(1) %"49", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"30" = inttoptr i64 %"12" to ptr + %"29" = load float, ptr %"30", align 4 + %"11" = bitcast float %"29" to i32 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"14" to ptr + %"45" = getelementptr inbounds i8, ptr %"31", i64 4 + %"32" = load float, ptr %"45", align 4 + %"13" = bitcast float %"32" to i32 + store i32 %"13", ptr addrspace(5) %"7", align 4 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"34" = bitcast i32 %"16" to float + %"33" = call i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float %"34") + store i32 %"33", ptr addrspace(5) %"6", align 4 + %"18" = load i32, ptr addrspace(5) %"7", align 4 + %"36" = bitcast i32 %"18" to float + %"35" = call i32 @__zluda_ptx_impl__cvt_rp_s32_f32(float %"36") + store i32 %"35", ptr addrspace(5) %"7", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"6", align 4 + %"37" = inttoptr i64 %"19" to ptr addrspace(1) + store i32 %"20", ptr addrspace(1) %"37", align 4 + %"21" = load i64, ptr addrspace(5) %"5", align 8 + %"22" = load i32, ptr addrspace(5) %"7", align 4 + %"39" = inttoptr i64 %"21" to ptr addrspace(1) + %"47" = getelementptr inbounds i8, ptr addrspace(1) %"39", i64 4 + store i32 %"22", ptr addrspace(1) %"47", align 4 ret void } diff --git a/ptx/src/test/spirv_run/cvt_s64_s32.ll b/ptx/src/test/spirv_run/cvt_s64_s32.ll index a272a4c..799b90a 100644 --- a/ptx/src/test/spirv_run/cvt_s64_s32.ll +++ b/ptx/src/test/spirv_run/cvt_s64_s32.ll @@ -1,31 +1,31 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { -"24": +define protected amdgpu_kernel void @cvt_s64_s32(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"21" = inttoptr i64 %"13" to ptr - %"20" = load i32, ptr %"21", align 4 - store i32 %"20", ptr addrspace(5) %"6", align 4 - %"15" = load i32, ptr addrspace(5) %"6", align 4 - %"14" = sext i32 %"15" to i64 - store i64 %"14", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"22" = inttoptr i64 %"16" to ptr - store i64 %"17", ptr %"22", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"12" to ptr + %"19" = load i32, ptr %"20", align 4 + store i32 %"19", ptr addrspace(5) %"6", align 4 + %"14" = load i32, ptr addrspace(5) %"6", align 4 + %"13" = sext i32 %"14" to i64 + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"21" = inttoptr i64 %"15" to ptr + store i64 %"16", ptr %"21", align 8 ret void } diff --git a/ptx/src/test/spirv_run/cvt_sat_s_u.ll b/ptx/src/test/spirv_run/cvt_sat_s_u.ll index 946ece1..5e8d015 100644 --- a/ptx/src/test/spirv_run/cvt_sat_s_u.ll +++ b/ptx/src/test/spirv_run/cvt_sat_s_u.ll @@ -1,50 +1,50 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 { -"35": +define protected amdgpu_kernel void @cvt_sat_s_u(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) - %"11" = load i64, ptr addrspace(4) %"27", align 8 - store i64 %"11", ptr addrspace(5) %"4", align 8 - %"12" = load i64, ptr addrspace(4) %"28", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(5) %"4", align 8 - %"29" = inttoptr i64 %"14" to ptr - %"13" = load i32, ptr %"29", align 4 - store i32 %"13", ptr addrspace(5) %"6", align 4 - %"16" = load i32, ptr addrspace(5) %"6", align 4 - %0 = call i32 @llvm.smax.i32(i32 %"16", i32 0) %1 = alloca i32, align 4, addrspace(5) - store i32 %0, ptr addrspace(5) %1, align 4 - %"15" = load i32, ptr addrspace(5) %1, align 4 - store i32 %"15", ptr addrspace(5) %"7", align 4 - %"18" = load i32, ptr addrspace(5) %"7", align 4 %2 = alloca i32, align 4, addrspace(5) - store i32 %"18", ptr addrspace(5) %2, align 4 - %"30" = load i32, ptr addrspace(5) %2, align 4 - store i32 %"30", ptr addrspace(5) %"7", align 4 - %"20" = load i32, ptr addrspace(5) %"6", align 4 %3 = alloca i32, align 4, addrspace(5) - store i32 %"20", ptr addrspace(5) %3, align 4 - %"31" = load i32, ptr addrspace(5) %3, align 4 - store i32 %"31", ptr addrspace(5) %"8", align 4 - %"21" = load i64, ptr addrspace(5) %"5", align 8 - %"22" = load i32, ptr addrspace(5) %"7", align 4 - %"32" = inttoptr i64 %"21" to ptr - store i32 %"22", ptr %"32", align 4 - %"23" = load i64, ptr addrspace(5) %"5", align 8 - %"24" = load i32, ptr addrspace(5) %"8", align 4 - %"34" = inttoptr i64 %"23" to ptr - %"37" = getelementptr inbounds i8, ptr %"34", i64 4 - store i32 %"24", ptr %"37", align 4 + br label %4 + +4: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"26", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 + %"11" = load i64, ptr addrspace(4) %"27", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"28" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"28", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i32, ptr addrspace(5) %"6", align 4 + %5 = call i32 @llvm.smax.i32(i32 %"15", i32 0) + store i32 %5, ptr addrspace(5) %1, align 4 + %"14" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + store i32 %"17", ptr addrspace(5) %2, align 4 + %"29" = load i32, ptr addrspace(5) %2, align 4 + store i32 %"29", ptr addrspace(5) %"7", align 4 + %"19" = load i32, ptr addrspace(5) %"6", align 4 + store i32 %"19", ptr addrspace(5) %3, align 4 + %"30" = load i32, ptr addrspace(5) %3, align 4 + store i32 %"30", ptr addrspace(5) %"8", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load i32, ptr addrspace(5) %"7", align 4 + %"31" = inttoptr i64 %"20" to ptr + store i32 %"21", ptr %"31", align 4 + %"22" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = load i32, ptr addrspace(5) %"8", align 4 + %"33" = inttoptr i64 %"22" to ptr + %"35" = getelementptr inbounds i8, ptr %"33", i64 4 + store i32 %"23", ptr %"35", align 4 ret void } diff --git a/ptx/src/test/spirv_run/cvt_u32_s16.ll b/ptx/src/test/spirv_run/cvt_u32_s16.ll index 7ab8366..1b868a5 100644 --- a/ptx/src/test/spirv_run/cvt_u32_s16.ll +++ b/ptx/src/test/spirv_run/cvt_u32_s16.ll @@ -1,31 +1,31 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @cvt_u32_s16(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { -"24": +define protected amdgpu_kernel void @cvt_u32_s16(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i16, align 2, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"20" = inttoptr i64 %"13" to ptr addrspace(1) - %"12" = load i16, ptr addrspace(1) %"20", align 2 - store i16 %"12", ptr addrspace(5) %"6", align 2 - %"15" = load i16, ptr addrspace(5) %"6", align 2 - %"21" = sext i16 %"15" to i32 - store i32 %"21", ptr addrspace(5) %"7", align 4 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i32, ptr addrspace(5) %"7", align 4 - %"23" = inttoptr i64 %"16" to ptr - store i32 %"17", ptr %"23", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr addrspace(1) + %"11" = load i16, ptr addrspace(1) %"19", align 2 + store i16 %"11", ptr addrspace(5) %"6", align 2 + %"14" = load i16, ptr addrspace(5) %"6", align 2 + %"20" = sext i16 %"14" to i32 + store i32 %"20", ptr addrspace(5) %"7", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i32, ptr addrspace(5) %"7", align 4 + %"22" = inttoptr i64 %"15" to ptr + store i32 %"16", ptr %"22", align 4 ret void } diff --git a/ptx/src/test/spirv_run/cvta.ll b/ptx/src/test/spirv_run/cvta.ll index 8cba990..7b73f8c 100644 --- a/ptx/src/test/spirv_run/cvta.ll +++ b/ptx/src/test/spirv_run/cvta.ll @@ -1,37 +1,37 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { -"27": +define protected amdgpu_kernel void @cvta(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"20", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %0 = inttoptr i64 %"12" to ptr - %1 = addrspacecast ptr %0 to ptr addrspace(1) - %"21" = ptrtoint ptr addrspace(1) %1 to i64 - store i64 %"21", ptr addrspace(5) %"4", align 8 - %"14" = load i64, ptr addrspace(5) %"5", align 8 - %2 = inttoptr i64 %"14" to ptr + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %2 = inttoptr i64 %"11" to ptr %3 = addrspacecast ptr %2 to ptr addrspace(1) - %"23" = ptrtoint ptr addrspace(1) %3 to i64 - store i64 %"23", ptr addrspace(5) %"5", align 8 - %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = ptrtoint ptr addrspace(1) %3 to i64 + store i64 %"20", ptr addrspace(5) %"4", align 8 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %4 = inttoptr i64 %"13" to ptr + %5 = addrspacecast ptr %4 to ptr addrspace(1) + %"22" = ptrtoint ptr addrspace(1) %5 to i64 + store i64 %"22", ptr addrspace(5) %"5", align 8 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"15" to ptr addrspace(1) + %"14" = load float, ptr addrspace(1) %"24", align 4 + store float %"14", ptr addrspace(5) %"6", align 4 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load float, ptr addrspace(5) %"6", align 4 %"25" = inttoptr i64 %"16" to ptr addrspace(1) - %"15" = load float, ptr addrspace(1) %"25", align 4 - store float %"15", ptr addrspace(5) %"6", align 4 - %"17" = load i64, ptr addrspace(5) %"5", align 8 - %"18" = load float, ptr addrspace(5) %"6", align 4 - %"26" = inttoptr i64 %"17" to ptr addrspace(1) - store float %"18", ptr addrspace(1) %"26", align 4 + store float %"17", ptr addrspace(1) %"25", align 4 ret void } diff --git a/ptx/src/test/spirv_run/div_approx.ll b/ptx/src/test/spirv_run/div_approx.ll index 91b3fb7..d4b889f 100644 --- a/ptx/src/test/spirv_run/div_approx.ll +++ b/ptx/src/test/spirv_run/div_approx.ll @@ -1,37 +1,37 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"28": +define protected amdgpu_kernel void @div_approx(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) %"7" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"25" = inttoptr i64 %"13" to ptr - %"12" = load float, ptr %"25", align 4 - store float %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"15" to ptr - %"30" = getelementptr inbounds i8, ptr %"26", i64 4 - %"14" = load float, ptr %"30", align 4 - store float %"14", ptr addrspace(5) %"7", align 4 - %"17" = load float, ptr addrspace(5) %"6", align 4 - %"18" = load float, ptr addrspace(5) %"7", align 4 - %"16" = fdiv arcp afn float %"17", %"18" - store float %"16", ptr addrspace(5) %"6", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load float, ptr addrspace(5) %"6", align 4 - %"27" = inttoptr i64 %"19" to ptr - store float %"20", ptr %"27", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"12" to ptr + %"11" = load float, ptr %"24", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"14" to ptr + %"28" = getelementptr inbounds i8, ptr %"25", i64 4 + %"13" = load float, ptr %"28", align 4 + store float %"13", ptr addrspace(5) %"7", align 4 + %"16" = load float, ptr addrspace(5) %"6", align 4 + %"17" = load float, ptr addrspace(5) %"7", align 4 + %"15" = fdiv arcp afn float %"16", %"17" + store float %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load float, ptr addrspace(5) %"6", align 4 + %"26" = inttoptr i64 %"18" to ptr + store float %"19", ptr %"26", align 4 ret void } diff --git a/ptx/src/test/spirv_run/dp4a.ll b/ptx/src/test/spirv_run/dp4a.ll index f55aa62..97f4098 100644 --- a/ptx/src/test/spirv_run/dp4a.ll +++ b/ptx/src/test/spirv_run/dp4a.ll @@ -3,44 +3,44 @@ target triple = "amdgcn-amd-amdhsa" declare i32 @__zluda_ptx_impl__dp4a_s32_s32(i32, i32, i32) #0 -define protected amdgpu_kernel void @dp4a(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #1 { -"39": +define protected amdgpu_kernel void @dp4a(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"28", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 %"11" = load i64, ptr addrspace(4) %"29", align 8 - store i64 %"11", ptr addrspace(5) %"4", align 8 - %"12" = load i64, ptr addrspace(4) %"30", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(5) %"4", align 8 - %"31" = inttoptr i64 %"14" to ptr - %"13" = load i32, ptr %"31", align 4 - store i32 %"13", ptr addrspace(5) %"6", align 4 - %"16" = load i64, ptr addrspace(5) %"4", align 8 - %"32" = inttoptr i64 %"16" to ptr - %"46" = getelementptr inbounds i8, ptr %"32", i64 4 - %"15" = load i32, ptr %"46", align 4 - store i32 %"15", ptr addrspace(5) %"7", align 4 - %"18" = load i64, ptr addrspace(5) %"4", align 8 - %"33" = inttoptr i64 %"18" to ptr - %"48" = getelementptr inbounds i8, ptr %"33", i64 8 - %"17" = load i32, ptr %"48", align 4 - store i32 %"17", ptr addrspace(5) %"8", align 4 - %"20" = load i32, ptr addrspace(5) %"6", align 4 - %"21" = load i32, ptr addrspace(5) %"7", align 4 - %"22" = load i32, ptr addrspace(5) %"8", align 4 - %"34" = call i32 @__zluda_ptx_impl__dp4a_s32_s32(i32 %"20", i32 %"21", i32 %"22") - store i32 %"34", ptr addrspace(5) %"6", align 4 - %"23" = load i64, ptr addrspace(5) %"5", align 8 - %"24" = load i32, ptr addrspace(5) %"6", align 4 - %"38" = inttoptr i64 %"23" to ptr - store i32 %"24", ptr %"38", align 4 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"30" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"30", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"15" to ptr + %"44" = getelementptr inbounds i8, ptr %"31", i64 4 + %"14" = load i32, ptr %"44", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"32" = inttoptr i64 %"17" to ptr + %"46" = getelementptr inbounds i8, ptr %"32", i64 8 + %"16" = load i32, ptr %"46", align 4 + store i32 %"16", ptr addrspace(5) %"8", align 4 + %"19" = load i32, ptr addrspace(5) %"6", align 4 + %"20" = load i32, ptr addrspace(5) %"7", align 4 + %"21" = load i32, ptr addrspace(5) %"8", align 4 + %"33" = call i32 @__zluda_ptx_impl__dp4a_s32_s32(i32 %"19", i32 %"20", i32 %"21") + store i32 %"33", ptr addrspace(5) %"6", align 4 + %"22" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = load i32, ptr addrspace(5) %"6", align 4 + %"37" = inttoptr i64 %"22" to ptr + store i32 %"23", ptr %"37", align 4 ret void } diff --git a/ptx/src/test/spirv_run/ex2.ll b/ptx/src/test/spirv_run/ex2.ll index 8e13d43..aa0c1d5 100644 --- a/ptx/src/test/spirv_run/ex2.ll +++ b/ptx/src/test/spirv_run/ex2.ll @@ -1,69 +1,69 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #0 { -"57": +define protected amdgpu_kernel void @ex2(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"46", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"47", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"48", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"49" = inttoptr i64 %"12" to ptr - %"11" = load float, ptr %"49", align 4 - store float %"11", ptr addrspace(5) %"6", align 4 - %"14" = load float, ptr addrspace(5) %"6", align 4 - %"13" = call afn float @llvm.exp2.f32(float %"14") - store float %"13", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load float, ptr addrspace(5) %"6", align 4 - %"50" = inttoptr i64 %"15" to ptr - store float %"16", ptr %"50", align 4 - %"18" = load i64, ptr addrspace(5) %"4", align 8 - %"51" = inttoptr i64 %"18" to ptr + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"48" = inttoptr i64 %"11" to ptr + %"10" = load float, ptr %"48", align 4 + store float %"10", ptr addrspace(5) %"6", align 4 + %"13" = load float, ptr addrspace(5) %"6", align 4 + %"12" = call afn float @llvm.exp2.f32(float %"13") + store float %"12", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load float, ptr addrspace(5) %"6", align 4 + %"49" = inttoptr i64 %"14" to ptr + store float %"15", ptr %"49", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"50" = inttoptr i64 %"17" to ptr + %"57" = getelementptr inbounds i8, ptr %"50", i64 4 + %"16" = load float, ptr %"57", align 4 + store float %"16", ptr addrspace(5) %"6", align 4 + %"19" = load float, ptr addrspace(5) %"6", align 4 + %"18" = call afn float @llvm.exp2.f32(float %"19") + store float %"18", ptr addrspace(5) %"6", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load float, ptr addrspace(5) %"6", align 4 + %"51" = inttoptr i64 %"20" to ptr %"59" = getelementptr inbounds i8, ptr %"51", i64 4 - %"17" = load float, ptr %"59", align 4 - store float %"17", ptr addrspace(5) %"6", align 4 - %"20" = load float, ptr addrspace(5) %"6", align 4 - %"19" = call afn float @llvm.exp2.f32(float %"20") - store float %"19", ptr addrspace(5) %"6", align 4 - %"21" = load i64, ptr addrspace(5) %"5", align 8 - %"22" = load float, ptr addrspace(5) %"6", align 4 - %"52" = inttoptr i64 %"21" to ptr - %"61" = getelementptr inbounds i8, ptr %"52", i64 4 - store float %"22", ptr %"61", align 4 - %"24" = load i64, ptr addrspace(5) %"4", align 8 - %"53" = inttoptr i64 %"24" to ptr + store float %"21", ptr %"59", align 4 + %"23" = load i64, ptr addrspace(5) %"4", align 8 + %"52" = inttoptr i64 %"23" to ptr + %"61" = getelementptr inbounds i8, ptr %"52", i64 8 + %"22" = load float, ptr %"61", align 4 + store float %"22", ptr addrspace(5) %"6", align 4 + %"25" = load float, ptr addrspace(5) %"6", align 4 + %"24" = call afn float @llvm.exp2.f32(float %"25") + store float %"24", ptr addrspace(5) %"6", align 4 + %"26" = load i64, ptr addrspace(5) %"5", align 8 + %"27" = load float, ptr addrspace(5) %"6", align 4 + %"53" = inttoptr i64 %"26" to ptr %"63" = getelementptr inbounds i8, ptr %"53", i64 8 - %"23" = load float, ptr %"63", align 4 - store float %"23", ptr addrspace(5) %"6", align 4 - %"26" = load float, ptr addrspace(5) %"6", align 4 - %"25" = call afn float @llvm.exp2.f32(float %"26") - store float %"25", ptr addrspace(5) %"6", align 4 - %"27" = load i64, ptr addrspace(5) %"5", align 8 - %"28" = load float, ptr addrspace(5) %"6", align 4 - %"54" = inttoptr i64 %"27" to ptr - %"65" = getelementptr inbounds i8, ptr %"54", i64 8 - store float %"28", ptr %"65", align 4 - %"30" = load i64, ptr addrspace(5) %"4", align 8 - %"55" = inttoptr i64 %"30" to ptr + store float %"27", ptr %"63", align 4 + %"29" = load i64, ptr addrspace(5) %"4", align 8 + %"54" = inttoptr i64 %"29" to ptr + %"65" = getelementptr inbounds i8, ptr %"54", i64 12 + %"28" = load float, ptr %"65", align 4 + store float %"28", ptr addrspace(5) %"6", align 4 + %"31" = load float, ptr addrspace(5) %"6", align 4 + %"30" = call afn float @llvm.exp2.f32(float %"31") + store float %"30", ptr addrspace(5) %"6", align 4 + %"32" = load i64, ptr addrspace(5) %"5", align 8 + %"33" = load float, ptr addrspace(5) %"6", align 4 + %"55" = inttoptr i64 %"32" to ptr %"67" = getelementptr inbounds i8, ptr %"55", i64 12 - %"29" = load float, ptr %"67", align 4 - store float %"29", ptr addrspace(5) %"6", align 4 - %"32" = load float, ptr addrspace(5) %"6", align 4 - %"31" = call afn float @llvm.exp2.f32(float %"32") - store float %"31", ptr addrspace(5) %"6", align 4 - %"33" = load i64, ptr addrspace(5) %"5", align 8 - %"34" = load float, ptr addrspace(5) %"6", align 4 - %"56" = inttoptr i64 %"33" to ptr - %"69" = getelementptr inbounds i8, ptr %"56", i64 12 - store float %"34", ptr %"69", align 4 + store float %"33", ptr %"67", align 4 ret void } diff --git a/ptx/src/test/spirv_run/extern_shared.ll b/ptx/src/test/spirv_run/extern_shared.ll index 34f1d33..e7d0a21 100644 --- a/ptx/src/test/spirv_run/extern_shared.ll +++ b/ptx/src/test/spirv_run/extern_shared.ll @@ -3,31 +3,31 @@ target triple = "amdgcn-amd-amdhsa" @shared_mem = external hidden addrspace(3) global [0 x i32] -define protected amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { -"24": +define protected amdgpu_kernel void @extern_shared(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"5", align 8 %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"11" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"11", ptr addrspace(5) %"6", align 8 - %"13" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = inttoptr i64 %"13" to ptr addrspace(1) - %"12" = load i64, ptr addrspace(1) %"20", align 8 - store i64 %"12", ptr addrspace(5) %"7", align 8 - %"14" = load i64, ptr addrspace(5) %"7", align 8 - store i64 %"14", ptr addrspace(3) @shared_mem, align 8 - %"15" = load i64, ptr addrspace(3) @shared_mem, align 8 - store i64 %"15", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"6", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"23" = inttoptr i64 %"16" to ptr addrspace(1) - store i64 %"17", ptr addrspace(1) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"6", align 8 + %"12" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = inttoptr i64 %"12" to ptr addrspace(1) + %"11" = load i64, ptr addrspace(1) %"19", align 8 + store i64 %"11", ptr addrspace(5) %"7", align 8 + %"13" = load i64, ptr addrspace(5) %"7", align 8 + store i64 %"13", ptr addrspace(3) @shared_mem, align 8 + %"14" = load i64, ptr addrspace(3) @shared_mem, align 8 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"22" = inttoptr i64 %"15" to ptr addrspace(1) + store i64 %"16", ptr addrspace(1) %"22", align 8 ret void } diff --git a/ptx/src/test/spirv_run/extern_shared_call.ll b/ptx/src/test/spirv_run/extern_shared_call.ll index 241053f..a2b6c10 100644 --- a/ptx/src/test/spirv_run/extern_shared_call.ll +++ b/ptx/src/test/spirv_run/extern_shared_call.ll @@ -3,49 +3,49 @@ target triple = "amdgcn-amd-amdhsa" @shared_mem = external hidden addrspace(3) global [0 x i32], align 4 -define private void @"2"(ptr addrspace(3) %"37") #0 { -"35": +define private void @"2"(ptr addrspace(3) %"33") #0 { %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 - %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 %"3" = alloca i64, align 8, addrspace(5) - %"14" = load i64, ptr addrspace(3) %"37", align 8 - store i64 %"14", ptr addrspace(5) %"3", align 8 - %"16" = load i64, ptr addrspace(5) %"3", align 8 - %"15" = add i64 %"16", 2 - store i64 %"15", ptr addrspace(5) %"3", align 8 - %"17" = load i64, ptr addrspace(5) %"3", align 8 - store i64 %"17", ptr addrspace(3) %"37", align 8 + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"10", align 1 + %"12" = load i64, ptr addrspace(3) %"33", align 8 + store i64 %"12", ptr addrspace(5) %"3", align 8 + %"14" = load i64, ptr addrspace(5) %"3", align 8 + %"13" = add i64 %"14", 2 + store i64 %"13", ptr addrspace(5) %"3", align 8 + %"15" = load i64, ptr addrspace(5) %"3", align 8 + store i64 %"15", ptr addrspace(3) %"33", align 8 ret void } -define protected amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #0 { -"36": - %"12" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"12", align 1 - %"13" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"13", align 1 +define protected amdgpu_kernel void @extern_shared_call(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #0 { + %"11" = alloca i1, align 1, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) %"9" = alloca i64, align 8, addrspace(5) - %"18" = load i64, ptr addrspace(4) %"27", align 8 - store i64 %"18", ptr addrspace(5) %"7", align 8 - %"19" = load i64, ptr addrspace(4) %"28", align 8 - store i64 %"19", ptr addrspace(5) %"8", align 8 - %"21" = load i64, ptr addrspace(5) %"7", align 8 - %"31" = inttoptr i64 %"21" to ptr addrspace(1) - %"20" = load i64, ptr addrspace(1) %"31", align 8 - store i64 %"20", ptr addrspace(5) %"9", align 8 - %"22" = load i64, ptr addrspace(5) %"9", align 8 - store i64 %"22", ptr addrspace(3) @shared_mem, align 8 + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"11", align 1 + %"16" = load i64, ptr addrspace(4) %"25", align 8 + store i64 %"16", ptr addrspace(5) %"7", align 8 + %"17" = load i64, ptr addrspace(4) %"26", align 8 + store i64 %"17", ptr addrspace(5) %"8", align 8 + %"19" = load i64, ptr addrspace(5) %"7", align 8 + %"29" = inttoptr i64 %"19" to ptr addrspace(1) + %"18" = load i64, ptr addrspace(1) %"29", align 8 + store i64 %"18", ptr addrspace(5) %"9", align 8 + %"20" = load i64, ptr addrspace(5) %"9", align 8 + store i64 %"20", ptr addrspace(3) @shared_mem, align 8 call void @"2"(ptr addrspace(3) @shared_mem) - %"23" = load i64, ptr addrspace(3) @shared_mem, align 8 - store i64 %"23", ptr addrspace(5) %"9", align 8 - %"24" = load i64, ptr addrspace(5) %"8", align 8 - %"25" = load i64, ptr addrspace(5) %"9", align 8 - %"34" = inttoptr i64 %"24" to ptr addrspace(1) - store i64 %"25", ptr addrspace(1) %"34", align 8 + %"21" = load i64, ptr addrspace(3) @shared_mem, align 8 + store i64 %"21", ptr addrspace(5) %"9", align 8 + %"22" = load i64, ptr addrspace(5) %"8", align 8 + %"23" = load i64, ptr addrspace(5) %"9", align 8 + %"32" = inttoptr i64 %"22" to ptr addrspace(1) + store i64 %"23", ptr addrspace(1) %"32", align 8 ret void } diff --git a/ptx/src/test/spirv_run/fma.ll b/ptx/src/test/spirv_run/fma.ll index d518432..61ef775 100644 --- a/ptx/src/test/spirv_run/fma.ll +++ b/ptx/src/test/spirv_run/fma.ll @@ -1,44 +1,44 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 { -"35": +define protected amdgpu_kernel void @fma(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) %"7" = alloca float, align 4, addrspace(5) %"8" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"28", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 %"11" = load i64, ptr addrspace(4) %"29", align 8 - store i64 %"11", ptr addrspace(5) %"4", align 8 - %"12" = load i64, ptr addrspace(4) %"30", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(5) %"4", align 8 - %"31" = inttoptr i64 %"14" to ptr - %"13" = load float, ptr %"31", align 4 - store float %"13", ptr addrspace(5) %"6", align 4 - %"16" = load i64, ptr addrspace(5) %"4", align 8 - %"32" = inttoptr i64 %"16" to ptr - %"37" = getelementptr inbounds i8, ptr %"32", i64 4 - %"15" = load float, ptr %"37", align 4 - store float %"15", ptr addrspace(5) %"7", align 4 - %"18" = load i64, ptr addrspace(5) %"4", align 8 - %"33" = inttoptr i64 %"18" to ptr - %"39" = getelementptr inbounds i8, ptr %"33", i64 8 - %"17" = load float, ptr %"39", align 4 - store float %"17", ptr addrspace(5) %"8", align 4 - %"20" = load float, ptr addrspace(5) %"6", align 4 - %"21" = load float, ptr addrspace(5) %"7", align 4 - %"22" = load float, ptr addrspace(5) %"8", align 4 - %"19" = call float @llvm.fma.f32(float %"20", float %"21", float %"22") - store float %"19", ptr addrspace(5) %"6", align 4 - %"23" = load i64, ptr addrspace(5) %"5", align 8 - %"24" = load float, ptr addrspace(5) %"6", align 4 - %"34" = inttoptr i64 %"23" to ptr - store float %"24", ptr %"34", align 4 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"30" = inttoptr i64 %"13" to ptr + %"12" = load float, ptr %"30", align 4 + store float %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"31" = inttoptr i64 %"15" to ptr + %"35" = getelementptr inbounds i8, ptr %"31", i64 4 + %"14" = load float, ptr %"35", align 4 + store float %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"32" = inttoptr i64 %"17" to ptr + %"37" = getelementptr inbounds i8, ptr %"32", i64 8 + %"16" = load float, ptr %"37", align 4 + store float %"16", ptr addrspace(5) %"8", align 4 + %"19" = load float, ptr addrspace(5) %"6", align 4 + %"20" = load float, ptr addrspace(5) %"7", align 4 + %"21" = load float, ptr addrspace(5) %"8", align 4 + %"18" = call float @llvm.fma.f32(float %"19", float %"20", float %"21") + store float %"18", ptr addrspace(5) %"6", align 4 + %"22" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = load float, ptr addrspace(5) %"6", align 4 + %"33" = inttoptr i64 %"22" to ptr + store float %"23", ptr %"33", align 4 ret void } diff --git a/ptx/src/test/spirv_run/func_ptr.ll b/ptx/src/test/spirv_run/func_ptr.ll index b7c0603..ad4392b 100644 --- a/ptx/src/test/spirv_run/func_ptr.ll +++ b/ptx/src/test/spirv_run/func_ptr.ll @@ -1,56 +1,56 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define private float @"1"(float %"17", float %"18") #0 { -"40": +define private float @"1"(float %"15", float %"16") #0 { %"3" = alloca float, align 4, addrspace(5) %"4" = alloca float, align 4, addrspace(5) %"2" = alloca float, align 4, addrspace(5) %"13" = alloca i1, align 1, addrspace(5) + br label %1 + +1: ; preds = %0 + store float %"15", ptr addrspace(5) %"3", align 4 + store float %"16", ptr addrspace(5) %"4", align 4 store i1 false, ptr addrspace(5) %"13", align 1 - %"14" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"14", align 1 - store float %"17", ptr addrspace(5) %"3", align 4 - store float %"18", ptr addrspace(5) %"4", align 4 - %"20" = load float, ptr addrspace(5) %"3", align 4 - %"21" = load float, ptr addrspace(5) %"4", align 4 - %"19" = fadd float %"20", %"21" - store float %"19", ptr addrspace(5) %"2", align 4 - %"22" = load float, ptr addrspace(5) %"2", align 4 - ret float %"22" + %"18" = load float, ptr addrspace(5) %"3", align 4 + %"19" = load float, ptr addrspace(5) %"4", align 4 + %"17" = fadd float %"18", %"19" + store float %"17", ptr addrspace(5) %"2", align 4 + %"20" = load float, ptr addrspace(5) %"2", align 4 + ret float %"20" } -define protected amdgpu_kernel void @func_ptr(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { -"41": - %"15" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"15", align 1 - %"16" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"16", align 1 +define protected amdgpu_kernel void @func_ptr(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { + %"14" = alloca i1, align 1, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) %"9" = alloca i64, align 8, addrspace(5) %"10" = alloca i64, align 8, addrspace(5) %"11" = alloca i64, align 8, addrspace(5) %"12" = alloca i64, align 8, addrspace(5) - %"23" = load i64, ptr addrspace(4) %"36", align 8 - store i64 %"23", ptr addrspace(5) %"8", align 8 - %"24" = load i64, ptr addrspace(4) %"37", align 8 - store i64 %"24", ptr addrspace(5) %"9", align 8 - %"26" = load i64, ptr addrspace(5) %"8", align 8 - %"38" = inttoptr i64 %"26" to ptr - %"25" = load i64, ptr %"38", align 8 - store i64 %"25", ptr addrspace(5) %"10", align 8 - %"28" = load i64, ptr addrspace(5) %"10", align 8 - %"27" = add i64 %"28", 1 - store i64 %"27", ptr addrspace(5) %"11", align 8 + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"14", align 1 + %"21" = load i64, ptr addrspace(4) %"34", align 8 + store i64 %"21", ptr addrspace(5) %"8", align 8 + %"22" = load i64, ptr addrspace(4) %"35", align 8 + store i64 %"22", ptr addrspace(5) %"9", align 8 + %"24" = load i64, ptr addrspace(5) %"8", align 8 + %"36" = inttoptr i64 %"24" to ptr + %"23" = load i64, ptr %"36", align 8 + store i64 %"23", ptr addrspace(5) %"10", align 8 + %"26" = load i64, ptr addrspace(5) %"10", align 8 + %"25" = add i64 %"26", 1 + store i64 %"25", ptr addrspace(5) %"11", align 8 store i64 ptrtoint (ptr @"1" to i64), ptr addrspace(5) %"12", align 8 - %"31" = load i64, ptr addrspace(5) %"11", align 8 - %"32" = load i64, ptr addrspace(5) %"12", align 8 - %"30" = add i64 %"31", %"32" - store i64 %"30", ptr addrspace(5) %"11", align 8 - %"33" = load i64, ptr addrspace(5) %"9", align 8 - %"34" = load i64, ptr addrspace(5) %"11", align 8 - %"39" = inttoptr i64 %"33" to ptr - store i64 %"34", ptr %"39", align 8 + %"29" = load i64, ptr addrspace(5) %"11", align 8 + %"30" = load i64, ptr addrspace(5) %"12", align 8 + %"28" = add i64 %"29", %"30" + store i64 %"28", ptr addrspace(5) %"11", align 8 + %"31" = load i64, ptr addrspace(5) %"9", align 8 + %"32" = load i64, ptr addrspace(5) %"11", align 8 + %"37" = inttoptr i64 %"31" to ptr + store i64 %"32", ptr %"37", align 8 ret void } diff --git a/ptx/src/test/spirv_run/generic.ll b/ptx/src/test/spirv_run/generic.ll index d746a22..44b4ef9 100644 --- a/ptx/src/test/spirv_run/generic.ll +++ b/ptx/src/test/spirv_run/generic.ll @@ -4,66 +4,66 @@ target triple = "amdgcn-amd-amdhsa" @foo = protected addrspace(1) externally_initialized global [4 x i32] [i32 2, i32 3, i32 5, i32 7] @bar = protected addrspace(1) externally_initialized global [4 x i64] [i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 4), i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 8), i64 add (i64 ptrtoint (ptr addrspacecast (ptr addrspace(1) @foo to ptr) to i64), i64 12)] -define protected amdgpu_kernel void @generic(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #0 { -"58": +define protected amdgpu_kernel void @generic(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 { %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 - %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) %"9" = alloca i32, align 4, addrspace(5) - %"12" = load i64, ptr addrspace(4) %"48", align 8 - store i64 %"12", ptr addrspace(5) %"7", align 8 - %0 = alloca i32, align 4, addrspace(5) - store i32 1, ptr addrspace(5) %0, align 4 - %"13" = load i32, ptr addrspace(5) %0, align 4 - store i32 %"13", ptr addrspace(5) %"8", align 4 - %"14" = load i64, ptr addrspace(1) @bar, align 8 - store i64 %"14", ptr addrspace(5) %"6", align 8 - %"16" = load i64, ptr addrspace(5) %"6", align 8 - %"50" = inttoptr i64 %"16" to ptr - %"15" = load i32, ptr %"50", align 4 - store i32 %"15", ptr addrspace(5) %"9", align 4 - %"18" = load i32, ptr addrspace(5) %"8", align 4 - %"19" = load i32, ptr addrspace(5) %"9", align 4 - %"17" = mul i32 %"18", %"19" - store i32 %"17", ptr addrspace(5) %"8", align 4 - %"20" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 8), align 8 - store i64 %"20", ptr addrspace(5) %"6", align 8 - %"22" = load i64, ptr addrspace(5) %"6", align 8 - %"52" = inttoptr i64 %"22" to ptr - %"21" = load i32, ptr %"52", align 4 - store i32 %"21", ptr addrspace(5) %"9", align 4 - %"24" = load i32, ptr addrspace(5) %"8", align 4 - %"25" = load i32, ptr addrspace(5) %"9", align 4 - %"23" = mul i32 %"24", %"25" - store i32 %"23", ptr addrspace(5) %"8", align 4 - %"26" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 16), align 8 - store i64 %"26", ptr addrspace(5) %"6", align 8 - %"28" = load i64, ptr addrspace(5) %"6", align 8 - %"54" = inttoptr i64 %"28" to ptr - %"27" = load i32, ptr %"54", align 4 - store i32 %"27", ptr addrspace(5) %"9", align 4 - %"30" = load i32, ptr addrspace(5) %"8", align 4 - %"31" = load i32, ptr addrspace(5) %"9", align 4 - %"29" = mul i32 %"30", %"31" - store i32 %"29", ptr addrspace(5) %"8", align 4 - %"32" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 24), align 8 - store i64 %"32", ptr addrspace(5) %"6", align 8 - %"34" = load i64, ptr addrspace(5) %"6", align 8 - %"56" = inttoptr i64 %"34" to ptr - %"33" = load i32, ptr %"56", align 4 - store i32 %"33", ptr addrspace(5) %"9", align 4 - %"36" = load i32, ptr addrspace(5) %"8", align 4 - %"37" = load i32, ptr addrspace(5) %"9", align 4 - %"35" = mul i32 %"36", %"37" - store i32 %"35", ptr addrspace(5) %"8", align 4 - %"38" = load i64, ptr addrspace(5) %"7", align 8 - %"39" = load i32, ptr addrspace(5) %"8", align 4 - %"57" = inttoptr i64 %"38" to ptr - store i32 %"39", ptr %"57", align 4 + %1 = alloca i32, align 4, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = load i64, ptr addrspace(4) %"47", align 8 + store i64 %"11", ptr addrspace(5) %"7", align 8 + store i32 1, ptr addrspace(5) %1, align 4 + %"12" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"12", ptr addrspace(5) %"8", align 4 + %"13" = load i64, ptr addrspace(1) @bar, align 8 + store i64 %"13", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"49" = inttoptr i64 %"15" to ptr + %"14" = load i32, ptr %"49", align 4 + store i32 %"14", ptr addrspace(5) %"9", align 4 + %"17" = load i32, ptr addrspace(5) %"8", align 4 + %"18" = load i32, ptr addrspace(5) %"9", align 4 + %"16" = mul i32 %"17", %"18" + store i32 %"16", ptr addrspace(5) %"8", align 4 + %"19" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 8), align 8 + store i64 %"19", ptr addrspace(5) %"6", align 8 + %"21" = load i64, ptr addrspace(5) %"6", align 8 + %"51" = inttoptr i64 %"21" to ptr + %"20" = load i32, ptr %"51", align 4 + store i32 %"20", ptr addrspace(5) %"9", align 4 + %"23" = load i32, ptr addrspace(5) %"8", align 4 + %"24" = load i32, ptr addrspace(5) %"9", align 4 + %"22" = mul i32 %"23", %"24" + store i32 %"22", ptr addrspace(5) %"8", align 4 + %"25" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 16), align 8 + store i64 %"25", ptr addrspace(5) %"6", align 8 + %"27" = load i64, ptr addrspace(5) %"6", align 8 + %"53" = inttoptr i64 %"27" to ptr + %"26" = load i32, ptr %"53", align 4 + store i32 %"26", ptr addrspace(5) %"9", align 4 + %"29" = load i32, ptr addrspace(5) %"8", align 4 + %"30" = load i32, ptr addrspace(5) %"9", align 4 + %"28" = mul i32 %"29", %"30" + store i32 %"28", ptr addrspace(5) %"8", align 4 + %"31" = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @bar, i64 24), align 8 + store i64 %"31", ptr addrspace(5) %"6", align 8 + %"33" = load i64, ptr addrspace(5) %"6", align 8 + %"55" = inttoptr i64 %"33" to ptr + %"32" = load i32, ptr %"55", align 4 + store i32 %"32", ptr addrspace(5) %"9", align 4 + %"35" = load i32, ptr addrspace(5) %"8", align 4 + %"36" = load i32, ptr addrspace(5) %"9", align 4 + %"34" = mul i32 %"35", %"36" + store i32 %"34", ptr addrspace(5) %"8", align 4 + %"37" = load i64, ptr addrspace(5) %"7", align 8 + %"38" = load i32, ptr addrspace(5) %"8", align 4 + %"56" = inttoptr i64 %"37" to ptr + store i32 %"38", ptr %"56", align 4 ret void } diff --git a/ptx/src/test/spirv_run/global_array.ll b/ptx/src/test/spirv_run/global_array.ll index 3a8da01..59a66ea 100644 --- a/ptx/src/test/spirv_run/global_array.ll +++ b/ptx/src/test/spirv_run/global_array.ll @@ -4,29 +4,29 @@ target triple = "amdgcn-amd-amdhsa" @asdas = protected addrspace(1) externally_initialized global [4 x [2 x i32]] [[2 x i32] [i32 -1, i32 2], [2 x i32] [i32 3, i32 0], [2 x i32] zeroinitializer, [2 x i32] zeroinitializer] @foobar = protected addrspace(1) externally_initialized global [4 x [2 x i64]] [[2 x i64] [i64 -1, i64 2], [2 x i64] [i64 3, i64 0], [2 x i64] [i64 ptrtoint (ptr addrspace(1) @asdas to i64), i64 0], [2 x i64] zeroinitializer] -define protected amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { -"22": +define protected amdgpu_kernel void @global_array(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) - %0 = alloca i64, align 8, addrspace(5) - store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %0, align 8 - %"11" = load i64, ptr addrspace(5) %0, align 8 - store i64 %"11", ptr addrspace(5) %"6", align 8 - %"12" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"12", ptr addrspace(5) %"7", align 8 - %"14" = load i64, ptr addrspace(5) %"6", align 8 + %1 = alloca i64, align 8, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + store i64 ptrtoint (ptr addrspace(1) @foobar to i64), ptr addrspace(5) %1, align 8 + %"10" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"10", ptr addrspace(5) %"6", align 8 + %"11" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"11", ptr addrspace(5) %"7", align 8 + %"13" = load i64, ptr addrspace(5) %"6", align 8 + %"19" = inttoptr i64 %"13" to ptr addrspace(1) + %"12" = load i32, ptr addrspace(1) %"19", align 4 + store i32 %"12", ptr addrspace(5) %"8", align 4 + %"14" = load i64, ptr addrspace(5) %"7", align 8 + %"15" = load i32, ptr addrspace(5) %"8", align 4 %"20" = inttoptr i64 %"14" to ptr addrspace(1) - %"13" = load i32, ptr addrspace(1) %"20", align 4 - store i32 %"13", ptr addrspace(5) %"8", align 4 - %"15" = load i64, ptr addrspace(5) %"7", align 8 - %"16" = load i32, ptr addrspace(5) %"8", align 4 - %"21" = inttoptr i64 %"15" to ptr addrspace(1) - store i32 %"16", ptr addrspace(1) %"21", align 4 + store i32 %"15", ptr addrspace(1) %"20", align 4 ret void } diff --git a/ptx/src/test/spirv_run/lanemask_lt.ll b/ptx/src/test/spirv_run/lanemask_lt.ll index d36d4a2..cc81383 100644 --- a/ptx/src/test/spirv_run/lanemask_lt.ll +++ b/ptx/src/test/spirv_run/lanemask_lt.ll @@ -3,41 +3,41 @@ target triple = "amdgcn-amd-amdhsa" declare i32 @__zluda_ptx_impl__sreg_lanemask_lt() #0 -define protected amdgpu_kernel void @lanemask_lt(ptr addrspace(4) byref(i64) %"28", ptr addrspace(4) byref(i64) %"29") #1 { -"40": +define protected amdgpu_kernel void @lanemask_lt(ptr addrspace(4) byref(i64) %"27", ptr addrspace(4) byref(i64) %"28") #1 { %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 - %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) + %1 = alloca i32, align 4, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"10", align 1 + %"14" = load i64, ptr addrspace(4) %"27", align 8 + store i64 %"14", ptr addrspace(5) %"4", align 8 %"15" = load i64, ptr addrspace(4) %"28", align 8 - store i64 %"15", ptr addrspace(5) %"4", align 8 - %"16" = load i64, ptr addrspace(4) %"29", align 8 - store i64 %"16", ptr addrspace(5) %"5", align 8 - %"18" = load i64, ptr addrspace(5) %"4", align 8 - %"31" = inttoptr i64 %"18" to ptr - %"30" = load i32, ptr %"31", align 4 - store i32 %"30", ptr addrspace(5) %"6", align 4 - %"20" = load i32, ptr addrspace(5) %"6", align 4 - %"32" = add i32 %"20", 1 - store i32 %"32", ptr addrspace(5) %"7", align 4 - %"12" = call i32 @__zluda_ptx_impl__sreg_lanemask_lt() - %0 = alloca i32, align 4, addrspace(5) - store i32 %"12", ptr addrspace(5) %0, align 4 - %"34" = load i32, ptr addrspace(5) %0, align 4 - store i32 %"34", ptr addrspace(5) %"8", align 4 - %"23" = load i32, ptr addrspace(5) %"7", align 4 - %"24" = load i32, ptr addrspace(5) %"8", align 4 - %"35" = add i32 %"23", %"24" - store i32 %"35", ptr addrspace(5) %"7", align 4 - %"25" = load i64, ptr addrspace(5) %"5", align 8 - %"26" = load i32, ptr addrspace(5) %"7", align 4 - %"38" = inttoptr i64 %"25" to ptr - store i32 %"26", ptr %"38", align 4 + store i64 %"15", ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"30" = inttoptr i64 %"17" to ptr + %"29" = load i32, ptr %"30", align 4 + store i32 %"29", ptr addrspace(5) %"6", align 4 + %"19" = load i32, ptr addrspace(5) %"6", align 4 + %"31" = add i32 %"19", 1 + store i32 %"31", ptr addrspace(5) %"7", align 4 + %"11" = call i32 @__zluda_ptx_impl__sreg_lanemask_lt() + store i32 %"11", ptr addrspace(5) %1, align 4 + %"33" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"33", ptr addrspace(5) %"8", align 4 + %"22" = load i32, ptr addrspace(5) %"7", align 4 + %"23" = load i32, ptr addrspace(5) %"8", align 4 + %"34" = add i32 %"22", %"23" + store i32 %"34", ptr addrspace(5) %"7", align 4 + %"24" = load i64, ptr addrspace(5) %"5", align 8 + %"25" = load i32, ptr addrspace(5) %"7", align 4 + %"37" = inttoptr i64 %"24" to ptr + store i32 %"25", ptr %"37", align 4 ret void } diff --git a/ptx/src/test/spirv_run/ld_st.ll b/ptx/src/test/spirv_run/ld_st.ll index c8d6eb1..4b23120 100644 --- a/ptx/src/test/spirv_run/ld_st.ll +++ b/ptx/src/test/spirv_run/ld_st.ll @@ -1,27 +1,27 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 { -"19": +define protected amdgpu_kernel void @ld_st(ptr addrspace(4) byref(i64) %"14", ptr addrspace(4) byref(i64) %"15") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"14", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"15", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"16", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"16" = inttoptr i64 %"11" to ptr + %"10" = load i64, ptr %"16", align 8 + store i64 %"10", ptr addrspace(5) %"6", align 8 + %"12" = load i64, ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"6", align 8 %"17" = inttoptr i64 %"12" to ptr - %"11" = load i64, ptr %"17", align 8 - store i64 %"11", ptr addrspace(5) %"6", align 8 - %"13" = load i64, ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(5) %"6", align 8 - %"18" = inttoptr i64 %"13" to ptr - store i64 %"14", ptr %"18", align 8 + store i64 %"13", ptr %"17", align 8 ret void } diff --git a/ptx/src/test/spirv_run/ld_st_implicit.ll b/ptx/src/test/spirv_run/ld_st_implicit.ll index da47ad8..71baa92 100644 --- a/ptx/src/test/spirv_run/ld_st_implicit.ll +++ b/ptx/src/test/spirv_run/ld_st_implicit.ll @@ -1,35 +1,35 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { -"23": +define protected amdgpu_kernel void @ld_st_implicit(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) + %1 = alloca i64, align 8, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"17", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %0 = alloca i64, align 8, addrspace(5) - store i64 81985529216486895, ptr addrspace(5) %0, align 8 - %"11" = load i64, ptr addrspace(5) %0, align 8 + store i64 %"9", ptr addrspace(5) %"5", align 8 + store i64 81985529216486895, ptr addrspace(5) %1, align 8 + %"10" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"10", ptr addrspace(5) %"6", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr addrspace(1) + %"18" = load float, ptr addrspace(1) %"19", align 4 + %"22" = bitcast float %"18" to i32 + %"11" = zext i32 %"22" to i64 store i64 %"11", ptr addrspace(5) %"6", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 %"20" = inttoptr i64 %"13" to ptr addrspace(1) - %"19" = load float, ptr addrspace(1) %"20", align 4 - %"24" = bitcast float %"19" to i32 - %"12" = zext i32 %"24" to i64 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"14" = load i64, ptr addrspace(5) %"5", align 8 - %"15" = load i64, ptr addrspace(5) %"6", align 8 - %"21" = inttoptr i64 %"14" to ptr addrspace(1) - %"26" = trunc i64 %"15" to i32 - %"22" = bitcast i32 %"26" to float - store float %"22", ptr addrspace(1) %"21", align 4 + %"24" = trunc i64 %"14" to i32 + %"21" = bitcast i32 %"24" to float + store float %"21", ptr addrspace(1) %"20", align 4 ret void } diff --git a/ptx/src/test/spirv_run/ld_st_offset.ll b/ptx/src/test/spirv_run/ld_st_offset.ll index 1b020cb..959aa53 100644 --- a/ptx/src/test/spirv_run/ld_st_offset.ll +++ b/ptx/src/test/spirv_run/ld_st_offset.ll @@ -1,38 +1,38 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { -"30": +define protected amdgpu_kernel void @ld_st_offset(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"25", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"13" to ptr - %"12" = load i32, ptr %"26", align 4 - store i32 %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"25", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"14" to ptr + %"30" = getelementptr inbounds i8, ptr %"26", i64 4 + %"13" = load i32, ptr %"30", align 4 + store i32 %"13", ptr addrspace(5) %"7", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i32, ptr addrspace(5) %"7", align 4 %"27" = inttoptr i64 %"15" to ptr - %"32" = getelementptr inbounds i8, ptr %"27", i64 4 - %"14" = load i32, ptr %"32", align 4 - store i32 %"14", ptr addrspace(5) %"7", align 4 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i32, ptr addrspace(5) %"7", align 4 - %"28" = inttoptr i64 %"16" to ptr - store i32 %"17", ptr %"28", align 4 - %"18" = load i64, ptr addrspace(5) %"5", align 8 - %"19" = load i32, ptr addrspace(5) %"6", align 4 - %"29" = inttoptr i64 %"18" to ptr - %"34" = getelementptr inbounds i8, ptr %"29", i64 4 - store i32 %"19", ptr %"34", align 4 + store i32 %"16", ptr %"27", align 4 + %"17" = load i64, ptr addrspace(5) %"5", align 8 + %"18" = load i32, ptr addrspace(5) %"6", align 4 + %"28" = inttoptr i64 %"17" to ptr + %"32" = getelementptr inbounds i8, ptr %"28", i64 4 + store i32 %"18", ptr %"32", align 4 ret void } diff --git a/ptx/src/test/spirv_run/lg2.ll b/ptx/src/test/spirv_run/lg2.ll index 5e29fe2..9e4500e 100644 --- a/ptx/src/test/spirv_run/lg2.ll +++ b/ptx/src/test/spirv_run/lg2.ll @@ -1,30 +1,30 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { -"21": +define protected amdgpu_kernel void @lg2(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"17", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"19" = inttoptr i64 %"12" to ptr - %"11" = load float, ptr %"19", align 4 - store float %"11", ptr addrspace(5) %"6", align 4 - %"14" = load float, ptr addrspace(5) %"6", align 4 - %"13" = call afn float @llvm.log2.f32(float %"14") - store float %"13", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load float, ptr addrspace(5) %"6", align 4 - %"20" = inttoptr i64 %"15" to ptr - store float %"16", ptr %"20", align 4 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"18" = inttoptr i64 %"11" to ptr + %"10" = load float, ptr %"18", align 4 + store float %"10", ptr addrspace(5) %"6", align 4 + %"13" = load float, ptr addrspace(5) %"6", align 4 + %"12" = call afn float @llvm.log2.f32(float %"13") + store float %"12", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load float, ptr addrspace(5) %"6", align 4 + %"19" = inttoptr i64 %"14" to ptr + store float %"15", ptr %"19", align 4 ret void } diff --git a/ptx/src/test/spirv_run/local_align.ll b/ptx/src/test/spirv_run/local_align.ll index 035d1f7..284a081 100644 --- a/ptx/src/test/spirv_run/local_align.ll +++ b/ptx/src/test/spirv_run/local_align.ll @@ -1,28 +1,28 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { -"20": +define protected amdgpu_kernel void @local_align(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca [8 x i8], align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"15", align 8 + store i64 %"9", ptr addrspace(5) %"5", align 8 %"10" = load i64, ptr addrspace(4) %"16", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"11" = load i64, ptr addrspace(4) %"17", align 8 - store i64 %"11", ptr addrspace(5) %"6", align 8 - %"13" = load i64, ptr addrspace(5) %"5", align 8 + store i64 %"10", ptr addrspace(5) %"6", align 8 + %"12" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = inttoptr i64 %"12" to ptr + %"11" = load i64, ptr %"17", align 8 + store i64 %"11", ptr addrspace(5) %"7", align 8 + %"13" = load i64, ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"7", align 8 %"18" = inttoptr i64 %"13" to ptr - %"12" = load i64, ptr %"18", align 8 - store i64 %"12", ptr addrspace(5) %"7", align 8 - %"14" = load i64, ptr addrspace(5) %"6", align 8 - %"15" = load i64, ptr addrspace(5) %"7", align 8 - %"19" = inttoptr i64 %"14" to ptr - store i64 %"15", ptr %"19", align 8 + store i64 %"14", ptr %"18", align 8 ret void } diff --git a/ptx/src/test/spirv_run/mad_hi_cc.ll b/ptx/src/test/spirv_run/mad_hi_cc.ll new file mode 100644 index 0000000..f9a27b4 --- /dev/null +++ b/ptx/src/test/spirv_run/mad_hi_cc.ll @@ -0,0 +1,90 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @mad_hi_cc(ptr addrspace(4) byref(i64) %"60", ptr addrspace(4) byref(i64) %"61") #0 { + %"14" = alloca i1, align 1, addrspace(5) + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"10" = alloca i32, align 4, addrspace(5) + %"11" = alloca i32, align 4, addrspace(5) + %"12" = alloca i32, align 4, addrspace(5) + %"13" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"14", align 1 + %"15" = load i64, ptr addrspace(4) %"60", align 8 + store i64 %"15", ptr addrspace(5) %"4", align 8 + %"16" = load i64, ptr addrspace(4) %"61", align 8 + store i64 %"16", ptr addrspace(5) %"5", align 8 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"63" = inttoptr i64 %"18" to ptr + %"62" = load i32, ptr %"63", align 4 + store i32 %"62", ptr addrspace(5) %"8", align 4 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"64" = inttoptr i64 %"20" to ptr + %"78" = getelementptr inbounds i8, ptr %"64", i64 4 + %"65" = load i32, ptr %"78", align 4 + store i32 %"65", ptr addrspace(5) %"9", align 4 + %"22" = load i64, ptr addrspace(5) %"4", align 8 + %"66" = inttoptr i64 %"22" to ptr + %"80" = getelementptr inbounds i8, ptr %"66", i64 8 + %"21" = load i32, ptr %"80", align 4 + store i32 %"21", ptr addrspace(5) %"10", align 4 + %"25" = load i32, ptr addrspace(5) %"8", align 4 + %"26" = load i32, ptr addrspace(5) %"9", align 4 + %"27" = load i32, ptr addrspace(5) %"10", align 4 + %2 = sext i32 %"25" to i64 + %3 = sext i32 %"26" to i64 + %4 = mul nsw i64 %2, %3 + %5 = lshr i64 %4, 32 + %6 = trunc i64 %5 to i32 + %7 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %6, i32 %"27") + %"23" = extractvalue { i32, i1 } %7, 0 + %"24" = extractvalue { i32, i1 } %7, 1 + store i32 %"23", ptr addrspace(5) %"7", align 4 + store i1 %"24", ptr addrspace(5) %"14", align 1 + %8 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 1, i32 -2) + %"28" = extractvalue { i32, i1 } %8, 0 + %"29" = extractvalue { i32, i1 } %8, 1 + store i32 %"28", ptr addrspace(5) %"6", align 4 + store i1 %"29", ptr addrspace(5) %"14", align 1 + %"31" = load i1, ptr addrspace(5) %"14", align 1 + %9 = zext i1 %"31" to i32 + %"70" = add i32 0, %9 + store i32 %"70", ptr addrspace(5) %"12", align 4 + %10 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 1, i32 -1) + %"32" = extractvalue { i32, i1 } %10, 0 + %"33" = extractvalue { i32, i1 } %10, 1 + store i32 %"32", ptr addrspace(5) %"6", align 4 + store i1 %"33", ptr addrspace(5) %"14", align 1 + %"35" = load i1, ptr addrspace(5) %"14", align 1 + %11 = zext i1 %"35" to i32 + %"71" = add i32 0, %11 + store i32 %"71", ptr addrspace(5) %"13", align 4 + %"36" = load i64, ptr addrspace(5) %"5", align 8 + %"37" = load i32, ptr addrspace(5) %"7", align 4 + %"72" = inttoptr i64 %"36" to ptr + store i32 %"37", ptr %"72", align 4 + %"38" = load i64, ptr addrspace(5) %"5", align 8 + %"39" = load i32, ptr addrspace(5) %"12", align 4 + %"73" = inttoptr i64 %"38" to ptr + %"82" = getelementptr inbounds i8, ptr %"73", i64 4 + store i32 %"39", ptr %"82", align 4 + %"40" = load i64, ptr addrspace(5) %"5", align 8 + %"41" = load i32, ptr addrspace(5) %"13", align 4 + %"75" = inttoptr i64 %"40" to ptr + %"84" = getelementptr inbounds i8, ptr %"75", i64 8 + store i32 %"41", ptr %"84", align 4 + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/mad_hi_cc.ptx b/ptx/src/test/spirv_run/mad_hi_cc.ptx new file mode 100644 index 0000000..4a8cac3 --- /dev/null +++ b/ptx/src/test/spirv_run/mad_hi_cc.ptx @@ -0,0 +1,41 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry mad_hi_cc( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u32 unused; + + .reg .s32 dst1; + .reg .b32 src1; + .reg .b32 src2; + .reg .b32 src3; + + .reg .b32 result_1; + .reg .b32 carry_out_1; + .reg .b32 carry_out_2; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + // test valid computational results + ld.s32 src1, [in_addr]; + ld.s32 src2, [in_addr+4]; + ld.b32 src3, [in_addr+8]; + mad.hi.cc.s32 dst1, src1, src2, src3; + + mad.hi.cc.u32 unused, 65536, 65536, 4294967294; // non-overflowing + addc.u32 carry_out_1, 0, 0; // carry_out_1 should be 0 + mad.hi.cc.u32 unused, 65536, 65536, 4294967295; // overflowing + addc.u32 carry_out_2, 0, 0; // carry_out_2 should be 1 + + st.s32 [out_addr], dst1; + st.s32 [out_addr+4], carry_out_1; + st.s32 [out_addr+8], carry_out_2; + ret; +} diff --git a/ptx/src/test/spirv_run/mad_s32.ll b/ptx/src/test/spirv_run/mad_s32.ll index 75a204a..f1c15cf 100644 --- a/ptx/src/test/spirv_run/mad_s32.ll +++ b/ptx/src/test/spirv_run/mad_s32.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"53", ptr addrspace(4) byref(i64) %"54") #0 { -"76": +define protected amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"52", ptr addrspace(4) byref(i64) %"53") #0 { %"13" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"13", align 1 - %"14" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"14", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) @@ -16,67 +12,71 @@ define protected amdgpu_kernel void @mad_s32(ptr addrspace(4) byref(i64) %"53", %"10" = alloca i32, align 4, addrspace(5) %"11" = alloca i32, align 4, addrspace(5) %"12" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"13", align 1 + %"14" = load i64, ptr addrspace(4) %"52", align 8 + store i64 %"14", ptr addrspace(5) %"4", align 8 %"15" = load i64, ptr addrspace(4) %"53", align 8 - store i64 %"15", ptr addrspace(5) %"4", align 8 - %"16" = load i64, ptr addrspace(4) %"54", align 8 - store i64 %"16", ptr addrspace(5) %"5", align 8 - %"18" = load i64, ptr addrspace(5) %"4", align 8 - %"56" = inttoptr i64 %"18" to ptr - %"55" = load i32, ptr %"56", align 4 - store i32 %"55", ptr addrspace(5) %"9", align 4 - %"20" = load i64, ptr addrspace(5) %"4", align 8 - %"57" = inttoptr i64 %"20" to ptr - %"78" = getelementptr inbounds i8, ptr %"57", i64 4 - %"58" = load i32, ptr %"78", align 4 - store i32 %"58", ptr addrspace(5) %"10", align 4 - %"22" = load i64, ptr addrspace(5) %"4", align 8 - %"59" = inttoptr i64 %"22" to ptr - %"80" = getelementptr inbounds i8, ptr %"59", i64 8 - %"21" = load i64, ptr %"80", align 8 - store i64 %"21", ptr addrspace(5) %"12", align 8 - %"24" = load i64, ptr addrspace(5) %"4", align 8 - %"60" = inttoptr i64 %"24" to ptr - %"82" = getelementptr inbounds i8, ptr %"60", i64 16 - %"61" = load i32, ptr %"82", align 4 - store i32 %"61", ptr addrspace(5) %"11", align 4 - %"26" = load i32, ptr addrspace(5) %"9", align 4 - %"27" = load i32, ptr addrspace(5) %"10", align 4 - %"28" = load i32, ptr addrspace(5) %"11", align 4 - %0 = mul i32 %"26", %"27" - %"25" = add i32 %0, %"28" - store i32 %"25", ptr addrspace(5) %"6", align 4 - %"30" = load i32, ptr addrspace(5) %"9", align 4 - %"31" = load i32, ptr addrspace(5) %"10", align 4 - %"32" = load i32, ptr addrspace(5) %"11", align 4 - %1 = sext i32 %"30" to i64 - %2 = sext i32 %"31" to i64 - %3 = mul nsw i64 %1, %2 - %4 = lshr i64 %3, 32 - %5 = trunc i64 %4 to i32 - %"29" = add i32 %5, %"32" - store i32 %"29", ptr addrspace(5) %"7", align 4 - %"34" = load i32, ptr addrspace(5) %"9", align 4 - %"35" = load i32, ptr addrspace(5) %"10", align 4 - %"36" = load i64, ptr addrspace(5) %"12", align 8 - %6 = sext i32 %"34" to i64 - %7 = sext i32 %"35" to i64 - %8 = mul nsw i64 %6, %7 - %"68" = add i64 %8, %"36" - store i64 %"68", ptr addrspace(5) %"8", align 8 - %"37" = load i64, ptr addrspace(5) %"5", align 8 - %"38" = load i32, ptr addrspace(5) %"6", align 4 - %"72" = inttoptr i64 %"37" to ptr - store i32 %"38", ptr %"72", align 4 - %"39" = load i64, ptr addrspace(5) %"5", align 8 - %"40" = load i32, ptr addrspace(5) %"7", align 4 - %"73" = inttoptr i64 %"39" to ptr - %"84" = getelementptr inbounds i8, ptr %"73", i64 8 - store i32 %"40", ptr %"84", align 4 - %"41" = load i64, ptr addrspace(5) %"5", align 8 - %"42" = load i64, ptr addrspace(5) %"8", align 8 - %"74" = inttoptr i64 %"41" to ptr - %"86" = getelementptr inbounds i8, ptr %"74", i64 16 - store i64 %"42", ptr %"86", align 8 + store i64 %"15", ptr addrspace(5) %"5", align 8 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"55" = inttoptr i64 %"17" to ptr + %"54" = load i32, ptr %"55", align 4 + store i32 %"54", ptr addrspace(5) %"9", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"56" = inttoptr i64 %"19" to ptr + %"76" = getelementptr inbounds i8, ptr %"56", i64 4 + %"57" = load i32, ptr %"76", align 4 + store i32 %"57", ptr addrspace(5) %"10", align 4 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"58" = inttoptr i64 %"21" to ptr + %"78" = getelementptr inbounds i8, ptr %"58", i64 8 + %"20" = load i64, ptr %"78", align 8 + store i64 %"20", ptr addrspace(5) %"12", align 8 + %"23" = load i64, ptr addrspace(5) %"4", align 8 + %"59" = inttoptr i64 %"23" to ptr + %"80" = getelementptr inbounds i8, ptr %"59", i64 16 + %"60" = load i32, ptr %"80", align 4 + store i32 %"60", ptr addrspace(5) %"11", align 4 + %"25" = load i32, ptr addrspace(5) %"9", align 4 + %"26" = load i32, ptr addrspace(5) %"10", align 4 + %"27" = load i32, ptr addrspace(5) %"11", align 4 + %2 = mul i32 %"25", %"26" + %"24" = add i32 %2, %"27" + store i32 %"24", ptr addrspace(5) %"6", align 4 + %"29" = load i32, ptr addrspace(5) %"9", align 4 + %"30" = load i32, ptr addrspace(5) %"10", align 4 + %"31" = load i32, ptr addrspace(5) %"11", align 4 + %3 = sext i32 %"29" to i64 + %4 = sext i32 %"30" to i64 + %5 = mul nsw i64 %3, %4 + %6 = lshr i64 %5, 32 + %7 = trunc i64 %6 to i32 + %"28" = add i32 %7, %"31" + store i32 %"28", ptr addrspace(5) %"7", align 4 + %"33" = load i32, ptr addrspace(5) %"9", align 4 + %"34" = load i32, ptr addrspace(5) %"10", align 4 + %"35" = load i64, ptr addrspace(5) %"12", align 8 + %8 = sext i32 %"33" to i64 + %9 = sext i32 %"34" to i64 + %10 = mul nsw i64 %8, %9 + %"67" = add i64 %10, %"35" + store i64 %"67", ptr addrspace(5) %"8", align 8 + %"36" = load i64, ptr addrspace(5) %"5", align 8 + %"37" = load i32, ptr addrspace(5) %"6", align 4 + %"71" = inttoptr i64 %"36" to ptr + store i32 %"37", ptr %"71", align 4 + %"38" = load i64, ptr addrspace(5) %"5", align 8 + %"39" = load i32, ptr addrspace(5) %"7", align 4 + %"72" = inttoptr i64 %"38" to ptr + %"82" = getelementptr inbounds i8, ptr %"72", i64 8 + store i32 %"39", ptr %"82", align 4 + %"40" = load i64, ptr addrspace(5) %"5", align 8 + %"41" = load i64, ptr addrspace(5) %"8", align 8 + %"73" = inttoptr i64 %"40" to ptr + %"84" = getelementptr inbounds i8, ptr %"73", i64 16 + store i64 %"41", ptr %"84", align 8 ret void } diff --git a/ptx/src/test/spirv_run/madc_cc.ll b/ptx/src/test/spirv_run/madc_cc.ll index 626149c..0c9df2b 100644 --- a/ptx/src/test/spirv_run/madc_cc.ll +++ b/ptx/src/test/spirv_run/madc_cc.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @madc_cc(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { -"55": +define protected amdgpu_kernel void @madc_cc(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 - %"12" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"12", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) @@ -14,54 +10,58 @@ define protected amdgpu_kernel void @madc_cc(ptr addrspace(4) byref(i64) %"41", %"8" = alloca i32, align 4, addrspace(5) %"9" = alloca i32, align 4, addrspace(5) %"10" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"11", align 1 + %"12" = load i64, ptr addrspace(4) %"40", align 8 + store i64 %"12", ptr addrspace(5) %"4", align 8 %"13" = load i64, ptr addrspace(4) %"41", align 8 - store i64 %"13", ptr addrspace(5) %"4", align 8 - %"14" = load i64, ptr addrspace(4) %"42", align 8 - store i64 %"14", ptr addrspace(5) %"5", align 8 - %"16" = load i64, ptr addrspace(5) %"4", align 8 - %"44" = inttoptr i64 %"16" to ptr - %"43" = load i32, ptr %"44", align 4 - store i32 %"43", ptr addrspace(5) %"8", align 4 - %"18" = load i64, ptr addrspace(5) %"4", align 8 - %"45" = inttoptr i64 %"18" to ptr - %"57" = getelementptr inbounds i8, ptr %"45", i64 4 - %"46" = load i32, ptr %"57", align 4 - store i32 %"46", ptr addrspace(5) %"9", align 4 - %"20" = load i64, ptr addrspace(5) %"4", align 8 - %"47" = inttoptr i64 %"20" to ptr - %"59" = getelementptr inbounds i8, ptr %"47", i64 8 - %"19" = load i32, ptr %"59", align 4 - store i32 %"19", ptr addrspace(5) %"10", align 4 - %"23" = load i32, ptr addrspace(5) %"8", align 4 - %"24" = load i32, ptr addrspace(5) %"9", align 4 - %"25" = load i32, ptr addrspace(5) %"10", align 4 - %0 = mul i32 %"23", %"24" - %1 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %0, i32 %"25") - %"21" = extractvalue { i32, i1 } %1, 0 - %"22" = extractvalue { i32, i1 } %1, 1 - store i32 %"21", ptr addrspace(5) %"6", align 4 - store i1 %"22", ptr addrspace(5) %"11", align 1 - %"27" = load i1, ptr addrspace(5) %"11", align 1 - %"28" = load i32, ptr addrspace(5) %"8", align 4 - %"29" = load i32, ptr addrspace(5) %"9", align 4 - %2 = sext i32 %"28" to i64 - %3 = sext i32 %"29" to i64 - %4 = mul nsw i64 %2, %3 - %5 = lshr i64 %4, 32 - %6 = trunc i64 %5 to i32 - %7 = zext i1 %"27" to i32 - %8 = add i32 %6, 3 - %"26" = add i32 %8, %7 - store i32 %"26", ptr addrspace(5) %"7", align 4 - %"30" = load i64, ptr addrspace(5) %"5", align 8 - %"31" = load i32, ptr addrspace(5) %"6", align 4 - %"53" = inttoptr i64 %"30" to ptr - store i32 %"31", ptr %"53", align 4 - %"32" = load i64, ptr addrspace(5) %"5", align 8 - %"33" = load i32, ptr addrspace(5) %"7", align 4 - %"54" = inttoptr i64 %"32" to ptr - %"61" = getelementptr inbounds i8, ptr %"54", i64 4 - store i32 %"33", ptr %"61", align 4 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"43" = inttoptr i64 %"15" to ptr + %"42" = load i32, ptr %"43", align 4 + store i32 %"42", ptr addrspace(5) %"8", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"44" = inttoptr i64 %"17" to ptr + %"55" = getelementptr inbounds i8, ptr %"44", i64 4 + %"45" = load i32, ptr %"55", align 4 + store i32 %"45", ptr addrspace(5) %"9", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"46" = inttoptr i64 %"19" to ptr + %"57" = getelementptr inbounds i8, ptr %"46", i64 8 + %"18" = load i32, ptr %"57", align 4 + store i32 %"18", ptr addrspace(5) %"10", align 4 + %"22" = load i32, ptr addrspace(5) %"8", align 4 + %"23" = load i32, ptr addrspace(5) %"9", align 4 + %"24" = load i32, ptr addrspace(5) %"10", align 4 + %2 = mul i32 %"22", %"23" + %3 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %2, i32 %"24") + %"20" = extractvalue { i32, i1 } %3, 0 + %"21" = extractvalue { i32, i1 } %3, 1 + store i32 %"20", ptr addrspace(5) %"6", align 4 + store i1 %"21", ptr addrspace(5) %"11", align 1 + %"26" = load i1, ptr addrspace(5) %"11", align 1 + %"27" = load i32, ptr addrspace(5) %"8", align 4 + %"28" = load i32, ptr addrspace(5) %"9", align 4 + %4 = sext i32 %"27" to i64 + %5 = sext i32 %"28" to i64 + %6 = mul nsw i64 %4, %5 + %7 = lshr i64 %6, 32 + %8 = trunc i64 %7 to i32 + %9 = zext i1 %"26" to i32 + %10 = add i32 %8, 3 + %"25" = add i32 %10, %9 + store i32 %"25", ptr addrspace(5) %"7", align 4 + %"29" = load i64, ptr addrspace(5) %"5", align 8 + %"30" = load i32, ptr addrspace(5) %"6", align 4 + %"52" = inttoptr i64 %"29" to ptr + store i32 %"30", ptr %"52", align 4 + %"31" = load i64, ptr addrspace(5) %"5", align 8 + %"32" = load i32, ptr addrspace(5) %"7", align 4 + %"53" = inttoptr i64 %"31" to ptr + %"59" = getelementptr inbounds i8, ptr %"53", i64 4 + store i32 %"32", ptr %"59", align 4 ret void } diff --git a/ptx/src/test/spirv_run/madc_cc2.ll b/ptx/src/test/spirv_run/madc_cc2.ll deleted file mode 100644 index bea7193..0000000 --- a/ptx/src/test/spirv_run/madc_cc2.ll +++ /dev/null @@ -1,73 +0,0 @@ -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" -target triple = "amdgcn-amd-amdhsa" - -define protected amdgpu_kernel void @madc_cc2(ptr addrspace(4) byref(i64) %"52", ptr addrspace(4) byref(i64) %"53") #0 { -"66": - %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 - %"12" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"12", align 1 - %"4" = alloca i64, align 8, addrspace(5) - %"5" = alloca i64, align 8, addrspace(5) - %"6" = alloca i32, align 4, addrspace(5) - %"7" = alloca i32, align 4, addrspace(5) - %"8" = alloca i32, align 4, addrspace(5) - %"9" = alloca i32, align 4, addrspace(5) - %"10" = alloca i32, align 4, addrspace(5) - %"13" = load i64, ptr addrspace(4) %"53", align 8 - store i64 %"13", ptr addrspace(5) %"5", align 8 - %0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 0, i32 -1) - %"14" = extractvalue { i32, i1 } %0, 0 - %"15" = extractvalue { i32, i1 } %0, 1 - store i32 %"14", ptr addrspace(5) %"6", align 4 - store i1 %"15", ptr addrspace(5) %"11", align 1 - %"18" = load i1, ptr addrspace(5) %"11", align 1 - %1 = zext i1 %"18" to i32 - %2 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 1, i32 -1) - %3 = extractvalue { i32, i1 } %2, 0 - %4 = extractvalue { i32, i1 } %2, 1 - %5 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %3, i32 %1) - %"54" = extractvalue { i32, i1 } %5, 0 - %6 = extractvalue { i32, i1 } %5, 1 - %"17" = xor i1 %4, %6 - store i32 %"54", ptr addrspace(5) %"7", align 4 - store i1 %"17", ptr addrspace(5) %"11", align 1 - %"20" = load i1, ptr addrspace(5) %"11", align 1 - %7 = zext i1 %"20" to i32 - %"55" = add i32 0, %7 - store i32 %"55", ptr addrspace(5) %"8", align 4 - %"22" = load i1, ptr addrspace(5) %"11", align 1 - %8 = zext i1 %"22" to i32 - %"56" = add i32 0, %8 - store i32 %"56", ptr addrspace(5) %"9", align 4 - %"24" = load i1, ptr addrspace(5) %"12", align 1 - %9 = zext i1 %"24" to i32 - %"57" = sub i32 2, %9 - store i32 %"57", ptr addrspace(5) %"10", align 4 - %"25" = load i64, ptr addrspace(5) %"5", align 8 - %"26" = load i32, ptr addrspace(5) %"7", align 4 - %"58" = inttoptr i64 %"25" to ptr - store i32 %"26", ptr %"58", align 4 - %"27" = load i64, ptr addrspace(5) %"5", align 8 - %"28" = load i32, ptr addrspace(5) %"8", align 4 - %"60" = inttoptr i64 %"27" to ptr - %"68" = getelementptr inbounds i8, ptr %"60", i64 4 - store i32 %"28", ptr %"68", align 4 - %"29" = load i64, ptr addrspace(5) %"5", align 8 - %"30" = load i32, ptr addrspace(5) %"9", align 4 - %"62" = inttoptr i64 %"29" to ptr - %"70" = getelementptr inbounds i8, ptr %"62", i64 8 - store i32 %"30", ptr %"70", align 4 - %"31" = load i64, ptr addrspace(5) %"5", align 8 - %"32" = load i32, ptr addrspace(5) %"10", align 4 - %"64" = inttoptr i64 %"31" to ptr - %"72" = getelementptr inbounds i8, ptr %"64", i64 12 - store i32 %"32", ptr %"72", align 4 - ret void -} - -; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn -declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 - -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/madc_cc2.ptx b/ptx/src/test/spirv_run/madc_cc2.ptx deleted file mode 100644 index 163c39b..0000000 --- a/ptx/src/test/spirv_run/madc_cc2.ptx +++ /dev/null @@ -1,38 +0,0 @@ -.version 6.5 -.target sm_30 -.address_size 64 - -.visible .entry madc_cc2( - .param .u64 input, - .param .u64 output -) -{ - .reg .u64 in_addr; - .reg .u64 out_addr; - .reg .u32 unused; - - .reg .b32 result_1; - .reg .b32 carry_out_1_1; - .reg .b32 carry_out_1_2; - .reg .b32 carry_out_1_3; - - ld.param.u64 out_addr, [output]; - - // set carry=1 - mad.lo.cc.u32 unused, 0, 0, 4294967295; - // overflow addition - madc.lo.cc.u32 result_1, 1, 1, 4294967295; - // write carry - madc.lo.u32 carry_out_1_1, 0, 0, 0; - // overflow is also detected by addc - addc.u32 carry_out_1_2, 0, 0; - // but not subc - subc.u32 carry_out_1_3, 2, 0; - - st.s32 [out_addr], result_1; - st.s32 [out_addr+4], carry_out_1_1; - st.s32 [out_addr+8], carry_out_1_2; - st.s32 [out_addr+12], carry_out_1_3; - - ret; -} diff --git a/ptx/src/test/spirv_run/max.ll b/ptx/src/test/spirv_run/max.ll index 79b6f48..ef0b39d 100644 --- a/ptx/src/test/spirv_run/max.ll +++ b/ptx/src/test/spirv_run/max.ll @@ -1,37 +1,37 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"28": +define protected amdgpu_kernel void @max(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"25" = inttoptr i64 %"13" to ptr - %"12" = load i32, ptr %"25", align 4 - store i32 %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"15" to ptr - %"30" = getelementptr inbounds i8, ptr %"26", i64 4 - %"14" = load i32, ptr %"30", align 4 - store i32 %"14", ptr addrspace(5) %"7", align 4 - %"17" = load i32, ptr addrspace(5) %"6", align 4 - %"18" = load i32, ptr addrspace(5) %"7", align 4 - %"16" = call i32 @llvm.smax.i32(i32 %"17", i32 %"18") - store i32 %"16", ptr addrspace(5) %"6", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i32, ptr addrspace(5) %"6", align 4 - %"27" = inttoptr i64 %"19" to ptr - store i32 %"20", ptr %"27", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"24", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"14" to ptr + %"28" = getelementptr inbounds i8, ptr %"25", i64 4 + %"13" = load i32, ptr %"28", align 4 + store i32 %"13", ptr addrspace(5) %"7", align 4 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + %"15" = call i32 @llvm.smax.i32(i32 %"16", i32 %"17") + store i32 %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i32, ptr addrspace(5) %"6", align 4 + %"26" = inttoptr i64 %"18" to ptr + store i32 %"19", ptr %"26", align 4 ret void } diff --git a/ptx/src/test/spirv_run/membar.ll b/ptx/src/test/spirv_run/membar.ll index c9ec8b9..f24c0fb 100644 --- a/ptx/src/test/spirv_run/membar.ll +++ b/ptx/src/test/spirv_run/membar.ll @@ -1,28 +1,28 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 { -"20": +define protected amdgpu_kernel void @membar(ptr addrspace(4) byref(i64) %"14", ptr addrspace(4) byref(i64) %"15") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"14", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"15", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"16", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"18" = inttoptr i64 %"12" to ptr - %"17" = load i32, ptr %"18", align 4 - store i32 %"17", ptr addrspace(5) %"6", align 4 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"17" = inttoptr i64 %"11" to ptr + %"16" = load i32, ptr %"17", align 4 + store i32 %"16", ptr addrspace(5) %"6", align 4 fence seq_cst - %"13" = load i64, ptr addrspace(5) %"5", align 8 - %"14" = load i32, ptr addrspace(5) %"6", align 4 - %"19" = inttoptr i64 %"13" to ptr - store i32 %"14", ptr %"19", align 4 + %"12" = load i64, ptr addrspace(5) %"5", align 8 + %"13" = load i32, ptr addrspace(5) %"6", align 4 + %"18" = inttoptr i64 %"12" to ptr + store i32 %"13", ptr %"18", align 4 ret void } diff --git a/ptx/src/test/spirv_run/min.ll b/ptx/src/test/spirv_run/min.ll index 0828070..b40c4db 100644 --- a/ptx/src/test/spirv_run/min.ll +++ b/ptx/src/test/spirv_run/min.ll @@ -1,37 +1,37 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"28": +define protected amdgpu_kernel void @min(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"25" = inttoptr i64 %"13" to ptr - %"12" = load i32, ptr %"25", align 4 - store i32 %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"15" to ptr - %"30" = getelementptr inbounds i8, ptr %"26", i64 4 - %"14" = load i32, ptr %"30", align 4 - store i32 %"14", ptr addrspace(5) %"7", align 4 - %"17" = load i32, ptr addrspace(5) %"6", align 4 - %"18" = load i32, ptr addrspace(5) %"7", align 4 - %"16" = call i32 @llvm.smin.i32(i32 %"17", i32 %"18") - store i32 %"16", ptr addrspace(5) %"6", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i32, ptr addrspace(5) %"6", align 4 - %"27" = inttoptr i64 %"19" to ptr - store i32 %"20", ptr %"27", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"24", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"14" to ptr + %"28" = getelementptr inbounds i8, ptr %"25", i64 4 + %"13" = load i32, ptr %"28", align 4 + store i32 %"13", ptr addrspace(5) %"7", align 4 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + %"15" = call i32 @llvm.smin.i32(i32 %"16", i32 %"17") + store i32 %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i32, ptr addrspace(5) %"6", align 4 + %"26" = inttoptr i64 %"18" to ptr + store i32 %"19", ptr %"26", align 4 ret void } diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index 1ad0cb2..71dbd06 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -294,7 +294,11 @@ test_ptx!( [65521u32, 2147549199, 0x1000], [2147487519u32, 4294934539] ); -test_ptx!(madc_cc2, [0xDEADu32], [0u32, 1, 1, 2]); +test_ptx!( + mad_hi_cc, + [0x26223377u32, 0x70777766u32, 0x60666633u32], + [0x71272866u32, 0u32, 1u32] +); // Multi-tap :) test_ptx!(mov_vector_cast, [0x200000001u64], [2u32, 1u32]); test_ptx!( cvt_clamp, @@ -327,11 +331,13 @@ test_ptx!( ], [4294967295u32, 0, 2] ); -test_ptx!(carry_mixed, [0xDEADu32], [1u32, 1u32]); test_ptx!( - subc_cc2, + carry_set_all, [0xDEADu32], - [0u32, 1, 0, 4294967295, 1, 4294967295, 1] + [ + 1u32, 0, 0, 1, 0, 1, 0, 0, 0u32, 4294967295, 4294967295, 0, 4294967295, 0, 4294967295, + 4294967295 + ] ); test_ptx!(vshr, [0x6f3650f4u32, 22, 0xc62d4586], [0xC62D4742u32]); test_ptx!(bfind, [0u32, 1u32, 0x64eb0414], [u32::MAX, 0, 30]); @@ -360,6 +366,7 @@ test_ptx!( [1923569713u64, 1923569712] ); test_ptx!(isspacep, [0xDEADu32], [1u32, 0]); +test_ptx!(sad, [2147483648u32, 2, 13], [2147483659u32, 2147483663]); test_ptx_warp!( shfl, diff --git a/ptx/src/test/spirv_run/mov.ll b/ptx/src/test/spirv_run/mov.ll index e876ced..d43fe68 100644 --- a/ptx/src/test/spirv_run/mov.ll +++ b/ptx/src/test/spirv_run/mov.ll @@ -1,33 +1,33 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { -"22": +define protected amdgpu_kernel void @mov(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + %1 = alloca i64, align 8, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"20" = inttoptr i64 %"13" to ptr - %"12" = load i64, ptr %"20", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"15" = load i64, ptr addrspace(5) %"6", align 8 - %0 = alloca i64, align 8, addrspace(5) - store i64 %"15", ptr addrspace(5) %0, align 8 - %"14" = load i64, ptr addrspace(5) %0, align 8 - store i64 %"14", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"21" = inttoptr i64 %"16" to ptr - store i64 %"17", ptr %"21", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr + %"11" = load i64, ptr %"19", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 + store i64 %"14", ptr addrspace(5) %1, align 8 + %"13" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"20" = inttoptr i64 %"15" to ptr + store i64 %"16", ptr %"20", align 8 ret void } diff --git a/ptx/src/test/spirv_run/mov_address.ll b/ptx/src/test/spirv_run/mov_address.ll index b9f3a8a..42d987f 100644 --- a/ptx/src/test/spirv_run/mov_address.ll +++ b/ptx/src/test/spirv_run/mov_address.ll @@ -1,19 +1,19 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"9", ptr addrspace(4) byref(i64) %"10") #0 { -"12": +define protected amdgpu_kernel void @mov_address(ptr addrspace(4) byref(i64) %"8", ptr addrspace(4) byref(i64) %"9") #0 { %"6" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"6", align 1 - %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 %"4" = alloca [8 x i8], align 1, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) - %"11" = ptrtoint ptr addrspace(5) %"4" to i64 - %0 = alloca i64, align 8, addrspace(5) - store i64 %"11", ptr addrspace(5) %0, align 8 - %"8" = load i64, ptr addrspace(5) %0, align 8 - store i64 %"8", ptr addrspace(5) %"5", align 8 + %1 = alloca i64, align 8, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"6", align 1 + %"10" = ptrtoint ptr addrspace(5) %"4" to i64 + store i64 %"10", ptr addrspace(5) %1, align 8 + %"7" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"7", ptr addrspace(5) %"5", align 8 ret void } diff --git a/ptx/src/test/spirv_run/mov_vector_cast.ll b/ptx/src/test/spirv_run/mov_vector_cast.ll index 1f52a3b..eb81724 100644 --- a/ptx/src/test/spirv_run/mov_vector_cast.ll +++ b/ptx/src/test/spirv_run/mov_vector_cast.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @mov_vector_cast(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { -"50": +define protected amdgpu_kernel void @mov_vector_cast(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { %"15" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"15", align 1 - %"16" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"16", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) @@ -16,51 +12,55 @@ define protected amdgpu_kernel void @mov_vector_cast(ptr addrspace(4) byref(i64) %"10" = alloca half, align 2, addrspace(5) %"11" = alloca half, align 2, addrspace(5) %"12" = alloca half, align 2, addrspace(5) - %"17" = load i64, ptr addrspace(4) %"35", align 8 - store i64 %"17", ptr addrspace(5) %"4", align 8 - %"18" = load i64, ptr addrspace(4) %"36", align 8 - store i64 %"18", ptr addrspace(5) %"5", align 8 - %"20" = load i64, ptr addrspace(5) %"4", align 8 - %"37" = inttoptr i64 %"20" to ptr - %"19" = load i64, ptr %"37", align 8 - store i64 %"19", ptr addrspace(5) %"6", align 8 - %"21" = load i64, ptr addrspace(5) %"6", align 8 - %0 = alloca i64, align 8, addrspace(5) - store i64 %"21", ptr addrspace(5) %0, align 8 - %"13" = load i64, ptr addrspace(5) %0, align 8 - %"39" = bitcast i64 %"13" to <2 x i32> - %"40" = extractelement <2 x i32> %"39", i32 0 - %"41" = extractelement <2 x i32> %"39", i32 1 - %"22" = bitcast i32 %"40" to float - %"23" = bitcast i32 %"41" to float - store float %"22", ptr addrspace(5) %"7", align 4 - store float %"23", ptr addrspace(5) %"8", align 4 - %"24" = load i64, ptr addrspace(5) %"6", align 8 %1 = alloca i64, align 8, addrspace(5) - store i64 %"24", ptr addrspace(5) %1, align 8 - %"14" = load i64, ptr addrspace(5) %1, align 8 - %"43" = bitcast i64 %"14" to <4 x i16> - %"44" = extractelement <4 x i16> %"43", i32 0 - %"45" = extractelement <4 x i16> %"43", i32 1 - %"46" = extractelement <4 x i16> %"43", i32 2 - %"47" = extractelement <4 x i16> %"43", i32 3 + %2 = alloca i64, align 8, addrspace(5) + br label %3 + +3: ; preds = %0 + store i1 false, ptr addrspace(5) %"15", align 1 + %"16" = load i64, ptr addrspace(4) %"34", align 8 + store i64 %"16", ptr addrspace(5) %"4", align 8 + %"17" = load i64, ptr addrspace(4) %"35", align 8 + store i64 %"17", ptr addrspace(5) %"5", align 8 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"36" = inttoptr i64 %"19" to ptr + %"18" = load i64, ptr %"36", align 8 + store i64 %"18", ptr addrspace(5) %"6", align 8 + %"20" = load i64, ptr addrspace(5) %"6", align 8 + store i64 %"20", ptr addrspace(5) %1, align 8 + %"13" = load i64, ptr addrspace(5) %1, align 8 + %"38" = bitcast i64 %"13" to <2 x i32> + %"39" = extractelement <2 x i32> %"38", i32 0 + %"40" = extractelement <2 x i32> %"38", i32 1 + %"21" = bitcast i32 %"39" to float + %"22" = bitcast i32 %"40" to float + store float %"21", ptr addrspace(5) %"7", align 4 + store float %"22", ptr addrspace(5) %"8", align 4 + %"23" = load i64, ptr addrspace(5) %"6", align 8 + store i64 %"23", ptr addrspace(5) %2, align 8 + %"14" = load i64, ptr addrspace(5) %2, align 8 + %"42" = bitcast i64 %"14" to <4 x i16> + %"43" = extractelement <4 x i16> %"42", i32 0 + %"44" = extractelement <4 x i16> %"42", i32 1 + %"45" = extractelement <4 x i16> %"42", i32 2 + %"46" = extractelement <4 x i16> %"42", i32 3 + %"24" = bitcast i16 %"43" to half %"25" = bitcast i16 %"44" to half %"26" = bitcast i16 %"45" to half %"27" = bitcast i16 %"46" to half - %"28" = bitcast i16 %"47" to half - store half %"25", ptr addrspace(5) %"9", align 2 - store half %"26", ptr addrspace(5) %"10", align 2 - store half %"27", ptr addrspace(5) %"11", align 2 - store half %"28", ptr addrspace(5) %"12", align 2 - %"29" = load i64, ptr addrspace(5) %"5", align 8 - %"30" = load float, ptr addrspace(5) %"8", align 4 - %"48" = inttoptr i64 %"29" to ptr - store float %"30", ptr %"48", align 4 - %"31" = load i64, ptr addrspace(5) %"5", align 8 - %"32" = load float, ptr addrspace(5) %"7", align 4 - %"49" = inttoptr i64 %"31" to ptr - %"52" = getelementptr inbounds i8, ptr %"49", i64 4 - store float %"32", ptr %"52", align 4 + store half %"24", ptr addrspace(5) %"9", align 2 + store half %"25", ptr addrspace(5) %"10", align 2 + store half %"26", ptr addrspace(5) %"11", align 2 + store half %"27", ptr addrspace(5) %"12", align 2 + %"28" = load i64, ptr addrspace(5) %"5", align 8 + %"29" = load float, ptr addrspace(5) %"8", align 4 + %"47" = inttoptr i64 %"28" to ptr + store float %"29", ptr %"47", align 4 + %"30" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = load float, ptr addrspace(5) %"7", align 4 + %"48" = inttoptr i64 %"30" to ptr + %"50" = getelementptr inbounds i8, ptr %"48", i64 4 + store float %"31", ptr %"50", align 4 ret void } diff --git a/ptx/src/test/spirv_run/mul_ftz.ll b/ptx/src/test/spirv_run/mul_ftz.ll index 04de6f2..38867fe 100644 --- a/ptx/src/test/spirv_run/mul_ftz.ll +++ b/ptx/src/test/spirv_run/mul_ftz.ll @@ -1,37 +1,37 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"28": +define protected amdgpu_kernel void @mul_ftz(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) %"7" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"25" = inttoptr i64 %"13" to ptr - %"12" = load float, ptr %"25", align 4 - store float %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"15" to ptr - %"30" = getelementptr inbounds i8, ptr %"26", i64 4 - %"14" = load float, ptr %"30", align 4 - store float %"14", ptr addrspace(5) %"7", align 4 - %"17" = load float, ptr addrspace(5) %"6", align 4 - %"18" = load float, ptr addrspace(5) %"7", align 4 - %"16" = fmul float %"17", %"18" - store float %"16", ptr addrspace(5) %"6", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load float, ptr addrspace(5) %"6", align 4 - %"27" = inttoptr i64 %"19" to ptr - store float %"20", ptr %"27", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"12" to ptr + %"11" = load float, ptr %"24", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"14" to ptr + %"28" = getelementptr inbounds i8, ptr %"25", i64 4 + %"13" = load float, ptr %"28", align 4 + store float %"13", ptr addrspace(5) %"7", align 4 + %"16" = load float, ptr addrspace(5) %"6", align 4 + %"17" = load float, ptr addrspace(5) %"7", align 4 + %"15" = fmul float %"16", %"17" + store float %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load float, ptr addrspace(5) %"6", align 4 + %"26" = inttoptr i64 %"18" to ptr + store float %"19", ptr %"26", align 4 ret void } diff --git a/ptx/src/test/spirv_run/mul_hi.ll b/ptx/src/test/spirv_run/mul_hi.ll index e57141b..8043deb 100644 --- a/ptx/src/test/spirv_run/mul_hi.ll +++ b/ptx/src/test/spirv_run/mul_hi.ll @@ -3,31 +3,31 @@ target triple = "amdgcn-amd-amdhsa" declare i64 @__zluda_ptx_impl__mul_hi_u64(i64, i64) #0 -define protected amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #1 { -"23": +define protected amdgpu_kernel void @mul_hi(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #1 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"20", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"21" = inttoptr i64 %"13" to ptr - %"12" = load i64, ptr %"21", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"15" = load i64, ptr addrspace(5) %"6", align 8 - %"14" = call i64 @__zluda_ptx_impl__mul_hi_u64(i64 %"15", i64 2) - store i64 %"14", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"22" = inttoptr i64 %"16" to ptr - store i64 %"17", ptr %"22", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"12" to ptr + %"11" = load i64, ptr %"20", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 + %"13" = call i64 @__zluda_ptx_impl__mul_hi_u64(i64 %"14", i64 2) + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"21" = inttoptr i64 %"15" to ptr + store i64 %"16", ptr %"21", align 8 ret void } diff --git a/ptx/src/test/spirv_run/mul_lo.ll b/ptx/src/test/spirv_run/mul_lo.ll index 1a915fa..9370500 100644 --- a/ptx/src/test/spirv_run/mul_lo.ll +++ b/ptx/src/test/spirv_run/mul_lo.ll @@ -1,31 +1,31 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { -"23": +define protected amdgpu_kernel void @mul_lo(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"20", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"21" = inttoptr i64 %"13" to ptr - %"12" = load i64, ptr %"21", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"15" = load i64, ptr addrspace(5) %"6", align 8 - %"14" = mul i64 %"15", 2 - store i64 %"14", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"22" = inttoptr i64 %"16" to ptr - store i64 %"17", ptr %"22", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"12" to ptr + %"11" = load i64, ptr %"20", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 + %"13" = mul i64 %"14", 2 + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"21" = inttoptr i64 %"15" to ptr + store i64 %"16", ptr %"21", align 8 ret void } diff --git a/ptx/src/test/spirv_run/mul_non_ftz.ll b/ptx/src/test/spirv_run/mul_non_ftz.ll index d0d2bcd..89f5e9f 100644 --- a/ptx/src/test/spirv_run/mul_non_ftz.ll +++ b/ptx/src/test/spirv_run/mul_non_ftz.ll @@ -1,37 +1,37 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @mul_non_ftz(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"28": +define protected amdgpu_kernel void @mul_non_ftz(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) %"7" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"25" = inttoptr i64 %"13" to ptr - %"12" = load float, ptr %"25", align 4 - store float %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"15" to ptr - %"30" = getelementptr inbounds i8, ptr %"26", i64 4 - %"14" = load float, ptr %"30", align 4 - store float %"14", ptr addrspace(5) %"7", align 4 - %"17" = load float, ptr addrspace(5) %"6", align 4 - %"18" = load float, ptr addrspace(5) %"7", align 4 - %"16" = fmul float %"17", %"18" - store float %"16", ptr addrspace(5) %"6", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load float, ptr addrspace(5) %"6", align 4 - %"27" = inttoptr i64 %"19" to ptr - store float %"20", ptr %"27", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"12" to ptr + %"11" = load float, ptr %"24", align 4 + store float %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"14" to ptr + %"28" = getelementptr inbounds i8, ptr %"25", i64 4 + %"13" = load float, ptr %"28", align 4 + store float %"13", ptr addrspace(5) %"7", align 4 + %"16" = load float, ptr addrspace(5) %"6", align 4 + %"17" = load float, ptr addrspace(5) %"7", align 4 + %"15" = fmul float %"16", %"17" + store float %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load float, ptr addrspace(5) %"6", align 4 + %"26" = inttoptr i64 %"18" to ptr + store float %"19", ptr %"26", align 4 ret void } diff --git a/ptx/src/test/spirv_run/mul_wide.ll b/ptx/src/test/spirv_run/mul_wide.ll index b1dec22..a0d84f4 100644 --- a/ptx/src/test/spirv_run/mul_wide.ll +++ b/ptx/src/test/spirv_run/mul_wide.ll @@ -1,40 +1,40 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @mul_wide(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { -"30": +define protected amdgpu_kernel void @mul_wide(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 %"11" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"11", ptr addrspace(5) %"4", align 8 - %"12" = load i64, ptr addrspace(4) %"25", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"14" to ptr addrspace(1) - %"13" = load i32, ptr addrspace(1) %"26", align 4 - store i32 %"13", ptr addrspace(5) %"6", align 4 - %"16" = load i64, ptr addrspace(5) %"4", align 8 - %"27" = inttoptr i64 %"16" to ptr addrspace(1) - %"32" = getelementptr inbounds i8, ptr addrspace(1) %"27", i64 4 - %"15" = load i32, ptr addrspace(1) %"32", align 4 - store i32 %"15", ptr addrspace(5) %"7", align 4 - %"18" = load i32, ptr addrspace(5) %"6", align 4 - %"19" = load i32, ptr addrspace(5) %"7", align 4 - %0 = sext i32 %"18" to i64 - %1 = sext i32 %"19" to i64 - %"17" = mul nsw i64 %0, %1 - store i64 %"17", ptr addrspace(5) %"8", align 8 - %"20" = load i64, ptr addrspace(5) %"5", align 8 - %"21" = load i64, ptr addrspace(5) %"8", align 8 - %"28" = inttoptr i64 %"20" to ptr - store i64 %"21", ptr %"28", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"13" to ptr addrspace(1) + %"12" = load i32, ptr addrspace(1) %"25", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"15" to ptr addrspace(1) + %"30" = getelementptr inbounds i8, ptr addrspace(1) %"26", i64 4 + %"14" = load i32, ptr addrspace(1) %"30", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"18" = load i32, ptr addrspace(5) %"7", align 4 + %2 = sext i32 %"17" to i64 + %3 = sext i32 %"18" to i64 + %"16" = mul nsw i64 %2, %3 + store i64 %"16", ptr addrspace(5) %"8", align 8 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i64, ptr addrspace(5) %"8", align 8 + %"27" = inttoptr i64 %"19" to ptr + store i64 %"20", ptr %"27", align 8 ret void } diff --git a/ptx/src/test/spirv_run/multireg.ll b/ptx/src/test/spirv_run/multireg.ll index 3826c19..3eb31cb 100644 --- a/ptx/src/test/spirv_run/multireg.ll +++ b/ptx/src/test/spirv_run/multireg.ll @@ -1,31 +1,31 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @multireg(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { -"23": +define protected amdgpu_kernel void @multireg(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"20", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"21" = inttoptr i64 %"13" to ptr - %"12" = load i64, ptr %"21", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"15" = load i64, ptr addrspace(5) %"6", align 8 - %"14" = add i64 %"15", 1 - store i64 %"14", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"22" = inttoptr i64 %"16" to ptr - store i64 %"17", ptr %"22", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"12" to ptr + %"11" = load i64, ptr %"20", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 + %"13" = add i64 %"14", 1 + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"21" = inttoptr i64 %"15" to ptr + store i64 %"16", ptr %"21", align 8 ret void } diff --git a/ptx/src/test/spirv_run/neg.ll b/ptx/src/test/spirv_run/neg.ll index c1087b4..056b0a1 100644 --- a/ptx/src/test/spirv_run/neg.ll +++ b/ptx/src/test/spirv_run/neg.ll @@ -1,30 +1,30 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @neg(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { -"21": +define protected amdgpu_kernel void @neg(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"17", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"19" = inttoptr i64 %"12" to ptr - %"11" = load i32, ptr %"19", align 4 - store i32 %"11", ptr addrspace(5) %"6", align 4 - %"14" = load i32, ptr addrspace(5) %"6", align 4 - %"13" = sub i32 0, %"14" - store i32 %"13", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load i32, ptr addrspace(5) %"6", align 4 - %"20" = inttoptr i64 %"15" to ptr - store i32 %"16", ptr %"20", align 4 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"18" = inttoptr i64 %"11" to ptr + %"10" = load i32, ptr %"18", align 4 + store i32 %"10", ptr addrspace(5) %"6", align 4 + %"13" = load i32, ptr addrspace(5) %"6", align 4 + %"12" = sub i32 0, %"13" + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load i32, ptr addrspace(5) %"6", align 4 + %"19" = inttoptr i64 %"14" to ptr + store i32 %"15", ptr %"19", align 4 ret void } diff --git a/ptx/src/test/spirv_run/non_scalar_ptr_offset.ll b/ptx/src/test/spirv_run/non_scalar_ptr_offset.ll index 718a512..d0c71eb 100644 --- a/ptx/src/test/spirv_run/non_scalar_ptr_offset.ll +++ b/ptx/src/test/spirv_run/non_scalar_ptr_offset.ll @@ -1,36 +1,36 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @non_scalar_ptr_offset(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"27": +define protected amdgpu_kernel void @non_scalar_ptr_offset(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 %"11" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"11", ptr addrspace(5) %"4", align 8 - %"12" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"25" = inttoptr i64 %"13" to ptr addrspace(1) - %"29" = getelementptr inbounds i8, ptr addrspace(1) %"25", i64 8 - %"8" = load <2 x i32>, ptr addrspace(1) %"29", align 8 - %"14" = extractelement <2 x i32> %"8", i32 0 - %"15" = extractelement <2 x i32> %"8", i32 1 - store i32 %"14", ptr addrspace(5) %"6", align 4 - store i32 %"15", ptr addrspace(5) %"7", align 4 - %"17" = load i32, ptr addrspace(5) %"6", align 4 - %"18" = load i32, ptr addrspace(5) %"7", align 4 - %"16" = add i32 %"17", %"18" - store i32 %"16", ptr addrspace(5) %"6", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i32, ptr addrspace(5) %"6", align 4 - %"26" = inttoptr i64 %"19" to ptr addrspace(1) - store i32 %"20", ptr addrspace(1) %"26", align 4 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"12" to ptr addrspace(1) + %"27" = getelementptr inbounds i8, ptr addrspace(1) %"24", i64 8 + %"8" = load <2 x i32>, ptr addrspace(1) %"27", align 8 + %"13" = extractelement <2 x i32> %"8", i32 0 + %"14" = extractelement <2 x i32> %"8", i32 1 + store i32 %"13", ptr addrspace(5) %"6", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + %"15" = add i32 %"16", %"17" + store i32 %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i32, ptr addrspace(5) %"6", align 4 + %"25" = inttoptr i64 %"18" to ptr addrspace(1) + store i32 %"19", ptr addrspace(1) %"25", align 4 ret void } diff --git a/ptx/src/test/spirv_run/not.ll b/ptx/src/test/spirv_run/not.ll index 10dd56c..7c9a557 100644 --- a/ptx/src/test/spirv_run/not.ll +++ b/ptx/src/test/spirv_run/not.ll @@ -1,31 +1,31 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { -"24": +define protected amdgpu_kernel void @not(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"20" = inttoptr i64 %"13" to ptr - %"12" = load i64, ptr %"20", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"15" = load i64, ptr addrspace(5) %"6", align 8 - %"21" = xor i64 %"15", -1 - store i64 %"21", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"23" = inttoptr i64 %"16" to ptr - store i64 %"17", ptr %"23", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr + %"11" = load i64, ptr %"19", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 + %"20" = xor i64 %"14", -1 + store i64 %"20", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"22" = inttoptr i64 %"15" to ptr + store i64 %"16", ptr %"22", align 8 ret void } diff --git a/ptx/src/test/spirv_run/ntid.ll b/ptx/src/test/spirv_run/ntid.ll index 93c95bf..29fccca 100644 --- a/ptx/src/test/spirv_run/ntid.ll +++ b/ptx/src/test/spirv_run/ntid.ll @@ -3,37 +3,37 @@ target triple = "amdgcn-amd-amdhsa" declare i32 @__zluda_ptx_impl__sreg_ntid(i8) #0 -define protected amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #1 { -"30": +define protected amdgpu_kernel void @ntid(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #1 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + %1 = alloca i32, align 4, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"15" = load i64, ptr addrspace(4) %"25", align 8 + store i64 %"15", ptr addrspace(5) %"4", align 8 %"16" = load i64, ptr addrspace(4) %"26", align 8 - store i64 %"16", ptr addrspace(5) %"4", align 8 - %"17" = load i64, ptr addrspace(4) %"27", align 8 - store i64 %"17", ptr addrspace(5) %"5", align 8 - %"19" = load i64, ptr addrspace(5) %"4", align 8 - %"28" = inttoptr i64 %"19" to ptr - %"18" = load i32, ptr %"28", align 4 - store i32 %"18", ptr addrspace(5) %"6", align 4 - %"12" = call i32 @__zluda_ptx_impl__sreg_ntid(i8 0) - %0 = alloca i32, align 4, addrspace(5) - store i32 %"12", ptr addrspace(5) %0, align 4 - %"20" = load i32, ptr addrspace(5) %0, align 4 - store i32 %"20", ptr addrspace(5) %"7", align 4 - %"22" = load i32, ptr addrspace(5) %"6", align 4 - %"23" = load i32, ptr addrspace(5) %"7", align 4 - %"21" = add i32 %"22", %"23" - store i32 %"21", ptr addrspace(5) %"6", align 4 - %"24" = load i64, ptr addrspace(5) %"5", align 8 - %"25" = load i32, ptr addrspace(5) %"6", align 4 - %"29" = inttoptr i64 %"24" to ptr - store i32 %"25", ptr %"29", align 4 + store i64 %"16", ptr addrspace(5) %"5", align 8 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"27" = inttoptr i64 %"18" to ptr + %"17" = load i32, ptr %"27", align 4 + store i32 %"17", ptr addrspace(5) %"6", align 4 + %"11" = call i32 @__zluda_ptx_impl__sreg_ntid(i8 0) + store i32 %"11", ptr addrspace(5) %1, align 4 + %"19" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"19", ptr addrspace(5) %"7", align 4 + %"21" = load i32, ptr addrspace(5) %"6", align 4 + %"22" = load i32, ptr addrspace(5) %"7", align 4 + %"20" = add i32 %"21", %"22" + store i32 %"20", ptr addrspace(5) %"6", align 4 + %"23" = load i64, ptr addrspace(5) %"5", align 8 + %"24" = load i32, ptr addrspace(5) %"6", align 4 + %"28" = inttoptr i64 %"23" to ptr + store i32 %"24", ptr %"28", align 4 ret void } diff --git a/ptx/src/test/spirv_run/or.ll b/ptx/src/test/spirv_run/or.ll index 13e844b..f929205 100644 --- a/ptx/src/test/spirv_run/or.ll +++ b/ptx/src/test/spirv_run/or.ll @@ -1,37 +1,37 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"31": +define protected amdgpu_kernel void @or(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"25" = inttoptr i64 %"13" to ptr - %"12" = load i64, ptr %"25", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"15" to ptr - %"33" = getelementptr inbounds i8, ptr %"26", i64 8 - %"14" = load i64, ptr %"33", align 8 - store i64 %"14", ptr addrspace(5) %"7", align 8 - %"17" = load i64, ptr addrspace(5) %"6", align 8 - %"18" = load i64, ptr addrspace(5) %"7", align 8 - %"27" = or i64 %"17", %"18" - store i64 %"27", ptr addrspace(5) %"6", align 8 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i64, ptr addrspace(5) %"6", align 8 - %"30" = inttoptr i64 %"19" to ptr - store i64 %"20", ptr %"30", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"12" to ptr + %"11" = load i64, ptr %"24", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"14" to ptr + %"31" = getelementptr inbounds i8, ptr %"25", i64 8 + %"13" = load i64, ptr %"31", align 8 + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"16" = load i64, ptr addrspace(5) %"6", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"26" = or i64 %"16", %"17" + store i64 %"26", ptr addrspace(5) %"6", align 8 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i64, ptr addrspace(5) %"6", align 8 + %"29" = inttoptr i64 %"18" to ptr + store i64 %"19", ptr %"29", align 8 ret void } diff --git a/ptx/src/test/spirv_run/param_ptr.ll b/ptx/src/test/spirv_run/param_ptr.ll index 3634669..75451de 100644 --- a/ptx/src/test/spirv_run/param_ptr.ll +++ b/ptx/src/test/spirv_run/param_ptr.ll @@ -1,39 +1,39 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @param_ptr(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { -"29": +define protected amdgpu_kernel void @param_ptr(ptr addrspace(4) byref(i64) %"21", ptr addrspace(4) byref(i64) %"22") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) - %"25" = ptrtoint ptr addrspace(4) %"22" to i64 - %0 = alloca i64, align 8, addrspace(5) - store i64 %"25", ptr addrspace(5) %0, align 8 - %"24" = load i64, ptr addrspace(5) %0, align 8 - store i64 %"24", ptr addrspace(5) %"4", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"13" to ptr addrspace(4) - %"12" = load i64, ptr addrspace(4) %"26", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"14", ptr addrspace(5) %"6", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"27" = inttoptr i64 %"16" to ptr - %"15" = load i64, ptr %"27", align 8 - store i64 %"15", ptr addrspace(5) %"7", align 8 - %"18" = load i64, ptr addrspace(5) %"7", align 8 - %"17" = add i64 %"18", 1 - store i64 %"17", ptr addrspace(5) %"8", align 8 - %"19" = load i64, ptr addrspace(5) %"6", align 8 - %"20" = load i64, ptr addrspace(5) %"8", align 8 - %"28" = inttoptr i64 %"19" to ptr - store i64 %"20", ptr %"28", align 8 + %1 = alloca i64, align 8, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"24" = ptrtoint ptr addrspace(4) %"21" to i64 + store i64 %"24", ptr addrspace(5) %1, align 8 + %"23" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"23", ptr addrspace(5) %"4", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"12" to ptr addrspace(4) + %"11" = load i64, ptr addrspace(4) %"25", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"13", ptr addrspace(5) %"6", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"26" = inttoptr i64 %"15" to ptr + %"14" = load i64, ptr %"26", align 8 + store i64 %"14", ptr addrspace(5) %"7", align 8 + %"17" = load i64, ptr addrspace(5) %"7", align 8 + %"16" = add i64 %"17", 1 + store i64 %"16", ptr addrspace(5) %"8", align 8 + %"18" = load i64, ptr addrspace(5) %"6", align 8 + %"19" = load i64, ptr addrspace(5) %"8", align 8 + %"27" = inttoptr i64 %"18" to ptr + store i64 %"19", ptr %"27", align 8 ret void } diff --git a/ptx/src/test/spirv_run/popc.ll b/ptx/src/test/spirv_run/popc.ll index e93f8ad..15befc4 100644 --- a/ptx/src/test/spirv_run/popc.ll +++ b/ptx/src/test/spirv_run/popc.ll @@ -1,30 +1,30 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { -"21": +define protected amdgpu_kernel void @popc(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"17", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"19" = inttoptr i64 %"12" to ptr - %"11" = load i32, ptr %"19", align 4 - store i32 %"11", ptr addrspace(5) %"6", align 4 - %"14" = load i32, ptr addrspace(5) %"6", align 4 - %"13" = call i32 @llvm.ctpop.i32(i32 %"14") - store i32 %"13", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load i32, ptr addrspace(5) %"6", align 4 - %"20" = inttoptr i64 %"15" to ptr - store i32 %"16", ptr %"20", align 4 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"18" = inttoptr i64 %"11" to ptr + %"10" = load i32, ptr %"18", align 4 + store i32 %"10", ptr addrspace(5) %"6", align 4 + %"13" = load i32, ptr addrspace(5) %"6", align 4 + %"12" = call i32 @llvm.ctpop.i32(i32 %"13") + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load i32, ptr addrspace(5) %"6", align 4 + %"19" = inttoptr i64 %"14" to ptr + store i32 %"15", ptr %"19", align 4 ret void } diff --git a/ptx/src/test/spirv_run/pred_not.ll b/ptx/src/test/spirv_run/pred_not.ll index 047f94a..8315512 100644 --- a/ptx/src/test/spirv_run/pred_not.ll +++ b/ptx/src/test/spirv_run/pred_not.ll @@ -1,64 +1,64 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { -"42": +define protected amdgpu_kernel void @pred_not(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { %"14" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"14", align 1 - %"15" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"15", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) %"9" = alloca i1, align 1, addrspace(5) - %"16" = load i64, ptr addrspace(4) %"37", align 8 - store i64 %"16", ptr addrspace(5) %"4", align 8 - %"17" = load i64, ptr addrspace(4) %"38", align 8 - store i64 %"17", ptr addrspace(5) %"5", align 8 - %"19" = load i64, ptr addrspace(5) %"4", align 8 - %"39" = inttoptr i64 %"19" to ptr - %"18" = load i64, ptr %"39", align 8 - store i64 %"18", ptr addrspace(5) %"6", align 8 - %"21" = load i64, ptr addrspace(5) %"4", align 8 - %"40" = inttoptr i64 %"21" to ptr - %"44" = getelementptr inbounds i8, ptr %"40", i64 8 - %"20" = load i64, ptr %"44", align 8 - store i64 %"20", ptr addrspace(5) %"7", align 8 - %"23" = load i64, ptr addrspace(5) %"6", align 8 - %"24" = load i64, ptr addrspace(5) %"7", align 8 - %"22" = icmp ult i64 %"23", %"24" - store i1 %"22", ptr addrspace(5) %"9", align 1 - %"26" = load i1, ptr addrspace(5) %"9", align 1 - %"25" = xor i1 %"26", true - store i1 %"25", ptr addrspace(5) %"9", align 1 - %"27" = load i1, ptr addrspace(5) %"9", align 1 - br i1 %"27", label %"10", label %"11" + %1 = alloca i64, align 8, addrspace(5) + %2 = alloca i64, align 8, addrspace(5) + br label %3 -"10": ; preds = %"42" - %0 = alloca i64, align 8, addrspace(5) - store i64 1, ptr addrspace(5) %0, align 8 - %"28" = load i64, ptr addrspace(5) %0, align 8 - store i64 %"28", ptr addrspace(5) %"8", align 8 +3: ; preds = %0 + store i1 false, ptr addrspace(5) %"14", align 1 + %"15" = load i64, ptr addrspace(4) %"36", align 8 + store i64 %"15", ptr addrspace(5) %"4", align 8 + %"16" = load i64, ptr addrspace(4) %"37", align 8 + store i64 %"16", ptr addrspace(5) %"5", align 8 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"38" = inttoptr i64 %"18" to ptr + %"17" = load i64, ptr %"38", align 8 + store i64 %"17", ptr addrspace(5) %"6", align 8 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"39" = inttoptr i64 %"20" to ptr + %"42" = getelementptr inbounds i8, ptr %"39", i64 8 + %"19" = load i64, ptr %"42", align 8 + store i64 %"19", ptr addrspace(5) %"7", align 8 + %"22" = load i64, ptr addrspace(5) %"6", align 8 + %"23" = load i64, ptr addrspace(5) %"7", align 8 + %"21" = icmp ult i64 %"22", %"23" + store i1 %"21", ptr addrspace(5) %"9", align 1 + %"25" = load i1, ptr addrspace(5) %"9", align 1 + %"24" = xor i1 %"25", true + store i1 %"24", ptr addrspace(5) %"9", align 1 + %"26" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"26", label %"10", label %"11" + +"10": ; preds = %3 + store i64 1, ptr addrspace(5) %1, align 8 + %"27" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"27", ptr addrspace(5) %"8", align 8 br label %"11" -"11": ; preds = %"10", %"42" - %"29" = load i1, ptr addrspace(5) %"9", align 1 - br i1 %"29", label %"13", label %"12" +"11": ; preds = %"10", %3 + %"28" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"28", label %"13", label %"12" "12": ; preds = %"11" - %1 = alloca i64, align 8, addrspace(5) - store i64 2, ptr addrspace(5) %1, align 8 - %"30" = load i64, ptr addrspace(5) %1, align 8 - store i64 %"30", ptr addrspace(5) %"8", align 8 + store i64 2, ptr addrspace(5) %2, align 8 + %"29" = load i64, ptr addrspace(5) %2, align 8 + store i64 %"29", ptr addrspace(5) %"8", align 8 br label %"13" "13": ; preds = %"12", %"11" - %"31" = load i64, ptr addrspace(5) %"5", align 8 - %"32" = load i64, ptr addrspace(5) %"8", align 8 - %"41" = inttoptr i64 %"31" to ptr - store i64 %"32", ptr %"41", align 8 + %"30" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = load i64, ptr addrspace(5) %"8", align 8 + %"40" = inttoptr i64 %"30" to ptr + store i64 %"31", ptr %"40", align 8 ret void } diff --git a/ptx/src/test/spirv_run/prmt.ll b/ptx/src/test/spirv_run/prmt.ll index 87313c6..76efedc 100644 --- a/ptx/src/test/spirv_run/prmt.ll +++ b/ptx/src/test/spirv_run/prmt.ll @@ -1,60 +1,60 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"32", ptr addrspace(4) byref(i64) %"33") #0 { -"44": +define protected amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"31", ptr addrspace(4) byref(i64) %"32") #0 { %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 - %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) %"9" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = load i64, ptr addrspace(4) %"31", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 %"12" = load i64, ptr addrspace(4) %"32", align 8 - store i64 %"12", ptr addrspace(5) %"4", align 8 - %"13" = load i64, ptr addrspace(4) %"33", align 8 - store i64 %"13", ptr addrspace(5) %"5", align 8 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"34" = inttoptr i64 %"15" to ptr - %"14" = load i32, ptr %"34", align 4 - store i32 %"14", ptr addrspace(5) %"6", align 4 - %"17" = load i64, ptr addrspace(5) %"4", align 8 - %"35" = inttoptr i64 %"17" to ptr - %"46" = getelementptr inbounds i8, ptr %"35", i64 4 - %"16" = load i32, ptr %"46", align 4 - store i32 %"16", ptr addrspace(5) %"7", align 4 - %"19" = load i32, ptr addrspace(5) %"6", align 4 - %"20" = load i32, ptr addrspace(5) %"7", align 4 - %0 = bitcast i32 %"19" to <4 x i8> - %1 = bitcast i32 %"20" to <4 x i8> - %2 = shufflevector <4 x i8> %0, <4 x i8> %1, <4 x i32> - %"36" = bitcast <4 x i8> %2 to i32 - store i32 %"36", ptr addrspace(5) %"8", align 4 - %"22" = load i32, ptr addrspace(5) %"6", align 4 - %"23" = load i32, ptr addrspace(5) %"7", align 4 - %3 = bitcast i32 %"22" to <4 x i8> - %4 = bitcast i32 %"23" to <4 x i8> - %5 = shufflevector <4 x i8> %3, <4 x i8> %4, <4 x i32> - %6 = extractelement <4 x i8> %5, i32 0 - %7 = ashr i8 %6, 7 - %8 = insertelement <4 x i8> %5, i8 %7, i32 0 - %9 = extractelement <4 x i8> %8, i32 2 - %10 = ashr i8 %9, 7 - %11 = insertelement <4 x i8> %8, i8 %10, i32 2 - %"39" = bitcast <4 x i8> %11 to i32 - store i32 %"39", ptr addrspace(5) %"9", align 4 - %"24" = load i64, ptr addrspace(5) %"5", align 8 - %"25" = load i32, ptr addrspace(5) %"8", align 4 - %"42" = inttoptr i64 %"24" to ptr - store i32 %"25", ptr %"42", align 4 - %"26" = load i64, ptr addrspace(5) %"5", align 8 - %"27" = load i32, ptr addrspace(5) %"9", align 4 - %"43" = inttoptr i64 %"26" to ptr - %"48" = getelementptr inbounds i8, ptr %"43", i64 4 - store i32 %"27", ptr %"48", align 4 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"33" = inttoptr i64 %"14" to ptr + %"13" = load i32, ptr %"33", align 4 + store i32 %"13", ptr addrspace(5) %"6", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"34" = inttoptr i64 %"16" to ptr + %"44" = getelementptr inbounds i8, ptr %"34", i64 4 + %"15" = load i32, ptr %"44", align 4 + store i32 %"15", ptr addrspace(5) %"7", align 4 + %"18" = load i32, ptr addrspace(5) %"6", align 4 + %"19" = load i32, ptr addrspace(5) %"7", align 4 + %2 = bitcast i32 %"18" to <4 x i8> + %3 = bitcast i32 %"19" to <4 x i8> + %4 = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> + %"35" = bitcast <4 x i8> %4 to i32 + store i32 %"35", ptr addrspace(5) %"8", align 4 + %"21" = load i32, ptr addrspace(5) %"6", align 4 + %"22" = load i32, ptr addrspace(5) %"7", align 4 + %5 = bitcast i32 %"21" to <4 x i8> + %6 = bitcast i32 %"22" to <4 x i8> + %7 = shufflevector <4 x i8> %5, <4 x i8> %6, <4 x i32> + %8 = extractelement <4 x i8> %7, i32 0 + %9 = ashr i8 %8, 7 + %10 = insertelement <4 x i8> %7, i8 %9, i32 0 + %11 = extractelement <4 x i8> %10, i32 2 + %12 = ashr i8 %11, 7 + %13 = insertelement <4 x i8> %10, i8 %12, i32 2 + %"38" = bitcast <4 x i8> %13 to i32 + store i32 %"38", ptr addrspace(5) %"9", align 4 + %"23" = load i64, ptr addrspace(5) %"5", align 8 + %"24" = load i32, ptr addrspace(5) %"8", align 4 + %"41" = inttoptr i64 %"23" to ptr + store i32 %"24", ptr %"41", align 4 + %"25" = load i64, ptr addrspace(5) %"5", align 8 + %"26" = load i32, ptr addrspace(5) %"9", align 4 + %"42" = inttoptr i64 %"25" to ptr + %"46" = getelementptr inbounds i8, ptr %"42", i64 4 + store i32 %"26", ptr %"46", align 4 ret void } diff --git a/ptx/src/test/spirv_run/prmt_non_immediate.ll b/ptx/src/test/spirv_run/prmt_non_immediate.ll index c1a1b9d..104c56d 100644 --- a/ptx/src/test/spirv_run/prmt_non_immediate.ll +++ b/ptx/src/test/spirv_run/prmt_non_immediate.ll @@ -1,45 +1,45 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @prmt_non_immediate(ptr addrspace(4) byref(i64) %"26", ptr addrspace(4) byref(i64) %"27") #0 { -"34": +define protected amdgpu_kernel void @prmt_non_immediate(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) + %1 = alloca i32, align 4, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"25", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 %"11" = load i64, ptr addrspace(4) %"26", align 8 - store i64 %"11", ptr addrspace(5) %"4", align 8 - %"12" = load i64, ptr addrspace(4) %"27", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(5) %"4", align 8 - %"28" = inttoptr i64 %"14" to ptr - %"13" = load i32, ptr %"28", align 4 - store i32 %"13", ptr addrspace(5) %"6", align 4 - %"16" = load i64, ptr addrspace(5) %"4", align 8 - %"29" = inttoptr i64 %"16" to ptr - %"36" = getelementptr inbounds i8, ptr %"29", i64 4 - %"15" = load i32, ptr %"36", align 4 - store i32 %"15", ptr addrspace(5) %"7", align 4 - %0 = alloca i32, align 4, addrspace(5) - store i32 64, ptr addrspace(5) %0, align 4 - %"17" = load i32, ptr addrspace(5) %0, align 4 - store i32 %"17", ptr addrspace(5) %"8", align 4 - %"19" = load i32, ptr addrspace(5) %"6", align 4 - %"20" = load i32, ptr addrspace(5) %"7", align 4 - %1 = bitcast i32 %"19" to <4 x i8> - %2 = bitcast i32 %"20" to <4 x i8> - %3 = shufflevector <4 x i8> %1, <4 x i8> %2, <4 x i32> - %"30" = bitcast <4 x i8> %3 to i32 - store i32 %"30", ptr addrspace(5) %"7", align 4 - %"21" = load i64, ptr addrspace(5) %"5", align 8 - %"22" = load i32, ptr addrspace(5) %"7", align 4 - %"33" = inttoptr i64 %"21" to ptr - store i32 %"22", ptr %"33", align 4 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"27" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"27", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"28" = inttoptr i64 %"15" to ptr + %"34" = getelementptr inbounds i8, ptr %"28", i64 4 + %"14" = load i32, ptr %"34", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + store i32 64, ptr addrspace(5) %1, align 4 + %"16" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"16", ptr addrspace(5) %"8", align 4 + %"18" = load i32, ptr addrspace(5) %"6", align 4 + %"19" = load i32, ptr addrspace(5) %"7", align 4 + %3 = bitcast i32 %"18" to <4 x i8> + %4 = bitcast i32 %"19" to <4 x i8> + %5 = shufflevector <4 x i8> %3, <4 x i8> %4, <4 x i32> + %"29" = bitcast <4 x i8> %5 to i32 + store i32 %"29", ptr addrspace(5) %"7", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load i32, ptr addrspace(5) %"7", align 4 + %"32" = inttoptr i64 %"20" to ptr + store i32 %"21", ptr %"32", align 4 ret void } diff --git a/ptx/src/test/spirv_run/rcp.ll b/ptx/src/test/spirv_run/rcp.ll index cb55c6a..dc03416 100644 --- a/ptx/src/test/spirv_run/rcp.ll +++ b/ptx/src/test/spirv_run/rcp.ll @@ -1,30 +1,30 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { -"21": +define protected amdgpu_kernel void @rcp(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"17", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"19" = inttoptr i64 %"12" to ptr - %"11" = load float, ptr %"19", align 4 - store float %"11", ptr addrspace(5) %"6", align 4 - %"14" = load float, ptr addrspace(5) %"6", align 4 - %"13" = fdiv arcp afn float 1.000000e+00, %"14" - store float %"13", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load float, ptr addrspace(5) %"6", align 4 - %"20" = inttoptr i64 %"15" to ptr - store float %"16", ptr %"20", align 4 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"18" = inttoptr i64 %"11" to ptr + %"10" = load float, ptr %"18", align 4 + store float %"10", ptr addrspace(5) %"6", align 4 + %"13" = load float, ptr addrspace(5) %"6", align 4 + %"12" = fdiv arcp afn float 1.000000e+00, %"13" + store float %"12", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load float, ptr addrspace(5) %"6", align 4 + %"19" = inttoptr i64 %"14" to ptr + store float %"15", ptr %"19", align 4 ret void } diff --git a/ptx/src/test/spirv_run/reg_local.ll b/ptx/src/test/spirv_run/reg_local.ll index c01a5e0..52bb3d1 100644 --- a/ptx/src/test/spirv_run/reg_local.ll +++ b/ptx/src/test/spirv_run/reg_local.ll @@ -1,37 +1,37 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { -"34": +define protected amdgpu_kernel void @reg_local(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca [8 x i8], align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"9", ptr addrspace(5) %"5", align 8 %"10" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"11" = load i64, ptr addrspace(4) %"25", align 8 - store i64 %"11", ptr addrspace(5) %"6", align 8 - %"13" = load i64, ptr addrspace(5) %"5", align 8 - %"27" = inttoptr i64 %"13" to ptr addrspace(1) - %"26" = load i64, ptr addrspace(1) %"27", align 8 - store i64 %"26", ptr addrspace(5) %"7", align 8 - %"14" = load i64, ptr addrspace(5) %"7", align 8 - %"19" = add i64 %"14", 1 - %"28" = addrspacecast ptr addrspace(5) %"4" to ptr - store i64 %"19", ptr %"28", align 8 - %"30" = addrspacecast ptr addrspace(5) %"4" to ptr - %"38" = getelementptr inbounds i8, ptr %"30", i64 0 - %"31" = load i64, ptr %"38", align 8 - store i64 %"31", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"6", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"32" = inttoptr i64 %"16" to ptr addrspace(1) - %"40" = getelementptr inbounds i8, ptr addrspace(1) %"32", i64 0 - store i64 %"17", ptr addrspace(1) %"40", align 8 + store i64 %"10", ptr addrspace(5) %"6", align 8 + %"12" = load i64, ptr addrspace(5) %"5", align 8 + %"26" = inttoptr i64 %"12" to ptr addrspace(1) + %"25" = load i64, ptr addrspace(1) %"26", align 8 + store i64 %"25", ptr addrspace(5) %"7", align 8 + %"13" = load i64, ptr addrspace(5) %"7", align 8 + %"18" = add i64 %"13", 1 + %"27" = addrspacecast ptr addrspace(5) %"4" to ptr + store i64 %"18", ptr %"27", align 8 + %"29" = addrspacecast ptr addrspace(5) %"4" to ptr + %"36" = getelementptr inbounds i8, ptr %"29", i64 0 + %"30" = load i64, ptr %"36", align 8 + store i64 %"30", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"6", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"31" = inttoptr i64 %"15" to ptr addrspace(1) + %"38" = getelementptr inbounds i8, ptr addrspace(1) %"31", i64 0 + store i64 %"16", ptr addrspace(1) %"38", align 8 ret void } diff --git a/ptx/src/test/spirv_run/rem.ll b/ptx/src/test/spirv_run/rem.ll index 3a1e26c..0fb9cd8 100644 --- a/ptx/src/test/spirv_run/rem.ll +++ b/ptx/src/test/spirv_run/rem.ll @@ -1,37 +1,37 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @rem(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"28": +define protected amdgpu_kernel void @rem(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"25" = inttoptr i64 %"13" to ptr - %"12" = load i32, ptr %"25", align 4 - store i32 %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"15" to ptr - %"30" = getelementptr inbounds i8, ptr %"26", i64 4 - %"14" = load i32, ptr %"30", align 4 - store i32 %"14", ptr addrspace(5) %"7", align 4 - %"17" = load i32, ptr addrspace(5) %"6", align 4 - %"18" = load i32, ptr addrspace(5) %"7", align 4 - %"16" = srem i32 %"17", %"18" - store i32 %"16", ptr addrspace(5) %"6", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i32, ptr addrspace(5) %"6", align 4 - %"27" = inttoptr i64 %"19" to ptr - store i32 %"20", ptr %"27", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"24", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"14" to ptr + %"28" = getelementptr inbounds i8, ptr %"25", i64 4 + %"13" = load i32, ptr %"28", align 4 + store i32 %"13", ptr addrspace(5) %"7", align 4 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + %"15" = srem i32 %"16", %"17" + store i32 %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i32, ptr addrspace(5) %"6", align 4 + %"26" = inttoptr i64 %"18" to ptr + store i32 %"19", ptr %"26", align 4 ret void } diff --git a/ptx/src/test/spirv_run/rsqrt.ll b/ptx/src/test/spirv_run/rsqrt.ll index ffdd662..40833ac 100644 --- a/ptx/src/test/spirv_run/rsqrt.ll +++ b/ptx/src/test/spirv_run/rsqrt.ll @@ -1,31 +1,31 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @rsqrt(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { -"21": +define protected amdgpu_kernel void @rsqrt(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca double, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"17", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"19" = inttoptr i64 %"12" to ptr - %"11" = load double, ptr %"19", align 8 - store double %"11", ptr addrspace(5) %"6", align 8 - %"14" = load double, ptr addrspace(5) %"6", align 8 - %0 = call afn double @llvm.sqrt.f64(double %"14") - %"13" = fdiv arcp afn double 1.000000e+00, %0 - store double %"13", ptr addrspace(5) %"6", align 8 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load double, ptr addrspace(5) %"6", align 8 - %"20" = inttoptr i64 %"15" to ptr - store double %"16", ptr %"20", align 8 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"18" = inttoptr i64 %"11" to ptr + %"10" = load double, ptr %"18", align 8 + store double %"10", ptr addrspace(5) %"6", align 8 + %"13" = load double, ptr addrspace(5) %"6", align 8 + %2 = call afn double @llvm.sqrt.f64(double %"13") + %"12" = fdiv arcp afn double 1.000000e+00, %2 + store double %"12", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load double, ptr addrspace(5) %"6", align 8 + %"19" = inttoptr i64 %"14" to ptr + store double %"15", ptr %"19", align 8 ret void } diff --git a/ptx/src/test/spirv_run/s64_min.ll b/ptx/src/test/spirv_run/s64_min.ll index 3f741e7..a96f0a4 100644 --- a/ptx/src/test/spirv_run/s64_min.ll +++ b/ptx/src/test/spirv_run/s64_min.ll @@ -1,24 +1,24 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @s64_min(ptr addrspace(4) byref(i64) %"13", ptr addrspace(4) byref(i64) %"14") #0 { -"16": +define protected amdgpu_kernel void @s64_min(ptr addrspace(4) byref(i64) %"12", ptr addrspace(4) byref(i64) %"13") #0 { %"6" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"6", align 1 - %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) - %"8" = load i64, ptr addrspace(4) %"14", align 8 - store i64 %"8", ptr addrspace(5) %"4", align 8 - %0 = alloca i64, align 8, addrspace(5) - store i64 -9223372036854775808, ptr addrspace(5) %0, align 8 - %"9" = load i64, ptr addrspace(5) %0, align 8 - store i64 %"9", ptr addrspace(5) %"5", align 8 - %"10" = load i64, ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(5) %"5", align 8 - %"15" = inttoptr i64 %"10" to ptr - store i64 %"11", ptr %"15", align 8 + %1 = alloca i64, align 8, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"6", align 1 + %"7" = load i64, ptr addrspace(4) %"13", align 8 + store i64 %"7", ptr addrspace(5) %"4", align 8 + store i64 -9223372036854775808, ptr addrspace(5) %1, align 8 + %"8" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"8", ptr addrspace(5) %"5", align 8 + %"9" = load i64, ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(5) %"5", align 8 + %"14" = inttoptr i64 %"9" to ptr + store i64 %"10", ptr %"14", align 8 ret void } diff --git a/ptx/src/test/spirv_run/sad.ll b/ptx/src/test/spirv_run/sad.ll new file mode 100644 index 0000000..aa65fce --- /dev/null +++ b/ptx/src/test/spirv_run/sad.ll @@ -0,0 +1,65 @@ +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" +target triple = "amdgcn-amd-amdhsa" + +define protected amdgpu_kernel void @sad(ptr addrspace(4) byref(i64) %"38", ptr addrspace(4) byref(i64) %"39") #0 { + %"11" = alloca i1, align 1, addrspace(5) + %"4" = alloca i64, align 8, addrspace(5) + %"5" = alloca i64, align 8, addrspace(5) + %"6" = alloca i32, align 4, addrspace(5) + %"7" = alloca i32, align 4, addrspace(5) + %"8" = alloca i32, align 4, addrspace(5) + %"9" = alloca i32, align 4, addrspace(5) + %"10" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"11", align 1 + %"12" = load i64, ptr addrspace(4) %"38", align 8 + store i64 %"12", ptr addrspace(5) %"4", align 8 + %"13" = load i64, ptr addrspace(4) %"39", align 8 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"41" = inttoptr i64 %"15" to ptr + %"40" = load i32, ptr %"41", align 4 + store i32 %"40", ptr addrspace(5) %"6", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"42" = inttoptr i64 %"17" to ptr + %"57" = getelementptr inbounds i8, ptr %"42", i64 4 + %"43" = load i32, ptr %"57", align 4 + store i32 %"43", ptr addrspace(5) %"7", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"44" = inttoptr i64 %"19" to ptr + %"59" = getelementptr inbounds i8, ptr %"44", i64 8 + %"45" = load i32, ptr %"59", align 4 + store i32 %"45", ptr addrspace(5) %"8", align 4 + %"21" = load i32, ptr addrspace(5) %"6", align 4 + %"22" = load i32, ptr addrspace(5) %"7", align 4 + %"23" = load i32, ptr addrspace(5) %"8", align 4 + %2 = icmp ugt i32 %"21", %"22" + %3 = sub i32 %"21", %"22" + %4 = sub i32 %"22", %"21" + %5 = select i1 %2, i32 %3, i32 %4 + %"46" = add i32 %"23", %5 + store i32 %"46", ptr addrspace(5) %"9", align 4 + %"25" = load i32, ptr addrspace(5) %"6", align 4 + %"26" = load i32, ptr addrspace(5) %"7", align 4 + %"27" = load i32, ptr addrspace(5) %"8", align 4 + %6 = icmp sgt i32 %"25", %"26" + %7 = sub i32 %"25", %"26" + %8 = sub i32 %"26", %"25" + %9 = select i1 %6, i32 %7, i32 %8 + %"50" = add i32 %"27", %9 + store i32 %"50", ptr addrspace(5) %"10", align 4 + %"28" = load i64, ptr addrspace(5) %"5", align 8 + %"29" = load i32, ptr addrspace(5) %"9", align 4 + %"54" = inttoptr i64 %"28" to ptr + store i32 %"29", ptr %"54", align 4 + %"30" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = load i32, ptr addrspace(5) %"10", align 4 + %"55" = inttoptr i64 %"30" to ptr + %"61" = getelementptr inbounds i8, ptr %"55", i64 4 + store i32 %"31", ptr %"61", align 4 + ret void +} + +attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/sad.ptx b/ptx/src/test/spirv_run/sad.ptx new file mode 100644 index 0000000..c7ed6c6 --- /dev/null +++ b/ptx/src/test/spirv_run/sad.ptx @@ -0,0 +1,29 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.entry sad( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .b32 a; + .reg .b32 b; + .reg .b32 c; + .reg .b32 result_u32; + .reg .b32 result_s32; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.u32 a, [in_addr]; + ld.u32 b, [in_addr+4]; + ld.u32 c, [in_addr+8]; + sad.u32 result_u32, a, b, c; + sad.s32 result_s32, a, b, c; + st.b32 [out_addr], result_u32; + st.b32 [out_addr+4], result_s32; + ret; +} diff --git a/ptx/src/test/spirv_run/selp.ll b/ptx/src/test/spirv_run/selp.ll index 6124887..0e20d6d 100644 --- a/ptx/src/test/spirv_run/selp.ll +++ b/ptx/src/test/spirv_run/selp.ll @@ -1,37 +1,37 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @selp(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { -"29": +define protected amdgpu_kernel void @selp(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i16, align 2, addrspace(5) %"7" = alloca i16, align 2, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"25", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"13" to ptr - %"12" = load i16, ptr %"26", align 2 - store i16 %"12", ptr addrspace(5) %"6", align 2 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"27" = inttoptr i64 %"15" to ptr - %"31" = getelementptr inbounds i8, ptr %"27", i64 2 - %"14" = load i16, ptr %"31", align 2 - store i16 %"14", ptr addrspace(5) %"7", align 2 - %"17" = load i16, ptr addrspace(5) %"6", align 2 - %"18" = load i16, ptr addrspace(5) %"7", align 2 - %"16" = select i1 false, i16 %"17", i16 %"18" - store i16 %"16", ptr addrspace(5) %"6", align 2 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i16, ptr addrspace(5) %"6", align 2 - %"28" = inttoptr i64 %"19" to ptr - store i16 %"20", ptr %"28", align 2 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"12" to ptr + %"11" = load i16, ptr %"25", align 2 + store i16 %"11", ptr addrspace(5) %"6", align 2 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"14" to ptr + %"29" = getelementptr inbounds i8, ptr %"26", i64 2 + %"13" = load i16, ptr %"29", align 2 + store i16 %"13", ptr addrspace(5) %"7", align 2 + %"16" = load i16, ptr addrspace(5) %"6", align 2 + %"17" = load i16, ptr addrspace(5) %"7", align 2 + %"15" = select i1 false, i16 %"16", i16 %"17" + store i16 %"15", ptr addrspace(5) %"6", align 2 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i16, ptr addrspace(5) %"6", align 2 + %"27" = inttoptr i64 %"18" to ptr + store i16 %"19", ptr %"27", align 2 ret void } diff --git a/ptx/src/test/spirv_run/selp_true.ll b/ptx/src/test/spirv_run/selp_true.ll index 283eb81..9b6b41a 100644 --- a/ptx/src/test/spirv_run/selp_true.ll +++ b/ptx/src/test/spirv_run/selp_true.ll @@ -1,37 +1,37 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @selp_true(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { -"29": +define protected amdgpu_kernel void @selp_true(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i16, align 2, addrspace(5) %"7" = alloca i16, align 2, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"23", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"25", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"13" to ptr - %"12" = load i16, ptr %"26", align 2 - store i16 %"12", ptr addrspace(5) %"6", align 2 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"27" = inttoptr i64 %"15" to ptr - %"31" = getelementptr inbounds i8, ptr %"27", i64 2 - %"14" = load i16, ptr %"31", align 2 - store i16 %"14", ptr addrspace(5) %"7", align 2 - %"17" = load i16, ptr addrspace(5) %"6", align 2 - %"18" = load i16, ptr addrspace(5) %"7", align 2 - %"16" = select i1 true, i16 %"17", i16 %"18" - store i16 %"16", ptr addrspace(5) %"6", align 2 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i16, ptr addrspace(5) %"6", align 2 - %"28" = inttoptr i64 %"19" to ptr - store i16 %"20", ptr %"28", align 2 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"12" to ptr + %"11" = load i16, ptr %"25", align 2 + store i16 %"11", ptr addrspace(5) %"6", align 2 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"14" to ptr + %"29" = getelementptr inbounds i8, ptr %"26", i64 2 + %"13" = load i16, ptr %"29", align 2 + store i16 %"13", ptr addrspace(5) %"7", align 2 + %"16" = load i16, ptr addrspace(5) %"6", align 2 + %"17" = load i16, ptr addrspace(5) %"7", align 2 + %"15" = select i1 true, i16 %"16", i16 %"17" + store i16 %"15", ptr addrspace(5) %"6", align 2 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i16, ptr addrspace(5) %"6", align 2 + %"27" = inttoptr i64 %"18" to ptr + store i16 %"19", ptr %"27", align 2 ret void } diff --git a/ptx/src/test/spirv_run/set_f16x2.ll b/ptx/src/test/spirv_run/set_f16x2.ll index 4a2c8ea..d6bf7e0 100644 --- a/ptx/src/test/spirv_run/set_f16x2.ll +++ b/ptx/src/test/spirv_run/set_f16x2.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @set_f16x2(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #0 { -"59": +define protected amdgpu_kernel void @set_f16x2(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #0 { %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 - %"12" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"12", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) @@ -14,54 +10,58 @@ define protected amdgpu_kernel void @set_f16x2(ptr addrspace(4) byref(i64) %"41" %"8" = alloca i32, align 4, addrspace(5) %"9" = alloca i32, align 4, addrspace(5) %"10" = alloca <2 x half>, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"11", align 1 + %"12" = load i64, ptr addrspace(4) %"40", align 8 + store i64 %"12", ptr addrspace(5) %"4", align 8 %"13" = load i64, ptr addrspace(4) %"41", align 8 - store i64 %"13", ptr addrspace(5) %"4", align 8 - %"14" = load i64, ptr addrspace(4) %"42", align 8 - store i64 %"14", ptr addrspace(5) %"5", align 8 - %"16" = load i64, ptr addrspace(5) %"4", align 8 - %"44" = inttoptr i64 %"16" to ptr - %"43" = load i32, ptr %"44", align 4 - store i32 %"43", ptr addrspace(5) %"6", align 4 - %"18" = load i64, ptr addrspace(5) %"4", align 8 - %"45" = inttoptr i64 %"18" to ptr - %"61" = getelementptr inbounds i8, ptr %"45", i64 4 - %"46" = load i32, ptr %"61", align 4 - store i32 %"46", ptr addrspace(5) %"7", align 4 - %"20" = load i64, ptr addrspace(5) %"4", align 8 - %"47" = inttoptr i64 %"20" to ptr - %"63" = getelementptr inbounds i8, ptr %"47", i64 8 - %"48" = load i32, ptr %"63", align 4 - store i32 %"48", ptr addrspace(5) %"8", align 4 - %"22" = load i64, ptr addrspace(5) %"4", align 8 - %"49" = inttoptr i64 %"22" to ptr - %"65" = getelementptr inbounds i8, ptr %"49", i64 12 - %"50" = load i32, ptr %"65", align 4 - store i32 %"50", ptr addrspace(5) %"9", align 4 - %"24" = load i32, ptr addrspace(5) %"6", align 4 - %"25" = load i32, ptr addrspace(5) %"7", align 4 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"43" = inttoptr i64 %"15" to ptr + %"42" = load i32, ptr %"43", align 4 + store i32 %"42", ptr addrspace(5) %"6", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"44" = inttoptr i64 %"17" to ptr + %"59" = getelementptr inbounds i8, ptr %"44", i64 4 + %"45" = load i32, ptr %"59", align 4 + store i32 %"45", ptr addrspace(5) %"7", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"46" = inttoptr i64 %"19" to ptr + %"61" = getelementptr inbounds i8, ptr %"46", i64 8 + %"47" = load i32, ptr %"61", align 4 + store i32 %"47", ptr addrspace(5) %"8", align 4 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"48" = inttoptr i64 %"21" to ptr + %"63" = getelementptr inbounds i8, ptr %"48", i64 12 + %"49" = load i32, ptr %"63", align 4 + store i32 %"49", ptr addrspace(5) %"9", align 4 + %"23" = load i32, ptr addrspace(5) %"6", align 4 + %"24" = load i32, ptr addrspace(5) %"7", align 4 + %"51" = bitcast i32 %"23" to <2 x half> %"52" = bitcast i32 %"24" to <2 x half> - %"53" = bitcast i32 %"25" to <2 x half> - %0 = fcmp ugt <2 x half> %"52", %"53" - %1 = sext <2 x i1> %0 to <2 x i16> - %"51" = bitcast <2 x i16> %1 to i32 - store i32 %"51", ptr addrspace(5) %"6", align 4 - %"27" = load i32, ptr addrspace(5) %"8", align 4 - %"28" = load i32, ptr addrspace(5) %"9", align 4 + %2 = fcmp ugt <2 x half> %"51", %"52" + %3 = sext <2 x i1> %2 to <2 x i16> + %"50" = bitcast <2 x i16> %3 to i32 + store i32 %"50", ptr addrspace(5) %"6", align 4 + %"26" = load i32, ptr addrspace(5) %"8", align 4 + %"27" = load i32, ptr addrspace(5) %"9", align 4 + %"54" = bitcast i32 %"26" to <2 x half> %"55" = bitcast i32 %"27" to <2 x half> - %"56" = bitcast i32 %"28" to <2 x half> - %2 = fcmp oeq <2 x half> %"55", %"56" - %"54" = uitofp <2 x i1> %2 to <2 x half> - %"26" = bitcast <2 x half> %"54" to i32 - store i32 %"26", ptr addrspace(5) %"8", align 4 - %"29" = load i64, ptr addrspace(5) %"5", align 8 - %"30" = load i32, ptr addrspace(5) %"6", align 4 - %"57" = inttoptr i64 %"29" to ptr - store i32 %"30", ptr %"57", align 4 - %"31" = load i64, ptr addrspace(5) %"5", align 8 - %"32" = load i32, ptr addrspace(5) %"8", align 4 - %"58" = inttoptr i64 %"31" to ptr - %"67" = getelementptr inbounds i8, ptr %"58", i64 4 - store i32 %"32", ptr %"67", align 4 + %4 = fcmp oeq <2 x half> %"54", %"55" + %"53" = uitofp <2 x i1> %4 to <2 x half> + %"25" = bitcast <2 x half> %"53" to i32 + store i32 %"25", ptr addrspace(5) %"8", align 4 + %"28" = load i64, ptr addrspace(5) %"5", align 8 + %"29" = load i32, ptr addrspace(5) %"6", align 4 + %"56" = inttoptr i64 %"28" to ptr + store i32 %"29", ptr %"56", align 4 + %"30" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = load i32, ptr addrspace(5) %"8", align 4 + %"57" = inttoptr i64 %"30" to ptr + %"65" = getelementptr inbounds i8, ptr %"57", i64 4 + store i32 %"31", ptr %"65", align 4 ret void } diff --git a/ptx/src/test/spirv_run/setp.ll b/ptx/src/test/spirv_run/setp.ll index a54f8f6..1e9e1e5 100644 --- a/ptx/src/test/spirv_run/setp.ll +++ b/ptx/src/test/spirv_run/setp.ll @@ -1,61 +1,61 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { -"40": +define protected amdgpu_kernel void @setp(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { %"14" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"14", align 1 - %"15" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"15", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) %"9" = alloca i1, align 1, addrspace(5) - %"16" = load i64, ptr addrspace(4) %"35", align 8 - store i64 %"16", ptr addrspace(5) %"4", align 8 - %"17" = load i64, ptr addrspace(4) %"36", align 8 - store i64 %"17", ptr addrspace(5) %"5", align 8 - %"19" = load i64, ptr addrspace(5) %"4", align 8 - %"37" = inttoptr i64 %"19" to ptr - %"18" = load i64, ptr %"37", align 8 - store i64 %"18", ptr addrspace(5) %"6", align 8 - %"21" = load i64, ptr addrspace(5) %"4", align 8 - %"38" = inttoptr i64 %"21" to ptr - %"42" = getelementptr inbounds i8, ptr %"38", i64 8 - %"20" = load i64, ptr %"42", align 8 - store i64 %"20", ptr addrspace(5) %"7", align 8 - %"23" = load i64, ptr addrspace(5) %"6", align 8 - %"24" = load i64, ptr addrspace(5) %"7", align 8 - %"22" = icmp ult i64 %"23", %"24" - store i1 %"22", ptr addrspace(5) %"9", align 1 - %"25" = load i1, ptr addrspace(5) %"9", align 1 - br i1 %"25", label %"10", label %"11" + %1 = alloca i64, align 8, addrspace(5) + %2 = alloca i64, align 8, addrspace(5) + br label %3 -"10": ; preds = %"40" - %0 = alloca i64, align 8, addrspace(5) - store i64 1, ptr addrspace(5) %0, align 8 - %"26" = load i64, ptr addrspace(5) %0, align 8 - store i64 %"26", ptr addrspace(5) %"8", align 8 +3: ; preds = %0 + store i1 false, ptr addrspace(5) %"14", align 1 + %"15" = load i64, ptr addrspace(4) %"34", align 8 + store i64 %"15", ptr addrspace(5) %"4", align 8 + %"16" = load i64, ptr addrspace(4) %"35", align 8 + store i64 %"16", ptr addrspace(5) %"5", align 8 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"36" = inttoptr i64 %"18" to ptr + %"17" = load i64, ptr %"36", align 8 + store i64 %"17", ptr addrspace(5) %"6", align 8 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"37" = inttoptr i64 %"20" to ptr + %"40" = getelementptr inbounds i8, ptr %"37", i64 8 + %"19" = load i64, ptr %"40", align 8 + store i64 %"19", ptr addrspace(5) %"7", align 8 + %"22" = load i64, ptr addrspace(5) %"6", align 8 + %"23" = load i64, ptr addrspace(5) %"7", align 8 + %"21" = icmp ult i64 %"22", %"23" + store i1 %"21", ptr addrspace(5) %"9", align 1 + %"24" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"24", label %"10", label %"11" + +"10": ; preds = %3 + store i64 1, ptr addrspace(5) %1, align 8 + %"25" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"25", ptr addrspace(5) %"8", align 8 br label %"11" -"11": ; preds = %"10", %"40" - %"27" = load i1, ptr addrspace(5) %"9", align 1 - br i1 %"27", label %"13", label %"12" +"11": ; preds = %"10", %3 + %"26" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"26", label %"13", label %"12" "12": ; preds = %"11" - %1 = alloca i64, align 8, addrspace(5) - store i64 2, ptr addrspace(5) %1, align 8 - %"28" = load i64, ptr addrspace(5) %1, align 8 - store i64 %"28", ptr addrspace(5) %"8", align 8 + store i64 2, ptr addrspace(5) %2, align 8 + %"27" = load i64, ptr addrspace(5) %2, align 8 + store i64 %"27", ptr addrspace(5) %"8", align 8 br label %"13" "13": ; preds = %"12", %"11" - %"29" = load i64, ptr addrspace(5) %"5", align 8 - %"30" = load i64, ptr addrspace(5) %"8", align 8 - %"39" = inttoptr i64 %"29" to ptr - store i64 %"30", ptr %"39", align 8 + %"28" = load i64, ptr addrspace(5) %"5", align 8 + %"29" = load i64, ptr addrspace(5) %"8", align 8 + %"38" = inttoptr i64 %"28" to ptr + store i64 %"29", ptr %"38", align 8 ret void } diff --git a/ptx/src/test/spirv_run/setp_bool.ll b/ptx/src/test/spirv_run/setp_bool.ll index 1707a3d..f0b659f 100644 --- a/ptx/src/test/spirv_run/setp_bool.ll +++ b/ptx/src/test/spirv_run/setp_bool.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @setp_bool(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { -"51": +define protected amdgpu_kernel void @setp_bool(ptr addrspace(4) byref(i64) %"44", ptr addrspace(4) byref(i64) %"45") #0 { %"16" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"16", align 1 - %"17" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"17", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) @@ -15,65 +11,69 @@ define protected amdgpu_kernel void @setp_bool(ptr addrspace(4) byref(i64) %"45" %"9" = alloca i1, align 1, addrspace(5) %"10" = alloca i1, align 1, addrspace(5) %"11" = alloca i1, align 1, addrspace(5) - %"18" = load i64, ptr addrspace(4) %"45", align 8 - store i64 %"18", ptr addrspace(5) %"4", align 8 - %"19" = load i64, ptr addrspace(4) %"46", align 8 - store i64 %"19", ptr addrspace(5) %"5", align 8 - %"21" = load i64, ptr addrspace(5) %"4", align 8 - %"47" = inttoptr i64 %"21" to ptr - %"20" = load float, ptr %"47", align 4 - store float %"20", ptr addrspace(5) %"6", align 4 - %"23" = load i64, ptr addrspace(5) %"4", align 8 - %"48" = inttoptr i64 %"23" to ptr - %"53" = getelementptr inbounds i8, ptr %"48", i64 4 - %"22" = load float, ptr %"53", align 4 - store float %"22", ptr addrspace(5) %"7", align 4 - %"25" = load i64, ptr addrspace(5) %"4", align 8 - %"49" = inttoptr i64 %"25" to ptr - %"55" = getelementptr inbounds i8, ptr %"49", i64 8 - %"24" = load float, ptr %"55", align 4 - store float %"24", ptr addrspace(5) %"8", align 4 - %0 = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %0, align 1 - %"26" = load i1, ptr addrspace(5) %0, align 1 - store i1 %"26", ptr addrspace(5) %"9", align 1 - %"29" = load float, ptr addrspace(5) %"6", align 4 - %"30" = load float, ptr addrspace(5) %"7", align 4 - %"31" = load i1, ptr addrspace(5) %"9", align 1 - %1 = fcmp ogt float %"29", %"30" - %2 = xor i1 %1, true - %"27" = and i1 %1, %"31" - %"28" = and i1 %2, %"31" - store i1 %"27", ptr addrspace(5) %"10", align 1 - store i1 %"28", ptr addrspace(5) %"11", align 1 - %"32" = load i1, ptr addrspace(5) %"10", align 1 - br i1 %"32", label %"12", label %"13" - -"12": ; preds = %"51" - %"34" = load float, ptr addrspace(5) %"6", align 4 + %1 = alloca i1, align 1, addrspace(5) + %2 = alloca float, align 4, addrspace(5) %3 = alloca float, align 4, addrspace(5) - store float %"34", ptr addrspace(5) %3, align 4 - %"33" = load float, ptr addrspace(5) %3, align 4 - store float %"33", ptr addrspace(5) %"8", align 4 + br label %4 + +4: ; preds = %0 + store i1 false, ptr addrspace(5) %"16", align 1 + %"17" = load i64, ptr addrspace(4) %"44", align 8 + store i64 %"17", ptr addrspace(5) %"4", align 8 + %"18" = load i64, ptr addrspace(4) %"45", align 8 + store i64 %"18", ptr addrspace(5) %"5", align 8 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"46" = inttoptr i64 %"20" to ptr + %"19" = load float, ptr %"46", align 4 + store float %"19", ptr addrspace(5) %"6", align 4 + %"22" = load i64, ptr addrspace(5) %"4", align 8 + %"47" = inttoptr i64 %"22" to ptr + %"51" = getelementptr inbounds i8, ptr %"47", i64 4 + %"21" = load float, ptr %"51", align 4 + store float %"21", ptr addrspace(5) %"7", align 4 + %"24" = load i64, ptr addrspace(5) %"4", align 8 + %"48" = inttoptr i64 %"24" to ptr + %"53" = getelementptr inbounds i8, ptr %"48", i64 8 + %"23" = load float, ptr %"53", align 4 + store float %"23", ptr addrspace(5) %"8", align 4 + store i1 false, ptr addrspace(5) %1, align 1 + %"25" = load i1, ptr addrspace(5) %1, align 1 + store i1 %"25", ptr addrspace(5) %"9", align 1 + %"28" = load float, ptr addrspace(5) %"6", align 4 + %"29" = load float, ptr addrspace(5) %"7", align 4 + %"30" = load i1, ptr addrspace(5) %"9", align 1 + %5 = fcmp ogt float %"28", %"29" + %6 = xor i1 %5, true + %"26" = and i1 %5, %"30" + %"27" = and i1 %6, %"30" + store i1 %"26", ptr addrspace(5) %"10", align 1 + store i1 %"27", ptr addrspace(5) %"11", align 1 + %"31" = load i1, ptr addrspace(5) %"10", align 1 + br i1 %"31", label %"12", label %"13" + +"12": ; preds = %4 + %"33" = load float, ptr addrspace(5) %"6", align 4 + store float %"33", ptr addrspace(5) %2, align 4 + %"32" = load float, ptr addrspace(5) %2, align 4 + store float %"32", ptr addrspace(5) %"8", align 4 br label %"13" -"13": ; preds = %"12", %"51" - %"35" = load i1, ptr addrspace(5) %"11", align 1 - br i1 %"35", label %"14", label %"15" +"13": ; preds = %"12", %4 + %"34" = load i1, ptr addrspace(5) %"11", align 1 + br i1 %"34", label %"14", label %"15" "14": ; preds = %"13" - %"37" = load float, ptr addrspace(5) %"7", align 4 - %4 = alloca float, align 4, addrspace(5) - store float %"37", ptr addrspace(5) %4, align 4 - %"36" = load float, ptr addrspace(5) %4, align 4 - store float %"36", ptr addrspace(5) %"8", align 4 + %"36" = load float, ptr addrspace(5) %"7", align 4 + store float %"36", ptr addrspace(5) %3, align 4 + %"35" = load float, ptr addrspace(5) %3, align 4 + store float %"35", ptr addrspace(5) %"8", align 4 br label %"15" "15": ; preds = %"14", %"13" - %"38" = load i64, ptr addrspace(5) %"5", align 8 - %"39" = load float, ptr addrspace(5) %"8", align 4 - %"50" = inttoptr i64 %"38" to ptr - store float %"39", ptr %"50", align 4 + %"37" = load i64, ptr addrspace(5) %"5", align 8 + %"38" = load float, ptr addrspace(5) %"8", align 4 + %"49" = inttoptr i64 %"37" to ptr + store float %"38", ptr %"49", align 4 ret void } diff --git a/ptx/src/test/spirv_run/setp_gt.ll b/ptx/src/test/spirv_run/setp_gt.ll index 0aa4831..dbaf20a 100644 --- a/ptx/src/test/spirv_run/setp_gt.ll +++ b/ptx/src/test/spirv_run/setp_gt.ll @@ -1,63 +1,63 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { -"40": +define protected amdgpu_kernel void @setp_gt(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { %"14" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"14", align 1 - %"15" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"15", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) %"7" = alloca float, align 4, addrspace(5) %"8" = alloca float, align 4, addrspace(5) %"9" = alloca i1, align 1, addrspace(5) - %"16" = load i64, ptr addrspace(4) %"35", align 8 - store i64 %"16", ptr addrspace(5) %"4", align 8 - %"17" = load i64, ptr addrspace(4) %"36", align 8 - store i64 %"17", ptr addrspace(5) %"5", align 8 - %"19" = load i64, ptr addrspace(5) %"4", align 8 - %"37" = inttoptr i64 %"19" to ptr - %"18" = load float, ptr %"37", align 4 - store float %"18", ptr addrspace(5) %"6", align 4 - %"21" = load i64, ptr addrspace(5) %"4", align 8 - %"38" = inttoptr i64 %"21" to ptr - %"42" = getelementptr inbounds i8, ptr %"38", i64 4 - %"20" = load float, ptr %"42", align 4 - store float %"20", ptr addrspace(5) %"7", align 4 - %"23" = load float, ptr addrspace(5) %"6", align 4 - %"24" = load float, ptr addrspace(5) %"7", align 4 - %"22" = fcmp ogt float %"23", %"24" - store i1 %"22", ptr addrspace(5) %"9", align 1 - %"25" = load i1, ptr addrspace(5) %"9", align 1 - br i1 %"25", label %"10", label %"11" + %1 = alloca float, align 4, addrspace(5) + %2 = alloca float, align 4, addrspace(5) + br label %3 -"10": ; preds = %"40" - %"27" = load float, ptr addrspace(5) %"6", align 4 - %0 = alloca float, align 4, addrspace(5) - store float %"27", ptr addrspace(5) %0, align 4 - %"26" = load float, ptr addrspace(5) %0, align 4 - store float %"26", ptr addrspace(5) %"8", align 4 +3: ; preds = %0 + store i1 false, ptr addrspace(5) %"14", align 1 + %"15" = load i64, ptr addrspace(4) %"34", align 8 + store i64 %"15", ptr addrspace(5) %"4", align 8 + %"16" = load i64, ptr addrspace(4) %"35", align 8 + store i64 %"16", ptr addrspace(5) %"5", align 8 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"36" = inttoptr i64 %"18" to ptr + %"17" = load float, ptr %"36", align 4 + store float %"17", ptr addrspace(5) %"6", align 4 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"37" = inttoptr i64 %"20" to ptr + %"40" = getelementptr inbounds i8, ptr %"37", i64 4 + %"19" = load float, ptr %"40", align 4 + store float %"19", ptr addrspace(5) %"7", align 4 + %"22" = load float, ptr addrspace(5) %"6", align 4 + %"23" = load float, ptr addrspace(5) %"7", align 4 + %"21" = fcmp ogt float %"22", %"23" + store i1 %"21", ptr addrspace(5) %"9", align 1 + %"24" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"24", label %"10", label %"11" + +"10": ; preds = %3 + %"26" = load float, ptr addrspace(5) %"6", align 4 + store float %"26", ptr addrspace(5) %1, align 4 + %"25" = load float, ptr addrspace(5) %1, align 4 + store float %"25", ptr addrspace(5) %"8", align 4 br label %"11" -"11": ; preds = %"10", %"40" - %"28" = load i1, ptr addrspace(5) %"9", align 1 - br i1 %"28", label %"13", label %"12" +"11": ; preds = %"10", %3 + %"27" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"27", label %"13", label %"12" "12": ; preds = %"11" - %"30" = load float, ptr addrspace(5) %"7", align 4 - %1 = alloca float, align 4, addrspace(5) - store float %"30", ptr addrspace(5) %1, align 4 - %"29" = load float, ptr addrspace(5) %1, align 4 - store float %"29", ptr addrspace(5) %"8", align 4 + %"29" = load float, ptr addrspace(5) %"7", align 4 + store float %"29", ptr addrspace(5) %2, align 4 + %"28" = load float, ptr addrspace(5) %2, align 4 + store float %"28", ptr addrspace(5) %"8", align 4 br label %"13" "13": ; preds = %"12", %"11" - %"31" = load i64, ptr addrspace(5) %"5", align 8 - %"32" = load float, ptr addrspace(5) %"8", align 4 - %"39" = inttoptr i64 %"31" to ptr - store float %"32", ptr %"39", align 4 + %"30" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = load float, ptr addrspace(5) %"8", align 4 + %"38" = inttoptr i64 %"30" to ptr + store float %"31", ptr %"38", align 4 ret void } diff --git a/ptx/src/test/spirv_run/setp_leu.ll b/ptx/src/test/spirv_run/setp_leu.ll index 4105d59..d27b96a 100644 --- a/ptx/src/test/spirv_run/setp_leu.ll +++ b/ptx/src/test/spirv_run/setp_leu.ll @@ -1,63 +1,63 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"35", ptr addrspace(4) byref(i64) %"36") #0 { -"40": +define protected amdgpu_kernel void @setp_leu(ptr addrspace(4) byref(i64) %"34", ptr addrspace(4) byref(i64) %"35") #0 { %"14" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"14", align 1 - %"15" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"15", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) %"7" = alloca float, align 4, addrspace(5) %"8" = alloca float, align 4, addrspace(5) %"9" = alloca i1, align 1, addrspace(5) - %"16" = load i64, ptr addrspace(4) %"35", align 8 - store i64 %"16", ptr addrspace(5) %"4", align 8 - %"17" = load i64, ptr addrspace(4) %"36", align 8 - store i64 %"17", ptr addrspace(5) %"5", align 8 - %"19" = load i64, ptr addrspace(5) %"4", align 8 - %"37" = inttoptr i64 %"19" to ptr - %"18" = load float, ptr %"37", align 4 - store float %"18", ptr addrspace(5) %"6", align 4 - %"21" = load i64, ptr addrspace(5) %"4", align 8 - %"38" = inttoptr i64 %"21" to ptr - %"42" = getelementptr inbounds i8, ptr %"38", i64 4 - %"20" = load float, ptr %"42", align 4 - store float %"20", ptr addrspace(5) %"7", align 4 - %"23" = load float, ptr addrspace(5) %"6", align 4 - %"24" = load float, ptr addrspace(5) %"7", align 4 - %"22" = fcmp ule float %"23", %"24" - store i1 %"22", ptr addrspace(5) %"9", align 1 - %"25" = load i1, ptr addrspace(5) %"9", align 1 - br i1 %"25", label %"10", label %"11" + %1 = alloca float, align 4, addrspace(5) + %2 = alloca float, align 4, addrspace(5) + br label %3 -"10": ; preds = %"40" - %"27" = load float, ptr addrspace(5) %"6", align 4 - %0 = alloca float, align 4, addrspace(5) - store float %"27", ptr addrspace(5) %0, align 4 - %"26" = load float, ptr addrspace(5) %0, align 4 - store float %"26", ptr addrspace(5) %"8", align 4 +3: ; preds = %0 + store i1 false, ptr addrspace(5) %"14", align 1 + %"15" = load i64, ptr addrspace(4) %"34", align 8 + store i64 %"15", ptr addrspace(5) %"4", align 8 + %"16" = load i64, ptr addrspace(4) %"35", align 8 + store i64 %"16", ptr addrspace(5) %"5", align 8 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"36" = inttoptr i64 %"18" to ptr + %"17" = load float, ptr %"36", align 4 + store float %"17", ptr addrspace(5) %"6", align 4 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"37" = inttoptr i64 %"20" to ptr + %"40" = getelementptr inbounds i8, ptr %"37", i64 4 + %"19" = load float, ptr %"40", align 4 + store float %"19", ptr addrspace(5) %"7", align 4 + %"22" = load float, ptr addrspace(5) %"6", align 4 + %"23" = load float, ptr addrspace(5) %"7", align 4 + %"21" = fcmp ule float %"22", %"23" + store i1 %"21", ptr addrspace(5) %"9", align 1 + %"24" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"24", label %"10", label %"11" + +"10": ; preds = %3 + %"26" = load float, ptr addrspace(5) %"6", align 4 + store float %"26", ptr addrspace(5) %1, align 4 + %"25" = load float, ptr addrspace(5) %1, align 4 + store float %"25", ptr addrspace(5) %"8", align 4 br label %"11" -"11": ; preds = %"10", %"40" - %"28" = load i1, ptr addrspace(5) %"9", align 1 - br i1 %"28", label %"13", label %"12" +"11": ; preds = %"10", %3 + %"27" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"27", label %"13", label %"12" "12": ; preds = %"11" - %"30" = load float, ptr addrspace(5) %"7", align 4 - %1 = alloca float, align 4, addrspace(5) - store float %"30", ptr addrspace(5) %1, align 4 - %"29" = load float, ptr addrspace(5) %1, align 4 - store float %"29", ptr addrspace(5) %"8", align 4 + %"29" = load float, ptr addrspace(5) %"7", align 4 + store float %"29", ptr addrspace(5) %2, align 4 + %"28" = load float, ptr addrspace(5) %2, align 4 + store float %"28", ptr addrspace(5) %"8", align 4 br label %"13" "13": ; preds = %"12", %"11" - %"31" = load i64, ptr addrspace(5) %"5", align 8 - %"32" = load float, ptr addrspace(5) %"8", align 4 - %"39" = inttoptr i64 %"31" to ptr - store float %"32", ptr %"39", align 4 + %"30" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = load float, ptr addrspace(5) %"8", align 4 + %"38" = inttoptr i64 %"30" to ptr + store float %"31", ptr %"38", align 4 ret void } diff --git a/ptx/src/test/spirv_run/setp_nan.ll b/ptx/src/test/spirv_run/setp_nan.ll index da9c62a..709ed89 100644 --- a/ptx/src/test/spirv_run/setp_nan.ll +++ b/ptx/src/test/spirv_run/setp_nan.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"116", ptr addrspace(4) byref(i64) %"117") #0 { -"130": +define protected amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"115", ptr addrspace(4) byref(i64) %"116") #0 { %"32" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"32", align 1 - %"33" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"33", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) @@ -19,172 +15,176 @@ define protected amdgpu_kernel void @setp_nan(ptr addrspace(4) byref(i64) %"116" %"13" = alloca float, align 4, addrspace(5) %"14" = alloca i32, align 4, addrspace(5) %"15" = alloca i1, align 1, addrspace(5) - %"34" = load i64, ptr addrspace(4) %"116", align 8 - store i64 %"34", ptr addrspace(5) %"4", align 8 - %"35" = load i64, ptr addrspace(4) %"117", align 8 - store i64 %"35", ptr addrspace(5) %"5", align 8 - %"37" = load i64, ptr addrspace(5) %"4", align 8 - %"118" = inttoptr i64 %"37" to ptr - %"36" = load float, ptr %"118", align 4 - store float %"36", ptr addrspace(5) %"6", align 4 - %"39" = load i64, ptr addrspace(5) %"4", align 8 - %"119" = inttoptr i64 %"39" to ptr - %"132" = getelementptr inbounds i8, ptr %"119", i64 4 - %"38" = load float, ptr %"132", align 4 - store float %"38", ptr addrspace(5) %"7", align 4 - %"41" = load i64, ptr addrspace(5) %"4", align 8 - %"120" = inttoptr i64 %"41" to ptr - %"134" = getelementptr inbounds i8, ptr %"120", i64 8 - %"40" = load float, ptr %"134", align 4 - store float %"40", ptr addrspace(5) %"8", align 4 - %"43" = load i64, ptr addrspace(5) %"4", align 8 - %"121" = inttoptr i64 %"43" to ptr - %"136" = getelementptr inbounds i8, ptr %"121", i64 12 - %"42" = load float, ptr %"136", align 4 - store float %"42", ptr addrspace(5) %"9", align 4 - %"45" = load i64, ptr addrspace(5) %"4", align 8 - %"122" = inttoptr i64 %"45" to ptr - %"138" = getelementptr inbounds i8, ptr %"122", i64 16 - %"44" = load float, ptr %"138", align 4 - store float %"44", ptr addrspace(5) %"10", align 4 - %"47" = load i64, ptr addrspace(5) %"4", align 8 - %"123" = inttoptr i64 %"47" to ptr - %"140" = getelementptr inbounds i8, ptr %"123", i64 20 - %"46" = load float, ptr %"140", align 4 - store float %"46", ptr addrspace(5) %"11", align 4 - %"49" = load i64, ptr addrspace(5) %"4", align 8 - %"124" = inttoptr i64 %"49" to ptr - %"142" = getelementptr inbounds i8, ptr %"124", i64 24 - %"48" = load float, ptr %"142", align 4 - store float %"48", ptr addrspace(5) %"12", align 4 - %"51" = load i64, ptr addrspace(5) %"4", align 8 - %"125" = inttoptr i64 %"51" to ptr - %"144" = getelementptr inbounds i8, ptr %"125", i64 28 - %"50" = load float, ptr %"144", align 4 - store float %"50", ptr addrspace(5) %"13", align 4 - %"53" = load float, ptr addrspace(5) %"6", align 4 - %"54" = load float, ptr addrspace(5) %"7", align 4 - %"52" = fcmp uno float %"53", %"54" - store i1 %"52", ptr addrspace(5) %"15", align 1 - %"55" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"55", label %"16", label %"17" + %1 = alloca i32, align 4, addrspace(5) + %2 = alloca i32, align 4, addrspace(5) + %3 = alloca i32, align 4, addrspace(5) + %4 = alloca i32, align 4, addrspace(5) + %5 = alloca i32, align 4, addrspace(5) + %6 = alloca i32, align 4, addrspace(5) + %7 = alloca i32, align 4, addrspace(5) + %8 = alloca i32, align 4, addrspace(5) + br label %9 -"16": ; preds = %"130" - %0 = alloca i32, align 4, addrspace(5) - store i32 1, ptr addrspace(5) %0, align 4 - %"56" = load i32, ptr addrspace(5) %0, align 4 - store i32 %"56", ptr addrspace(5) %"14", align 4 +9: ; preds = %0 + store i1 false, ptr addrspace(5) %"32", align 1 + %"33" = load i64, ptr addrspace(4) %"115", align 8 + store i64 %"33", ptr addrspace(5) %"4", align 8 + %"34" = load i64, ptr addrspace(4) %"116", align 8 + store i64 %"34", ptr addrspace(5) %"5", align 8 + %"36" = load i64, ptr addrspace(5) %"4", align 8 + %"117" = inttoptr i64 %"36" to ptr + %"35" = load float, ptr %"117", align 4 + store float %"35", ptr addrspace(5) %"6", align 4 + %"38" = load i64, ptr addrspace(5) %"4", align 8 + %"118" = inttoptr i64 %"38" to ptr + %"130" = getelementptr inbounds i8, ptr %"118", i64 4 + %"37" = load float, ptr %"130", align 4 + store float %"37", ptr addrspace(5) %"7", align 4 + %"40" = load i64, ptr addrspace(5) %"4", align 8 + %"119" = inttoptr i64 %"40" to ptr + %"132" = getelementptr inbounds i8, ptr %"119", i64 8 + %"39" = load float, ptr %"132", align 4 + store float %"39", ptr addrspace(5) %"8", align 4 + %"42" = load i64, ptr addrspace(5) %"4", align 8 + %"120" = inttoptr i64 %"42" to ptr + %"134" = getelementptr inbounds i8, ptr %"120", i64 12 + %"41" = load float, ptr %"134", align 4 + store float %"41", ptr addrspace(5) %"9", align 4 + %"44" = load i64, ptr addrspace(5) %"4", align 8 + %"121" = inttoptr i64 %"44" to ptr + %"136" = getelementptr inbounds i8, ptr %"121", i64 16 + %"43" = load float, ptr %"136", align 4 + store float %"43", ptr addrspace(5) %"10", align 4 + %"46" = load i64, ptr addrspace(5) %"4", align 8 + %"122" = inttoptr i64 %"46" to ptr + %"138" = getelementptr inbounds i8, ptr %"122", i64 20 + %"45" = load float, ptr %"138", align 4 + store float %"45", ptr addrspace(5) %"11", align 4 + %"48" = load i64, ptr addrspace(5) %"4", align 8 + %"123" = inttoptr i64 %"48" to ptr + %"140" = getelementptr inbounds i8, ptr %"123", i64 24 + %"47" = load float, ptr %"140", align 4 + store float %"47", ptr addrspace(5) %"12", align 4 + %"50" = load i64, ptr addrspace(5) %"4", align 8 + %"124" = inttoptr i64 %"50" to ptr + %"142" = getelementptr inbounds i8, ptr %"124", i64 28 + %"49" = load float, ptr %"142", align 4 + store float %"49", ptr addrspace(5) %"13", align 4 + %"52" = load float, ptr addrspace(5) %"6", align 4 + %"53" = load float, ptr addrspace(5) %"7", align 4 + %"51" = fcmp uno float %"52", %"53" + store i1 %"51", ptr addrspace(5) %"15", align 1 + %"54" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"54", label %"16", label %"17" + +"16": ; preds = %9 + store i32 1, ptr addrspace(5) %1, align 4 + %"55" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"55", ptr addrspace(5) %"14", align 4 br label %"17" -"17": ; preds = %"16", %"130" - %"57" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"57", label %"19", label %"18" +"17": ; preds = %"16", %9 + %"56" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"56", label %"19", label %"18" "18": ; preds = %"17" - %1 = alloca i32, align 4, addrspace(5) - store i32 0, ptr addrspace(5) %1, align 4 - %"58" = load i32, ptr addrspace(5) %1, align 4 - store i32 %"58", ptr addrspace(5) %"14", align 4 + store i32 0, ptr addrspace(5) %2, align 4 + %"57" = load i32, ptr addrspace(5) %2, align 4 + store i32 %"57", ptr addrspace(5) %"14", align 4 br label %"19" "19": ; preds = %"18", %"17" - %"59" = load i64, ptr addrspace(5) %"5", align 8 - %"60" = load i32, ptr addrspace(5) %"14", align 4 - %"126" = inttoptr i64 %"59" to ptr - store i32 %"60", ptr %"126", align 4 - %"62" = load float, ptr addrspace(5) %"8", align 4 - %"63" = load float, ptr addrspace(5) %"9", align 4 - %"61" = fcmp uno float %"62", %"63" - store i1 %"61", ptr addrspace(5) %"15", align 1 - %"64" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"64", label %"20", label %"21" + %"58" = load i64, ptr addrspace(5) %"5", align 8 + %"59" = load i32, ptr addrspace(5) %"14", align 4 + %"125" = inttoptr i64 %"58" to ptr + store i32 %"59", ptr %"125", align 4 + %"61" = load float, ptr addrspace(5) %"8", align 4 + %"62" = load float, ptr addrspace(5) %"9", align 4 + %"60" = fcmp uno float %"61", %"62" + store i1 %"60", ptr addrspace(5) %"15", align 1 + %"63" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"63", label %"20", label %"21" "20": ; preds = %"19" - %2 = alloca i32, align 4, addrspace(5) - store i32 1, ptr addrspace(5) %2, align 4 - %"65" = load i32, ptr addrspace(5) %2, align 4 - store i32 %"65", ptr addrspace(5) %"14", align 4 + store i32 1, ptr addrspace(5) %3, align 4 + %"64" = load i32, ptr addrspace(5) %3, align 4 + store i32 %"64", ptr addrspace(5) %"14", align 4 br label %"21" "21": ; preds = %"20", %"19" - %"66" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"66", label %"23", label %"22" + %"65" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"65", label %"23", label %"22" "22": ; preds = %"21" - %3 = alloca i32, align 4, addrspace(5) - store i32 0, ptr addrspace(5) %3, align 4 - %"67" = load i32, ptr addrspace(5) %3, align 4 - store i32 %"67", ptr addrspace(5) %"14", align 4 + store i32 0, ptr addrspace(5) %4, align 4 + %"66" = load i32, ptr addrspace(5) %4, align 4 + store i32 %"66", ptr addrspace(5) %"14", align 4 br label %"23" "23": ; preds = %"22", %"21" - %"68" = load i64, ptr addrspace(5) %"5", align 8 - %"69" = load i32, ptr addrspace(5) %"14", align 4 - %"127" = inttoptr i64 %"68" to ptr - %"146" = getelementptr inbounds i8, ptr %"127", i64 4 - store i32 %"69", ptr %"146", align 4 - %"71" = load float, ptr addrspace(5) %"10", align 4 - %"72" = load float, ptr addrspace(5) %"11", align 4 - %"70" = fcmp uno float %"71", %"72" - store i1 %"70", ptr addrspace(5) %"15", align 1 - %"73" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"73", label %"24", label %"25" + %"67" = load i64, ptr addrspace(5) %"5", align 8 + %"68" = load i32, ptr addrspace(5) %"14", align 4 + %"126" = inttoptr i64 %"67" to ptr + %"144" = getelementptr inbounds i8, ptr %"126", i64 4 + store i32 %"68", ptr %"144", align 4 + %"70" = load float, ptr addrspace(5) %"10", align 4 + %"71" = load float, ptr addrspace(5) %"11", align 4 + %"69" = fcmp uno float %"70", %"71" + store i1 %"69", ptr addrspace(5) %"15", align 1 + %"72" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"72", label %"24", label %"25" "24": ; preds = %"23" - %4 = alloca i32, align 4, addrspace(5) - store i32 1, ptr addrspace(5) %4, align 4 - %"74" = load i32, ptr addrspace(5) %4, align 4 - store i32 %"74", ptr addrspace(5) %"14", align 4 + store i32 1, ptr addrspace(5) %5, align 4 + %"73" = load i32, ptr addrspace(5) %5, align 4 + store i32 %"73", ptr addrspace(5) %"14", align 4 br label %"25" "25": ; preds = %"24", %"23" - %"75" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"75", label %"27", label %"26" + %"74" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"74", label %"27", label %"26" "26": ; preds = %"25" - %5 = alloca i32, align 4, addrspace(5) - store i32 0, ptr addrspace(5) %5, align 4 - %"76" = load i32, ptr addrspace(5) %5, align 4 - store i32 %"76", ptr addrspace(5) %"14", align 4 + store i32 0, ptr addrspace(5) %6, align 4 + %"75" = load i32, ptr addrspace(5) %6, align 4 + store i32 %"75", ptr addrspace(5) %"14", align 4 br label %"27" "27": ; preds = %"26", %"25" - %"77" = load i64, ptr addrspace(5) %"5", align 8 - %"78" = load i32, ptr addrspace(5) %"14", align 4 - %"128" = inttoptr i64 %"77" to ptr - %"148" = getelementptr inbounds i8, ptr %"128", i64 8 - store i32 %"78", ptr %"148", align 4 - %"80" = load float, ptr addrspace(5) %"12", align 4 - %"81" = load float, ptr addrspace(5) %"13", align 4 - %"79" = fcmp uno float %"80", %"81" - store i1 %"79", ptr addrspace(5) %"15", align 1 - %"82" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"82", label %"28", label %"29" + %"76" = load i64, ptr addrspace(5) %"5", align 8 + %"77" = load i32, ptr addrspace(5) %"14", align 4 + %"127" = inttoptr i64 %"76" to ptr + %"146" = getelementptr inbounds i8, ptr %"127", i64 8 + store i32 %"77", ptr %"146", align 4 + %"79" = load float, ptr addrspace(5) %"12", align 4 + %"80" = load float, ptr addrspace(5) %"13", align 4 + %"78" = fcmp uno float %"79", %"80" + store i1 %"78", ptr addrspace(5) %"15", align 1 + %"81" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"81", label %"28", label %"29" "28": ; preds = %"27" - %6 = alloca i32, align 4, addrspace(5) - store i32 1, ptr addrspace(5) %6, align 4 - %"83" = load i32, ptr addrspace(5) %6, align 4 - store i32 %"83", ptr addrspace(5) %"14", align 4 + store i32 1, ptr addrspace(5) %7, align 4 + %"82" = load i32, ptr addrspace(5) %7, align 4 + store i32 %"82", ptr addrspace(5) %"14", align 4 br label %"29" "29": ; preds = %"28", %"27" - %"84" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"84", label %"31", label %"30" + %"83" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"83", label %"31", label %"30" "30": ; preds = %"29" - %7 = alloca i32, align 4, addrspace(5) - store i32 0, ptr addrspace(5) %7, align 4 - %"85" = load i32, ptr addrspace(5) %7, align 4 - store i32 %"85", ptr addrspace(5) %"14", align 4 + store i32 0, ptr addrspace(5) %8, align 4 + %"84" = load i32, ptr addrspace(5) %8, align 4 + store i32 %"84", ptr addrspace(5) %"14", align 4 br label %"31" "31": ; preds = %"30", %"29" - %"86" = load i64, ptr addrspace(5) %"5", align 8 - %"87" = load i32, ptr addrspace(5) %"14", align 4 - %"129" = inttoptr i64 %"86" to ptr - %"150" = getelementptr inbounds i8, ptr %"129", i64 12 - store i32 %"87", ptr %"150", align 4 + %"85" = load i64, ptr addrspace(5) %"5", align 8 + %"86" = load i32, ptr addrspace(5) %"14", align 4 + %"128" = inttoptr i64 %"85" to ptr + %"148" = getelementptr inbounds i8, ptr %"128", i64 12 + store i32 %"86", ptr %"148", align 4 ret void } diff --git a/ptx/src/test/spirv_run/setp_num.ll b/ptx/src/test/spirv_run/setp_num.ll index 07cf161..bebecc4 100644 --- a/ptx/src/test/spirv_run/setp_num.ll +++ b/ptx/src/test/spirv_run/setp_num.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"116", ptr addrspace(4) byref(i64) %"117") #0 { -"130": +define protected amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"115", ptr addrspace(4) byref(i64) %"116") #0 { %"32" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"32", align 1 - %"33" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"33", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) @@ -19,172 +15,176 @@ define protected amdgpu_kernel void @setp_num(ptr addrspace(4) byref(i64) %"116" %"13" = alloca float, align 4, addrspace(5) %"14" = alloca i32, align 4, addrspace(5) %"15" = alloca i1, align 1, addrspace(5) - %"34" = load i64, ptr addrspace(4) %"116", align 8 - store i64 %"34", ptr addrspace(5) %"4", align 8 - %"35" = load i64, ptr addrspace(4) %"117", align 8 - store i64 %"35", ptr addrspace(5) %"5", align 8 - %"37" = load i64, ptr addrspace(5) %"4", align 8 - %"118" = inttoptr i64 %"37" to ptr - %"36" = load float, ptr %"118", align 4 - store float %"36", ptr addrspace(5) %"6", align 4 - %"39" = load i64, ptr addrspace(5) %"4", align 8 - %"119" = inttoptr i64 %"39" to ptr - %"132" = getelementptr inbounds i8, ptr %"119", i64 4 - %"38" = load float, ptr %"132", align 4 - store float %"38", ptr addrspace(5) %"7", align 4 - %"41" = load i64, ptr addrspace(5) %"4", align 8 - %"120" = inttoptr i64 %"41" to ptr - %"134" = getelementptr inbounds i8, ptr %"120", i64 8 - %"40" = load float, ptr %"134", align 4 - store float %"40", ptr addrspace(5) %"8", align 4 - %"43" = load i64, ptr addrspace(5) %"4", align 8 - %"121" = inttoptr i64 %"43" to ptr - %"136" = getelementptr inbounds i8, ptr %"121", i64 12 - %"42" = load float, ptr %"136", align 4 - store float %"42", ptr addrspace(5) %"9", align 4 - %"45" = load i64, ptr addrspace(5) %"4", align 8 - %"122" = inttoptr i64 %"45" to ptr - %"138" = getelementptr inbounds i8, ptr %"122", i64 16 - %"44" = load float, ptr %"138", align 4 - store float %"44", ptr addrspace(5) %"10", align 4 - %"47" = load i64, ptr addrspace(5) %"4", align 8 - %"123" = inttoptr i64 %"47" to ptr - %"140" = getelementptr inbounds i8, ptr %"123", i64 20 - %"46" = load float, ptr %"140", align 4 - store float %"46", ptr addrspace(5) %"11", align 4 - %"49" = load i64, ptr addrspace(5) %"4", align 8 - %"124" = inttoptr i64 %"49" to ptr - %"142" = getelementptr inbounds i8, ptr %"124", i64 24 - %"48" = load float, ptr %"142", align 4 - store float %"48", ptr addrspace(5) %"12", align 4 - %"51" = load i64, ptr addrspace(5) %"4", align 8 - %"125" = inttoptr i64 %"51" to ptr - %"144" = getelementptr inbounds i8, ptr %"125", i64 28 - %"50" = load float, ptr %"144", align 4 - store float %"50", ptr addrspace(5) %"13", align 4 - %"53" = load float, ptr addrspace(5) %"6", align 4 - %"54" = load float, ptr addrspace(5) %"7", align 4 - %"52" = fcmp ord float %"53", %"54" - store i1 %"52", ptr addrspace(5) %"15", align 1 - %"55" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"55", label %"16", label %"17" + %1 = alloca i32, align 4, addrspace(5) + %2 = alloca i32, align 4, addrspace(5) + %3 = alloca i32, align 4, addrspace(5) + %4 = alloca i32, align 4, addrspace(5) + %5 = alloca i32, align 4, addrspace(5) + %6 = alloca i32, align 4, addrspace(5) + %7 = alloca i32, align 4, addrspace(5) + %8 = alloca i32, align 4, addrspace(5) + br label %9 -"16": ; preds = %"130" - %0 = alloca i32, align 4, addrspace(5) - store i32 2, ptr addrspace(5) %0, align 4 - %"56" = load i32, ptr addrspace(5) %0, align 4 - store i32 %"56", ptr addrspace(5) %"14", align 4 +9: ; preds = %0 + store i1 false, ptr addrspace(5) %"32", align 1 + %"33" = load i64, ptr addrspace(4) %"115", align 8 + store i64 %"33", ptr addrspace(5) %"4", align 8 + %"34" = load i64, ptr addrspace(4) %"116", align 8 + store i64 %"34", ptr addrspace(5) %"5", align 8 + %"36" = load i64, ptr addrspace(5) %"4", align 8 + %"117" = inttoptr i64 %"36" to ptr + %"35" = load float, ptr %"117", align 4 + store float %"35", ptr addrspace(5) %"6", align 4 + %"38" = load i64, ptr addrspace(5) %"4", align 8 + %"118" = inttoptr i64 %"38" to ptr + %"130" = getelementptr inbounds i8, ptr %"118", i64 4 + %"37" = load float, ptr %"130", align 4 + store float %"37", ptr addrspace(5) %"7", align 4 + %"40" = load i64, ptr addrspace(5) %"4", align 8 + %"119" = inttoptr i64 %"40" to ptr + %"132" = getelementptr inbounds i8, ptr %"119", i64 8 + %"39" = load float, ptr %"132", align 4 + store float %"39", ptr addrspace(5) %"8", align 4 + %"42" = load i64, ptr addrspace(5) %"4", align 8 + %"120" = inttoptr i64 %"42" to ptr + %"134" = getelementptr inbounds i8, ptr %"120", i64 12 + %"41" = load float, ptr %"134", align 4 + store float %"41", ptr addrspace(5) %"9", align 4 + %"44" = load i64, ptr addrspace(5) %"4", align 8 + %"121" = inttoptr i64 %"44" to ptr + %"136" = getelementptr inbounds i8, ptr %"121", i64 16 + %"43" = load float, ptr %"136", align 4 + store float %"43", ptr addrspace(5) %"10", align 4 + %"46" = load i64, ptr addrspace(5) %"4", align 8 + %"122" = inttoptr i64 %"46" to ptr + %"138" = getelementptr inbounds i8, ptr %"122", i64 20 + %"45" = load float, ptr %"138", align 4 + store float %"45", ptr addrspace(5) %"11", align 4 + %"48" = load i64, ptr addrspace(5) %"4", align 8 + %"123" = inttoptr i64 %"48" to ptr + %"140" = getelementptr inbounds i8, ptr %"123", i64 24 + %"47" = load float, ptr %"140", align 4 + store float %"47", ptr addrspace(5) %"12", align 4 + %"50" = load i64, ptr addrspace(5) %"4", align 8 + %"124" = inttoptr i64 %"50" to ptr + %"142" = getelementptr inbounds i8, ptr %"124", i64 28 + %"49" = load float, ptr %"142", align 4 + store float %"49", ptr addrspace(5) %"13", align 4 + %"52" = load float, ptr addrspace(5) %"6", align 4 + %"53" = load float, ptr addrspace(5) %"7", align 4 + %"51" = fcmp ord float %"52", %"53" + store i1 %"51", ptr addrspace(5) %"15", align 1 + %"54" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"54", label %"16", label %"17" + +"16": ; preds = %9 + store i32 2, ptr addrspace(5) %1, align 4 + %"55" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"55", ptr addrspace(5) %"14", align 4 br label %"17" -"17": ; preds = %"16", %"130" - %"57" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"57", label %"19", label %"18" +"17": ; preds = %"16", %9 + %"56" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"56", label %"19", label %"18" "18": ; preds = %"17" - %1 = alloca i32, align 4, addrspace(5) - store i32 0, ptr addrspace(5) %1, align 4 - %"58" = load i32, ptr addrspace(5) %1, align 4 - store i32 %"58", ptr addrspace(5) %"14", align 4 + store i32 0, ptr addrspace(5) %2, align 4 + %"57" = load i32, ptr addrspace(5) %2, align 4 + store i32 %"57", ptr addrspace(5) %"14", align 4 br label %"19" "19": ; preds = %"18", %"17" - %"59" = load i64, ptr addrspace(5) %"5", align 8 - %"60" = load i32, ptr addrspace(5) %"14", align 4 - %"126" = inttoptr i64 %"59" to ptr - store i32 %"60", ptr %"126", align 4 - %"62" = load float, ptr addrspace(5) %"8", align 4 - %"63" = load float, ptr addrspace(5) %"9", align 4 - %"61" = fcmp ord float %"62", %"63" - store i1 %"61", ptr addrspace(5) %"15", align 1 - %"64" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"64", label %"20", label %"21" + %"58" = load i64, ptr addrspace(5) %"5", align 8 + %"59" = load i32, ptr addrspace(5) %"14", align 4 + %"125" = inttoptr i64 %"58" to ptr + store i32 %"59", ptr %"125", align 4 + %"61" = load float, ptr addrspace(5) %"8", align 4 + %"62" = load float, ptr addrspace(5) %"9", align 4 + %"60" = fcmp ord float %"61", %"62" + store i1 %"60", ptr addrspace(5) %"15", align 1 + %"63" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"63", label %"20", label %"21" "20": ; preds = %"19" - %2 = alloca i32, align 4, addrspace(5) - store i32 2, ptr addrspace(5) %2, align 4 - %"65" = load i32, ptr addrspace(5) %2, align 4 - store i32 %"65", ptr addrspace(5) %"14", align 4 + store i32 2, ptr addrspace(5) %3, align 4 + %"64" = load i32, ptr addrspace(5) %3, align 4 + store i32 %"64", ptr addrspace(5) %"14", align 4 br label %"21" "21": ; preds = %"20", %"19" - %"66" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"66", label %"23", label %"22" + %"65" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"65", label %"23", label %"22" "22": ; preds = %"21" - %3 = alloca i32, align 4, addrspace(5) - store i32 0, ptr addrspace(5) %3, align 4 - %"67" = load i32, ptr addrspace(5) %3, align 4 - store i32 %"67", ptr addrspace(5) %"14", align 4 + store i32 0, ptr addrspace(5) %4, align 4 + %"66" = load i32, ptr addrspace(5) %4, align 4 + store i32 %"66", ptr addrspace(5) %"14", align 4 br label %"23" "23": ; preds = %"22", %"21" - %"68" = load i64, ptr addrspace(5) %"5", align 8 - %"69" = load i32, ptr addrspace(5) %"14", align 4 - %"127" = inttoptr i64 %"68" to ptr - %"146" = getelementptr inbounds i8, ptr %"127", i64 4 - store i32 %"69", ptr %"146", align 4 - %"71" = load float, ptr addrspace(5) %"10", align 4 - %"72" = load float, ptr addrspace(5) %"11", align 4 - %"70" = fcmp ord float %"71", %"72" - store i1 %"70", ptr addrspace(5) %"15", align 1 - %"73" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"73", label %"24", label %"25" + %"67" = load i64, ptr addrspace(5) %"5", align 8 + %"68" = load i32, ptr addrspace(5) %"14", align 4 + %"126" = inttoptr i64 %"67" to ptr + %"144" = getelementptr inbounds i8, ptr %"126", i64 4 + store i32 %"68", ptr %"144", align 4 + %"70" = load float, ptr addrspace(5) %"10", align 4 + %"71" = load float, ptr addrspace(5) %"11", align 4 + %"69" = fcmp ord float %"70", %"71" + store i1 %"69", ptr addrspace(5) %"15", align 1 + %"72" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"72", label %"24", label %"25" "24": ; preds = %"23" - %4 = alloca i32, align 4, addrspace(5) - store i32 2, ptr addrspace(5) %4, align 4 - %"74" = load i32, ptr addrspace(5) %4, align 4 - store i32 %"74", ptr addrspace(5) %"14", align 4 + store i32 2, ptr addrspace(5) %5, align 4 + %"73" = load i32, ptr addrspace(5) %5, align 4 + store i32 %"73", ptr addrspace(5) %"14", align 4 br label %"25" "25": ; preds = %"24", %"23" - %"75" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"75", label %"27", label %"26" + %"74" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"74", label %"27", label %"26" "26": ; preds = %"25" - %5 = alloca i32, align 4, addrspace(5) - store i32 0, ptr addrspace(5) %5, align 4 - %"76" = load i32, ptr addrspace(5) %5, align 4 - store i32 %"76", ptr addrspace(5) %"14", align 4 + store i32 0, ptr addrspace(5) %6, align 4 + %"75" = load i32, ptr addrspace(5) %6, align 4 + store i32 %"75", ptr addrspace(5) %"14", align 4 br label %"27" "27": ; preds = %"26", %"25" - %"77" = load i64, ptr addrspace(5) %"5", align 8 - %"78" = load i32, ptr addrspace(5) %"14", align 4 - %"128" = inttoptr i64 %"77" to ptr - %"148" = getelementptr inbounds i8, ptr %"128", i64 8 - store i32 %"78", ptr %"148", align 4 - %"80" = load float, ptr addrspace(5) %"12", align 4 - %"81" = load float, ptr addrspace(5) %"13", align 4 - %"79" = fcmp ord float %"80", %"81" - store i1 %"79", ptr addrspace(5) %"15", align 1 - %"82" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"82", label %"28", label %"29" + %"76" = load i64, ptr addrspace(5) %"5", align 8 + %"77" = load i32, ptr addrspace(5) %"14", align 4 + %"127" = inttoptr i64 %"76" to ptr + %"146" = getelementptr inbounds i8, ptr %"127", i64 8 + store i32 %"77", ptr %"146", align 4 + %"79" = load float, ptr addrspace(5) %"12", align 4 + %"80" = load float, ptr addrspace(5) %"13", align 4 + %"78" = fcmp ord float %"79", %"80" + store i1 %"78", ptr addrspace(5) %"15", align 1 + %"81" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"81", label %"28", label %"29" "28": ; preds = %"27" - %6 = alloca i32, align 4, addrspace(5) - store i32 2, ptr addrspace(5) %6, align 4 - %"83" = load i32, ptr addrspace(5) %6, align 4 - store i32 %"83", ptr addrspace(5) %"14", align 4 + store i32 2, ptr addrspace(5) %7, align 4 + %"82" = load i32, ptr addrspace(5) %7, align 4 + store i32 %"82", ptr addrspace(5) %"14", align 4 br label %"29" "29": ; preds = %"28", %"27" - %"84" = load i1, ptr addrspace(5) %"15", align 1 - br i1 %"84", label %"31", label %"30" + %"83" = load i1, ptr addrspace(5) %"15", align 1 + br i1 %"83", label %"31", label %"30" "30": ; preds = %"29" - %7 = alloca i32, align 4, addrspace(5) - store i32 0, ptr addrspace(5) %7, align 4 - %"85" = load i32, ptr addrspace(5) %7, align 4 - store i32 %"85", ptr addrspace(5) %"14", align 4 + store i32 0, ptr addrspace(5) %8, align 4 + %"84" = load i32, ptr addrspace(5) %8, align 4 + store i32 %"84", ptr addrspace(5) %"14", align 4 br label %"31" "31": ; preds = %"30", %"29" - %"86" = load i64, ptr addrspace(5) %"5", align 8 - %"87" = load i32, ptr addrspace(5) %"14", align 4 - %"129" = inttoptr i64 %"86" to ptr - %"150" = getelementptr inbounds i8, ptr %"129", i64 12 - store i32 %"87", ptr %"150", align 4 + %"85" = load i64, ptr addrspace(5) %"5", align 8 + %"86" = load i32, ptr addrspace(5) %"14", align 4 + %"128" = inttoptr i64 %"85" to ptr + %"148" = getelementptr inbounds i8, ptr %"128", i64 12 + store i32 %"86", ptr %"148", align 4 ret void } diff --git a/ptx/src/test/spirv_run/setp_pred2.ll b/ptx/src/test/spirv_run/setp_pred2.ll index 9ce8135..01ae23e 100644 --- a/ptx/src/test/spirv_run/setp_pred2.ll +++ b/ptx/src/test/spirv_run/setp_pred2.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @setp_pred2(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { -"42": +define protected amdgpu_kernel void @setp_pred2(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { %"15" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"15", align 1 - %"16" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"16", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) @@ -14,53 +10,57 @@ define protected amdgpu_kernel void @setp_pred2(ptr addrspace(4) byref(i64) %"37 %"8" = alloca float, align 4, addrspace(5) %"9" = alloca i1, align 1, addrspace(5) %"10" = alloca i1, align 1, addrspace(5) - %"17" = load i64, ptr addrspace(4) %"37", align 8 - store i64 %"17", ptr addrspace(5) %"4", align 8 - %"18" = load i64, ptr addrspace(4) %"38", align 8 - store i64 %"18", ptr addrspace(5) %"5", align 8 - %"20" = load i64, ptr addrspace(5) %"4", align 8 - %"39" = inttoptr i64 %"20" to ptr - %"19" = load float, ptr %"39", align 4 - store float %"19", ptr addrspace(5) %"6", align 4 - %"22" = load i64, ptr addrspace(5) %"4", align 8 - %"40" = inttoptr i64 %"22" to ptr - %"44" = getelementptr inbounds i8, ptr %"40", i64 4 - %"21" = load float, ptr %"44", align 4 - store float %"21", ptr addrspace(5) %"7", align 4 - %"25" = load float, ptr addrspace(5) %"6", align 4 - %"26" = load float, ptr addrspace(5) %"7", align 4 - %"23" = fcmp ogt float %"25", %"26" - %"24" = xor i1 %"23", true - store i1 %"23", ptr addrspace(5) %"9", align 1 - store i1 %"24", ptr addrspace(5) %"10", align 1 - %"27" = load i1, ptr addrspace(5) %"9", align 1 - br i1 %"27", label %"11", label %"12" + %1 = alloca float, align 4, addrspace(5) + %2 = alloca float, align 4, addrspace(5) + br label %3 -"11": ; preds = %"42" - %"29" = load float, ptr addrspace(5) %"6", align 4 - %0 = alloca float, align 4, addrspace(5) - store float %"29", ptr addrspace(5) %0, align 4 - %"28" = load float, ptr addrspace(5) %0, align 4 - store float %"28", ptr addrspace(5) %"8", align 4 +3: ; preds = %0 + store i1 false, ptr addrspace(5) %"15", align 1 + %"16" = load i64, ptr addrspace(4) %"36", align 8 + store i64 %"16", ptr addrspace(5) %"4", align 8 + %"17" = load i64, ptr addrspace(4) %"37", align 8 + store i64 %"17", ptr addrspace(5) %"5", align 8 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"38" = inttoptr i64 %"19" to ptr + %"18" = load float, ptr %"38", align 4 + store float %"18", ptr addrspace(5) %"6", align 4 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"39" = inttoptr i64 %"21" to ptr + %"42" = getelementptr inbounds i8, ptr %"39", i64 4 + %"20" = load float, ptr %"42", align 4 + store float %"20", ptr addrspace(5) %"7", align 4 + %"24" = load float, ptr addrspace(5) %"6", align 4 + %"25" = load float, ptr addrspace(5) %"7", align 4 + %"22" = fcmp ogt float %"24", %"25" + %"23" = xor i1 %"22", true + store i1 %"22", ptr addrspace(5) %"9", align 1 + store i1 %"23", ptr addrspace(5) %"10", align 1 + %"26" = load i1, ptr addrspace(5) %"9", align 1 + br i1 %"26", label %"11", label %"12" + +"11": ; preds = %3 + %"28" = load float, ptr addrspace(5) %"6", align 4 + store float %"28", ptr addrspace(5) %1, align 4 + %"27" = load float, ptr addrspace(5) %1, align 4 + store float %"27", ptr addrspace(5) %"8", align 4 br label %"12" -"12": ; preds = %"11", %"42" - %"30" = load i1, ptr addrspace(5) %"10", align 1 - br i1 %"30", label %"13", label %"14" +"12": ; preds = %"11", %3 + %"29" = load i1, ptr addrspace(5) %"10", align 1 + br i1 %"29", label %"13", label %"14" "13": ; preds = %"12" - %"32" = load float, ptr addrspace(5) %"7", align 4 - %1 = alloca float, align 4, addrspace(5) - store float %"32", ptr addrspace(5) %1, align 4 - %"31" = load float, ptr addrspace(5) %1, align 4 - store float %"31", ptr addrspace(5) %"8", align 4 + %"31" = load float, ptr addrspace(5) %"7", align 4 + store float %"31", ptr addrspace(5) %2, align 4 + %"30" = load float, ptr addrspace(5) %2, align 4 + store float %"30", ptr addrspace(5) %"8", align 4 br label %"14" "14": ; preds = %"13", %"12" - %"33" = load i64, ptr addrspace(5) %"5", align 8 - %"34" = load float, ptr addrspace(5) %"8", align 4 - %"41" = inttoptr i64 %"33" to ptr - store float %"34", ptr %"41", align 4 + %"32" = load i64, ptr addrspace(5) %"5", align 8 + %"33" = load float, ptr addrspace(5) %"8", align 4 + %"40" = inttoptr i64 %"32" to ptr + store float %"33", ptr %"40", align 4 ret void } diff --git a/ptx/src/test/spirv_run/shared_ptr_32.ll b/ptx/src/test/spirv_run/shared_ptr_32.ll index a132a58..f3e0269 100644 --- a/ptx/src/test/spirv_run/shared_ptr_32.ll +++ b/ptx/src/test/spirv_run/shared_ptr_32.ll @@ -3,42 +3,42 @@ target triple = "amdgcn-amd-amdhsa" @"4" = private addrspace(3) global [128 x i8] undef, align 4 -define protected amdgpu_kernel void @shared_ptr_32(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #0 { -"32": +define protected amdgpu_kernel void @shared_ptr_32(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 - %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) %"9" = alloca i64, align 8, addrspace(5) + %1 = alloca i32, align 4, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 %"12" = load i64, ptr addrspace(4) %"25", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(4) %"26", align 8 - store i64 %"13", ptr addrspace(5) %"6", align 8 - %0 = alloca i32, align 4, addrspace(5) - store i32 ptrtoint (ptr addrspace(3) @"4" to i32), ptr addrspace(5) %0, align 4 - %"14" = load i32, ptr addrspace(5) %0, align 4 - store i32 %"14", ptr addrspace(5) %"7", align 4 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"28" = inttoptr i64 %"16" to ptr addrspace(1) - %"15" = load i64, ptr addrspace(1) %"28", align 8 - store i64 %"15", ptr addrspace(5) %"8", align 8 - %"17" = load i32, ptr addrspace(5) %"7", align 4 - %"18" = load i64, ptr addrspace(5) %"8", align 8 - %"29" = inttoptr i32 %"17" to ptr addrspace(3) - store i64 %"18", ptr addrspace(3) %"29", align 8 - %"20" = load i32, ptr addrspace(5) %"7", align 4 - %"30" = inttoptr i32 %"20" to ptr addrspace(3) - %"34" = getelementptr inbounds i8, ptr addrspace(3) %"30", i64 0 - %"19" = load i64, ptr addrspace(3) %"34", align 8 - store i64 %"19", ptr addrspace(5) %"9", align 8 - %"21" = load i64, ptr addrspace(5) %"6", align 8 - %"22" = load i64, ptr addrspace(5) %"9", align 8 - %"31" = inttoptr i64 %"21" to ptr addrspace(1) - store i64 %"22", ptr addrspace(1) %"31", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + store i32 ptrtoint (ptr addrspace(3) @"4" to i32), ptr addrspace(5) %1, align 4 + %"13" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"13", ptr addrspace(5) %"7", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"27" = inttoptr i64 %"15" to ptr addrspace(1) + %"14" = load i64, ptr addrspace(1) %"27", align 8 + store i64 %"14", ptr addrspace(5) %"8", align 8 + %"16" = load i32, ptr addrspace(5) %"7", align 4 + %"17" = load i64, ptr addrspace(5) %"8", align 8 + %"28" = inttoptr i32 %"16" to ptr addrspace(3) + store i64 %"17", ptr addrspace(3) %"28", align 8 + %"19" = load i32, ptr addrspace(5) %"7", align 4 + %"29" = inttoptr i32 %"19" to ptr addrspace(3) + %"32" = getelementptr inbounds i8, ptr addrspace(3) %"29", i64 0 + %"18" = load i64, ptr addrspace(3) %"32", align 8 + store i64 %"18", ptr addrspace(5) %"9", align 8 + %"20" = load i64, ptr addrspace(5) %"6", align 8 + %"21" = load i64, ptr addrspace(5) %"9", align 8 + %"30" = inttoptr i64 %"20" to ptr addrspace(1) + store i64 %"21", ptr addrspace(1) %"30", align 8 ret void } diff --git a/ptx/src/test/spirv_run/shared_ptr_take_address.ll b/ptx/src/test/spirv_run/shared_ptr_take_address.ll index a3d3e5d..fd61d71 100644 --- a/ptx/src/test/spirv_run/shared_ptr_take_address.ll +++ b/ptx/src/test/spirv_run/shared_ptr_take_address.ll @@ -3,41 +3,41 @@ target triple = "amdgcn-amd-amdhsa" @shared_mem = external hidden addrspace(3) global [0 x i8], align 4 -define protected amdgpu_kernel void @shared_ptr_take_address(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"30": +define protected amdgpu_kernel void @shared_ptr_take_address(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 - %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) %"9" = alloca i64, align 8, addrspace(5) + %1 = alloca i64, align 8, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 %"12" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"13", ptr addrspace(5) %"6", align 8 - %0 = alloca i64, align 8, addrspace(5) - store i64 ptrtoint (ptr addrspace(3) @shared_mem to i64), ptr addrspace(5) %0, align 8 - %"14" = load i64, ptr addrspace(5) %0, align 8 - store i64 %"14", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"26" = inttoptr i64 %"16" to ptr addrspace(1) - %"15" = load i64, ptr addrspace(1) %"26", align 8 - store i64 %"15", ptr addrspace(5) %"8", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"18" = load i64, ptr addrspace(5) %"8", align 8 - %"27" = inttoptr i64 %"17" to ptr addrspace(3) - store i64 %"18", ptr addrspace(3) %"27", align 8 - %"20" = load i64, ptr addrspace(5) %"7", align 8 - %"28" = inttoptr i64 %"20" to ptr addrspace(3) - %"19" = load i64, ptr addrspace(3) %"28", align 8 - store i64 %"19", ptr addrspace(5) %"9", align 8 - %"21" = load i64, ptr addrspace(5) %"6", align 8 - %"22" = load i64, ptr addrspace(5) %"9", align 8 - %"29" = inttoptr i64 %"21" to ptr addrspace(1) - store i64 %"22", ptr addrspace(1) %"29", align 8 + store i64 %"12", ptr addrspace(5) %"6", align 8 + store i64 ptrtoint (ptr addrspace(3) @shared_mem to i64), ptr addrspace(5) %1, align 8 + %"13" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"25" = inttoptr i64 %"15" to ptr addrspace(1) + %"14" = load i64, ptr addrspace(1) %"25", align 8 + store i64 %"14", ptr addrspace(5) %"8", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"17" = load i64, ptr addrspace(5) %"8", align 8 + %"26" = inttoptr i64 %"16" to ptr addrspace(3) + store i64 %"17", ptr addrspace(3) %"26", align 8 + %"19" = load i64, ptr addrspace(5) %"7", align 8 + %"27" = inttoptr i64 %"19" to ptr addrspace(3) + %"18" = load i64, ptr addrspace(3) %"27", align 8 + store i64 %"18", ptr addrspace(5) %"9", align 8 + %"20" = load i64, ptr addrspace(5) %"6", align 8 + %"21" = load i64, ptr addrspace(5) %"9", align 8 + %"28" = inttoptr i64 %"20" to ptr addrspace(1) + store i64 %"21", ptr addrspace(1) %"28", align 8 ret void } diff --git a/ptx/src/test/spirv_run/shared_unify_decl.ll b/ptx/src/test/spirv_run/shared_unify_decl.ll index 1079e59..61d62d7 100644 --- a/ptx/src/test/spirv_run/shared_unify_decl.ll +++ b/ptx/src/test/spirv_run/shared_unify_decl.ll @@ -4,76 +4,76 @@ target triple = "amdgcn-amd-amdhsa" @shared_ex = external hidden addrspace(3) global [0 x i32] @shared_mod = private addrspace(3) global [4 x i32] undef -define private i64 @"3"(ptr addrspace(3) %"69", ptr addrspace(3) %"70") #0 { -"62": +define private i64 @"3"(ptr addrspace(3) %"63", ptr addrspace(3) %"64") #0 { %"8" = alloca i64, align 8, addrspace(5) %"20" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"20", align 1 - %"21" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"21", align 1 %"9" = alloca i64, align 8, addrspace(5) %"10" = alloca i64, align 8, addrspace(5) - %"26" = load i64, ptr addrspace(3) %"70", align 8 - store i64 %"26", ptr addrspace(5) %"9", align 8 - %"27" = load i64, ptr addrspace(3) %"69", align 8 - store i64 %"27", ptr addrspace(5) %"10", align 8 - %"29" = load i64, ptr addrspace(5) %"10", align 8 - %"30" = load i64, ptr addrspace(5) %"9", align 8 - %"53" = add i64 %"29", %"30" - store i64 %"53", ptr addrspace(5) %"8", align 8 - %"31" = load i64, ptr addrspace(5) %"8", align 8 - ret i64 %"31" + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"20", align 1 + %"23" = load i64, ptr addrspace(3) %"64", align 8 + store i64 %"23", ptr addrspace(5) %"9", align 8 + %"24" = load i64, ptr addrspace(3) %"63", align 8 + store i64 %"24", ptr addrspace(5) %"10", align 8 + %"26" = load i64, ptr addrspace(5) %"10", align 8 + %"27" = load i64, ptr addrspace(5) %"9", align 8 + %"50" = add i64 %"26", %"27" + store i64 %"50", ptr addrspace(5) %"8", align 8 + %"28" = load i64, ptr addrspace(5) %"8", align 8 + ret i64 %"28" } -define private i64 @"5"(i64 %"32", ptr addrspace(3) %"71", ptr addrspace(3) %"72") #0 { -"63": +define private i64 @"5"(i64 %"29", ptr addrspace(3) %"65", ptr addrspace(3) %"66") #0 { %"12" = alloca i64, align 8, addrspace(5) %"11" = alloca i64, align 8, addrspace(5) - %"22" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"22", align 1 - %"23" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"23", align 1 - store i64 %"32", ptr addrspace(5) %"12", align 8 - %"33" = load i64, ptr addrspace(5) %"12", align 8 - store i64 %"33", ptr addrspace(3) %"71", align 8 - %"34" = call i64 @"3"(ptr addrspace(3) %"71", ptr addrspace(3) %"72") - store i64 %"34", ptr addrspace(5) %"11", align 8 - %"35" = load i64, ptr addrspace(5) %"11", align 8 - ret i64 %"35" + %"21" = alloca i1, align 1, addrspace(5) + br label %1 + +1: ; preds = %0 + store i64 %"29", ptr addrspace(5) %"12", align 8 + store i1 false, ptr addrspace(5) %"21", align 1 + %"30" = load i64, ptr addrspace(5) %"12", align 8 + store i64 %"30", ptr addrspace(3) %"65", align 8 + %"31" = call i64 @"3"(ptr addrspace(3) %"65", ptr addrspace(3) %"66") + store i64 %"31", ptr addrspace(5) %"11", align 8 + %"32" = load i64, ptr addrspace(5) %"11", align 8 + ret i64 %"32" } -define protected amdgpu_kernel void @shared_unify_decl(ptr addrspace(4) byref(i64) %"49", ptr addrspace(4) byref(i64) %"50") #0 { -"64": - %"24" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"24", align 1 - %"25" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"25", align 1 +define protected amdgpu_kernel void @shared_unify_decl(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 { + %"22" = alloca i1, align 1, addrspace(5) %"16" = alloca i64, align 8, addrspace(5) %"17" = alloca i64, align 8, addrspace(5) %"18" = alloca i64, align 8, addrspace(5) %"19" = alloca i64, align 8, addrspace(5) - %"36" = load i64, ptr addrspace(4) %"49", align 8 - store i64 %"36", ptr addrspace(5) %"16", align 8 - %"37" = load i64, ptr addrspace(4) %"50", align 8 - store i64 %"37", ptr addrspace(5) %"17", align 8 - %"39" = load i64, ptr addrspace(5) %"16", align 8 - %"56" = inttoptr i64 %"39" to ptr addrspace(1) - %"38" = load i64, ptr addrspace(1) %"56", align 8 - store i64 %"38", ptr addrspace(5) %"18", align 8 - %"41" = load i64, ptr addrspace(5) %"16", align 8 - %"57" = inttoptr i64 %"41" to ptr addrspace(1) - %"74" = getelementptr inbounds i8, ptr addrspace(1) %"57", i64 8 - %"40" = load i64, ptr addrspace(1) %"74", align 8 - store i64 %"40", ptr addrspace(5) %"19", align 8 - %"42" = load i64, ptr addrspace(5) %"19", align 8 - store i64 %"42", ptr addrspace(3) @shared_mod, align 8 - %"44" = load i64, ptr addrspace(5) %"18", align 8 - %"59" = call i64 @"5"(i64 %"44", ptr addrspace(3) @shared_ex, ptr addrspace(3) @shared_mod) - store i64 %"59", ptr addrspace(5) %"19", align 8 - %"45" = load i64, ptr addrspace(5) %"17", align 8 - %"46" = load i64, ptr addrspace(5) %"19", align 8 - %"61" = inttoptr i64 %"45" to ptr - store i64 %"46", ptr %"61", align 8 + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"22", align 1 + %"33" = load i64, ptr addrspace(4) %"46", align 8 + store i64 %"33", ptr addrspace(5) %"16", align 8 + %"34" = load i64, ptr addrspace(4) %"47", align 8 + store i64 %"34", ptr addrspace(5) %"17", align 8 + %"36" = load i64, ptr addrspace(5) %"16", align 8 + %"53" = inttoptr i64 %"36" to ptr addrspace(1) + %"35" = load i64, ptr addrspace(1) %"53", align 8 + store i64 %"35", ptr addrspace(5) %"18", align 8 + %"38" = load i64, ptr addrspace(5) %"16", align 8 + %"54" = inttoptr i64 %"38" to ptr addrspace(1) + %"68" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 8 + %"37" = load i64, ptr addrspace(1) %"68", align 8 + store i64 %"37", ptr addrspace(5) %"19", align 8 + %"39" = load i64, ptr addrspace(5) %"19", align 8 + store i64 %"39", ptr addrspace(3) @shared_mod, align 8 + %"41" = load i64, ptr addrspace(5) %"18", align 8 + %"56" = call i64 @"5"(i64 %"41", ptr addrspace(3) @shared_ex, ptr addrspace(3) @shared_mod) + store i64 %"56", ptr addrspace(5) %"19", align 8 + %"42" = load i64, ptr addrspace(5) %"17", align 8 + %"43" = load i64, ptr addrspace(5) %"19", align 8 + %"58" = inttoptr i64 %"42" to ptr + store i64 %"43", ptr %"58", align 8 ret void } diff --git a/ptx/src/test/spirv_run/shared_unify_extern.ll b/ptx/src/test/spirv_run/shared_unify_extern.ll index d83ea7a..769fd9f 100644 --- a/ptx/src/test/spirv_run/shared_unify_extern.ll +++ b/ptx/src/test/spirv_run/shared_unify_extern.ll @@ -4,76 +4,76 @@ target triple = "amdgcn-amd-amdhsa" @shared_ex = external hidden addrspace(3) global [0 x i32] @shared_mod = private addrspace(3) global [4 x i32] undef -define private i64 @"3"(ptr addrspace(3) %"62", ptr addrspace(3) %"63") #0 { -"59": +define private i64 @"3"(ptr addrspace(3) %"56", ptr addrspace(3) %"57") #0 { %"4" = alloca i64, align 8, addrspace(5) %"17" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"17", align 1 - %"18" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"18", align 1 %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) - %"23" = load i64, ptr addrspace(3) %"63", align 8 - store i64 %"23", ptr addrspace(5) %"5", align 8 - %"24" = load i64, ptr addrspace(3) %"62", align 8 - store i64 %"24", ptr addrspace(5) %"6", align 8 - %"26" = load i64, ptr addrspace(5) %"6", align 8 - %"27" = load i64, ptr addrspace(5) %"5", align 8 - %"50" = add i64 %"26", %"27" - store i64 %"50", ptr addrspace(5) %"4", align 8 - %"28" = load i64, ptr addrspace(5) %"4", align 8 - ret i64 %"28" + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"17", align 1 + %"20" = load i64, ptr addrspace(3) %"57", align 8 + store i64 %"20", ptr addrspace(5) %"5", align 8 + %"21" = load i64, ptr addrspace(3) %"56", align 8 + store i64 %"21", ptr addrspace(5) %"6", align 8 + %"23" = load i64, ptr addrspace(5) %"6", align 8 + %"24" = load i64, ptr addrspace(5) %"5", align 8 + %"47" = add i64 %"23", %"24" + store i64 %"47", ptr addrspace(5) %"4", align 8 + %"25" = load i64, ptr addrspace(5) %"4", align 8 + ret i64 %"25" } -define private i64 @"7"(i64 %"29", ptr addrspace(3) %"64", ptr addrspace(3) %"65") #0 { -"60": +define private i64 @"7"(i64 %"26", ptr addrspace(3) %"58", ptr addrspace(3) %"59") #0 { %"9" = alloca i64, align 8, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) - %"19" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"19", align 1 - %"20" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"20", align 1 - store i64 %"29", ptr addrspace(5) %"9", align 8 - %"30" = load i64, ptr addrspace(5) %"9", align 8 - store i64 %"30", ptr addrspace(3) %"64", align 8 - %"31" = call i64 @"3"(ptr addrspace(3) %"64", ptr addrspace(3) %"65") - store i64 %"31", ptr addrspace(5) %"8", align 8 - %"32" = load i64, ptr addrspace(5) %"8", align 8 - ret i64 %"32" + %"18" = alloca i1, align 1, addrspace(5) + br label %1 + +1: ; preds = %0 + store i64 %"26", ptr addrspace(5) %"9", align 8 + store i1 false, ptr addrspace(5) %"18", align 1 + %"27" = load i64, ptr addrspace(5) %"9", align 8 + store i64 %"27", ptr addrspace(3) %"58", align 8 + %"28" = call i64 @"3"(ptr addrspace(3) %"58", ptr addrspace(3) %"59") + store i64 %"28", ptr addrspace(5) %"8", align 8 + %"29" = load i64, ptr addrspace(5) %"8", align 8 + ret i64 %"29" } -define protected amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"46", ptr addrspace(4) byref(i64) %"47") #0 { -"61": - %"21" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"21", align 1 - %"22" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"22", align 1 +define protected amdgpu_kernel void @shared_unify_extern(ptr addrspace(4) byref(i64) %"43", ptr addrspace(4) byref(i64) %"44") #0 { + %"19" = alloca i1, align 1, addrspace(5) %"13" = alloca i64, align 8, addrspace(5) %"14" = alloca i64, align 8, addrspace(5) %"15" = alloca i64, align 8, addrspace(5) %"16" = alloca i64, align 8, addrspace(5) - %"33" = load i64, ptr addrspace(4) %"46", align 8 - store i64 %"33", ptr addrspace(5) %"13", align 8 - %"34" = load i64, ptr addrspace(4) %"47", align 8 - store i64 %"34", ptr addrspace(5) %"14", align 8 - %"36" = load i64, ptr addrspace(5) %"13", align 8 - %"53" = inttoptr i64 %"36" to ptr addrspace(1) - %"35" = load i64, ptr addrspace(1) %"53", align 8 - store i64 %"35", ptr addrspace(5) %"15", align 8 - %"38" = load i64, ptr addrspace(5) %"13", align 8 - %"54" = inttoptr i64 %"38" to ptr addrspace(1) - %"67" = getelementptr inbounds i8, ptr addrspace(1) %"54", i64 8 - %"37" = load i64, ptr addrspace(1) %"67", align 8 - store i64 %"37", ptr addrspace(5) %"16", align 8 - %"39" = load i64, ptr addrspace(5) %"16", align 8 - store i64 %"39", ptr addrspace(3) @shared_mod, align 8 - %"41" = load i64, ptr addrspace(5) %"15", align 8 - %"56" = call i64 @"7"(i64 %"41", ptr addrspace(3) @shared_ex, ptr addrspace(3) @shared_mod) - store i64 %"56", ptr addrspace(5) %"16", align 8 - %"42" = load i64, ptr addrspace(5) %"14", align 8 - %"43" = load i64, ptr addrspace(5) %"16", align 8 - %"58" = inttoptr i64 %"42" to ptr - store i64 %"43", ptr %"58", align 8 + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"19", align 1 + %"30" = load i64, ptr addrspace(4) %"43", align 8 + store i64 %"30", ptr addrspace(5) %"13", align 8 + %"31" = load i64, ptr addrspace(4) %"44", align 8 + store i64 %"31", ptr addrspace(5) %"14", align 8 + %"33" = load i64, ptr addrspace(5) %"13", align 8 + %"50" = inttoptr i64 %"33" to ptr addrspace(1) + %"32" = load i64, ptr addrspace(1) %"50", align 8 + store i64 %"32", ptr addrspace(5) %"15", align 8 + %"35" = load i64, ptr addrspace(5) %"13", align 8 + %"51" = inttoptr i64 %"35" to ptr addrspace(1) + %"61" = getelementptr inbounds i8, ptr addrspace(1) %"51", i64 8 + %"34" = load i64, ptr addrspace(1) %"61", align 8 + store i64 %"34", ptr addrspace(5) %"16", align 8 + %"36" = load i64, ptr addrspace(5) %"16", align 8 + store i64 %"36", ptr addrspace(3) @shared_mod, align 8 + %"38" = load i64, ptr addrspace(5) %"15", align 8 + %"53" = call i64 @"7"(i64 %"38", ptr addrspace(3) @shared_ex, ptr addrspace(3) @shared_mod) + store i64 %"53", ptr addrspace(5) %"16", align 8 + %"39" = load i64, ptr addrspace(5) %"14", align 8 + %"40" = load i64, ptr addrspace(5) %"16", align 8 + %"55" = inttoptr i64 %"39" to ptr + store i64 %"40", ptr %"55", align 8 ret void } diff --git a/ptx/src/test/spirv_run/shared_unify_local.ll b/ptx/src/test/spirv_run/shared_unify_local.ll index e3a1db7..522e0f5 100644 --- a/ptx/src/test/spirv_run/shared_unify_local.ll +++ b/ptx/src/test/spirv_run/shared_unify_local.ll @@ -4,81 +4,81 @@ target triple = "amdgcn-amd-amdhsa" @shared_ex = external hidden addrspace(3) global [0 x i32] @"5" = private addrspace(3) global i64 undef, align 4 -define private i64 @"2"(i64 %"24", ptr addrspace(3) %"65", ptr addrspace(3) %"66") #0 { -"62": +define private i64 @"2"(i64 %"21", ptr addrspace(3) %"59", ptr addrspace(3) %"60") #0 { %"4" = alloca i64, align 8, addrspace(5) %"3" = alloca i64, align 8, addrspace(5) %"18" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"18", align 1 - %"19" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"19", align 1 %"6" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i64 %"21", ptr addrspace(5) %"4", align 8 + store i1 false, ptr addrspace(5) %"18", align 1 + %"22" = load i64, ptr addrspace(5) %"4", align 8 + store i64 %"22", ptr addrspace(3) %"60", align 8 + %"23" = load i64, ptr addrspace(3) %"60", align 8 + store i64 %"23", ptr addrspace(5) %"6", align 8 + %"24" = load i64, ptr addrspace(3) %"59", align 8 store i64 %"24", ptr addrspace(5) %"4", align 8 - %"25" = load i64, ptr addrspace(5) %"4", align 8 - store i64 %"25", ptr addrspace(3) %"66", align 8 - %"26" = load i64, ptr addrspace(3) %"66", align 8 - store i64 %"26", ptr addrspace(5) %"6", align 8 - %"27" = load i64, ptr addrspace(3) %"65", align 8 - store i64 %"27", ptr addrspace(5) %"4", align 8 - %"29" = load i64, ptr addrspace(5) %"4", align 8 - %"30" = load i64, ptr addrspace(5) %"6", align 8 - %"54" = add i64 %"29", %"30" - store i64 %"54", ptr addrspace(5) %"3", align 8 - %"31" = load i64, ptr addrspace(5) %"3", align 8 - ret i64 %"31" + %"26" = load i64, ptr addrspace(5) %"4", align 8 + %"27" = load i64, ptr addrspace(5) %"6", align 8 + %"51" = add i64 %"26", %"27" + store i64 %"51", ptr addrspace(5) %"3", align 8 + %"28" = load i64, ptr addrspace(5) %"3", align 8 + ret i64 %"28" } -define private i64 @"7"(i64 %"32", i64 %"33", ptr addrspace(3) %"67", ptr addrspace(3) %"68") #0 { -"63": +define private i64 @"7"(i64 %"29", i64 %"30", ptr addrspace(3) %"61", ptr addrspace(3) %"62") #0 { %"9" = alloca i64, align 8, addrspace(5) %"10" = alloca i64, align 8, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) - %"20" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"20", align 1 - %"21" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"21", align 1 - store i64 %"32", ptr addrspace(5) %"9", align 8 - store i64 %"33", ptr addrspace(5) %"10", align 8 - %"34" = load i64, ptr addrspace(5) %"9", align 8 - store i64 %"34", ptr addrspace(3) %"67", align 8 - %"36" = load i64, ptr addrspace(5) %"10", align 8 - %"35" = call i64 @"2"(i64 %"36", ptr addrspace(3) %"67", ptr addrspace(3) %"68") - store i64 %"35", ptr addrspace(5) %"8", align 8 - %"37" = load i64, ptr addrspace(5) %"8", align 8 - ret i64 %"37" + %"19" = alloca i1, align 1, addrspace(5) + br label %1 + +1: ; preds = %0 + store i64 %"29", ptr addrspace(5) %"9", align 8 + store i64 %"30", ptr addrspace(5) %"10", align 8 + store i1 false, ptr addrspace(5) %"19", align 1 + %"31" = load i64, ptr addrspace(5) %"9", align 8 + store i64 %"31", ptr addrspace(3) %"61", align 8 + %"33" = load i64, ptr addrspace(5) %"10", align 8 + %"32" = call i64 @"2"(i64 %"33", ptr addrspace(3) %"61", ptr addrspace(3) %"62") + store i64 %"32", ptr addrspace(5) %"8", align 8 + %"34" = load i64, ptr addrspace(5) %"8", align 8 + ret i64 %"34" } -define protected amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"51", ptr addrspace(4) byref(i64) %"52") #0 { -"64": - %"22" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"22", align 1 - %"23" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"23", align 1 +define protected amdgpu_kernel void @shared_unify_local(ptr addrspace(4) byref(i64) %"48", ptr addrspace(4) byref(i64) %"49") #0 { + %"20" = alloca i1, align 1, addrspace(5) %"14" = alloca i64, align 8, addrspace(5) %"15" = alloca i64, align 8, addrspace(5) %"16" = alloca i64, align 8, addrspace(5) %"17" = alloca i64, align 8, addrspace(5) - %"38" = load i64, ptr addrspace(4) %"51", align 8 - store i64 %"38", ptr addrspace(5) %"14", align 8 - %"39" = load i64, ptr addrspace(4) %"52", align 8 - store i64 %"39", ptr addrspace(5) %"15", align 8 - %"41" = load i64, ptr addrspace(5) %"14", align 8 - %"57" = inttoptr i64 %"41" to ptr addrspace(1) - %"40" = load i64, ptr addrspace(1) %"57", align 8 - store i64 %"40", ptr addrspace(5) %"16", align 8 - %"43" = load i64, ptr addrspace(5) %"14", align 8 - %"58" = inttoptr i64 %"43" to ptr addrspace(1) - %"70" = getelementptr inbounds i8, ptr addrspace(1) %"58", i64 8 - %"42" = load i64, ptr addrspace(1) %"70", align 8 - store i64 %"42", ptr addrspace(5) %"17", align 8 - %"45" = load i64, ptr addrspace(5) %"16", align 8 - %"46" = load i64, ptr addrspace(5) %"17", align 8 - %"59" = call i64 @"7"(i64 %"45", i64 %"46", ptr addrspace(3) @shared_ex, ptr addrspace(3) @"5") - store i64 %"59", ptr addrspace(5) %"17", align 8 - %"47" = load i64, ptr addrspace(5) %"15", align 8 - %"48" = load i64, ptr addrspace(5) %"17", align 8 - %"61" = inttoptr i64 %"47" to ptr - store i64 %"48", ptr %"61", align 8 + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"20", align 1 + %"35" = load i64, ptr addrspace(4) %"48", align 8 + store i64 %"35", ptr addrspace(5) %"14", align 8 + %"36" = load i64, ptr addrspace(4) %"49", align 8 + store i64 %"36", ptr addrspace(5) %"15", align 8 + %"38" = load i64, ptr addrspace(5) %"14", align 8 + %"54" = inttoptr i64 %"38" to ptr addrspace(1) + %"37" = load i64, ptr addrspace(1) %"54", align 8 + store i64 %"37", ptr addrspace(5) %"16", align 8 + %"40" = load i64, ptr addrspace(5) %"14", align 8 + %"55" = inttoptr i64 %"40" to ptr addrspace(1) + %"64" = getelementptr inbounds i8, ptr addrspace(1) %"55", i64 8 + %"39" = load i64, ptr addrspace(1) %"64", align 8 + store i64 %"39", ptr addrspace(5) %"17", align 8 + %"42" = load i64, ptr addrspace(5) %"16", align 8 + %"43" = load i64, ptr addrspace(5) %"17", align 8 + %"56" = call i64 @"7"(i64 %"42", i64 %"43", ptr addrspace(3) @shared_ex, ptr addrspace(3) @"5") + store i64 %"56", ptr addrspace(5) %"17", align 8 + %"44" = load i64, ptr addrspace(5) %"15", align 8 + %"45" = load i64, ptr addrspace(5) %"17", align 8 + %"58" = inttoptr i64 %"44" to ptr + store i64 %"45", ptr %"58", align 8 ret void } diff --git a/ptx/src/test/spirv_run/shared_variable.ll b/ptx/src/test/spirv_run/shared_variable.ll index 2c2678a..ac1e519 100644 --- a/ptx/src/test/spirv_run/shared_variable.ll +++ b/ptx/src/test/spirv_run/shared_variable.ll @@ -3,32 +3,32 @@ target triple = "amdgcn-amd-amdhsa" @"4" = private addrspace(3) global [128 x i8] undef, align 4 -define protected amdgpu_kernel void @shared_variable(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { -"25": +define protected amdgpu_kernel void @shared_variable(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) %"8" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 %"11" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(4) %"20", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"14" = load i64, ptr addrspace(5) %"5", align 8 - %"21" = inttoptr i64 %"14" to ptr addrspace(1) - %"13" = load i64, ptr addrspace(1) %"21", align 8 - store i64 %"13", ptr addrspace(5) %"7", align 8 - %"15" = load i64, ptr addrspace(5) %"7", align 8 - store i64 %"15", ptr addrspace(3) @"4", align 8 - %"16" = load i64, ptr addrspace(3) @"4", align 8 - store i64 %"16", ptr addrspace(5) %"8", align 8 - %"17" = load i64, ptr addrspace(5) %"6", align 8 - %"18" = load i64, ptr addrspace(5) %"8", align 8 - %"24" = inttoptr i64 %"17" to ptr addrspace(1) - store i64 %"18", ptr addrspace(1) %"24", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = inttoptr i64 %"13" to ptr addrspace(1) + %"12" = load i64, ptr addrspace(1) %"20", align 8 + store i64 %"12", ptr addrspace(5) %"7", align 8 + %"14" = load i64, ptr addrspace(5) %"7", align 8 + store i64 %"14", ptr addrspace(3) @"4", align 8 + %"15" = load i64, ptr addrspace(3) @"4", align 8 + store i64 %"15", ptr addrspace(5) %"8", align 8 + %"16" = load i64, ptr addrspace(5) %"6", align 8 + %"17" = load i64, ptr addrspace(5) %"8", align 8 + %"23" = inttoptr i64 %"16" to ptr addrspace(1) + store i64 %"17", ptr addrspace(1) %"23", align 8 ret void } diff --git a/ptx/src/test/spirv_run/shf.ll b/ptx/src/test/spirv_run/shf.ll index 6eb5aa0..317a60f 100644 --- a/ptx/src/test/spirv_run/shf.ll +++ b/ptx/src/test/spirv_run/shf.ll @@ -1,38 +1,38 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @shf(ptr addrspace(4) byref(i64) %"25", ptr addrspace(4) byref(i64) %"26") #0 { -"33": +define protected amdgpu_kernel void @shf(ptr addrspace(4) byref(i64) %"24", ptr addrspace(4) byref(i64) %"25") #0 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"24", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 %"11" = load i64, ptr addrspace(4) %"25", align 8 - store i64 %"11", ptr addrspace(5) %"4", align 8 - %"12" = load i64, ptr addrspace(4) %"26", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(5) %"4", align 8 - %"27" = inttoptr i64 %"14" to ptr - %"13" = load i32, ptr %"27", align 4 - store i32 %"13", ptr addrspace(5) %"6", align 4 - %"16" = load i64, ptr addrspace(5) %"4", align 8 - %"28" = inttoptr i64 %"16" to ptr - %"35" = getelementptr inbounds i8, ptr %"28", i64 4 - %"15" = load i32, ptr %"35", align 4 - store i32 %"15", ptr addrspace(5) %"7", align 4 - %"18" = load i32, ptr addrspace(5) %"6", align 4 - %"19" = load i32, ptr addrspace(5) %"7", align 4 - %"29" = call i32 @llvm.fshl.i32(i32 %"19", i32 %"18", i32 14) - store i32 %"29", ptr addrspace(5) %"8", align 4 - %"20" = load i64, ptr addrspace(5) %"5", align 8 - %"21" = load i32, ptr addrspace(5) %"8", align 4 - %"32" = inttoptr i64 %"20" to ptr - store i32 %"21", ptr %"32", align 4 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"4", align 8 + %"26" = inttoptr i64 %"13" to ptr + %"12" = load i32, ptr %"26", align 4 + store i32 %"12", ptr addrspace(5) %"6", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"27" = inttoptr i64 %"15" to ptr + %"33" = getelementptr inbounds i8, ptr %"27", i64 4 + %"14" = load i32, ptr %"33", align 4 + store i32 %"14", ptr addrspace(5) %"7", align 4 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"18" = load i32, ptr addrspace(5) %"7", align 4 + %"28" = call i32 @llvm.fshl.i32(i32 %"18", i32 %"17", i32 14) + store i32 %"28", ptr addrspace(5) %"8", align 4 + %"19" = load i64, ptr addrspace(5) %"5", align 8 + %"20" = load i32, ptr addrspace(5) %"8", align 4 + %"31" = inttoptr i64 %"19" to ptr + store i32 %"20", ptr %"31", align 4 ret void } diff --git a/ptx/src/test/spirv_run/shl.ll b/ptx/src/test/spirv_run/shl.ll index a353e07..9f9b609 100644 --- a/ptx/src/test/spirv_run/shl.ll +++ b/ptx/src/test/spirv_run/shl.ll @@ -1,32 +1,32 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @shl(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { -"25": +define protected amdgpu_kernel void @shl(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"20", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"21" = inttoptr i64 %"13" to ptr - %"12" = load i64, ptr %"21", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"15" = load i64, ptr addrspace(5) %"6", align 8 - %0 = shl i64 %"15", 2 - %"22" = select i1 false, i64 0, i64 %0 - store i64 %"22", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"24" = inttoptr i64 %"16" to ptr - store i64 %"17", ptr %"24", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"12" to ptr + %"11" = load i64, ptr %"20", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 + %2 = shl i64 %"14", 2 + %"21" = select i1 false, i64 0, i64 %2 + store i64 %"21", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"23" = inttoptr i64 %"15" to ptr + store i64 %"16", ptr %"23", align 8 ret void } diff --git a/ptx/src/test/spirv_run/shl_link_hack.ll b/ptx/src/test/spirv_run/shl_link_hack.ll index 8d695ad..29d1c74 100644 --- a/ptx/src/test/spirv_run/shl_link_hack.ll +++ b/ptx/src/test/spirv_run/shl_link_hack.ll @@ -3,37 +3,37 @@ target triple = "amdgcn-amd-amdhsa" declare i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr, i32) #0 -define protected amdgpu_kernel void @shl_link_hack(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #1 { -"30": +define protected amdgpu_kernel void @shl_link_hack(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #1 { %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 - %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"9", align 1 + %"10" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"10", ptr addrspace(5) %"4", align 8 %"11" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"11", ptr addrspace(5) %"4", align 8 - %"12" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"14" = load i64, ptr addrspace(5) %"5", align 8 - %"25" = inttoptr i64 %"14" to ptr - %"13" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr %"25", i32 2000000) - store i32 %"13", ptr addrspace(5) %"8", align 4 - %"16" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"16" to ptr - %"15" = load i64, ptr %"26", align 8 - store i64 %"15", ptr addrspace(5) %"6", align 8 - %"18" = load i64, ptr addrspace(5) %"6", align 8 - %0 = shl i64 %"18", 2 - %"27" = select i1 false, i64 0, i64 %0 - store i64 %"27", ptr addrspace(5) %"7", align 8 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i64, ptr addrspace(5) %"7", align 8 - %"29" = inttoptr i64 %"19" to ptr - store i64 %"20", ptr %"29", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"13" = load i64, ptr addrspace(5) %"5", align 8 + %"24" = inttoptr i64 %"13" to ptr + %"12" = call i32 @__zluda_ptx_impl__atom_relaxed_gpu_generic_inc(ptr %"24", i32 2000000) + store i32 %"12", ptr addrspace(5) %"8", align 4 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"15" to ptr + %"14" = load i64, ptr %"25", align 8 + store i64 %"14", ptr addrspace(5) %"6", align 8 + %"17" = load i64, ptr addrspace(5) %"6", align 8 + %2 = shl i64 %"17", 2 + %"26" = select i1 false, i64 0, i64 %2 + store i64 %"26", ptr addrspace(5) %"7", align 8 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i64, ptr addrspace(5) %"7", align 8 + %"28" = inttoptr i64 %"18" to ptr + store i64 %"19", ptr %"28", align 8 ret void } diff --git a/ptx/src/test/spirv_run/shl_overflow.ll b/ptx/src/test/spirv_run/shl_overflow.ll index 0213149..86178d8 100644 --- a/ptx/src/test/spirv_run/shl_overflow.ll +++ b/ptx/src/test/spirv_run/shl_overflow.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @shl_overflow(ptr addrspace(4) byref(i64) %"48", ptr addrspace(4) byref(i64) %"49") #0 { -"63": +define protected amdgpu_kernel void @shl_overflow(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #0 { %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 - %"12" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"12", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) @@ -14,61 +10,65 @@ define protected amdgpu_kernel void @shl_overflow(ptr addrspace(4) byref(i64) %" %"8" = alloca i32, align 4, addrspace(5) %"9" = alloca i32, align 4, addrspace(5) %"10" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"11", align 1 + %"12" = load i64, ptr addrspace(4) %"47", align 8 + store i64 %"12", ptr addrspace(5) %"4", align 8 %"13" = load i64, ptr addrspace(4) %"48", align 8 - store i64 %"13", ptr addrspace(5) %"4", align 8 - %"14" = load i64, ptr addrspace(4) %"49", align 8 - store i64 %"14", ptr addrspace(5) %"5", align 8 - %"16" = load i64, ptr addrspace(5) %"4", align 8 - %"50" = inttoptr i64 %"16" to ptr - %"15" = load i32, ptr %"50", align 4 - store i32 %"15", ptr addrspace(5) %"6", align 4 - %"18" = load i64, ptr addrspace(5) %"4", align 8 - %"51" = inttoptr i64 %"18" to ptr - %"65" = getelementptr inbounds i8, ptr %"51", i64 4 - %"17" = load i32, ptr %"65", align 4 - store i32 %"17", ptr addrspace(5) %"8", align 4 - %"20" = load i64, ptr addrspace(5) %"4", align 8 - %"52" = inttoptr i64 %"20" to ptr - %"67" = getelementptr inbounds i8, ptr %"52", i64 8 - %"19" = load i32, ptr %"67", align 4 - store i32 %"19", ptr addrspace(5) %"9", align 4 - %"22" = load i64, ptr addrspace(5) %"4", align 8 - %"53" = inttoptr i64 %"22" to ptr - %"69" = getelementptr inbounds i8, ptr %"53", i64 12 - %"21" = load i32, ptr %"69", align 4 - store i32 %"21", ptr addrspace(5) %"10", align 4 - %"24" = load i32, ptr addrspace(5) %"6", align 4 - %"25" = load i32, ptr addrspace(5) %"8", align 4 - %0 = icmp ugt i32 %"25", 31 - %1 = shl i32 %"24", %"25" - %"54" = select i1 %0, i32 0, i32 %1 - store i32 %"54", ptr addrspace(5) %"7", align 4 - %"26" = load i64, ptr addrspace(5) %"5", align 8 - %"27" = load i32, ptr addrspace(5) %"7", align 4 - %"56" = inttoptr i64 %"26" to ptr - store i32 %"27", ptr %"56", align 4 - %"29" = load i32, ptr addrspace(5) %"6", align 4 - %"30" = load i32, ptr addrspace(5) %"9", align 4 - %2 = icmp ugt i32 %"30", 31 - %3 = shl i32 %"29", %"30" - %"57" = select i1 %2, i32 0, i32 %3 - store i32 %"57", ptr addrspace(5) %"7", align 4 - %"31" = load i64, ptr addrspace(5) %"5", align 8 - %"32" = load i32, ptr addrspace(5) %"7", align 4 - %"59" = inttoptr i64 %"31" to ptr - %"71" = getelementptr inbounds i8, ptr %"59", i64 4 - store i32 %"32", ptr %"71", align 4 - %"34" = load i32, ptr addrspace(5) %"6", align 4 - %"35" = load i32, ptr addrspace(5) %"10", align 4 - %4 = icmp ugt i32 %"35", 31 - %5 = shl i32 %"34", %"35" - %"60" = select i1 %4, i32 0, i32 %5 - store i32 %"60", ptr addrspace(5) %"7", align 4 - %"36" = load i64, ptr addrspace(5) %"5", align 8 - %"37" = load i32, ptr addrspace(5) %"7", align 4 - %"62" = inttoptr i64 %"36" to ptr - %"73" = getelementptr inbounds i8, ptr %"62", i64 8 - store i32 %"37", ptr %"73", align 4 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"49" = inttoptr i64 %"15" to ptr + %"14" = load i32, ptr %"49", align 4 + store i32 %"14", ptr addrspace(5) %"6", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"50" = inttoptr i64 %"17" to ptr + %"63" = getelementptr inbounds i8, ptr %"50", i64 4 + %"16" = load i32, ptr %"63", align 4 + store i32 %"16", ptr addrspace(5) %"8", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"51" = inttoptr i64 %"19" to ptr + %"65" = getelementptr inbounds i8, ptr %"51", i64 8 + %"18" = load i32, ptr %"65", align 4 + store i32 %"18", ptr addrspace(5) %"9", align 4 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"52" = inttoptr i64 %"21" to ptr + %"67" = getelementptr inbounds i8, ptr %"52", i64 12 + %"20" = load i32, ptr %"67", align 4 + store i32 %"20", ptr addrspace(5) %"10", align 4 + %"23" = load i32, ptr addrspace(5) %"6", align 4 + %"24" = load i32, ptr addrspace(5) %"8", align 4 + %2 = icmp ugt i32 %"24", 31 + %3 = shl i32 %"23", %"24" + %"53" = select i1 %2, i32 0, i32 %3 + store i32 %"53", ptr addrspace(5) %"7", align 4 + %"25" = load i64, ptr addrspace(5) %"5", align 8 + %"26" = load i32, ptr addrspace(5) %"7", align 4 + %"55" = inttoptr i64 %"25" to ptr + store i32 %"26", ptr %"55", align 4 + %"28" = load i32, ptr addrspace(5) %"6", align 4 + %"29" = load i32, ptr addrspace(5) %"9", align 4 + %4 = icmp ugt i32 %"29", 31 + %5 = shl i32 %"28", %"29" + %"56" = select i1 %4, i32 0, i32 %5 + store i32 %"56", ptr addrspace(5) %"7", align 4 + %"30" = load i64, ptr addrspace(5) %"5", align 8 + %"31" = load i32, ptr addrspace(5) %"7", align 4 + %"58" = inttoptr i64 %"30" to ptr + %"69" = getelementptr inbounds i8, ptr %"58", i64 4 + store i32 %"31", ptr %"69", align 4 + %"33" = load i32, ptr addrspace(5) %"6", align 4 + %"34" = load i32, ptr addrspace(5) %"10", align 4 + %6 = icmp ugt i32 %"34", 31 + %7 = shl i32 %"33", %"34" + %"59" = select i1 %6, i32 0, i32 %7 + store i32 %"59", ptr addrspace(5) %"7", align 4 + %"35" = load i64, ptr addrspace(5) %"5", align 8 + %"36" = load i32, ptr addrspace(5) %"7", align 4 + %"61" = inttoptr i64 %"35" to ptr + %"71" = getelementptr inbounds i8, ptr %"61", i64 8 + store i32 %"36", ptr %"71", align 4 ret void } diff --git a/ptx/src/test/spirv_run/shr_s32.ll b/ptx/src/test/spirv_run/shr_s32.ll index 7bc5489..a6a6d98 100644 --- a/ptx/src/test/spirv_run/shr_s32.ll +++ b/ptx/src/test/spirv_run/shr_s32.ll @@ -1,39 +1,39 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @shr_s32(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"29": +define protected amdgpu_kernel void @shr_s32(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"25" = inttoptr i64 %"13" to ptr - %"12" = load i32, ptr %"25", align 4 - store i32 %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"15" to ptr - %"31" = getelementptr inbounds i8, ptr %"26", i64 4 - %"14" = load i32, ptr %"31", align 4 - store i32 %"14", ptr addrspace(5) %"7", align 4 - %"17" = load i32, ptr addrspace(5) %"6", align 4 - %"18" = load i32, ptr addrspace(5) %"7", align 4 - %0 = icmp ugt i32 %"18", 31 - %1 = ashr i32 %"17", %"18" - %"16" = select i1 %0, i32 -1, i32 %1 - store i32 %"16", ptr addrspace(5) %"6", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i32, ptr addrspace(5) %"6", align 4 - %"28" = inttoptr i64 %"19" to ptr - store i32 %"20", ptr %"28", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"24", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"14" to ptr + %"29" = getelementptr inbounds i8, ptr %"25", i64 4 + %"13" = load i32, ptr %"29", align 4 + store i32 %"13", ptr addrspace(5) %"7", align 4 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + %2 = icmp ugt i32 %"17", 31 + %3 = ashr i32 %"16", %"17" + %"15" = select i1 %2, i32 -1, i32 %3 + store i32 %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i32, ptr addrspace(5) %"6", align 4 + %"27" = inttoptr i64 %"18" to ptr + store i32 %"19", ptr %"27", align 4 ret void } diff --git a/ptx/src/test/spirv_run/shr_u32.ll b/ptx/src/test/spirv_run/shr_u32.ll index f337c1b..52153d9 100644 --- a/ptx/src/test/spirv_run/shr_u32.ll +++ b/ptx/src/test/spirv_run/shr_u32.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @shr_u32(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #0 { -"46": +define protected amdgpu_kernel void @shr_u32(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 - %"12" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"12", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) @@ -14,45 +10,49 @@ define protected amdgpu_kernel void @shr_u32(ptr addrspace(4) byref(i64) %"37", %"8" = alloca i32, align 4, addrspace(5) %"9" = alloca i32, align 4, addrspace(5) %"10" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"11", align 1 + %"12" = load i64, ptr addrspace(4) %"36", align 8 + store i64 %"12", ptr addrspace(5) %"4", align 8 %"13" = load i64, ptr addrspace(4) %"37", align 8 - store i64 %"13", ptr addrspace(5) %"4", align 8 - %"14" = load i64, ptr addrspace(4) %"38", align 8 - store i64 %"14", ptr addrspace(5) %"5", align 8 - %"16" = load i64, ptr addrspace(5) %"4", align 8 - %"39" = inttoptr i64 %"16" to ptr - %"15" = load i32, ptr %"39", align 4 - store i32 %"15", ptr addrspace(5) %"6", align 4 - %"18" = load i64, ptr addrspace(5) %"4", align 8 - %"40" = inttoptr i64 %"18" to ptr - %"48" = getelementptr inbounds i8, ptr %"40", i64 4 - %"17" = load i32, ptr %"48", align 4 - store i32 %"17", ptr addrspace(5) %"7", align 4 - %"20" = load i64, ptr addrspace(5) %"4", align 8 - %"41" = inttoptr i64 %"20" to ptr - %"50" = getelementptr inbounds i8, ptr %"41", i64 8 - %"19" = load i32, ptr %"50", align 4 - store i32 %"19", ptr addrspace(5) %"8", align 4 - %"22" = load i32, ptr addrspace(5) %"6", align 4 - %"23" = load i32, ptr addrspace(5) %"7", align 4 - %0 = icmp ugt i32 %"23", 31 - %1 = lshr i32 %"22", %"23" - %"21" = select i1 %0, i32 0, i32 %1 - store i32 %"21", ptr addrspace(5) %"9", align 4 - %"25" = load i32, ptr addrspace(5) %"6", align 4 - %"26" = load i32, ptr addrspace(5) %"8", align 4 - %2 = icmp ugt i32 %"26", 31 - %3 = lshr i32 %"25", %"26" - %"24" = select i1 %2, i32 0, i32 %3 - store i32 %"24", ptr addrspace(5) %"10", align 4 - %"27" = load i64, ptr addrspace(5) %"5", align 8 - %"28" = load i32, ptr addrspace(5) %"9", align 4 - %"44" = inttoptr i64 %"27" to ptr - store i32 %"28", ptr %"44", align 4 - %"29" = load i64, ptr addrspace(5) %"5", align 8 - %"30" = load i32, ptr addrspace(5) %"10", align 4 - %"45" = inttoptr i64 %"29" to ptr - %"52" = getelementptr inbounds i8, ptr %"45", i64 4 - store i32 %"30", ptr %"52", align 4 + store i64 %"13", ptr addrspace(5) %"5", align 8 + %"15" = load i64, ptr addrspace(5) %"4", align 8 + %"38" = inttoptr i64 %"15" to ptr + %"14" = load i32, ptr %"38", align 4 + store i32 %"14", ptr addrspace(5) %"6", align 4 + %"17" = load i64, ptr addrspace(5) %"4", align 8 + %"39" = inttoptr i64 %"17" to ptr + %"46" = getelementptr inbounds i8, ptr %"39", i64 4 + %"16" = load i32, ptr %"46", align 4 + store i32 %"16", ptr addrspace(5) %"7", align 4 + %"19" = load i64, ptr addrspace(5) %"4", align 8 + %"40" = inttoptr i64 %"19" to ptr + %"48" = getelementptr inbounds i8, ptr %"40", i64 8 + %"18" = load i32, ptr %"48", align 4 + store i32 %"18", ptr addrspace(5) %"8", align 4 + %"21" = load i32, ptr addrspace(5) %"6", align 4 + %"22" = load i32, ptr addrspace(5) %"7", align 4 + %2 = icmp ugt i32 %"22", 31 + %3 = lshr i32 %"21", %"22" + %"20" = select i1 %2, i32 0, i32 %3 + store i32 %"20", ptr addrspace(5) %"9", align 4 + %"24" = load i32, ptr addrspace(5) %"6", align 4 + %"25" = load i32, ptr addrspace(5) %"8", align 4 + %4 = icmp ugt i32 %"25", 31 + %5 = lshr i32 %"24", %"25" + %"23" = select i1 %4, i32 0, i32 %5 + store i32 %"23", ptr addrspace(5) %"10", align 4 + %"26" = load i64, ptr addrspace(5) %"5", align 8 + %"27" = load i32, ptr addrspace(5) %"9", align 4 + %"43" = inttoptr i64 %"26" to ptr + store i32 %"27", ptr %"43", align 4 + %"28" = load i64, ptr addrspace(5) %"5", align 8 + %"29" = load i32, ptr addrspace(5) %"10", align 4 + %"44" = inttoptr i64 %"28" to ptr + %"50" = getelementptr inbounds i8, ptr %"44", i64 4 + store i32 %"29", ptr %"50", align 4 ret void } diff --git a/ptx/src/test/spirv_run/sign_extend.ll b/ptx/src/test/spirv_run/sign_extend.ll index bb72576..98494e3 100644 --- a/ptx/src/test/spirv_run/sign_extend.ll +++ b/ptx/src/test/spirv_run/sign_extend.ll @@ -1,28 +1,28 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @sign_extend(ptr addrspace(4) byref(i64) %"15", ptr addrspace(4) byref(i64) %"16") #0 { -"20": +define protected amdgpu_kernel void @sign_extend(ptr addrspace(4) byref(i64) %"14", ptr addrspace(4) byref(i64) %"15") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"14", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"15", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"16", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"17" = inttoptr i64 %"11" to ptr + %"16" = load i16, ptr %"17", align 2 + %"10" = sext i16 %"16" to i32 + store i32 %"10", ptr addrspace(5) %"6", align 4 + %"12" = load i64, ptr addrspace(5) %"5", align 8 + %"13" = load i32, ptr addrspace(5) %"6", align 4 %"18" = inttoptr i64 %"12" to ptr - %"17" = load i16, ptr %"18", align 2 - %"11" = sext i16 %"17" to i32 - store i32 %"11", ptr addrspace(5) %"6", align 4 - %"13" = load i64, ptr addrspace(5) %"5", align 8 - %"14" = load i32, ptr addrspace(5) %"6", align 4 - %"19" = inttoptr i64 %"13" to ptr - store i32 %"14", ptr %"19", align 4 + store i32 %"13", ptr %"18", align 4 ret void } diff --git a/ptx/src/test/spirv_run/sin.ll b/ptx/src/test/spirv_run/sin.ll index 40ce553..33f510c 100644 --- a/ptx/src/test/spirv_run/sin.ll +++ b/ptx/src/test/spirv_run/sin.ll @@ -1,30 +1,30 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @sin(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { -"21": +define protected amdgpu_kernel void @sin(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"17", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"19" = inttoptr i64 %"12" to ptr - %"11" = load float, ptr %"19", align 4 - store float %"11", ptr addrspace(5) %"6", align 4 - %"14" = load float, ptr addrspace(5) %"6", align 4 - %"13" = call afn float @llvm.sin.f32(float %"14") - store float %"13", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load float, ptr addrspace(5) %"6", align 4 - %"20" = inttoptr i64 %"15" to ptr - store float %"16", ptr %"20", align 4 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"18" = inttoptr i64 %"11" to ptr + %"10" = load float, ptr %"18", align 4 + store float %"10", ptr addrspace(5) %"6", align 4 + %"13" = load float, ptr addrspace(5) %"6", align 4 + %"12" = call afn float @llvm.sin.f32(float %"13") + store float %"12", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load float, ptr addrspace(5) %"6", align 4 + %"19" = inttoptr i64 %"14" to ptr + store float %"15", ptr %"19", align 4 ret void } diff --git a/ptx/src/test/spirv_run/sqrt.ll b/ptx/src/test/spirv_run/sqrt.ll index 332f67a..f86753e 100644 --- a/ptx/src/test/spirv_run/sqrt.ll +++ b/ptx/src/test/spirv_run/sqrt.ll @@ -1,30 +1,30 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @sqrt(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { -"21": +define protected amdgpu_kernel void @sqrt(ptr addrspace(4) byref(i64) %"16", ptr addrspace(4) byref(i64) %"17") #0 { %"7" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"7", align 1 - %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca float, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"7", align 1 + %"8" = load i64, ptr addrspace(4) %"16", align 8 + store i64 %"8", ptr addrspace(5) %"4", align 8 %"9" = load i64, ptr addrspace(4) %"17", align 8 - store i64 %"9", ptr addrspace(5) %"4", align 8 - %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"5", align 8 - %"12" = load i64, ptr addrspace(5) %"4", align 8 - %"19" = inttoptr i64 %"12" to ptr - %"11" = load float, ptr %"19", align 4 - store float %"11", ptr addrspace(5) %"6", align 4 - %"14" = load float, ptr addrspace(5) %"6", align 4 - %"13" = call afn float @llvm.sqrt.f32(float %"14") - store float %"13", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"5", align 8 - %"16" = load float, ptr addrspace(5) %"6", align 4 - %"20" = inttoptr i64 %"15" to ptr - store float %"16", ptr %"20", align 4 + store i64 %"9", ptr addrspace(5) %"5", align 8 + %"11" = load i64, ptr addrspace(5) %"4", align 8 + %"18" = inttoptr i64 %"11" to ptr + %"10" = load float, ptr %"18", align 4 + store float %"10", ptr addrspace(5) %"6", align 4 + %"13" = load float, ptr addrspace(5) %"6", align 4 + %"12" = call afn float @llvm.sqrt.f32(float %"13") + store float %"12", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"5", align 8 + %"15" = load float, ptr addrspace(5) %"6", align 4 + %"19" = inttoptr i64 %"14" to ptr + store float %"15", ptr %"19", align 4 ret void } diff --git a/ptx/src/test/spirv_run/sub.ll b/ptx/src/test/spirv_run/sub.ll index 2383be0..24a12bd 100644 --- a/ptx/src/test/spirv_run/sub.ll +++ b/ptx/src/test/spirv_run/sub.ll @@ -1,31 +1,31 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @sub(ptr addrspace(4) byref(i64) %"19", ptr addrspace(4) byref(i64) %"20") #0 { -"23": +define protected amdgpu_kernel void @sub(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i64, align 8, addrspace(5) %"7" = alloca i64, align 8, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"20", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"21" = inttoptr i64 %"13" to ptr - %"12" = load i64, ptr %"21", align 8 - store i64 %"12", ptr addrspace(5) %"6", align 8 - %"15" = load i64, ptr addrspace(5) %"6", align 8 - %"14" = sub i64 %"15", 1 - store i64 %"14", ptr addrspace(5) %"7", align 8 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i64, ptr addrspace(5) %"7", align 8 - %"22" = inttoptr i64 %"16" to ptr - store i64 %"17", ptr %"22", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"20" = inttoptr i64 %"12" to ptr + %"11" = load i64, ptr %"20", align 8 + store i64 %"11", ptr addrspace(5) %"6", align 8 + %"14" = load i64, ptr addrspace(5) %"6", align 8 + %"13" = sub i64 %"14", 1 + store i64 %"13", ptr addrspace(5) %"7", align 8 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i64, ptr addrspace(5) %"7", align 8 + %"21" = inttoptr i64 %"15" to ptr + store i64 %"16", ptr %"21", align 8 ret void } diff --git a/ptx/src/test/spirv_run/subc_cc.ll b/ptx/src/test/spirv_run/subc_cc.ll index 9a08872..cdd5c0b 100644 --- a/ptx/src/test/spirv_run/subc_cc.ll +++ b/ptx/src/test/spirv_run/subc_cc.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @subc_cc(ptr addrspace(4) byref(i64) %"54", ptr addrspace(4) byref(i64) %"55") #0 { -"69": +define protected amdgpu_kernel void @subc_cc(ptr addrspace(4) byref(i64) %"57", ptr addrspace(4) byref(i64) %"58") #0 { %"13" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"13", align 1 - %"14" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"14", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) @@ -16,70 +12,78 @@ define protected amdgpu_kernel void @subc_cc(ptr addrspace(4) byref(i64) %"54", %"10" = alloca i32, align 4, addrspace(5) %"11" = alloca i32, align 4, addrspace(5) %"12" = alloca i32, align 4, addrspace(5) - %"15" = load i64, ptr addrspace(4) %"54", align 8 - store i64 %"15", ptr addrspace(5) %"4", align 8 - %"16" = load i64, ptr addrspace(4) %"55", align 8 - store i64 %"16", ptr addrspace(5) %"5", align 8 - %"18" = load i64, ptr addrspace(5) %"4", align 8 - %"57" = inttoptr i64 %"18" to ptr - %"56" = load i32, ptr %"57", align 4 - store i32 %"56", ptr addrspace(5) %"9", align 4 - %"20" = load i64, ptr addrspace(5) %"4", align 8 - %"58" = inttoptr i64 %"20" to ptr - %"71" = getelementptr inbounds i8, ptr %"58", i64 4 - %"59" = load i32, ptr %"71", align 4 - store i32 %"59", ptr addrspace(5) %"10", align 4 - %"22" = load i64, ptr addrspace(5) %"4", align 8 - %"60" = inttoptr i64 %"22" to ptr - %"73" = getelementptr inbounds i8, ptr %"60", i64 8 - %"21" = load i32, ptr %"73", align 4 - store i32 %"21", ptr addrspace(5) %"11", align 4 - %"24" = load i64, ptr addrspace(5) %"4", align 8 - %"61" = inttoptr i64 %"24" to ptr - %"75" = getelementptr inbounds i8, ptr %"61", i64 12 - %"23" = load i32, ptr %"75", align 4 - store i32 %"23", ptr addrspace(5) %"12", align 4 - %"27" = load i32, ptr addrspace(5) %"9", align 4 - %"28" = load i32, ptr addrspace(5) %"10", align 4 - %0 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %"27", i32 %"28") - %"25" = extractvalue { i32, i1 } %0, 0 - %"26" = extractvalue { i32, i1 } %0, 1 - store i32 %"25", ptr addrspace(5) %"6", align 4 - store i1 %"26", ptr addrspace(5) %"14", align 1 - %"31" = load i1, ptr addrspace(5) %"14", align 1 - %"32" = load i32, ptr addrspace(5) %"6", align 4 - %"33" = load i32, ptr addrspace(5) %"11", align 4 - %1 = zext i1 %"31" to i32 - %2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %"32", i32 %"33") - %3 = extractvalue { i32, i1 } %2, 0 - %4 = extractvalue { i32, i1 } %2, 1 - %5 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %3, i32 %1) - %"29" = extractvalue { i32, i1 } %5, 0 - %6 = extractvalue { i32, i1 } %5, 1 - %"30" = xor i1 %4, %6 - store i32 %"29", ptr addrspace(5) %"7", align 4 - store i1 %"30", ptr addrspace(5) %"14", align 1 - %"35" = load i1, ptr addrspace(5) %"14", align 1 - %"36" = load i32, ptr addrspace(5) %"7", align 4 - %"37" = load i32, ptr addrspace(5) %"12", align 4 - %7 = zext i1 %"35" to i32 - %8 = sub i32 %"36", %"37" - %"34" = sub i32 %8, %7 - store i32 %"34", ptr addrspace(5) %"8", align 4 - %"38" = load i64, ptr addrspace(5) %"5", align 8 - %"39" = load i32, ptr addrspace(5) %"6", align 4 - %"66" = inttoptr i64 %"38" to ptr - store i32 %"39", ptr %"66", align 4 - %"40" = load i64, ptr addrspace(5) %"5", align 8 - %"41" = load i32, ptr addrspace(5) %"7", align 4 - %"67" = inttoptr i64 %"40" to ptr - %"77" = getelementptr inbounds i8, ptr %"67", i64 4 - store i32 %"41", ptr %"77", align 4 - %"42" = load i64, ptr addrspace(5) %"5", align 8 - %"43" = load i32, ptr addrspace(5) %"8", align 4 - %"68" = inttoptr i64 %"42" to ptr - %"79" = getelementptr inbounds i8, ptr %"68", i64 8 - store i32 %"43", ptr %"79", align 4 + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"13", align 1 + %"18" = load i64, ptr addrspace(4) %"57", align 8 + store i64 %"18", ptr addrspace(5) %"4", align 8 + %"19" = load i64, ptr addrspace(4) %"58", align 8 + store i64 %"19", ptr addrspace(5) %"5", align 8 + %"21" = load i64, ptr addrspace(5) %"4", align 8 + %"60" = inttoptr i64 %"21" to ptr + %"59" = load i32, ptr %"60", align 4 + store i32 %"59", ptr addrspace(5) %"9", align 4 + %"23" = load i64, ptr addrspace(5) %"4", align 8 + %"61" = inttoptr i64 %"23" to ptr + %"73" = getelementptr inbounds i8, ptr %"61", i64 4 + %"62" = load i32, ptr %"73", align 4 + store i32 %"62", ptr addrspace(5) %"10", align 4 + %"25" = load i64, ptr addrspace(5) %"4", align 8 + %"63" = inttoptr i64 %"25" to ptr + %"75" = getelementptr inbounds i8, ptr %"63", i64 8 + %"24" = load i32, ptr %"75", align 4 + store i32 %"24", ptr addrspace(5) %"11", align 4 + %"27" = load i64, ptr addrspace(5) %"4", align 8 + %"64" = inttoptr i64 %"27" to ptr + %"77" = getelementptr inbounds i8, ptr %"64", i64 12 + %"26" = load i32, ptr %"77", align 4 + store i32 %"26", ptr addrspace(5) %"12", align 4 + %"29" = load i32, ptr addrspace(5) %"9", align 4 + %"30" = load i32, ptr addrspace(5) %"10", align 4 + %2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %"29", i32 %"30") + %"28" = extractvalue { i32, i1 } %2, 0 + %"14" = extractvalue { i32, i1 } %2, 1 + store i32 %"28", ptr addrspace(5) %"6", align 4 + %"31" = xor i1 %"14", true + store i1 %"31", ptr addrspace(5) %"13", align 1 + %"32" = load i1, ptr addrspace(5) %"13", align 1 + %"15" = xor i1 %"32", true + %"34" = load i32, ptr addrspace(5) %"6", align 4 + %"35" = load i32, ptr addrspace(5) %"11", align 4 + %3 = zext i1 %"15" to i32 + %4 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %"34", i32 %"35") + %5 = extractvalue { i32, i1 } %4, 0 + %6 = extractvalue { i32, i1 } %4, 1 + %7 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %5, i32 %3) + %"33" = extractvalue { i32, i1 } %7, 0 + %8 = extractvalue { i32, i1 } %7, 1 + %"16" = xor i1 %6, %8 + store i32 %"33", ptr addrspace(5) %"7", align 4 + %"36" = xor i1 %"16", true + store i1 %"36", ptr addrspace(5) %"13", align 1 + %"37" = load i1, ptr addrspace(5) %"13", align 1 + %"17" = xor i1 %"37", true + %"39" = load i32, ptr addrspace(5) %"7", align 4 + %"40" = load i32, ptr addrspace(5) %"12", align 4 + %9 = zext i1 %"17" to i32 + %10 = sub i32 %"39", %"40" + %"38" = sub i32 %10, %9 + store i32 %"38", ptr addrspace(5) %"8", align 4 + %"41" = load i64, ptr addrspace(5) %"5", align 8 + %"42" = load i32, ptr addrspace(5) %"6", align 4 + %"69" = inttoptr i64 %"41" to ptr + store i32 %"42", ptr %"69", align 4 + %"43" = load i64, ptr addrspace(5) %"5", align 8 + %"44" = load i32, ptr addrspace(5) %"7", align 4 + %"70" = inttoptr i64 %"43" to ptr + %"79" = getelementptr inbounds i8, ptr %"70", i64 4 + store i32 %"44", ptr %"79", align 4 + %"45" = load i64, ptr addrspace(5) %"5", align 8 + %"46" = load i32, ptr addrspace(5) %"8", align 4 + %"71" = inttoptr i64 %"45" to ptr + %"81" = getelementptr inbounds i8, ptr %"71", i64 8 + store i32 %"46", ptr %"81", align 4 ret void } diff --git a/ptx/src/test/spirv_run/subc_cc2.ll b/ptx/src/test/spirv_run/subc_cc2.ll deleted file mode 100644 index aded371..0000000 --- a/ptx/src/test/spirv_run/subc_cc2.ll +++ /dev/null @@ -1,127 +0,0 @@ -target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" -target triple = "amdgcn-amd-amdhsa" - -define protected amdgpu_kernel void @subc_cc2(ptr addrspace(4) byref(i64) %"86", ptr addrspace(4) byref(i64) %"87") #0 { -"112": - %"14" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"14", align 1 - %"15" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"15", align 1 - %"4" = alloca i64, align 8, addrspace(5) - %"5" = alloca i64, align 8, addrspace(5) - %"6" = alloca i32, align 4, addrspace(5) - %"7" = alloca i32, align 4, addrspace(5) - %"8" = alloca i32, align 4, addrspace(5) - %"9" = alloca i32, align 4, addrspace(5) - %"10" = alloca i32, align 4, addrspace(5) - %"11" = alloca i32, align 4, addrspace(5) - %"12" = alloca i32, align 4, addrspace(5) - %"13" = alloca i32, align 4, addrspace(5) - %"16" = load i64, ptr addrspace(4) %"87", align 8 - store i64 %"16", ptr addrspace(5) %"5", align 8 - %0 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1) - %"88" = extractvalue { i32, i1 } %0, 0 - %"18" = extractvalue { i32, i1 } %0, 1 - store i32 %"88", ptr addrspace(5) %"6", align 4 - store i1 %"18", ptr addrspace(5) %"15", align 1 - %"21" = load i1, ptr addrspace(5) %"15", align 1 - %1 = zext i1 %"21" to i32 - %2 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 -1) - %3 = extractvalue { i32, i1 } %2, 0 - %4 = extractvalue { i32, i1 } %2, 1 - %5 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %3, i32 %1) - %"89" = extractvalue { i32, i1 } %5, 0 - %6 = extractvalue { i32, i1 } %5, 1 - %"20" = xor i1 %4, %6 - store i32 %"89", ptr addrspace(5) %"7", align 4 - store i1 %"20", ptr addrspace(5) %"15", align 1 - %"23" = load i1, ptr addrspace(5) %"15", align 1 - %7 = zext i1 %"23" to i32 - %"90" = sub i32 2, %7 - store i32 %"90", ptr addrspace(5) %"8", align 4 - %"25" = load i1, ptr addrspace(5) %"14", align 1 - %8 = zext i1 %"25" to i32 - %"91" = add i32 0, %8 - store i32 %"91", ptr addrspace(5) %"9", align 4 - %9 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1) - %"92" = extractvalue { i32, i1 } %9, 0 - %"27" = extractvalue { i32, i1 } %9, 1 - store i32 %"92", ptr addrspace(5) %"6", align 4 - store i1 %"27", ptr addrspace(5) %"15", align 1 - %"30" = load i1, ptr addrspace(5) %"15", align 1 - %10 = zext i1 %"30" to i32 - %11 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 0) - %12 = extractvalue { i32, i1 } %11, 0 - %13 = extractvalue { i32, i1 } %11, 1 - %14 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %12, i32 %10) - %"93" = extractvalue { i32, i1 } %14, 0 - %15 = extractvalue { i32, i1 } %14, 1 - %"29" = xor i1 %13, %15 - store i32 %"93", ptr addrspace(5) %"10", align 4 - store i1 %"29", ptr addrspace(5) %"15", align 1 - %"32" = load i1, ptr addrspace(5) %"15", align 1 - %16 = zext i1 %"32" to i32 - %"94" = sub i32 2, %16 - store i32 %"94", ptr addrspace(5) %"11", align 4 - %17 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 0) - %"95" = extractvalue { i32, i1 } %17, 0 - %"34" = extractvalue { i32, i1 } %17, 1 - store i32 %"95", ptr addrspace(5) %"6", align 4 - store i1 %"34", ptr addrspace(5) %"15", align 1 - %"37" = load i1, ptr addrspace(5) %"15", align 1 - %18 = zext i1 %"37" to i32 - %19 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 0, i32 1) - %20 = extractvalue { i32, i1 } %19, 0 - %21 = extractvalue { i32, i1 } %19, 1 - %22 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %20, i32 %18) - %"96" = extractvalue { i32, i1 } %22, 0 - %23 = extractvalue { i32, i1 } %22, 1 - %"36" = xor i1 %21, %23 - store i32 %"96", ptr addrspace(5) %"12", align 4 - store i1 %"36", ptr addrspace(5) %"15", align 1 - %"39" = load i1, ptr addrspace(5) %"15", align 1 - %24 = zext i1 %"39" to i32 - %"97" = sub i32 2, %24 - store i32 %"97", ptr addrspace(5) %"13", align 4 - %"40" = load i64, ptr addrspace(5) %"5", align 8 - %"41" = load i32, ptr addrspace(5) %"7", align 4 - %"98" = inttoptr i64 %"40" to ptr - store i32 %"41", ptr %"98", align 4 - %"42" = load i64, ptr addrspace(5) %"5", align 8 - %"43" = load i32, ptr addrspace(5) %"8", align 4 - %"100" = inttoptr i64 %"42" to ptr - %"114" = getelementptr inbounds i8, ptr %"100", i64 4 - store i32 %"43", ptr %"114", align 4 - %"44" = load i64, ptr addrspace(5) %"5", align 8 - %"45" = load i32, ptr addrspace(5) %"9", align 4 - %"102" = inttoptr i64 %"44" to ptr - %"116" = getelementptr inbounds i8, ptr %"102", i64 8 - store i32 %"45", ptr %"116", align 4 - %"46" = load i64, ptr addrspace(5) %"5", align 8 - %"47" = load i32, ptr addrspace(5) %"10", align 4 - %"104" = inttoptr i64 %"46" to ptr - %"118" = getelementptr inbounds i8, ptr %"104", i64 12 - store i32 %"47", ptr %"118", align 4 - %"48" = load i64, ptr addrspace(5) %"5", align 8 - %"49" = load i32, ptr addrspace(5) %"11", align 4 - %"106" = inttoptr i64 %"48" to ptr - %"120" = getelementptr inbounds i8, ptr %"106", i64 16 - store i32 %"49", ptr %"120", align 4 - %"50" = load i64, ptr addrspace(5) %"5", align 8 - %"51" = load i32, ptr addrspace(5) %"12", align 4 - %"108" = inttoptr i64 %"50" to ptr - %"122" = getelementptr inbounds i8, ptr %"108", i64 20 - store i32 %"51", ptr %"122", align 4 - %"52" = load i64, ptr addrspace(5) %"5", align 8 - %"53" = load i32, ptr addrspace(5) %"13", align 4 - %"110" = inttoptr i64 %"52" to ptr - %"124" = getelementptr inbounds i8, ptr %"110", i64 24 - store i32 %"53", ptr %"124", align 4 - ret void -} - -; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn -declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1 - -attributes #0 = { "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "no-trapping-math"="true" "uniform-work-group-size"="true" } -attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn } diff --git a/ptx/src/test/spirv_run/subc_cc2.ptx b/ptx/src/test/spirv_run/subc_cc2.ptx deleted file mode 100644 index 2c776a4..0000000 --- a/ptx/src/test/spirv_run/subc_cc2.ptx +++ /dev/null @@ -1,55 +0,0 @@ -.version 6.5 -.target sm_30 -.address_size 64 - -.visible .entry subc_cc2( - .param .u64 input, - .param .u64 output -) -{ - .reg .u64 in_addr; - .reg .u64 out_addr; - .reg .b32 unused; - - .reg .b32 result_1; - .reg .b32 carry_out_1_1; - .reg .b32 carry_out_1_2; - .reg .b32 result_2; - .reg .b32 carry_out_2; - .reg .b32 result_3; - .reg .b32 carry_out_3; - - ld.param.u64 out_addr, [output]; - - // set carry=1 - sub.cc.s32 unused, 0, 1; - // overflow (b + CC.CF), no underflow in whole operation - subc.cc.s32 result_1, 0, 4294967295; - // write carry - subc.s32 carry_out_1_1, 2, 0; - // make sure the overflow in (b + CC.CF) is not detected by addc - addc.s32 carry_out_1_2, 0, 0; - - // set carry=1 - sub.cc.s32 unused, 0, 1; - // underflow in substraction, underflow in whole operation - subc.cc.s32 result_2, 0, 0; - // write carry - subc.s32 carry_out_2, 2, 0; - - // set carry=0 - sub.cc.s32 unused, 0, 0; - // same operation as bove, but 0-1-0 instead of 0-0-1 - subc.cc.s32 result_3, 0, 1; - // write carry - subc.s32 carry_out_3, 2, 0; - - st.s32 [out_addr], result_1; - st.s32 [out_addr+4], carry_out_1_1; - st.s32 [out_addr+8], carry_out_1_2; - st.s32 [out_addr+12], result_2; - st.s32 [out_addr+16], carry_out_2; - st.s32 [out_addr+20], result_3; - st.s32 [out_addr+24], carry_out_3; - ret; -} diff --git a/ptx/src/test/spirv_run/vector.ll b/ptx/src/test/spirv_run/vector.ll index a53904e..f311be7 100644 --- a/ptx/src/test/spirv_run/vector.ll +++ b/ptx/src/test/spirv_run/vector.ll @@ -1,95 +1,95 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define private <2 x i32> @"1"(<2 x i32> %"20") #0 { -"52": +define private <2 x i32> @"1"(<2 x i32> %"18") #0 { %"3" = alloca <2 x i32>, align 8, addrspace(5) %"2" = alloca <2 x i32>, align 8, addrspace(5) %"16" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"16", align 1 - %"17" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"17", align 1 %"4" = alloca <2 x i32>, align 8, addrspace(5) %"5" = alloca i32, align 4, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) - store <2 x i32> %"20", ptr addrspace(5) %"3", align 8 - %0 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"3", i32 0, i32 0 - %"22" = load i32, ptr addrspace(5) %0, align 4 %1 = alloca i32, align 4, addrspace(5) - store i32 %"22", ptr addrspace(5) %1, align 4 - %"21" = load i32, ptr addrspace(5) %1, align 4 - store i32 %"21", ptr addrspace(5) %"5", align 4 - %2 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"3", i32 0, i32 1 - %"24" = load i32, ptr addrspace(5) %2, align 4 + %2 = alloca i32, align 4, addrspace(5) %3 = alloca i32, align 4, addrspace(5) - store i32 %"24", ptr addrspace(5) %3, align 4 - %"23" = load i32, ptr addrspace(5) %3, align 4 - store i32 %"23", ptr addrspace(5) %"6", align 4 - %"26" = load i32, ptr addrspace(5) %"5", align 4 - %"27" = load i32, ptr addrspace(5) %"6", align 4 - %"25" = add i32 %"26", %"27" - store i32 %"25", ptr addrspace(5) %"6", align 4 - %"29" = load i32, ptr addrspace(5) %"6", align 4 %4 = alloca i32, align 4, addrspace(5) + %5 = alloca i32, align 4, addrspace(5) + %6 = alloca <2 x i32>, align 8, addrspace(5) + br label %7 + +7: ; preds = %0 + store <2 x i32> %"18", ptr addrspace(5) %"3", align 8 + store i1 false, ptr addrspace(5) %"16", align 1 + %8 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"3", i32 0, i32 0 + %"20" = load i32, ptr addrspace(5) %8, align 4 + store i32 %"20", ptr addrspace(5) %1, align 4 + %"19" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"19", ptr addrspace(5) %"5", align 4 + %9 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"3", i32 0, i32 1 + %"22" = load i32, ptr addrspace(5) %9, align 4 + store i32 %"22", ptr addrspace(5) %2, align 4 + %"21" = load i32, ptr addrspace(5) %2, align 4 + store i32 %"21", ptr addrspace(5) %"6", align 4 + %"24" = load i32, ptr addrspace(5) %"5", align 4 + %"25" = load i32, ptr addrspace(5) %"6", align 4 + %"23" = add i32 %"24", %"25" + store i32 %"23", ptr addrspace(5) %"6", align 4 + %"27" = load i32, ptr addrspace(5) %"6", align 4 + store i32 %"27", ptr addrspace(5) %3, align 4 + %"26" = load i32, ptr addrspace(5) %3, align 4 + %10 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 0 + store i32 %"26", ptr addrspace(5) %10, align 4 + %"29" = load i32, ptr addrspace(5) %"6", align 4 store i32 %"29", ptr addrspace(5) %4, align 4 %"28" = load i32, ptr addrspace(5) %4, align 4 - %5 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 0 - store i32 %"28", ptr addrspace(5) %5, align 4 - %"31" = load i32, ptr addrspace(5) %"6", align 4 - %6 = alloca i32, align 4, addrspace(5) - store i32 %"31", ptr addrspace(5) %6, align 4 - %"30" = load i32, ptr addrspace(5) %6, align 4 - %7 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 1 - store i32 %"30", ptr addrspace(5) %7, align 4 - %8 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 1 - %"33" = load i32, ptr addrspace(5) %8, align 4 - %9 = alloca i32, align 4, addrspace(5) - store i32 %"33", ptr addrspace(5) %9, align 4 - %"32" = load i32, ptr addrspace(5) %9, align 4 - %10 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 0 - store i32 %"32", ptr addrspace(5) %10, align 4 - %"35" = load <2 x i32>, ptr addrspace(5) %"4", align 8 - %11 = alloca <2 x i32>, align 8, addrspace(5) - store <2 x i32> %"35", ptr addrspace(5) %11, align 8 - %"34" = load <2 x i32>, ptr addrspace(5) %11, align 8 - store <2 x i32> %"34", ptr addrspace(5) %"2", align 8 - %"36" = load <2 x i32>, ptr addrspace(5) %"2", align 8 - ret <2 x i32> %"36" + %11 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 1 + store i32 %"28", ptr addrspace(5) %11, align 4 + %12 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 1 + %"31" = load i32, ptr addrspace(5) %12, align 4 + store i32 %"31", ptr addrspace(5) %5, align 4 + %"30" = load i32, ptr addrspace(5) %5, align 4 + %13 = getelementptr inbounds <2 x i32>, ptr addrspace(5) %"4", i32 0, i32 0 + store i32 %"30", ptr addrspace(5) %13, align 4 + %"33" = load <2 x i32>, ptr addrspace(5) %"4", align 8 + store <2 x i32> %"33", ptr addrspace(5) %6, align 8 + %"32" = load <2 x i32>, ptr addrspace(5) %6, align 8 + store <2 x i32> %"32", ptr addrspace(5) %"2", align 8 + %"34" = load <2 x i32>, ptr addrspace(5) %"2", align 8 + ret <2 x i32> %"34" } -define protected amdgpu_kernel void @vector(ptr addrspace(4) byref(i64) %"47", ptr addrspace(4) byref(i64) %"48") #0 { -"53": - %"18" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"18", align 1 - %"19" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"19", align 1 +define protected amdgpu_kernel void @vector(ptr addrspace(4) byref(i64) %"45", ptr addrspace(4) byref(i64) %"46") #0 { + %"17" = alloca i1, align 1, addrspace(5) %"10" = alloca i64, align 8, addrspace(5) %"11" = alloca i64, align 8, addrspace(5) %"12" = alloca <2 x i32>, align 8, addrspace(5) %"13" = alloca i32, align 4, addrspace(5) %"14" = alloca i32, align 4, addrspace(5) %"15" = alloca i64, align 8, addrspace(5) - %"37" = load i64, ptr addrspace(4) %"47", align 8 - store i64 %"37", ptr addrspace(5) %"10", align 8 - %"38" = load i64, ptr addrspace(4) %"48", align 8 - store i64 %"38", ptr addrspace(5) %"11", align 8 - %"40" = load i64, ptr addrspace(5) %"10", align 8 - %"49" = inttoptr i64 %"40" to ptr - %"39" = load <2 x i32>, ptr %"49", align 8 + %1 = alloca i64, align 8, addrspace(5) + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"17", align 1 + %"35" = load i64, ptr addrspace(4) %"45", align 8 + store i64 %"35", ptr addrspace(5) %"10", align 8 + %"36" = load i64, ptr addrspace(4) %"46", align 8 + store i64 %"36", ptr addrspace(5) %"11", align 8 + %"38" = load i64, ptr addrspace(5) %"10", align 8 + %"47" = inttoptr i64 %"38" to ptr + %"37" = load <2 x i32>, ptr %"47", align 8 + store <2 x i32> %"37", ptr addrspace(5) %"12", align 8 + %"40" = load <2 x i32>, ptr addrspace(5) %"12", align 8 + %"39" = call <2 x i32> @"1"(<2 x i32> %"40") store <2 x i32> %"39", ptr addrspace(5) %"12", align 8 %"42" = load <2 x i32>, ptr addrspace(5) %"12", align 8 - %"41" = call <2 x i32> @"1"(<2 x i32> %"42") - store <2 x i32> %"41", ptr addrspace(5) %"12", align 8 + %"48" = bitcast <2 x i32> %"42" to i64 + store i64 %"48", ptr addrspace(5) %1, align 8 + %"41" = load i64, ptr addrspace(5) %1, align 8 + store i64 %"41", ptr addrspace(5) %"15", align 8 + %"43" = load i64, ptr addrspace(5) %"11", align 8 %"44" = load <2 x i32>, ptr addrspace(5) %"12", align 8 - %"50" = bitcast <2 x i32> %"44" to i64 - %0 = alloca i64, align 8, addrspace(5) - store i64 %"50", ptr addrspace(5) %0, align 8 - %"43" = load i64, ptr addrspace(5) %0, align 8 - store i64 %"43", ptr addrspace(5) %"15", align 8 - %"45" = load i64, ptr addrspace(5) %"11", align 8 - %"46" = load <2 x i32>, ptr addrspace(5) %"12", align 8 - %"51" = inttoptr i64 %"45" to ptr - store <2 x i32> %"46", ptr %"51", align 8 + %"49" = inttoptr i64 %"43" to ptr + store <2 x i32> %"44", ptr %"49", align 8 ret void } diff --git a/ptx/src/test/spirv_run/vector4.ll b/ptx/src/test/spirv_run/vector4.ll index 53187f7..7d92885 100644 --- a/ptx/src/test/spirv_run/vector4.ll +++ b/ptx/src/test/spirv_run/vector4.ll @@ -1,34 +1,34 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @vector4(ptr addrspace(4) byref(i64) %"18", ptr addrspace(4) byref(i64) %"19") #0 { -"24": +define protected amdgpu_kernel void @vector4(ptr addrspace(4) byref(i64) %"17", ptr addrspace(4) byref(i64) %"18") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca <4 x i32>, align 16, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) - %"10" = load i64, ptr addrspace(4) %"18", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"19", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"20" = inttoptr i64 %"13" to ptr - %"12" = load <4 x i32>, ptr %"20", align 16 - store <4 x i32> %"12", ptr addrspace(5) %"6", align 16 - %0 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %"6", i32 0, i32 3 - %"15" = load i32, ptr addrspace(5) %0, align 4 %1 = alloca i32, align 4, addrspace(5) - store i32 %"15", ptr addrspace(5) %1, align 4 - %"21" = load i32, ptr addrspace(5) %1, align 4 - store i32 %"21", ptr addrspace(5) %"7", align 4 - %"16" = load i64, ptr addrspace(5) %"5", align 8 - %"17" = load i32, ptr addrspace(5) %"7", align 4 - %"23" = inttoptr i64 %"16" to ptr - store i32 %"17", ptr %"23", align 4 + br label %2 + +2: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"17", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 + %"10" = load i64, ptr addrspace(4) %"18", align 8 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"19" = inttoptr i64 %"12" to ptr + %"11" = load <4 x i32>, ptr %"19", align 16 + store <4 x i32> %"11", ptr addrspace(5) %"6", align 16 + %3 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %"6", i32 0, i32 3 + %"14" = load i32, ptr addrspace(5) %3, align 4 + store i32 %"14", ptr addrspace(5) %1, align 4 + %"20" = load i32, ptr addrspace(5) %1, align 4 + store i32 %"20", ptr addrspace(5) %"7", align 4 + %"15" = load i64, ptr addrspace(5) %"5", align 8 + %"16" = load i32, ptr addrspace(5) %"7", align 4 + %"22" = inttoptr i64 %"15" to ptr + store i32 %"16", ptr %"22", align 4 ret void } diff --git a/ptx/src/test/spirv_run/vector_extract.ll b/ptx/src/test/spirv_run/vector_extract.ll index bceac42..ea2e2db 100644 --- a/ptx/src/test/spirv_run/vector_extract.ll +++ b/ptx/src/test/spirv_run/vector_extract.ll @@ -1,12 +1,8 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @vector_extract(ptr addrspace(4) byref(i64) %"49", ptr addrspace(4) byref(i64) %"50") #0 { -"61": +define protected amdgpu_kernel void @vector_extract(ptr addrspace(4) byref(i64) %"48", ptr addrspace(4) byref(i64) %"49") #0 { %"17" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"17", align 1 - %"18" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"18", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i16, align 2, addrspace(5) @@ -14,83 +10,87 @@ define protected amdgpu_kernel void @vector_extract(ptr addrspace(4) byref(i64) %"8" = alloca i16, align 2, addrspace(5) %"9" = alloca i16, align 2, addrspace(5) %"10" = alloca <4 x i16>, align 8, addrspace(5) + %1 = alloca <4 x i16>, align 8, addrspace(5) + %2 = alloca <4 x i16>, align 8, addrspace(5) + %3 = alloca <4 x i16>, align 8, addrspace(5) + br label %4 + +4: ; preds = %0 + store i1 false, ptr addrspace(5) %"17", align 1 + %"18" = load i64, ptr addrspace(4) %"48", align 8 + store i64 %"18", ptr addrspace(5) %"4", align 8 %"19" = load i64, ptr addrspace(4) %"49", align 8 - store i64 %"19", ptr addrspace(5) %"4", align 8 - %"20" = load i64, ptr addrspace(4) %"50", align 8 - store i64 %"20", ptr addrspace(5) %"5", align 8 - %"21" = load i64, ptr addrspace(5) %"4", align 8 - %"51" = inttoptr i64 %"21" to ptr addrspace(1) - %"11" = load <4 x i8>, ptr addrspace(1) %"51", align 4 - %"52" = extractelement <4 x i8> %"11", i32 0 - %"53" = extractelement <4 x i8> %"11", i32 1 - %"54" = extractelement <4 x i8> %"11", i32 2 - %"55" = extractelement <4 x i8> %"11", i32 3 + store i64 %"19", ptr addrspace(5) %"5", align 8 + %"20" = load i64, ptr addrspace(5) %"4", align 8 + %"50" = inttoptr i64 %"20" to ptr addrspace(1) + %"11" = load <4 x i8>, ptr addrspace(1) %"50", align 4 + %"51" = extractelement <4 x i8> %"11", i32 0 + %"52" = extractelement <4 x i8> %"11", i32 1 + %"53" = extractelement <4 x i8> %"11", i32 2 + %"54" = extractelement <4 x i8> %"11", i32 3 + %"21" = zext i8 %"51" to i16 %"22" = zext i8 %"52" to i16 %"23" = zext i8 %"53" to i16 %"24" = zext i8 %"54" to i16 - %"25" = zext i8 %"55" to i16 - store i16 %"22", ptr addrspace(5) %"6", align 2 - store i16 %"23", ptr addrspace(5) %"7", align 2 - store i16 %"24", ptr addrspace(5) %"8", align 2 - store i16 %"25", ptr addrspace(5) %"9", align 2 - %"26" = load i16, ptr addrspace(5) %"7", align 2 - %"27" = load i16, ptr addrspace(5) %"8", align 2 - %"28" = load i16, ptr addrspace(5) %"9", align 2 - %"29" = load i16, ptr addrspace(5) %"6", align 2 - %0 = insertelement <4 x i16> undef, i16 %"26", i32 0 - %1 = insertelement <4 x i16> %0, i16 %"27", i32 1 - %2 = insertelement <4 x i16> %1, i16 %"28", i32 2 - %"12" = insertelement <4 x i16> %2, i16 %"29", i32 3 - %3 = alloca <4 x i16>, align 8, addrspace(5) - store <4 x i16> %"12", ptr addrspace(5) %3, align 8 - %"30" = load <4 x i16>, ptr addrspace(5) %3, align 8 - store <4 x i16> %"30", ptr addrspace(5) %"10", align 8 - %"31" = load <4 x i16>, ptr addrspace(5) %"10", align 8 - %4 = alloca <4 x i16>, align 8, addrspace(5) - store <4 x i16> %"31", ptr addrspace(5) %4, align 8 - %"13" = load <4 x i16>, ptr addrspace(5) %4, align 8 - %"32" = extractelement <4 x i16> %"13", i32 0 - %"33" = extractelement <4 x i16> %"13", i32 1 - %"34" = extractelement <4 x i16> %"13", i32 2 - %"35" = extractelement <4 x i16> %"13", i32 3 - store i16 %"32", ptr addrspace(5) %"8", align 2 - store i16 %"33", ptr addrspace(5) %"9", align 2 - store i16 %"34", ptr addrspace(5) %"6", align 2 - store i16 %"35", ptr addrspace(5) %"7", align 2 - %"36" = load i16, ptr addrspace(5) %"8", align 2 - %"37" = load i16, ptr addrspace(5) %"9", align 2 - %"38" = load i16, ptr addrspace(5) %"6", align 2 - %"39" = load i16, ptr addrspace(5) %"7", align 2 - %5 = insertelement <4 x i16> undef, i16 %"36", i32 0 - %6 = insertelement <4 x i16> %5, i16 %"37", i32 1 - %7 = insertelement <4 x i16> %6, i16 %"38", i32 2 - %"15" = insertelement <4 x i16> %7, i16 %"39", i32 3 - %8 = alloca <4 x i16>, align 8, addrspace(5) - store <4 x i16> %"15", ptr addrspace(5) %8, align 8 - %"14" = load <4 x i16>, ptr addrspace(5) %8, align 8 - %"40" = extractelement <4 x i16> %"14", i32 0 - %"41" = extractelement <4 x i16> %"14", i32 1 - %"42" = extractelement <4 x i16> %"14", i32 2 - %"43" = extractelement <4 x i16> %"14", i32 3 - store i16 %"40", ptr addrspace(5) %"9", align 2 - store i16 %"41", ptr addrspace(5) %"6", align 2 - store i16 %"42", ptr addrspace(5) %"7", align 2 - store i16 %"43", ptr addrspace(5) %"8", align 2 - %"44" = load i16, ptr addrspace(5) %"6", align 2 - %"45" = load i16, ptr addrspace(5) %"7", align 2 - %"46" = load i16, ptr addrspace(5) %"8", align 2 - %"47" = load i16, ptr addrspace(5) %"9", align 2 + store i16 %"21", ptr addrspace(5) %"6", align 2 + store i16 %"22", ptr addrspace(5) %"7", align 2 + store i16 %"23", ptr addrspace(5) %"8", align 2 + store i16 %"24", ptr addrspace(5) %"9", align 2 + %"25" = load i16, ptr addrspace(5) %"7", align 2 + %"26" = load i16, ptr addrspace(5) %"8", align 2 + %"27" = load i16, ptr addrspace(5) %"9", align 2 + %"28" = load i16, ptr addrspace(5) %"6", align 2 + %5 = insertelement <4 x i16> undef, i16 %"25", i32 0 + %6 = insertelement <4 x i16> %5, i16 %"26", i32 1 + %7 = insertelement <4 x i16> %6, i16 %"27", i32 2 + %"12" = insertelement <4 x i16> %7, i16 %"28", i32 3 + store <4 x i16> %"12", ptr addrspace(5) %1, align 8 + %"29" = load <4 x i16>, ptr addrspace(5) %1, align 8 + store <4 x i16> %"29", ptr addrspace(5) %"10", align 8 + %"30" = load <4 x i16>, ptr addrspace(5) %"10", align 8 + store <4 x i16> %"30", ptr addrspace(5) %2, align 8 + %"13" = load <4 x i16>, ptr addrspace(5) %2, align 8 + %"31" = extractelement <4 x i16> %"13", i32 0 + %"32" = extractelement <4 x i16> %"13", i32 1 + %"33" = extractelement <4 x i16> %"13", i32 2 + %"34" = extractelement <4 x i16> %"13", i32 3 + store i16 %"31", ptr addrspace(5) %"8", align 2 + store i16 %"32", ptr addrspace(5) %"9", align 2 + store i16 %"33", ptr addrspace(5) %"6", align 2 + store i16 %"34", ptr addrspace(5) %"7", align 2 + %"35" = load i16, ptr addrspace(5) %"8", align 2 + %"36" = load i16, ptr addrspace(5) %"9", align 2 + %"37" = load i16, ptr addrspace(5) %"6", align 2 + %"38" = load i16, ptr addrspace(5) %"7", align 2 + %8 = insertelement <4 x i16> undef, i16 %"35", i32 0 + %9 = insertelement <4 x i16> %8, i16 %"36", i32 1 + %10 = insertelement <4 x i16> %9, i16 %"37", i32 2 + %"15" = insertelement <4 x i16> %10, i16 %"38", i32 3 + store <4 x i16> %"15", ptr addrspace(5) %3, align 8 + %"14" = load <4 x i16>, ptr addrspace(5) %3, align 8 + %"39" = extractelement <4 x i16> %"14", i32 0 + %"40" = extractelement <4 x i16> %"14", i32 1 + %"41" = extractelement <4 x i16> %"14", i32 2 + %"42" = extractelement <4 x i16> %"14", i32 3 + store i16 %"39", ptr addrspace(5) %"9", align 2 + store i16 %"40", ptr addrspace(5) %"6", align 2 + store i16 %"41", ptr addrspace(5) %"7", align 2 + store i16 %"42", ptr addrspace(5) %"8", align 2 + %"43" = load i16, ptr addrspace(5) %"6", align 2 + %"44" = load i16, ptr addrspace(5) %"7", align 2 + %"45" = load i16, ptr addrspace(5) %"8", align 2 + %"46" = load i16, ptr addrspace(5) %"9", align 2 + %"55" = trunc i16 %"43" to i8 %"56" = trunc i16 %"44" to i8 %"57" = trunc i16 %"45" to i8 %"58" = trunc i16 %"46" to i8 - %"59" = trunc i16 %"47" to i8 - %9 = insertelement <4 x i8> undef, i8 %"56", i32 0 - %10 = insertelement <4 x i8> %9, i8 %"57", i32 1 - %11 = insertelement <4 x i8> %10, i8 %"58", i32 2 - %"16" = insertelement <4 x i8> %11, i8 %"59", i32 3 - %"48" = load i64, ptr addrspace(5) %"5", align 8 - %"60" = inttoptr i64 %"48" to ptr addrspace(1) - store <4 x i8> %"16", ptr addrspace(1) %"60", align 4 + %11 = insertelement <4 x i8> undef, i8 %"55", i32 0 + %12 = insertelement <4 x i8> %11, i8 %"56", i32 1 + %13 = insertelement <4 x i8> %12, i8 %"57", i32 2 + %"16" = insertelement <4 x i8> %13, i8 %"58", i32 3 + %"47" = load i64, ptr addrspace(5) %"5", align 8 + %"59" = inttoptr i64 %"47" to ptr addrspace(1) + store <4 x i8> %"16", ptr addrspace(1) %"59", align 4 ret void } diff --git a/ptx/src/test/spirv_run/vote_ballot.ll b/ptx/src/test/spirv_run/vote_ballot.ll index 200eccc..efba70a 100644 --- a/ptx/src/test/spirv_run/vote_ballot.ll +++ b/ptx/src/test/spirv_run/vote_ballot.ll @@ -3,48 +3,48 @@ target triple = "amdgcn-amd-amdhsa" declare i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1, i32) #0 -define protected amdgpu_kernel void @vote_ballot(ptr addrspace(4) byref(i64) %"41", ptr addrspace(4) byref(i64) %"42") #1 { -"51": +define protected amdgpu_kernel void @vote_ballot(ptr addrspace(4) byref(i64) %"40", ptr addrspace(4) byref(i64) %"41") #1 { %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 - %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) %"9" = alloca i32, align 4, addrspace(5) - %"12" = load i64, ptr addrspace(4) %"42", align 8 - store i64 %"12", ptr addrspace(5) %"5", align 8 - %"43" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 true, i32 1) - store i32 %"43", ptr addrspace(5) %"6", align 4 - %"44" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 false, i32 16777215) - store i32 %"44", ptr addrspace(5) %"7", align 4 - %"45" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 true, i32 2) - store i32 %"45", ptr addrspace(5) %"8", align 4 - %"46" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 true, i32 3) - store i32 %"46", ptr addrspace(5) %"9", align 4 - %"17" = load i64, ptr addrspace(5) %"5", align 8 - %"18" = load i32, ptr addrspace(5) %"6", align 4 - %"47" = inttoptr i64 %"17" to ptr - %"57" = getelementptr inbounds i8, ptr %"47", i64 0 - store i32 %"18", ptr %"57", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i32, ptr addrspace(5) %"7", align 4 - %"48" = inttoptr i64 %"19" to ptr - %"59" = getelementptr inbounds i8, ptr %"48", i64 4 - store i32 %"20", ptr %"59", align 4 - %"21" = load i64, ptr addrspace(5) %"5", align 8 - %"22" = load i32, ptr addrspace(5) %"8", align 4 - %"49" = inttoptr i64 %"21" to ptr - %"61" = getelementptr inbounds i8, ptr %"49", i64 8 - store i32 %"22", ptr %"61", align 4 - %"23" = load i64, ptr addrspace(5) %"5", align 8 - %"24" = load i32, ptr addrspace(5) %"9", align 4 - %"50" = inttoptr i64 %"23" to ptr - %"63" = getelementptr inbounds i8, ptr %"50", i64 12 - store i32 %"24", ptr %"63", align 4 + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = load i64, ptr addrspace(4) %"41", align 8 + store i64 %"11", ptr addrspace(5) %"5", align 8 + %"42" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 true, i32 1) + store i32 %"42", ptr addrspace(5) %"6", align 4 + %"43" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 false, i32 16777215) + store i32 %"43", ptr addrspace(5) %"7", align 4 + %"44" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 true, i32 2) + store i32 %"44", ptr addrspace(5) %"8", align 4 + %"45" = call i32 @__zluda_ptx_impl__vote_sync_ballot_b32_32(i1 true, i32 3) + store i32 %"45", ptr addrspace(5) %"9", align 4 + %"16" = load i64, ptr addrspace(5) %"5", align 8 + %"17" = load i32, ptr addrspace(5) %"6", align 4 + %"46" = inttoptr i64 %"16" to ptr + %"55" = getelementptr inbounds i8, ptr %"46", i64 0 + store i32 %"17", ptr %"55", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i32, ptr addrspace(5) %"7", align 4 + %"47" = inttoptr i64 %"18" to ptr + %"57" = getelementptr inbounds i8, ptr %"47", i64 4 + store i32 %"19", ptr %"57", align 4 + %"20" = load i64, ptr addrspace(5) %"5", align 8 + %"21" = load i32, ptr addrspace(5) %"8", align 4 + %"48" = inttoptr i64 %"20" to ptr + %"59" = getelementptr inbounds i8, ptr %"48", i64 8 + store i32 %"21", ptr %"59", align 4 + %"22" = load i64, ptr addrspace(5) %"5", align 8 + %"23" = load i32, ptr addrspace(5) %"9", align 4 + %"49" = inttoptr i64 %"22" to ptr + %"61" = getelementptr inbounds i8, ptr %"49", i64 12 + store i32 %"23", ptr %"61", align 4 ret void } diff --git a/ptx/src/test/spirv_run/vshr.ll b/ptx/src/test/spirv_run/vshr.ll index e3b6b5e..3d24770 100644 --- a/ptx/src/test/spirv_run/vshr.ll +++ b/ptx/src/test/spirv_run/vshr.ll @@ -1,48 +1,48 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @vshr(ptr addrspace(4) byref(i64) %"30", ptr addrspace(4) byref(i64) %"31") #0 { -"39": +define protected amdgpu_kernel void @vshr(ptr addrspace(4) byref(i64) %"29", ptr addrspace(4) byref(i64) %"30") #0 { %"10" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"10", align 1 - %"11" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"11", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) %"8" = alloca i32, align 4, addrspace(5) %"9" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"10", align 1 + %"11" = load i64, ptr addrspace(4) %"29", align 8 + store i64 %"11", ptr addrspace(5) %"4", align 8 %"12" = load i64, ptr addrspace(4) %"30", align 8 - store i64 %"12", ptr addrspace(5) %"4", align 8 - %"13" = load i64, ptr addrspace(4) %"31", align 8 - store i64 %"13", ptr addrspace(5) %"5", align 8 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"33" = inttoptr i64 %"15" to ptr - %"32" = load i32, ptr %"33", align 4 - store i32 %"32", ptr addrspace(5) %"7", align 4 - %"17" = load i64, ptr addrspace(5) %"4", align 8 - %"34" = inttoptr i64 %"17" to ptr - %"41" = getelementptr inbounds i8, ptr %"34", i64 4 - %"35" = load i32, ptr %"41", align 4 - store i32 %"35", ptr addrspace(5) %"8", align 4 - %"19" = load i64, ptr addrspace(5) %"4", align 8 - %"36" = inttoptr i64 %"19" to ptr - %"43" = getelementptr inbounds i8, ptr %"36", i64 8 - %"37" = load i32, ptr %"43", align 4 - store i32 %"37", ptr addrspace(5) %"9", align 4 - %"21" = load i32, ptr addrspace(5) %"7", align 4 - %"22" = load i32, ptr addrspace(5) %"8", align 4 - %"23" = load i32, ptr addrspace(5) %"9", align 4 - %0 = icmp ugt i32 %"22", 31 - %1 = lshr i32 %"21", %"22" - %2 = select i1 %0, i32 0, i32 %1 - %"20" = add i32 %2, %"23" - store i32 %"20", ptr addrspace(5) %"6", align 4 - %"24" = load i64, ptr addrspace(5) %"5", align 8 - %"25" = load i32, ptr addrspace(5) %"6", align 4 - %"38" = inttoptr i64 %"24" to ptr - store i32 %"25", ptr %"38", align 4 + store i64 %"12", ptr addrspace(5) %"5", align 8 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"32" = inttoptr i64 %"14" to ptr + %"31" = load i32, ptr %"32", align 4 + store i32 %"31", ptr addrspace(5) %"7", align 4 + %"16" = load i64, ptr addrspace(5) %"4", align 8 + %"33" = inttoptr i64 %"16" to ptr + %"39" = getelementptr inbounds i8, ptr %"33", i64 4 + %"34" = load i32, ptr %"39", align 4 + store i32 %"34", ptr addrspace(5) %"8", align 4 + %"18" = load i64, ptr addrspace(5) %"4", align 8 + %"35" = inttoptr i64 %"18" to ptr + %"41" = getelementptr inbounds i8, ptr %"35", i64 8 + %"36" = load i32, ptr %"41", align 4 + store i32 %"36", ptr addrspace(5) %"9", align 4 + %"20" = load i32, ptr addrspace(5) %"7", align 4 + %"21" = load i32, ptr addrspace(5) %"8", align 4 + %"22" = load i32, ptr addrspace(5) %"9", align 4 + %2 = icmp ugt i32 %"21", 31 + %3 = lshr i32 %"20", %"21" + %4 = select i1 %2, i32 0, i32 %3 + %"19" = add i32 %4, %"22" + store i32 %"19", ptr addrspace(5) %"6", align 4 + %"23" = load i64, ptr addrspace(5) %"5", align 8 + %"24" = load i32, ptr addrspace(5) %"6", align 4 + %"37" = inttoptr i64 %"23" to ptr + store i32 %"24", ptr %"37", align 4 ret void } diff --git a/ptx/src/test/spirv_run/xor.ll b/ptx/src/test/spirv_run/xor.ll index 7181bd1..bc0ad26 100644 --- a/ptx/src/test/spirv_run/xor.ll +++ b/ptx/src/test/spirv_run/xor.ll @@ -1,37 +1,37 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7" target triple = "amdgcn-amd-amdhsa" -define protected amdgpu_kernel void @xor(ptr addrspace(4) byref(i64) %"23", ptr addrspace(4) byref(i64) %"24") #0 { -"28": +define protected amdgpu_kernel void @xor(ptr addrspace(4) byref(i64) %"22", ptr addrspace(4) byref(i64) %"23") #0 { %"8" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"8", align 1 - %"9" = alloca i1, align 1, addrspace(5) - store i1 false, ptr addrspace(5) %"9", align 1 %"4" = alloca i64, align 8, addrspace(5) %"5" = alloca i64, align 8, addrspace(5) %"6" = alloca i32, align 4, addrspace(5) %"7" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + store i1 false, ptr addrspace(5) %"8", align 1 + %"9" = load i64, ptr addrspace(4) %"22", align 8 + store i64 %"9", ptr addrspace(5) %"4", align 8 %"10" = load i64, ptr addrspace(4) %"23", align 8 - store i64 %"10", ptr addrspace(5) %"4", align 8 - %"11" = load i64, ptr addrspace(4) %"24", align 8 - store i64 %"11", ptr addrspace(5) %"5", align 8 - %"13" = load i64, ptr addrspace(5) %"4", align 8 - %"25" = inttoptr i64 %"13" to ptr - %"12" = load i32, ptr %"25", align 4 - store i32 %"12", ptr addrspace(5) %"6", align 4 - %"15" = load i64, ptr addrspace(5) %"4", align 8 - %"26" = inttoptr i64 %"15" to ptr - %"30" = getelementptr inbounds i8, ptr %"26", i64 4 - %"14" = load i32, ptr %"30", align 4 - store i32 %"14", ptr addrspace(5) %"7", align 4 - %"17" = load i32, ptr addrspace(5) %"6", align 4 - %"18" = load i32, ptr addrspace(5) %"7", align 4 - %"16" = xor i32 %"17", %"18" - store i32 %"16", ptr addrspace(5) %"6", align 4 - %"19" = load i64, ptr addrspace(5) %"5", align 8 - %"20" = load i32, ptr addrspace(5) %"6", align 4 - %"27" = inttoptr i64 %"19" to ptr - store i32 %"20", ptr %"27", align 4 + store i64 %"10", ptr addrspace(5) %"5", align 8 + %"12" = load i64, ptr addrspace(5) %"4", align 8 + %"24" = inttoptr i64 %"12" to ptr + %"11" = load i32, ptr %"24", align 4 + store i32 %"11", ptr addrspace(5) %"6", align 4 + %"14" = load i64, ptr addrspace(5) %"4", align 8 + %"25" = inttoptr i64 %"14" to ptr + %"28" = getelementptr inbounds i8, ptr %"25", i64 4 + %"13" = load i32, ptr %"28", align 4 + store i32 %"13", ptr addrspace(5) %"7", align 4 + %"16" = load i32, ptr addrspace(5) %"6", align 4 + %"17" = load i32, ptr addrspace(5) %"7", align 4 + %"15" = xor i32 %"16", %"17" + store i32 %"15", ptr addrspace(5) %"6", align 4 + %"18" = load i64, ptr addrspace(5) %"5", align 8 + %"19" = load i32, ptr addrspace(5) %"6", align 4 + %"26" = inttoptr i64 %"18" to ptr + store i32 %"19", ptr %"26", align 4 ret void } diff --git a/ptx/src/translate.rs b/ptx/src/translate.rs index fbf286b..99fc356 100644 --- a/ptx/src/translate.rs +++ b/ptx/src/translate.rs @@ -1931,30 +1931,26 @@ fn insert_hardware_registers<'input>( } // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#extended-precision-integer-arithmetic-instructions -// NVIDIA documentation is misleading. In fact there is no single CC.CF, -// but separate registers for overflow (`add` and `mad`) and underflow (`sub`) -// For reference check the .ptx tests +// NVIDIA documentation is slightly misleading when it comes to subc and sub.cc. +// They both invert the CC flag. Meaning that for sub: +// * sub.cc x, 0,1 will set CC to 0 +// * sub.cc x, 0,0 will set CC to 1 +// and for subc: +// * if CC is 1 then subc will compute d = a - b +// * if CC is 0 then subc will compute d = a - b - 1 fn insert_hardware_registers_impl<'input>( id_defs: &mut IdNameMapBuilder<'input>, typed_statements: Vec, ) -> Result, TranslateError> { let mut result = Vec::with_capacity(typed_statements.len()); - let overflow_flag_var = id_defs.register_variable_def( + let carry_flag_variable = id_defs.register_variable_def( None, ast::Type::Scalar(ast::ScalarType::Pred), ast::StateSpace::Reg, Some(ast::Initializer::Constant(ast::ImmediateValue::U64(0))), ); - let underflow_flag_var = id_defs.register_variable_def( - None, - ast::Type::Scalar(ast::ScalarType::Pred), - ast::StateSpace::Reg, - Some(ast::Initializer::Constant(ast::ImmediateValue::U64(0))), - ); - let overflow_flag = overflow_flag_var.name; - let underflow_flag = underflow_flag_var.name; - result.push(Statement::Variable(overflow_flag_var)); - result.push(Statement::Variable(underflow_flag_var)); + let carry_flag = carry_flag_variable.name; + result.push(Statement::Variable(carry_flag_variable)); for statement in typed_statements { match statement { Statement::Instruction(ast::Instruction::MadC { @@ -1965,37 +1961,88 @@ fn insert_hardware_registers_impl<'input>( }) => result.push(Statement::MadC(MadCDetails { type_, is_hi, - arg: Arg4CarryIn::new(arg, carry_out, TypedOperand::Reg(overflow_flag)), + arg: Arg4CarryIn::new(arg, carry_out, TypedOperand::Reg(carry_flag)), })), - Statement::Instruction(ast::Instruction::MadCC { type_, arg }) => { + Statement::Instruction(ast::Instruction::MadCC { type_, is_hi, arg }) => { result.push(Statement::MadCC(MadCCDetails { type_, - arg: Arg4CarryOut::new(arg, TypedOperand::Reg(overflow_flag)), + is_hi, + arg: Arg4CarryOut::new(arg, TypedOperand::Reg(carry_flag)), })) } Statement::Instruction(ast::Instruction::AddC(details, args)) => { result.push(Statement::AddC( details.type_, - Arg3CarryIn::new(args, details.carry_out, TypedOperand::Reg(overflow_flag)), + Arg3CarryIn::new(args, details.carry_out, TypedOperand::Reg(carry_flag)), )) } Statement::Instruction(ast::Instruction::AddCC(details, args)) => { result.push(Statement::AddCC( details, - Arg3CarryOut::new(args, TypedOperand::Reg(overflow_flag)), + Arg3CarryOut::new(args, TypedOperand::Reg(carry_flag)), )) } Statement::Instruction(ast::Instruction::SubC(details, args)) => { + let inverted_carry_in = id_defs.register_intermediate(Some(( + ast::Type::Scalar(ast::ScalarType::Pred), + ast::StateSpace::Reg, + ))); + result.push(Statement::Instruction(ast::Instruction::Not( + ast::ScalarType::Pred, + ast::Arg2 { + dst: TypedOperand::Reg(inverted_carry_in), + src: TypedOperand::Reg(carry_flag), + }, + ))); + let (carry_out_id, carry_out_postprocess) = if details.carry_out { + let inverted_carry_out = id_defs.register_intermediate(Some(( + ast::Type::Scalar(ast::ScalarType::Pred), + ast::StateSpace::Reg, + ))); + let invert_statement = Statement::Instruction(ast::Instruction::Not( + ast::ScalarType::Pred, + ast::Arg2 { + dst: TypedOperand::Reg(carry_flag), + src: TypedOperand::Reg(inverted_carry_out), + }, + )); + ( + Some(TypedOperand::Reg(inverted_carry_out)), + Some(invert_statement), + ) + } else { + (None, None) + }; result.push(Statement::SubC( details.type_, - Arg3CarryIn::new(args, details.carry_out, TypedOperand::Reg(underflow_flag)), - )) + Arg3CarryIn { + dst: args.dst, + carry_out: carry_out_id, + carry_in: TypedOperand::Reg(inverted_carry_in), + src1: args.src1, + src2: args.src2, + }, + )); + if let Some(carry_out_postprocess) = carry_out_postprocess { + result.push(carry_out_postprocess); + } } - Statement::Instruction(ast::Instruction::SubCC(details, args)) => { + Statement::Instruction(ast::Instruction::SubCC(type_, args)) => { + let temp = id_defs.register_intermediate(Some(( + ast::Type::Scalar(ast::ScalarType::Pred), + ast::StateSpace::Reg, + ))); result.push(Statement::SubCC( - details, - Arg3CarryOut::new(args, TypedOperand::Reg(underflow_flag)), - )) + type_, + Arg3CarryOut::new(args, TypedOperand::Reg(temp)), + )); + result.push(Statement::Instruction(ast::Instruction::Not( + ast::ScalarType::Pred, + ast::Arg2 { + dst: TypedOperand::Reg(carry_flag), + src: TypedOperand::Reg(temp), + }, + ))); } s => result.push(s), } @@ -2447,58 +2494,6 @@ fn insert_implicit_conversions2_impl<'input>( Ok(result) } -fn normalize_labels<'input>( - module: TranslationModule<'input, ExpandedArgParams>, -) -> Result, TranslateError> { - convert_methods_simple(module, normalize_labels2_impl) -} - -fn normalize_labels2_impl<'input>( - id_defs: &mut IdNameMapBuilder<'input>, - fn_body: Vec, -) -> Result, TranslateError> { - let mut labels_in_use = FxHashSet::default(); - for statement in fn_body.iter() { - match statement { - Statement::Instruction(i) => { - if let Some(target) = i.jump_target() { - labels_in_use.insert(target); - } - } - Statement::Conditional(cond) => { - labels_in_use.insert(cond.if_true); - labels_in_use.insert(cond.if_false); - } - Statement::Call(..) - | Statement::Variable(..) - | Statement::LoadVar(..) - | Statement::StoreVar(..) - | Statement::RetValue(..) - | Statement::Conversion(..) - | Statement::Constant(..) - | Statement::Label(..) - | Statement::PtrAccess { .. } - | Statement::RepackVector(..) - | Statement::MadC(..) - | Statement::MadCC(..) - | Statement::AddC(..) - | Statement::AddCC(..) - | Statement::SubC(..) - | Statement::SubCC(..) - | Statement::AsmVolatile { .. } - | Statement::FunctionPointer(..) => {} - } - } - Ok( - iter::once(Statement::Label(id_defs.register_intermediate(None))) - .chain(fn_body.into_iter().filter(|s| match s { - Statement::Label(i) => labels_in_use.contains(i), - _ => true, - })) - .collect::>(), - ) -} - fn hoist_globals<'input, P: ast::ArgParams>( module: TranslationModule<'input, P>, ) -> TranslationModule<'input, P> { @@ -2860,7 +2855,7 @@ fn replace_instructions_with_builtins_impl<'input>( vector, "_", suld.type_.to_ptx_name(), - "_trap", + "_zero", ] .concat(); statements.push(instruction_to_fn_call( @@ -2881,7 +2876,7 @@ fn replace_instructions_with_builtins_impl<'input>( vector, "_", sust.type_.to_ptx_name(), - "_trap", + "_zero", ] .concat(); statements.push(instruction_to_fn_call( @@ -3337,9 +3332,7 @@ fn to_llvm_module_impl2<'a, 'input>( } let translation_module = insert_implicit_conversions(translation_module)?; let translation_module = insert_compilation_mode_prologue(translation_module); - let translation_module = normalize_labels(translation_module)?; let translation_module = hoist_globals(translation_module); - let translation_module = move_variables_to_start(translation_module)?; let mut translation_module = replace_instructions_with_builtins(translation_module)?; if raytracing.is_some() { translation_module = raytracing::replace_tex_builtins_hack(translation_module)?; @@ -3392,49 +3385,6 @@ fn return_from_noreturn( translation_module } -// From "Performance Tips for Frontend Authors" (https://llvm.org/docs/Frontend/PerformanceTips.html): -// "The SROA (Scalar Replacement Of Aggregates) and Mem2Reg passes only attempt to eliminate alloca -// instructions that are in the entry basic block. Given SSA is the canonical form expected by much -// of the optimizer; if allocas can not be eliminated by Mem2Reg or SROA, the optimizer is likely to -// be less effective than it could be." -// Empirically, this is true. Moving allocas to the start gives us less spill-happy assembly -fn move_variables_to_start<'input, P: ast::ArgParams>( - module: TranslationModule<'input, P>, -) -> Result, TranslateError> { - convert_methods_simple(module, move_variables_to_start_impl) -} - -fn move_variables_to_start_impl<'input, P: ast::ArgParams>( - _: &mut IdNameMapBuilder<'input>, - fn_body: Vec, P>>, -) -> Result, P>>, TranslateError> { - if fn_body.is_empty() { - return Ok(fn_body); - } - let mut result = (0..fn_body.len()) - .into_iter() - .map(|_| mem::MaybeUninit::<_>::uninit()) - .collect::>(); - let variables_count = fn_body.iter().fold(0, |acc, statement| { - acc + matches!(statement, Statement::Variable(..)) as usize - }); - let mut variable = 1usize; - let mut non_variable = variables_count + 1; - // methods always start with an entry label - let mut statements = fn_body.into_iter(); - let start_label = statements.next().ok_or_else(TranslateError::unreachable)?; - unsafe { result.get_unchecked_mut(0).write(start_label) }; - for statement in statements { - let index = match statement { - Statement::Variable(_) => &mut variable, - _ => &mut non_variable, - }; - unsafe { result.get_unchecked_mut(*index).write(statement) }; - *index += 1; - } - Ok(unsafe { mem::transmute(result) }) -} - // PTX definition of param state space does not translate cleanly into AMDGPU notion of an address space: //  .param in kernel arguments matches AMDGPU constant address space // .param in function arguments and variables matches AMDGPU private address space @@ -5570,6 +5520,7 @@ impl, U: ArgParamsEx> Visitable for MadCD pub(crate) struct MadCCDetails { pub(crate) type_: ast::ScalarType, + pub(crate) is_hi: bool, pub(crate) arg: Arg4CarryOut

, } @@ -5580,6 +5531,7 @@ impl, U: ArgParamsEx> Visitable for MadCC ) -> Result, U>, TranslateError> { Ok(Statement::MadCC(MadCCDetails { type_: self.type_, + is_hi: self.is_hi, arg: self.arg.map(visitor, self.type_)?, })) } @@ -6488,8 +6440,9 @@ impl ast::Instruction { carry_out, arg: arg.map(visitor, &ast::Type::Scalar(type_), false)?, }, - ast::Instruction::MadCC { type_, arg } => ast::Instruction::MadCC { + ast::Instruction::MadCC { type_, arg, is_hi } => ast::Instruction::MadCC { type_, + is_hi, arg: arg.map(visitor, &ast::Type::Scalar(type_), false)?, }, ast::Instruction::Tex(details, arg) => { @@ -6604,6 +6557,9 @@ impl ast::Instruction { ast::StateSpace::Reg, )), )?), + ast::Instruction::Sad(type_, a) => { + ast::Instruction::Sad(type_, a.map(visitor, &ast::Type::Scalar(type_), false)?) + } ast::Instruction::Isspacep(space, arg) => ast::Instruction::Isspacep( space, arg.map_different_types( @@ -6612,6 +6568,7 @@ impl ast::Instruction { &ast::Type::Scalar(ast::ScalarType::U64), )?, ), + }) } } @@ -6866,15 +6823,6 @@ pub(crate) enum TypeKind { Struct, } -impl> ast::Instruction { - fn jump_target(&self) -> Option { - match self { - ast::Instruction::Bra(_, a) => Some(a.src), - _ => None, - } - } -} - impl ast::Instruction { // .wide instructions don't support ftz, so it's enough to just look at the // type declared by the instruction @@ -6969,6 +6917,7 @@ impl ast::Instruction { ast::Instruction::Shf(..) => None, ast::Instruction::Vote(..) => None, ast::Instruction::Nanosleep(..) => None, + ast::Instruction::Sad(_, _) => None, ast::Instruction::Sub(ast::ArithDetails::Float(float_control), _) | ast::Instruction::Add(ast::ArithDetails::Float(float_control), _) | ast::Instruction::Mul(ast::MulDetails::Float(float_control), _) diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml index 2a214e4..edc0965 100644 --- a/xtask/Cargo.toml +++ b/xtask/Cargo.toml @@ -11,4 +11,12 @@ cargo_metadata = "=0.17.0" # cargo-platform is a cargo_metadata, version 0.1.6 requires rust 1.70 or higher cargo-platform = "=0.1.5" serde = "1.0.193" -serde_json = "1.0.108" \ No newline at end of file +serde_json = "1.0.108" +time = { version = "=0.3.23", features = ["local-offset"] } + +[target.'cfg(windows)'.dependencies] +zip = { version = "0.6.6", features = ["deflate", "time"], default-features = false } + +[target.'cfg(unix)'.dependencies] +flate2 = { version = "1.0.28", features = ["cloudflare_zlib"], default-features = false } +tar = "0.4" \ No newline at end of file diff --git a/xtask/src/main.rs b/xtask/src/main.rs index 317ec01..d47659f 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -1,4 +1,5 @@ use argh::{EarlyExit, FromArgs, TopLevelCommand}; +use cargo_metadata::camino::Utf8PathBuf; use serde::Deserialize; use std::{ env, @@ -60,7 +61,7 @@ struct BuildCommand { } #[derive(FromArgs)] -/// Package build artifacts into an archive (.zip or .tar.gz) +/// Compile ZLUDA and package binaries into an archive (.zip or .tar.gz) #[argh(subcommand, name = "zip")] struct ZipCommand { /// use artifacts from release mode @@ -73,10 +74,15 @@ fn main() -> Result<(), DynError> { let args: Arguments = argh::from_env(); std::process::exit(match args.command { Subcommand::Build(BuildCommand { release }) => build(!release)?, - Subcommand::Zip(_) => panic!(), + Subcommand::Zip(ZipCommand { release }) => build_and_zip(!release), }) } +fn build_and_zip(is_debug: bool) -> i32 { + let workspace = build_impl(is_debug).unwrap(); + os::zip(workspace) +} + #[derive(Deserialize)] struct ZludaMetadata { zluda: Project, @@ -92,8 +98,6 @@ struct Project { #[serde(skip_deserializing)] kind: TargetKind, #[serde(default)] - top_level: bool, - #[serde(default)] windows_only: bool, #[serde(default)] linux_only: bool, @@ -104,9 +108,13 @@ struct Project { #[serde(default)] skip_dump_link: bool, #[serde(default)] + skip_zip: bool, + #[serde(default)] linux_names: Vec, #[serde(default)] dump_names: Vec, + #[serde(default)] + dump_nvidia_names: Vec, } #[derive(Clone, Copy, Default, PartialEq, Debug)] @@ -116,14 +124,56 @@ enum TargetKind { Cdylib, } +struct Workspace { + pub cargo: String, + pub project_root: PathBuf, + pub projects: Vec, + pub target_directory: Utf8PathBuf, +} + +impl Workspace { + fn open(is_debug: bool) -> Result { + let cargo = env::var("CARGO").unwrap_or_else(|_| "cargo".to_string()); + let project_root = Self::project_root()?; + let mut cmd = cargo_metadata::MetadataCommand::new(); + cmd.cargo_path(&cargo).current_dir(&project_root).no_deps(); + let cargo_metadata = cmd.exec()?; + let projects = cargo_metadata + .packages + .into_iter() + .filter_map(Project::new) + .filter(|p| !p.skip_build(is_debug)) + .collect::>(); + let mut target_directory = cargo_metadata.target_directory; + target_directory.push(if is_debug { "debug" } else { "release" }); + Ok(Workspace { + cargo, + project_root, + projects, + target_directory, + }) + } + + fn project_root() -> Result { + Ok(Path::new(&env!("CARGO_MANIFEST_DIR")) + .ancestors() + .nth(1) + .ok_or::("CARGO_MANIFEST_DIR".into())? + .to_path_buf()) + } + + fn cargo_command(&self) -> Command { + let mut command = Command::new(&self.cargo); + command.current_dir(&self.project_root); + command + } +} + impl Project { - fn new(json_pkg: cargo_metadata::Package) -> Self { - let mut project = serde_json::from_value::>(json_pkg.metadata) - .unwrap() - .map_or(Default::default(), |x| x.zluda); - if project != Default::default() { - project.top_level = true; - } + fn new(json_pkg: cargo_metadata::Package) -> Option { + let project_metadata = + serde_json::from_value::>(json_pkg.metadata).unwrap()?; + let mut project = project_metadata.zluda; project.name = json_pkg.name; if let Some((target_name, kind)) = json_pkg.targets.into_iter().find_map(|t| { match t.kind.first().map(std::ops::Deref::deref) { @@ -135,13 +185,10 @@ impl Project { project.target_name = target_name; project.kind = kind; } - project + Some(project) } fn skip_build(&self, is_debug: bool) -> bool { - if !self.top_level { - return true; - } if self.broken { return true; } @@ -159,67 +206,77 @@ impl Project { } fn build(is_debug: bool) -> Result { - let cargo = env::var("CARGO").unwrap_or_else(|_| "cargo".to_string()); - let project_root = project_root()?; - let mut cmd = cargo_metadata::MetadataCommand::new(); - cmd.cargo_path(&cargo).current_dir(&project_root).no_deps(); - let metadata = cmd.exec()?; - let projects = metadata - .packages - .into_iter() - .map(Project::new) - .filter(|p| !p.skip_build(is_debug)) - .collect::>(); - let mut command = Command::new(&cargo); - command.current_dir(&project_root).arg("build"); - projects.iter().fold(&mut command, |command, proj| { - command.args(["-p", &proj.name]) - }); + build_impl(is_debug)?; + Ok(0) +} + +fn build_impl(is_debug: bool) -> Result { + let workspace = Workspace::open(is_debug)?; + let mut command = workspace.cargo_command(); + command.arg("build"); + command.arg("--locked"); + workspace + .projects + .iter() + .fold(&mut command, |command, proj| { + command.args(["-p", &proj.name]) + }); if !is_debug { command.arg("--release"); } let build_result = command.status()?.code().unwrap(); if build_result != 0 { - return Ok(build_result); + return Err(format!("{command:?} failed with exit code {build_result}").into()); } - os::create_dump_dir_and_symlinks(is_debug, metadata.target_directory, projects); - Ok(0) + os::create_dump_dir_and_symlinks(&workspace); + Ok(workspace) } -fn project_root() -> Result { - Ok(Path::new(&env!("CARGO_MANIFEST_DIR")) - .ancestors() - .nth(1) - .ok_or::("CARGO_MANIFEST_DIR".into())? - .to_path_buf()) -} +impl TargetKind { + #[cfg(unix)] + fn prefix(self) -> &'static str { + match self { + TargetKind::Binary => "", + TargetKind::Cdylib => "lib", + } + } -#[cfg(not(unix))] -mod os { - use super::Project; - use cargo_metadata::camino::Utf8PathBuf; + #[cfg(unix)] + fn suffix(self) -> &'static str { + match self { + TargetKind::Binary => "", + TargetKind::Cdylib => ".so", + } + } - // This is 100% intentional, we don't want symlinks on Windows since - // we use completely different scheme for injections here - pub(crate) fn create_dump_dir_and_symlinks(_: bool, _: Utf8PathBuf, _: Vec) {} + #[cfg(windows)] + fn suffix(self) -> &'static str { + match self { + TargetKind::Binary => ".exe", + TargetKind::Cdylib => ".dll", + } + } } #[cfg(unix)] mod os { - use super::{Project, TargetKind}; + use crate::Workspace; use cargo_metadata::camino::Utf8PathBuf; + use flate2::{write::GzEncoder, Compression}; + use std::{ + fs::File, + time::{Duration, SystemTime}, + }; - pub(crate) fn create_dump_dir_and_symlinks( - is_debug: bool, - mut target_directory: Utf8PathBuf, - projects: Vec, - ) { + pub(crate) fn create_dump_dir_and_symlinks(workspace: &Workspace) { use std::fs; - target_directory.push(if is_debug { "debug" } else { "release" }); - let mut dump_dir = target_directory.clone(); + let mut dump_dir = workspace.target_directory.clone(); dump_dir.push("dump"); fs::create_dir_all(&dump_dir).unwrap(); - for project in projects { + let mut dump_nvidia_dir = dump_dir.clone(); + dump_nvidia_dir.set_file_name("dump_nvidia"); + fs::create_dir_all(&dump_nvidia_dir).unwrap(); + for project in workspace.projects.iter() { let dst = format!( "{}{}{}", project.kind.prefix(), @@ -227,15 +284,18 @@ mod os { project.kind.suffix() ); let dump_dst = format!("../{}", dst); - for src_file in project.linux_names { - force_symlink(&dst, &target_directory, &src_file); + for src_file in project.linux_names.iter() { + force_symlink(&dst, &workspace.target_directory, src_file); if project.skip_dump_link { continue; } - force_symlink(&dump_dst, &dump_dir, &src_file); + force_symlink(&dump_dst, &dump_dir, src_file); } - for src_file in project.dump_names { - force_symlink(&dump_dst, &dump_dir, &src_file); + for src_file in project.dump_names.iter() { + force_symlink(&dump_dst, &dump_dir, src_file); + } + for src_file in project.dump_nvidia_names.iter() { + force_symlink(&dump_dst, &dump_nvidia_dir, src_file); } } } @@ -263,19 +323,128 @@ mod os { } } - impl TargetKind { - fn prefix(self) -> &'static str { - match self { - TargetKind::Binary => "", - TargetKind::Cdylib => "lib", + pub(crate) fn zip(workspace: Workspace) -> i32 { + let mut target_file = workspace.target_directory.clone(); + target_file.push("zluda.tar.gz"); + let gz_file = File::create(target_file).unwrap(); + let gz = GzEncoder::new(gz_file, Compression::default()); + let mut tar = tar::Builder::new(gz); + let time = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or(Duration::ZERO); + for project in workspace.projects { + if project.skip_zip { + continue; + } + let mut src_file = File::open(format!( + "{}/{}{}{}", + &workspace.target_directory, + project.kind.prefix(), + project.target_name, + project.kind.suffix() + )) + .unwrap(); + let file_name = format!( + "{}{}{}", + project.kind.prefix(), + project.target_name, + project.kind.suffix() + ); + tar.append_file(format!("zluda/{file_name}"), &mut src_file) + .unwrap(); + for linux_name in project.linux_names.iter() { + let mut header = tar_header_symlink(time); + tar.append_link(&mut header, format!("zluda/{}", linux_name), &file_name) + .unwrap(); + if project.skip_dump_link { + continue; + } + let mut header = tar_header_symlink(time); + tar.append_link( + &mut header, + format!("zluda/dump/{}", linux_name), + format!("../{file_name}"), + ) + .unwrap(); + } + for dump_name in project.dump_names.iter() { + let mut header = tar_header_symlink(time); + tar.append_link( + &mut header, + format!("zluda/dump/{}", dump_name), + format!("../{file_name}"), + ) + .unwrap(); + } + for dump_name in project.dump_nvidia_names.iter() { + let mut header = tar_header_symlink(time); + tar.append_link( + &mut header, + format!("zluda/dump_nvidia/{}", dump_name), + format!("../{file_name}"), + ) + .unwrap(); } } + tar.finish().unwrap(); + 0 + } - fn suffix(self) -> &'static str { - match self { - TargetKind::Binary => "", - TargetKind::Cdylib => ".so", - } - } + fn tar_header_symlink(time: Duration) -> tar::Header { + let mut header = tar::Header::new_gnu(); + header.set_mtime(time.as_secs()); + header.set_entry_type(tar::EntryType::Symlink); + header + } +} + +#[cfg(windows)] +mod os { + use crate::Workspace; + use std::{convert::TryFrom, fs::File}; + + // This is 100% intentional, we don't want symlinks on Windows since + // we use a completely different scheme for injections there + pub(crate) fn create_dump_dir_and_symlinks(_: &Workspace) {} + + pub(crate) fn zip(workspace: Workspace) -> i32 { + fn get_zip_entry_options( + f: &File, + time_offset: time::UtcOffset, + ) -> Option { + let time = f.metadata().unwrap().modified().unwrap(); + let time = time::OffsetDateTime::from(time).to_offset(time_offset); + Some( + zip::write::FileOptions::default() + .last_modified_time(zip::DateTime::try_from(time).unwrap()), + ) + } + let mut target_file = workspace.target_directory.clone(); + target_file.push("zluda.zip"); + let zip_archive = File::create(target_file).unwrap(); + let mut zip_writer = zip::write::ZipWriter::new(zip_archive); + let time_offset = time::UtcOffset::current_local_offset().unwrap_or(time::UtcOffset::UTC); + for p in workspace.projects { + if p.skip_zip { + continue; + } + let mut src_file = File::open(format!( + "{}/{}{}", + &workspace.target_directory, + p.target_name, + p.kind.suffix() + )) + .unwrap(); + zip_writer + .start_file( + format!("zluda/{}{}", p.target_name, p.kind.suffix()), + get_zip_entry_options(&src_file, time_offset) + .unwrap_or(zip::write::FileOptions::default()), + ) + .unwrap(); + std::io::copy(&mut src_file, &mut zip_writer).unwrap(); + } + zip_writer.finish().unwrap(); + 0 } } diff --git a/zluda/src/cuda.rs b/zluda/src/cuda.rs index 9de0111..f8a0584 100644 --- a/zluda/src/cuda.rs +++ b/zluda/src/cuda.rs @@ -69,6 +69,7 @@ cuda_function_declarations!( cuCtxGetDevice, cuCtxGetLimit, cuCtxSetLimit, + cuCtxSetFlags, cuCtxGetStreamPriorityRange, cuCtxSynchronize, cuCtxSetCacheConfig, @@ -488,6 +489,10 @@ mod definitions { context::set_limit(limit, value) } + pub(crate) unsafe fn cuCtxSetFlags(flags: u32) -> Result<(), CUresult> { + context::set_flags(flags) + } + pub(crate) unsafe fn cuCtxGetStreamPriorityRange( leastPriority: *mut ::std::os::raw::c_int, greatestPriority: *mut ::std::os::raw::c_int, @@ -1241,7 +1246,7 @@ mod definitions { } pub(crate) unsafe fn cuSurfObjectDestroy(surfObject: hipSurfaceObject_t) -> hipError_t { - hipDestroySurfaceObject(surfObject) + surface::destroy(surfObject) } pub(crate) unsafe fn cuTexObjectCreate( diff --git a/zluda/src/impl/context.rs b/zluda/src/impl/context.rs index 429338b..d1b3e7b 100644 --- a/zluda/src/impl/context.rs +++ b/zluda/src/impl/context.rs @@ -7,7 +7,7 @@ use cuda_types::*; use hip_runtime_sys::*; use rustc_hash::{FxHashMap, FxHashSet}; use std::ptr; -use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::atomic::AtomicU32; use std::sync::Mutex; use std::{cell::RefCell, ffi::c_void}; @@ -28,57 +28,104 @@ impl ZludaObject for ContextData { const LIVENESS_FAIL: CUresult = CUresult::CUDA_ERROR_INVALID_CONTEXT; fn drop_with_result(&mut self, _: bool) -> Result<(), CUresult> { - let mutable = self - .mutable - .get_mut() - .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; - fold_cuda_errors(mutable.streams.iter().copied().map(|s| { - unsafe { LiveCheck::drop_box_with_result(s, true)? }; - Ok(()) - })) + self.with_inner_mut(|mutable| { + fold_cuda_errors( + mutable + .streams + .iter() + .copied() + .map(|s| unsafe { LiveCheck::drop_box_with_result(s, true) }), + ) + })? } } pub(crate) struct ContextData { - pub(crate) flags: AtomicU32, - is_primary: bool, - pub(crate) ref_count: AtomicU32, pub(crate) device: hipDevice_t, - pub(crate) mutable: Mutex, + pub(crate) variant: ContextVariant, +} + +pub(crate) enum ContextVariant { + NonPrimary(NonPrimaryContextData), + Primary(Mutex), +} + +pub(crate) struct PrimaryContextData { + pub(crate) ref_count: u32, + pub(crate) flags: u32, + pub(crate) mutable: ContextInnerMutable, +} + +pub(crate) struct NonPrimaryContextData { + flags: AtomicU32, + mutable: Mutex, } impl ContextData { - pub(crate) fn new( - flags: u32, - device: hipDevice_t, - is_primary: bool, - initial_refcount: u32, - ) -> Result { - Ok(ContextData { - flags: AtomicU32::new(flags), + pub(crate) fn new_non_primary(flags: u32, device: hipDevice_t) -> Self { + Self { device, - ref_count: AtomicU32::new(initial_refcount), - is_primary, - mutable: Mutex::new(ContextDataMutable::new()), + variant: ContextVariant::NonPrimary(NonPrimaryContextData { + flags: AtomicU32::new(flags), + mutable: Mutex::new(ContextInnerMutable::new()), + }), + } + } + + pub(crate) fn new_primary(device: hipDevice_t) -> Self { + Self { + device, + variant: ContextVariant::Primary(Mutex::new(PrimaryContextData { + ref_count: 0, + flags: 0, + mutable: ContextInnerMutable::new(), + })), + } + } + + pub(crate) fn with_inner_mut( + &self, + fn_: impl FnOnce(&mut ContextInnerMutable) -> T, + ) -> Result { + Ok(match self.variant { + ContextVariant::Primary(ref mutex_over_primary_ctx_data) => { + let mut primary_ctx_data = mutex_over_primary_ctx_data + .lock() + .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + fn_(&mut primary_ctx_data.mutable) + } + ContextVariant::NonPrimary(NonPrimaryContextData { ref mutable, .. }) => { + let mut ctx_data_mutable = + mutable.lock().map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + fn_(&mut ctx_data_mutable) + } }) } } -pub(crate) struct ContextDataMutable { +pub(crate) struct ContextInnerMutable { pub(crate) streams: FxHashSet<*mut stream::Stream>, pub(crate) modules: FxHashSet<*mut module::Module>, // Field below is here to support CUDA Driver Dark API pub(crate) local_storage: FxHashMap<*mut c_void, LocalStorageValue>, } -impl ContextDataMutable { - fn new() -> Self { - ContextDataMutable { +impl ContextInnerMutable { + pub(crate) fn new() -> Self { + ContextInnerMutable { streams: FxHashSet::default(), modules: FxHashSet::default(), local_storage: FxHashMap::default(), } } + pub(crate) fn drop_with_result(&mut self) -> Result<(), CUresult> { + fold_cuda_errors( + self.streams + .iter() + .copied() + .map(|s| unsafe { LiveCheck::drop_box_with_result(s, true) }), + ) + } } pub(crate) struct LocalStorageValue { @@ -94,7 +141,7 @@ pub(crate) unsafe fn create( if pctx == ptr::null_mut() { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - let context_box = Box::new(LiveCheck::new(ContextData::new(flags, dev, false, 1)?)); + let context_box = Box::new(LiveCheck::new(ContextData::new_non_primary(flags, dev))); let context_ptr = Box::into_raw(context_box); *pctx = context_ptr; push_context_stack(context_ptr) @@ -105,7 +152,7 @@ pub(crate) unsafe fn destroy(ctx: *mut Context) -> Result<(), CUresult> { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } let ctx_ref = LiveCheck::as_result(ctx)?; - if ctx_ref.is_primary { + if let ContextVariant::Primary { .. } = ctx_ref.variant { return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT); } CONTEXT_STACK.with(|stack| { @@ -175,14 +222,25 @@ pub(crate) fn set_limit(limit: hipLimit_t, value: usize) -> Result<(), CUresult> Ok(()) } +pub(crate) fn set_flags(flags: u32) -> Result<(), CUresult> { + with_current(|ctx| match ctx.variant { + ContextVariant::NonPrimary(ref context) => { + context + .flags + .store(flags, std::sync::atomic::Ordering::SeqCst); + Ok(()) + } + // This looks stupid, but this is an actual CUDA behavior, + // see primary_context.rs test + ContextVariant::Primary(_) => Ok(()), + })? +} + pub(crate) unsafe fn get_api_version(ctx: *mut Context, version: *mut u32) -> Result<(), CUresult> { if ctx == ptr::null_mut() { return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT); } - let ctx = LiveCheck::as_result(ctx)?; - if ctx.ref_count.load(Ordering::Acquire) == 0 { - return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT); - } + //let ctx = LiveCheck::as_result(ctx)?; //TODO: query device for properties roughly matching CUDA API version *version = 3020; Ok(()) diff --git a/zluda/src/impl/dark_api.rs b/zluda/src/impl/dark_api.rs index c3f4fca..c3b596c 100644 --- a/zluda/src/impl/dark_api.rs +++ b/zluda/src/impl/dark_api.rs @@ -121,20 +121,27 @@ impl CudaDarkApi for CudaDarkApiZluda { value: *mut c_void, dtor_callback: Option, ) -> CUresult { - with_context_or_current(cu_ctx, |ctx| { - let mut ctx_mutable = ctx - .mutable - .lock() - .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; - ctx_mutable.local_storage.insert( - key, - LocalStorageValue { - value, - _dtor_callback: dtor_callback, - }, - ); - Ok(()) - }) + unsafe fn context_local_storage_insert_impl( + cu_ctx: cuda_types::CUcontext, + key: *mut c_void, + value: *mut c_void, + dtor_callback: Option< + extern "system" fn(cuda_types::CUcontext, *mut c_void, *mut c_void), + >, + ) -> Result<(), CUresult> { + with_context_or_current(cu_ctx, |ctx| { + ctx.with_inner_mut(|ctx_mutable| { + ctx_mutable.local_storage.insert( + key, + LocalStorageValue { + value, + _dtor_callback: dtor_callback, + }, + ); + }) + })? + } + context_local_storage_insert_impl(cu_ctx, key, value, dtor_callback).into_cuda() } // TODO @@ -143,29 +150,30 @@ impl CudaDarkApi for CudaDarkApiZluda { } unsafe extern "system" fn context_local_storage_get( - result: *mut *mut c_void, + cu_result: *mut *mut c_void, cu_ctx: cuda_types::CUcontext, key: *mut c_void, ) -> CUresult { - let mut cu_result = None; - let query_cu_result = with_context_or_current(cu_ctx, |ctx| { - let ctx_mutable = ctx - .mutable - .lock() - .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; - cu_result = ctx_mutable.local_storage.get(&key).map(|v| v.value); - Ok(()) - }); - if query_cu_result != CUresult::CUDA_SUCCESS { - query_cu_result - } else { - match cu_result { - Some(value) => { - *result = value; - CUresult::CUDA_SUCCESS - } - None => CUresult::CUDA_ERROR_INVALID_VALUE, + unsafe fn context_local_storage_get_impl( + cu_ctx: cuda_types::CUcontext, + key: *mut c_void, + ) -> Result<*mut c_void, CUresult> { + with_context_or_current(cu_ctx, |ctx| { + ctx.with_inner_mut(|ctx_mutable| { + ctx_mutable + .local_storage + .get(&key) + .map(|v| v.value) + .ok_or(CUresult::CUDA_ERROR_INVALID_VALUE) + })? + })? + } + match context_local_storage_get_impl(cu_ctx, key) { + Ok(result) => { + *cu_result = result; + CUresult::CUDA_SUCCESS } + Err(err) => err, } } @@ -386,14 +394,14 @@ impl CudaDarkApi for CudaDarkApiZluda { } } -unsafe fn with_context_or_current( +unsafe fn with_context_or_current( ctx: CUcontext, - f: impl FnOnce(&context::ContextData) -> Result<(), CUresult>, -) -> CUresult { + fn_: impl FnOnce(&context::ContextData) -> T, +) -> Result { if ctx == ptr::null_mut() { - context::with_current(|c| f(c)).into_cuda() + context::with_current(|c| fn_(c)) } else { let ctx = FromCuda::from_cuda(ctx); - LiveCheck::as_result(ctx).map(f).into_cuda() + Ok(fn_(LiveCheck::as_result(ctx)?)) } } diff --git a/zluda/src/impl/device.rs b/zluda/src/impl/device.rs index 59201e2..b7dd0f5 100644 --- a/zluda/src/impl/device.rs +++ b/zluda/src/impl/device.rs @@ -1,6 +1,8 @@ +use super::context::{ContextInnerMutable, ContextVariant, PrimaryContextData}; use super::{ - context, LiveCheck, GLOBAL_STATE, + context, LiveCheck, GLOBAL_STATE }; +use crate::r#impl::context::ContextData; use crate::{r#impl::IntoCuda, hip_call_cuda}; use crate::hip_call; use cuda_types::{CUdevice_attribute, CUdevprop, CUuuid_st, CUresult}; @@ -10,11 +12,7 @@ use paste::paste; use std::{ mem, os::raw::{c_char, c_uint}, - ptr, - sync::{ - atomic::AtomicU32, - Mutex, - }, ops::AddAssign, ffi::CString, + ptr,ffi::CString, }; const ZLUDA_SUFFIX: &'static [u8] = b" [ZLUDA]\0"; @@ -28,9 +26,7 @@ pub const COMPUTE_CAPABILITY_MINOR: u32 = 8; pub(crate) struct Device { pub(crate) compilation_mode: CompilationMode, pub(crate) comgr_isa: CString, - // Primary context is lazy-initialized, the mutex is here to secure retain - // from multiple threads - primary_context: Mutex>, + primary_context: context::Context, } impl Device { @@ -48,7 +44,7 @@ impl Device { Ok(Self { compilation_mode, comgr_isa, - primary_context: Mutex::new(None), + primary_context: LiveCheck::new(ContextData::new_primary(index as i32)), }) } } @@ -113,6 +109,10 @@ pub(crate) unsafe fn get_attribute( return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } let hip_attrib = match attrib { + CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE => { + *pi = 32; + return Ok(()); + } CUdevice_attribute::CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT => { *pi = 1; return Ok(()); @@ -516,38 +516,29 @@ unsafe fn primary_ctx_get_or_retain( if pctx == ptr::null_mut() { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - let ctx = primary_ctx(hip_dev, |ctx| { - let ctx = match ctx { - Some(ref mut ctx) => ctx, - None => { - ctx.insert(LiveCheck::new(context::ContextData::new(0, hip_dev, true, 0)?)) - }, - }; - if increment_refcount { - ctx.as_mut_unchecked().ref_count.get_mut().add_assign(1); + let ctx = primary_ctx(hip_dev, |ctx, raw_ctx| { + if increment_refcount || ctx.ref_count == 0 { + ctx.ref_count += 1; } - Ok(ctx as *mut _) + Ok(raw_ctx.cast_mut()) })??; *pctx = ctx; Ok(()) } pub(crate) unsafe fn primary_ctx_release(hip_dev: hipDevice_t) -> Result<(), CUresult> { - primary_ctx(hip_dev, move |maybe_ctx| { - if let Some(ctx) = maybe_ctx { - let ctx_data = ctx.as_mut_unchecked(); - let ref_count = ctx_data.ref_count.get_mut(); - *ref_count -= 1; - if *ref_count == 0 { - //TODO: fix - //ctx.try_drop(false) - Ok(()) - } else { - Ok(()) - } - } else { - Err(CUresult::CUDA_ERROR_INVALID_CONTEXT) + primary_ctx(hip_dev, |ctx, _| { + if ctx.ref_count == 0 { + return Err(CUresult::CUDA_ERROR_INVALID_CONTEXT); } + ctx.ref_count -= 1; + if ctx.ref_count == 0 { + // Even if we encounter errors we can't really surface them + ctx.mutable.drop_with_result().ok(); + ctx.mutable = ContextInnerMutable::new(); + ctx.flags = 0; + } + Ok(()) })? } @@ -566,53 +557,43 @@ pub(crate) unsafe fn primary_ctx_set_flags( hip_dev: hipDevice_t, flags: ::std::os::raw::c_uint, ) -> Result<(), CUresult> { - primary_ctx(hip_dev, move |maybe_ctx| { - if let Some(ctx) = maybe_ctx { - let ctx = ctx.as_mut_unchecked(); - ctx.flags = AtomicU32::new(flags); - Ok(()) - } else { - Err(CUresult::CUDA_ERROR_INVALID_CONTEXT) - } + primary_ctx(hip_dev, |ctx, _| { + ctx.flags = flags; + // TODO: actually use flags + Ok(()) })? } pub(crate) unsafe fn primary_ctx_get_state( hip_dev: hipDevice_t, - flags_ptr: *mut ::std::os::raw::c_uint, - active_ptr: *mut ::std::os::raw::c_int, + flags_ptr: *mut u32, + active_ptr: *mut i32, ) -> Result<(), CUresult> { if flags_ptr == ptr::null_mut() || active_ptr == ptr::null_mut() { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } - let maybe_flags = primary_ctx(hip_dev, move |maybe_ctx| { - if let Some(ctx) = maybe_ctx { - let ctx = ctx.as_mut_unchecked(); - Some(*ctx.flags.get_mut()) - } else { - None - } + let (flags, active) = primary_ctx(hip_dev, |ctx, _| { + (ctx.flags, (ctx.ref_count > 0) as i32) })?; - if let Some(flags) = maybe_flags { - *flags_ptr = flags; - *active_ptr = 1; - } else { - *flags_ptr = 0; - *active_ptr = 0; - } + *flags_ptr = flags; + *active_ptr = active; Ok(()) } pub(crate) unsafe fn primary_ctx( dev: hipDevice_t, - f: impl FnOnce(&mut Option) -> T, + fn_: impl FnOnce(&mut PrimaryContextData, *const LiveCheck) -> T, ) -> Result { let device = GLOBAL_STATE.get()?.device(dev)?; - let mut maybe_primary_context = device - .primary_context - .lock() - .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; - Ok(f(&mut maybe_primary_context)) + let raw_ptr = &device.primary_context as *const _; + let context = device.primary_context.as_ref_unchecked(); + match context.variant { + ContextVariant::Primary(ref mutex_over_primary_ctx) => { + let mut primary_ctx = mutex_over_primary_ctx.lock().map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; + Ok(fn_(&mut primary_ctx, raw_ptr)) + }, + ContextVariant::NonPrimary(..) => Err(CUresult::CUDA_ERROR_UNKNOWN) + } } pub(crate) unsafe fn get_name(name: *mut i8, len: i32, device: i32) -> hipError_t { diff --git a/zluda/src/impl/mod.rs b/zluda/src/impl/mod.rs index d0e58a2..73c6efa 100644 --- a/zluda/src/impl/mod.rs +++ b/zluda/src/impl/mod.rs @@ -148,6 +148,10 @@ impl LiveCheck { outer_ptr as *mut Self } + pub unsafe fn as_ref_unchecked(&self) -> & T { + &self.data + } + pub unsafe fn as_mut_unchecked(&mut self) -> &mut T { &mut self.data } diff --git a/zluda/src/impl/module.rs b/zluda/src/impl/module.rs index 6a6911a..8a49d43 100644 --- a/zluda/src/impl/module.rs +++ b/zluda/src/impl/module.rs @@ -31,13 +31,11 @@ impl ZludaObject for ModuleData { let deregistration_err = if !by_owner { if let Some(ctx) = self.owner { let ctx = unsafe { LiveCheck::as_result(ctx.as_ptr())? }; - let mut ctx_mutable = ctx - .mutable - .lock() - .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; - ctx_mutable - .modules - .remove(&unsafe { LiveCheck::from_raw(self) }); + ctx.with_inner_mut(|ctx_mutable| { + ctx_mutable + .modules + .remove(&unsafe { LiveCheck::from_raw(self) }); + })?; } Ok(()) } else { @@ -104,11 +102,9 @@ pub(crate) unsafe fn load_impl( isa, input, )?); - let mut ctx_mutable = ctx - .mutable - .lock() - .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; - ctx_mutable.modules.insert(module); + ctx.with_inner_mut(|ctx_mutable| { + ctx_mutable.modules.insert(module); + })?; *output = module; Ok(()) })? diff --git a/zluda/src/impl/stream.rs b/zluda/src/impl/stream.rs index fb53510..71ed20b 100644 --- a/zluda/src/impl/stream.rs +++ b/zluda/src/impl/stream.rs @@ -21,13 +21,11 @@ impl ZludaObject for StreamData { if !by_owner { let ctx = unsafe { LiveCheck::as_result(self.ctx)? }; { - let mut ctx_mutable = ctx - .mutable - .lock() - .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; - ctx_mutable - .streams - .remove(&unsafe { LiveCheck::from_raw(&mut *self) }); + ctx.with_inner_mut(|ctx_mutable| { + ctx_mutable + .streams + .remove(&unsafe { LiveCheck::from_raw(&mut *self) }); + })?; } } hip_call_cuda!(hipStreamDestroy(self.base)); @@ -59,11 +57,9 @@ pub(crate) unsafe fn create_with_priority( ctx: ptr::null_mut(), }))); let ctx = context::with_current(|ctx| { - let mut ctx_mutable = ctx - .mutable - .lock() - .map_err(|_| CUresult::CUDA_ERROR_UNKNOWN)?; - ctx_mutable.streams.insert(stream); + ctx.with_inner_mut(|ctx_mutable| { + ctx_mutable.streams.insert(stream); + })?; Ok(LiveCheck::from_raw(ctx as *const _ as _)) })??; (*stream).as_mut_unchecked().ctx = ctx; diff --git a/zluda/src/impl/surface.rs b/zluda/src/impl/surface.rs index b07b52f..0f24fa3 100644 --- a/zluda/src/impl/surface.rs +++ b/zluda/src/impl/surface.rs @@ -1,23 +1,65 @@ +use super::hipfix; +use crate::hip_call_cuda; use cuda_types::*; use hip_runtime_sys::*; use std::{mem, ptr}; -use crate::hip_call_cuda; - -use super::{hipfix, FromCuda}; +// Same as in zluda_ptx_impl.cpp +const IMAGE_RESERVED_TOP_BITS: u32 = 3; pub(crate) unsafe fn create( - p_surf_object: *mut hipSurfaceObject_t, + result: *mut hipSurfaceObject_t, p_res_desc: *const CUDA_RESOURCE_DESC, ) -> Result<(), CUresult> { if p_res_desc == ptr::null() { return Err(CUresult::CUDA_ERROR_INVALID_VALUE); } let desc = to_surface_desc(*p_res_desc)?; - hip_call_cuda!(hipCreateSurfaceObject(p_surf_object, &desc)); + // We need to check array format and channel count to set top bits of the surface object. + // HIP does not support non-Array sources anyway + if desc.resType != hipResourceType::hipResourceTypeArray { + return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED); + } + let mut surf_obj = mem::zeroed(); + hip_call_cuda!(hipCreateSurfaceObject(&mut surf_obj, &desc)); + let top_reserved_bits = surf_obj as usize >> (usize::BITS - IMAGE_RESERVED_TOP_BITS); + if top_reserved_bits != 0 { + #[allow(unused_must_use)] + { + hipDestroySurfaceObject(surf_obj); + } + return Err(CUresult::CUDA_ERROR_UNKNOWN); + } + let format_size = format_size((&*desc.res.array.array).Format)?; + let channels = (&*desc.res.array.array).NumChannels; + let pixel_size = format_size * channels as usize; + let shift_amount = + (pixel_size.trailing_zeros() as usize) << (usize::BITS - IMAGE_RESERVED_TOP_BITS); + surf_obj = (surf_obj as usize | shift_amount) as _; + *result = surf_obj; Ok(()) } +pub(crate) unsafe fn destroy(surf_object: hipSurfaceObject_t) -> hipError_t { + hipDestroySurfaceObject( + (((surf_object as usize) << IMAGE_RESERVED_TOP_BITS) >> IMAGE_RESERVED_TOP_BITS) as _, + ) +} + +pub(crate) fn format_size(f: hipArray_Format) -> Result { + Ok(match f { + hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8 + | hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8 => 1, + hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16 + | hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16 + | hipArray_Format::HIP_AD_FORMAT_HALF => 2, + hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32 + | hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32 + | hipArray_Format::HIP_AD_FORMAT_FLOAT => 4, + _ => return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED), + }) +} + unsafe fn to_surface_desc(res_desc: CUDA_RESOURCE_DESC) -> Result { let res_type = mem::transmute(res_desc.resType); let res: hipResourceDesc__bindgen_ty_1 = match res_desc.resType { @@ -26,92 +68,10 @@ unsafe fn to_surface_desc(res_desc: CUDA_RESOURCE_DESC) -> Result hipResourceDesc__bindgen_ty_1 { - mipmap: hipResourceDesc__bindgen_ty_1__bindgen_ty_2 { - mipmap: mem::transmute(res_desc.res.mipmap.hMipmappedArray), - }, - }, - CUresourcetype::CU_RESOURCE_TYPE_LINEAR => hipResourceDesc__bindgen_ty_1 { - linear: hipResourceDesc__bindgen_ty_1__bindgen_ty_3 { - devPtr: res_desc.res.linear.devPtr.0, - desc: channel_format_desc( - FromCuda::from_cuda(res_desc.res.linear.format), - res_desc.res.linear.numChannels, - )?, - sizeInBytes: res_desc.res.linear.sizeInBytes, - }, - }, - CUresourcetype::CU_RESOURCE_TYPE_PITCH2D => hipResourceDesc__bindgen_ty_1 { - pitch2D: hipResourceDesc__bindgen_ty_1__bindgen_ty_4 { - devPtr: res_desc.res.pitch2D.devPtr.0, - desc: channel_format_desc( - FromCuda::from_cuda(res_desc.res.pitch2D.format), - res_desc.res.pitch2D.numChannels, - )?, - width: res_desc.res.pitch2D.width, - height: res_desc.res.pitch2D.height, - pitchInBytes: res_desc.res.pitch2D.pitchInBytes, - }, - }, - _ => todo!(), + _ => return Err(CUresult::CUDA_ERROR_NOT_SUPPORTED), }; Ok(hipResourceDesc { resType: res_type, res, }) } - -fn channel_format_desc( - format: hipArray_Format, - num_channels: u32, -) -> Result { - let mut bits = match num_channels { - 1 => (1, 0, 0, 0), - 2 => (1, 1, 0, 0), - 3 => (1, 1, 1, 0), - 4 => (1, 1, 1, 1), - _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE), - }; - let (kind, bit_width) = match format { - hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT8 => { - (hipChannelFormatKind::hipChannelFormatKindUnsigned, u8::BITS) - } - hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT16 => ( - hipChannelFormatKind::hipChannelFormatKindUnsigned, - u16::BITS, - ), - hipArray_Format::HIP_AD_FORMAT_UNSIGNED_INT32 => ( - hipChannelFormatKind::hipChannelFormatKindUnsigned, - u32::BITS, - ), - hipArray_Format::HIP_AD_FORMAT_SIGNED_INT8 => { - (hipChannelFormatKind::hipChannelFormatKindSigned, i8::BITS) - } - hipArray_Format::HIP_AD_FORMAT_SIGNED_INT16 => { - (hipChannelFormatKind::hipChannelFormatKindSigned, i16::BITS) - } - hipArray_Format::HIP_AD_FORMAT_SIGNED_INT32 => { - (hipChannelFormatKind::hipChannelFormatKindSigned, i32::BITS) - } - hipArray_Format::HIP_AD_FORMAT_HALF => ( - hipChannelFormatKind::hipChannelFormatKindFloat, - mem::size_of::() as u32 * u8::BITS, - ), - hipArray_Format::HIP_AD_FORMAT_FLOAT => ( - hipChannelFormatKind::hipChannelFormatKindFloat, - mem::size_of::() as u32 * u8::BITS, - ), - _ => return Err(CUresult::CUDA_ERROR_INVALID_VALUE), - }; - bits.0 *= bit_width; - bits.1 *= bit_width; - bits.2 *= bit_width; - bits.3 *= bit_width; - Ok(hipChannelFormatDesc { - x: bits.0 as i32, - y: bits.1 as i32, - z: bits.2 as i32, - w: bits.3 as i32, - f: kind, - }) -} diff --git a/zluda/tests/kernel_suld.rs b/zluda/tests/kernel_suld.rs index 8255f3d..7d368a2 100644 --- a/zluda/tests/kernel_suld.rs +++ b/zluda/tests/kernel_suld.rs @@ -340,10 +340,6 @@ unsafe fn kernel_suld_impl< if mem::size_of::() * CHANNELS < mem::size_of::() * SULD_N { return; } - // TODO: reenable those tests - if mem::size_of::() != mem::size_of::() || CHANNELS != SULD_N { - return; - } let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed); let size = 4usize; let random_size = rand::distributions::Uniform::::new(1, size as u32); diff --git a/zluda/tests/kernel_sust.rs b/zluda/tests/kernel_sust.rs index 6b5ef49..e6a07de 100644 --- a/zluda/tests/kernel_sust.rs +++ b/zluda/tests/kernel_sust.rs @@ -312,7 +312,9 @@ unsafe fn byte_fill(vec: &mut Vec, value: u8) { fn extend_bytes_with(slice: &[u8], elm: u8, desired_length: usize) -> Vec { let mut result = slice.to_vec(); - result.extend(std::iter::repeat(elm).take(desired_length - slice.len())); + if desired_length > slice.len() { + result.extend(std::iter::repeat(elm).take(desired_length - slice.len())); + } result } @@ -337,10 +339,6 @@ unsafe fn kernel_sust_impl< if mem::size_of::() * CHANNELS < mem::size_of::() * SUST_N { return; } - // TODO: reenable those tests - if mem::size_of::() != mem::size_of::() || CHANNELS != SUST_N { - return; - } let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed); let size = 4usize; let random_size = rand::distributions::Uniform::::new(1, size as u32); @@ -464,4 +462,8 @@ unsafe fn kernel_sust_impl< assert_eq!(expected, &*observed); let mut unused = mem::zeroed(); assert_eq!(cuda.cuCtxPopCurrent(&mut unused), CUresult::CUDA_SUCCESS); + assert_eq!( + cuda.cuDevicePrimaryCtxRelease_v2(CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); } diff --git a/zluda/tests/primary_context.rs b/zluda/tests/primary_context.rs new file mode 100644 index 0000000..f72c7b1 --- /dev/null +++ b/zluda/tests/primary_context.rs @@ -0,0 +1,84 @@ +use crate::common::CudaDriverFns; +use cuda_types::*; +use std::{mem, ptr}; +mod common; + +cuda_driver_test!(primary_context); + +unsafe fn primary_context(cuda: T) { + assert_eq!(cuda.cuInit(0), CUresult::CUDA_SUCCESS); + let mut flags = 0; + let mut active = 0; + assert_eq!( + cuda.cuDevicePrimaryCtxGetState(CUdevice_v1(0), &mut flags, &mut active), + CUresult::CUDA_SUCCESS + ); + assert_eq!((0, 0), (flags, active)); + assert_eq!( + cuda.cuDevicePrimaryCtxSetFlags_v2(CUdevice_v1(0), 1), + CUresult::CUDA_SUCCESS + ); + assert_eq!( + cuda.cuDevicePrimaryCtxGetState(CUdevice_v1(0), &mut flags, &mut active), + CUresult::CUDA_SUCCESS + ); + assert_eq!((1, 0), (flags, active)); + let mut primary_ctx = ptr::null_mut(); + assert_eq!( + cuda.cuDevicePrimaryCtxRetain(&mut primary_ctx, CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + assert_eq!( + cuda.cuCtxPushCurrent_v2(primary_ctx), + CUresult::CUDA_SUCCESS + ); + assert_eq!(cuda.cuCtxSetFlags(2), CUresult::CUDA_SUCCESS); + assert_eq!( + cuda.cuCtxSetCurrent(ptr::null_mut()), + CUresult::CUDA_SUCCESS + ); + assert_eq!( + cuda.cuDevicePrimaryCtxGetState(CUdevice_v1(0), &mut flags, &mut active), + CUresult::CUDA_SUCCESS + ); + assert_eq!((1, 1), (flags, active)); + assert_ne!(primary_ctx, ptr::null_mut()); + let mut active_ctx = ptr::null_mut(); + assert_eq!( + cuda.cuCtxGetCurrent(&mut active_ctx), + CUresult::CUDA_SUCCESS + ); + assert_eq!(active_ctx, ptr::null_mut()); + assert_ne!(primary_ctx, active_ctx); + assert_eq!( + cuda.cuDevicePrimaryCtxGetState(CUdevice_v1(0), &mut flags, &mut active), + CUresult::CUDA_SUCCESS + ); + assert_eq!((1, 1), (flags, active)); + let mut buffer = mem::zeroed(); + assert_eq!( + cuda.cuCtxPushCurrent_v2(primary_ctx), + CUresult::CUDA_SUCCESS + ); + assert_eq!(cuda.cuMemAlloc_v2(&mut buffer, 4), CUresult::CUDA_SUCCESS); + assert_eq!( + cuda.cuDevicePrimaryCtxRelease_v2(CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + assert_eq!( + cuda.cuDevicePrimaryCtxGetState(CUdevice_v1(0), &mut flags, &mut active), + CUresult::CUDA_SUCCESS + ); + assert_ne!( + cuda.cuDevicePrimaryCtxRelease_v2(CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); + assert_eq!((0, 0), (flags, active)); + // Already freed on context destruction + // TODO: reenable when we start tracking allocations inside context + //assert_ne!(cuda.cuMemFree_v2(buffer), CUresult::CUDA_SUCCESS); + assert_eq!( + cuda.cuDevicePrimaryCtxReset_v2(CUdevice_v1(0)), + CUresult::CUDA_SUCCESS + ); +} diff --git a/zluda_api/Cargo.toml b/zluda_api/Cargo.toml index b708cbd..79231b0 100644 --- a/zluda_api/Cargo.toml +++ b/zluda_api/Cargo.toml @@ -26,3 +26,4 @@ features = [ [package.metadata.zluda] debug_only = true windows_only = true +skip_zip = true diff --git a/zluda_dark_api/Cargo.toml b/zluda_dark_api/Cargo.toml index 0aef25e..8266e36 100644 --- a/zluda_dark_api/Cargo.toml +++ b/zluda_dark_api/Cargo.toml @@ -14,6 +14,7 @@ either = "1.9" bit-vec = "0.6.3" paste = "1.0" lz4-sys = "1.9" +cloudflare-zlib = "0.2.10" thread-id = "4.1.0" # we don't need elf32, but goblin has a bug where elf64 does not build without elf32 goblin = { version = "0.5.1", default-features = false, features = ["elf64", "elf32"] } diff --git a/zluda_dark_api/src/lib.rs b/zluda_dark_api/src/lib.rs index 6849e0e..15c6091 100644 --- a/zluda_dark_api/src/lib.rs +++ b/zluda_dark_api/src/lib.rs @@ -687,13 +687,19 @@ pub enum FatbinModule { pub struct FatbinFile { data: *const u8, pub kind: FatbinFileKind, - pub compressed: bool, + pub compression: FatbinCompression, pub sm_version: u32, padded_payload_size: usize, payload_size: usize, uncompressed_payload: usize, } +pub enum FatbinCompression { + None, + Zlib, + Lz4, +} + impl FatbinFile { unsafe fn try_new(fatbin_file: &FatbinFileHeader) -> Result { let fatbin_file_version = fatbin_file.version; @@ -719,22 +725,19 @@ impl FatbinFile { }); } }; - if fatbin_file + let compression = if fatbin_file .flags .contains(FatbinFileHeaderFlags::CompressedOld) { - return Err(UnexpectedFieldError { - name: "FATBIN_FILE_HEADER_FLAGS", - expected: vec![ - AnyUInt::U64(FatbinFileHeaderFlags::empty().bits()), - AnyUInt::U64(FatbinFileHeaderFlags::CompressedNew.bits()), - ], - observed: AnyUInt::U64(fatbin_file.flags.bits()), - }); - } - let compressed = fatbin_file + FatbinCompression::Zlib + } else if fatbin_file .flags - .contains(FatbinFileHeaderFlags::CompressedNew); + .contains(FatbinFileHeaderFlags::CompressedNew) + { + FatbinCompression::Lz4 + } else { + FatbinCompression::None + }; let data = (fatbin_file as *const _ as *const u8).add(fatbin_file.header_size as usize); let padded_payload_size = fatbin_file.padded_payload_size as usize; let payload_size = fatbin_file.payload_size as usize; @@ -743,7 +746,7 @@ impl FatbinFile { Ok(Self { data, kind, - compressed, + compression, padded_payload_size, payload_size, uncompressed_payload, @@ -753,28 +756,36 @@ impl FatbinFile { // Returning static lifetime here because all known uses of this are related to fatbin files that // are constants inside files - pub unsafe fn get_or_decompress(&self) -> Result, Lz4DecompressionFailure> { - if self.compressed { - match self.decompress_kernel_module() { - Some(mut decompressed) => { - if self.kind == FatbinFileKind::Ptx { - decompressed.pop(); // remove trailing zero + pub unsafe fn get_or_decompress(&self) -> Result, DecompressionFailure> { + match self.compression { + FatbinCompression::Lz4 => { + match self.decompress_kernel_module_lz4() { + Some(mut decompressed) => { + if self.kind == FatbinFileKind::Ptx { + decompressed.pop(); // remove trailing zero + } + Ok(Cow::Owned(decompressed)) } - Ok(Cow::Owned(decompressed)) + None => Err(DecompressionFailure), } - None => Err(Lz4DecompressionFailure), } - } else { - Ok(Cow::Borrowed(slice::from_raw_parts( + FatbinCompression::Zlib => { + let compressed = + std::slice::from_raw_parts(self.data.cast(), self.padded_payload_size); + Ok(Cow::Owned( + cloudflare_zlib::inflate(compressed).map_err(|_| DecompressionFailure)?, + )) + } + FatbinCompression::None => Ok(Cow::Borrowed(slice::from_raw_parts( self.data, self.padded_payload_size as usize, - ))) + ))), } } const MAX_MODULE_DECOMPRESSION_BOUND: usize = 64 * 1024 * 1024; - unsafe fn decompress_kernel_module(&self) -> Option> { + unsafe fn decompress_kernel_module_lz4(&self) -> Option> { let decompressed_size = usize::max(1024, self.uncompressed_payload as usize); let mut decompressed_vec = vec![0u8; decompressed_size]; loop { @@ -801,7 +812,7 @@ impl FatbinFile { } #[derive(Debug)] -pub struct Lz4DecompressionFailure; +pub struct DecompressionFailure; pub fn anti_zluda_hash AntiZludaHashInputDevice>( return_known_value: bool, diff --git a/zluda_dump/Cargo.toml b/zluda_dump/Cargo.toml index 2ee1592..1499905 100644 --- a/zluda_dump/Cargo.toml +++ b/zluda_dump/Cargo.toml @@ -44,3 +44,4 @@ rand = "0.8.5" # Nominally debug_only, but useful for power users [package.metadata.zluda] dump_names = ["libcuda.so", "libcuda.so.1"] +dump_nvidia_names = ["libcuda.so", "libcuda.so.1"] diff --git a/zluda_dump/src/log.rs b/zluda_dump/src/log.rs index 2cfbda6..7777a61 100644 --- a/zluda_dump/src/log.rs +++ b/zluda_dump/src/log.rs @@ -19,7 +19,7 @@ use std::path::PathBuf; use std::str::Utf8Error; use zluda_dark_api::AnyUInt; use zluda_dark_api::FatbinFileKind; -use zluda_dark_api::Lz4DecompressionFailure; +use zluda_dark_api::DecompressionFailure; use zluda_dark_api::UnexpectedFieldError; const LOG_PREFIX: &[u8] = b"[ZLUDA_DUMP] "; @@ -447,7 +447,7 @@ impl Display for LogEntry { file_name ) } - LogEntry::Lz4DecompressionFailure => write!(f, "LZ4 decompression failure"), + LogEntry::Lz4DecompressionFailure => write!(f, "Decompression failure"), LogEntry::UnknownExportTableFn => write!(f, "Unknown export table function"), LogEntry::UnexpectedBinaryField { field_name, @@ -591,8 +591,8 @@ impl From for LogEntry { } } -impl From for LogEntry { - fn from(_err: Lz4DecompressionFailure) -> Self { +impl From for LogEntry { + fn from(_err: DecompressionFailure) -> Self { LogEntry::Lz4DecompressionFailure } } diff --git a/zluda_ml/Cargo.toml b/zluda_ml/Cargo.toml index 25d88a9..452cc0e 100644 --- a/zluda_ml/Cargo.toml +++ b/zluda_ml/Cargo.toml @@ -15,5 +15,4 @@ atiadlxx-sys = { path = "../atiadlxx-sys" } rocm_smi-sys = { path = "../rocm_smi-sys" } [package.metadata.zluda] -top_level = true linux_names = ["libnvidia-ml.so", "libnvidia-ml.so.1"]