From bf2aef9be0132106b4ee85343a9717911411e33b Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Fri, 4 Oct 2024 22:58:25 +0200 Subject: [PATCH] Add support for bfi --- ptx/lib/zluda_ptx_impl.bc | Bin 3720 -> 4136 bytes ptx/lib/zluda_ptx_impl.cpp | 29 ++++++++++++++++++ ...eplace_instructions_with_function_calls.rs | 4 +++ 3 files changed, 33 insertions(+) diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index 6dbf916ce12c9f7d166c8f6999ff8f986e5db2a5..9533233ef8adad3c8cba1d5339c9bfaae496681d 100644 GIT binary patch delta 960 zcmeB>U7;{Rg~^3qx zF?4SbQJiAQ6rgd4vz0@!#c7VPlcp1g;uOV}fUaUc4rPX7mV+Sk0wow2x)~T4r2Q0s z{Qv*|KVt;*0g;VwQ&@ryh%wCIVt|4MUWuTK?S~l{9T*%KI2afh92jyAOz?8qz{a4) z3{}IBz}yyMC1mKppa2tLkXG2}xFF*!1A~JO8^h*o))OppCa2jL7!nm27+4t?7=&3I z*%aM)xKvpVGGsRKFy%}Rg`#5Ntdr#KSDxuHexilr#A)mjv6X`UI8*HIuoy zjTwz5+jFb&xG|jS_xl;Bae%=}d2%7QV!c7nBp!2?4v7Z`j~pm*KENSvQQ##YImbeC zM|RhV2S<1v&j|@I%d)F+G_&VGoGlJ^wwsOs0|NsS0|SEy$hBDv4B}j2jriVIzrQNxvU211JbuKtTX9U!WOm{)A(kDow&OR24p1G`bZqiObA6 z-6`BOTUCKgi!Z65rLn77Lgpb;n|{X^wMTs%U)mDVIlCIWoE5Gx++bi}a4S?`VBln6 zU@+nnP-_ zur}l5f80`rd<+Z>D;OaP4H+321Q{3@oERAxgh0s)A`F)Yarq{D^8BusU|?Xl!@$4* zqCp8zK!t(f3j+fK$TS!Y66aK5U|?c|sDsfUagcf`sBJJBB+j7&vV(ykmWP2M0^;B; zpy-N^ugWP+NsKQjsff?aEtp)*dsYceEGIrGEi*pJ*eE_eKRFx3%bom=SDZOBK54QW Gp927Ic=jm( delta 541 zcmZ3X&>=fPh3Ny+L^X9G0aqrcN!(5?0!AVh91`AKMotssrDL30B(yZNCK%Rih?%Hx zH$T%9XZhTbDBGn2J>L{11S=c)rTh>3xL zLFB1@fSvN>O7boB3QZaez9?q=Hz;KxNr~=z&9%lg_bCDK_CxwNc4V!tI4OEI6 z-3yq+6>3}-Y-Ty^a9oi?XaS4#WJ7KVwx{+6OaXS2{ke@9nI<=LtMMo^JarGR6JeEb zFlU;)m0Pi1r)Lt6IZKDcgM&v7lsF&Y5Vt7s5|E6UBXY;oHKm}?dBSFvBM!$KIS)Mm zIk!-Ofq|WYfx(DRK&geHfmP!{L<7rI){F^J4eWbZXE?+{Ln4)jfgzfKfkB9Q$^(#( zm>C!t-ZC&SurV+&_%JXqFfuSOM1a&z*5ze4(fW!|ABLf2{TtHz2 pQV5d=vAHLw^8RK6#o`@?$?N#UI6NU%nN2>!_h9lXemf=x1_1QWd%*wz diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index 60a33e1..85823b4 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -101,4 +101,33 @@ extern "C" len = sub_sat(64, pos); return (base << (64U - pos - len)) >> (64U - len); } + + uint32_t __ockl_bfm_u32(uint32_t count, uint32_t offset) __attribute__((device)); + uint32_t FUNC(bfi_b32)(uint32_t insert, uint32_t base, uint32_t pos_32, uint32_t len_32) + { + uint32_t pos = pos_32 & 0xFFU; + uint32_t len = len_32 & 0xFFU; + if (pos >= 32) + return base; + uint32_t mask; + if (len >= 32) + mask = UINT32_MAX << pos; + else + mask = __ockl_bfm_u32(len, pos); + return (~mask & base) | (mask & (insert << pos)); + } + + uint64_t FUNC(bfi_b64)(uint64_t insert, uint64_t base, uint32_t pos, uint32_t len) + { + // NVIDIA docs are incorrect. In 64 bit `bfe` both `pos` and `len` + // parameters use whole 32 bit number and not just bottom 8 bits + if (pos >= 64) + return base; + uint64_t mask; + if (len >= 64) + mask = UINT64_MAX << pos; + else + mask = ((1UL << len) - 1UL) << (pos); + return (~mask & base) | (mask & (insert << pos)); + } } diff --git a/ptx/src/pass/replace_instructions_with_function_calls.rs b/ptx/src/pass/replace_instructions_with_function_calls.rs index 75ee676..70d77d3 100644 --- a/ptx/src/pass/replace_instructions_with_function_calls.rs +++ b/ptx/src/pass/replace_instructions_with_function_calls.rs @@ -100,6 +100,10 @@ fn run_instruction<'input>( let name = ["bfe_", scalar_to_ptx_name(data)].concat(); to_call(resolver, fn_declarations, name.into(), i)? } + i @ ptx_parser::Instruction::Bfi { data, .. } => { + let name = ["bfi_", scalar_to_ptx_name(data)].concat(); + to_call(resolver, fn_declarations, name.into(), i)? + } i => i, }) }