From bf560bd8ec17bd7bd406adbcbe83ab9d72d0ae5e Mon Sep 17 00:00:00 2001 From: Violet Date: Fri, 19 Sep 2025 17:50:36 +0000 Subject: [PATCH] Handle PrmtSlow Uses the HIP implementation of `__byte_perm`. --- ptx/lib/zluda_ptx_impl.bc | Bin 24456 -> 24824 bytes ptx/lib/zluda_ptx_impl.cpp | 5 ++ ptx/src/pass/llvm/emit.rs | 6 +-- .../replace_instructions_with_functions.rs | 3 ++ ptx/src/test/ll/prmt_slow.ll | 46 ++++++++++++++++++ ptx/src/test/spirv_run/mod.rs | 5 ++ ptx/src/test/spirv_run/prmt_slow.ptx | 25 ++++++++++ 7 files changed, 86 insertions(+), 4 deletions(-) create mode 100644 ptx/src/test/ll/prmt_slow.ll create mode 100644 ptx/src/test/spirv_run/prmt_slow.ptx diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index bc375c3b14ff5032dff6fc592f683e1b081a93be..0eef0b42d9d7e1ff94779fd6bf7ebdd51498f91c 100644 GIT binary patch delta 8552 zcmeC!&-mjY;{+AP-4j)_8AB&dlxOtVxKWT%WyygD6002N9AlSS{;^bqA%D`c4k5Mq zLVOCFoYN&cI9Lz5a4>8>#JHc6aq{FQ?#y~eO$P=Z1{jbuSaOQ9pOI++Lj!{V0|P?? z!-)pPtFalp0t{>nEDa!KEDmfuE0dCo84|2OTrf6D@Hi?b#=^qD1m-d@Fic={))G9a z#4tk$Cepwo5%l(i3M1nJ1_uTX1_lNPhLi&n>QDJ@;b2(C0#l{H+!nGT$kTyAgFygn zngD~O!ID=~b(tD88DWY}G$@Mh^AQqY;6ip&SX(zkf)`8;gHeJ4 zBgjqr9?CGx5Cd^RxPhl$BIqHX8RLN?ATAiE9GJjWH;0p>fsKL50VKi%a?`)&Kn4R# z5EqOk4VG+D%wcNiM0V3uX-i=N23}-0{oq*4kWd8H0wRnOJY2maSQ#!egLoiZ4{}sl zk&Xg`0s{vF$TSX6K)n<$V>}=N;(`=`0?KQd5f6hJCz#8?zyNYncBrHSgAYsu6i!TQ z4Nfp5EP@LeC3uL&O<`kkU_}apRlk^;c^K5Vz^dw@W|^90IxvJFhq;rn@dJj0O<+|Z z!YIMxY1VXB)Hq&M#NEurpvDbS0Ky<=skO7oF!;d37?fP@<0-kfSiA7zu^irD~~oGlmw}Rs9^vrlRD~vVcgW@2Z;;@u_($;CLC&DH~<$i zO7KXX?0B8=z!ao#t)Gyk$87L}A>k6TI@8$)S=byHa^PZ;21|_2H~eZ~cmNkNO7K`Z z-%+B5!5|YZCTXzb*2M&LWlxtK{LXk_36iZ7R)sddWM*n$W`Ox0l%#jC-DhW*%my=U z0;BV)dfpZM91MIgAyDudOV`LV_(;KoK*9UOY=)fzg9S_o6ueXO4%DK#jO#uFj~&Ad zc&-I0YppMM(8 zh?34I!6WoLqjdcWMh2f$xB;Lr|G6r7(l0;B{OL>XA>5oM}RkOVU~gPJj1;fV&t&?qj9?7L3{ z(|xS-Fx=N=hsAw&SD?AC-e5^o@Isy&3=GRqEEZkEmHL2*Fun?nU^ZuHc!?4= ztlOG24lpwKWWXH`it>-59*lep2~H@%GjrnvU({l>e#x)o#!U?j9w_S0Zn{w8z)%1; zP|{!tQ~JW6XxZ=TIVNFtG-apK87DKN<%OeLCLCoHV32_s2TBlEq6$nD7(^KAkqX_1 zTN}^|)TUg;O=z}?-g7c#cVJk75-@L~Gjz~w?cE^|iRQRPg%5e&F>o*-)pqi2A=P>d z9OW1o!3hEE#FU03Y>kKdgcuki6*h+p`!X}SPhKk4W?-OTWNK(^tdMM;mTF>}oMw__ zVQ86bYH4YhmTF{Rk(iihX_lIjW@ut$ZfuZdm@+w9+*-H8>Wl!3G0S0|<^~C49?zsR z0s_W7u1RMM40;rm8#%pNd(W;rtJ=HfOmE(l1C#fQe`L&>yir2i8stW8mh?zPo6}s3 zQDGbxnHU(11rL4TTWXM$u!fPNsnLkv-5{x910zS%K^dQewlc>$7%#FZa;&@6mA;DOxm0((X4ROXiDRk9LakM9@}^d6&yh!rD(_i zZXJdDB<{%$<%v-jPc6?X9czdgVP%{4t6*)wkSv>Ja9NG zq$4H3%)kK37tw4A3EOxkUP19QPm%!Bg2~?$6_}XiCQB$;h}1jqMDQ>f-4&2H)X~V; zq7cz=P<(QVk^?r0?~`{Z`7kb=%&M%;s6JU&*^%+(9Sx8>w2Ul_ZE9$#7!mk&pp-Nb&|_k;r0BMuS-jERrlqM-DJJ3FsVnAUt`J zY9VWUgaM1gW)8Kf?0iKL0X$9&GY&H~OSCXN*t}Zz03#Cz`{rDIX-38;lRFJ8IaBx! z7C0NU9Aul!uO>SAfq@9u69Z_dg-rfuV8r-jvZ0|WG3`iEJ;m|Ny z+sK(Qdvd;!B;(7;oknu33EOyl4@_QTY#?K298tX12QfBuvW$s>=vx5? zaf=BDy)+FvSMQTGG-wcDVfLOJXrd&3seQrjgl3*FuSU-~5-Scub$y-OWun4ZI(fZ` zIAg@*<0dwi72A0HKIBNQd2rJx=O8HCKDcSL!0|(lM@h;+j_ka(I9JUHJNHT0X%lOpU?nN^e6p(AukW=jN=K%lD3H*N! zprOgTO%)t>HhbJTY>SC{SQ!~b1X~LgIyV{gex8ua*CGF-QT~Ub{HI1R15QqsF;kRy z#9sFRj83%IKo}5ma;TZ2I7lB%`6orV?#aDoO5#v$5Y-RbYfgYoc`*5ynW6;5Mwm{J zDG=Ky|3|hFsuis7M0?F-3v>1QBKC&a~9BKCS(CFSc}ITs`@jr5$6~WK}Ci#{HAitsEGaPF`W zb?GLk+)1Aa(BjS1mMW7WalQ+xq}15opiN-o38Pczhjug2#Lp-#MEa|~)s;^fPAy`u$?{EyKBXtV$t ziUkm#`@j05MotE84u4KU^R%8bOEL%B3TU=xHFh&-YiKzIl{9K*nLg(fv?1ea>hp4oWz& zDnn)e`iR2pH{*pFaKi#tv?YC(fGX^re8MB49@N-00JkX}bwJ~@E?0m?!J1_l!b1_mQO0i_n^eLOQ3+-l(7%zIe+hZP0kdKO=~v zQERrqjwJ3zoihS2gd`jFX9;RNmTELPEST|Gw$b9F;EH68M(a&NCxrDH?WPI;2)1f; z=o6LrlHcg`Ufg0ySEI{ziHMM%Mz<>x6(W6&9xo+kEa`9bZjii@GNZBHzeiF+b!KDm z49SS6GaJM9OIC=^Z;W^>HRJ2w#;7|gpw=}Ar}8i`^fNIq2r*A70F7#b`X9;M3=E** zDbRo{Xlx(?q>zDu0YrmF2n|3zTLuP(Ol}4SK?VkfDsBb_(BQZvR2@hj7J3YslM}tN z>qQtC7+M(^7(n#@|NsAU=rAzMWME(bjn=_vkT|Cb1H*O(1_sceAdH5IUu0llkY-?D zfYBgv0Tl*@Hw+96pfOq)4H5^L&&0^U0CEx&BSSry0g?b|kY;3H5C$0rr9t8#i}e^G z7Q<+eI7qz*BLjmd0|Nt$28n~z$1*}31fxOXAoUecb7A!V|Mma>gEVwAGJwZ!U^Gm_ za;OC`8YB)zgC;RS z;tWQ^#8)#xq5wvN#6fXxh%+!Sz-W*-$U&*h5C_3%khp+KJp)4< zGXsML1A>M~FfgoRW?)ceK+rJpi%|728Ycb|svbte#AR3*7}OaU7+^F^+?j=eK?yWA z4`whhfI|Z0gH#qs_Jh$d4b3c&kbu!3agfiZu`n>mGB7Z}XplI_oXsqd5QWh&@yk$i zKy*DQyMZJ?20dee#0`vwiT`4O1Rab9iGwVVW`#HiMuWsb=IF69Fvv47Fu-V#I7q!O zR6UIT56XWa4GB;UFdC+zo)zLC7!48!shtdQ&tqe0>z z^$%I=Awl$z6+*)#7}+3E0Ha~z@@$Y?0HZR~iU9ArTl8v_HV@P^Sa@g}Hv zQ$3UclUT>bzyPWOU^Gnp5>y;U!^FR_K@us9hKWnEGcag@rm~)q!z9w6 z3Sl%%d;&WIgDPmU6-vXzx3WWg2BTr(51|&oXqY$)2P7(CG)SCNg`u87f&=0r7!8sD z1)Vii1B?cVgYvyUR6UFaiG$P^LDj=(kT^(v8&o}v28n~zuLP^FXJCLZAlVP3VJ}nz zjD~4=0M@|30HZR~iU9AtqwC&b4v8YB)a4(4KD;ALQ7fYC7V1gJQShKc8LF@Wd!VKhvc_mpjCVFbQU8!wE*i z#6`FvWjKt6iCc3+>JID47lYk7pFx^yI{K3pLpnrqa>{b`ax&9WOEPm)^-}Xv^a@Nj ZuMFwsoV++vkgcF7w7_+$hMn`5EJWPDbI$%eXV^9~wF^h%g8+FflMN z2rx(*ERo_~%gA(qp@Bhwfq|ic;Y5RCXnYPIgAY3c6gaT)tSmU()WF~X6<}a6O7M6p z7s9*Llr9b$|JrhpZIh?D~p>bv~3I5`+t z8JNJPF)1*&g{<7<%3z=hQUJ!121}HtdonROFe16>>fQoD1|Lqa0tN;Kkekw$O=@5W zU{GLSVPIfT0J*8vYz{NSWn?#LT{@`7Fhc^St%0XrBIskEF(cyz1_uTXkUgMlGTA1ItIDW)?qEkJgYs8pf|gAX4}5y(vroU0iUs$e2uH+lE5 zvM{hPFe-pV8S5u7I%^dl(qovR1mc4*D4>1@w=o`20fie#1mvby%e0s|8sHuVxhY4K zO_IS!1grw&bT*!qQQems7*-&MfoNR91jYj{$VO=$QRC%k-~tDEJp%(c%uUkRWEgzJ zkj?t?W)oT*i)JSVFdj%jHp}bYg$Cw^lkhM;(V#f>T(2W5!(>*NS)in_&zYHzL5&+G z1akdTb2d>1pL!H40}L!M)a6TbU>Fy{W5dI+48^$j5o!RS$O|yYz=S}?oiXNU=4VhtQKl~C zgRZQ^>IC*Aw$DSWYg-eaK6MVY)9O7{h&y%dxmmP!z*`=2}x2 zr5O&Cz(TQ}0c7!`;DtPQ7#Nlr!<`K(_IGe)KVV}pn2QoN5n%$X77PuX@N58z@~69) zG!HN`_+-HKgQ8qgOoN-F0iF{;RmoII?z1R~x1P~?m8IA$ehvnBxe1Ew1v&|iz)J^EV$|B&dW;Fx)~UJN6IoFmx9YRlxd(h`#pTmSOj1u61;F(R*j@)V zj~oGwH7bk@6Ihu*)hz?V<|V=&%#3c6--@*v7%Lc=8X1`=q!^o-r6pRJq#38BBqybq zBpDi5n3$&~rI@B9nVK6VrWzV08k?jgO62?5I zlTLUfGzBFlZ_Y0ERMswDEUjuVnOou`WA@~~6595lqFKAeEmG0q6xYMRFpdeE5p@O_76*dvda*3FGF;GbD|K zd^|N01yUs0IwTAW?lG_&Twpl)j-)+f=VTcvbH5pL13oA$XclBWSj^rwaTShTKn^Ps48EzR(X?&6+c~7BX8&9EvEhxYg4H>}g67DWX z4u;7KrPNqjoi{K}mX{V|VrrlKUrLJc)8v0r(u|M51-x^0KoTw@ltGtHqI^D3Qm}VQ^#eURe#sn8{*tDU4qy+spe+J}D>4_;&Jrxp*)? z8O-*W{90a&@$2Mc3f_}_6(m^RItAoTmX&jvyjDR1EDTbnrRdMn;>K7qxmM9q{4P`L zK^Er2ypD`)n|Tfi^hk&}a5OPan|xMLTxuKl6`g~HrwkqRhrB(3H^d zb@D$&361>@XN7d61eh5ZK#4n=O(9|%&%`4rA;6O)z|=6=PDz33uBs>F zCjp%U54b1aQY~bSi!fke*c_lXm7T9JB7nzTTm!gr8O(GV0a+Ew$>a;y>CczicZ{$k{(e;TCnnO+7$B@0yL zSEq)U1P37AAnhkw<58p|J>K>Et$JO+K(0Ig(2fZW_fLWSYF!ID@TC&Crae zZ?d+Dob;MFfhKLWJqee%*-j)Z6`65F@mimRgab!YqwC}p69v&X0uJI96ApT58g#DS zCu?ZXAi%<`J$a^ylK83i1-lcPdBVIJJ?BWQI0)7Cbn+z=70FbgMyZ$vHi*9JW~&>5 z%*-5)1{=aA^PAdO7Hs43dypf!=DXO{5dWJ4d{E@DRj@&lVbfX0yN7KbFxz3GfLxP?oMMMx3;4em z@c&wXj-E(!GcjcLgyxf-p>WX zru^MQIT&~HRWl`Vs5Xe|3+>fAz@}W7%xA7B0kILL6Qm7d`($@>Bpabx!TNTzS5Iy* zSFcZEpYVcx_KNl@G_=4`a0=tkXAL(PRSJ~Ue;rhNwNMTjWdv|Nna!NUKH)&;oPzeb zXl5*MG@QZ=*L^2Zz3LE(85sdW5Z!@L?VR&xuy24HvcNHL3M-oSXN)>Wl5`*r$p{c+ z4QXIxl?gC3X1Fq&c_sVS0B$InTy7!Dm@#>(g<*ZkAq_X49t%)fD|@8D*V!s-Xz)mZ z<=}x=oeYvt<|j_Zolix}jvVM{7GPoK)@e$GGPQMDp3aH5A;_%E=OnJi25vk&pir<9J!sKv0xcxg zgm6xRst9rQ&@<$PX%1ydgA^lc!UShPRm8M5Ld~d}$|eP|g*!|v6RI#GafzTBB#)=K zE`+L_HMJG0vO7#D8>%vDa+>YW`lUSrJ&FRgMo9?_iX3d$P6-^8XnLp)Ez!322=*u{ z)WalCD;!i^x(Osqr~Os1wiI9D^E@Fj?BZceEH9 zErv#mp`lz1g}MK$-)iJ!(B|;xBs6F1Izzqvn0hjn$0;;fcvVdnoeW3ya1E|Yw#3!KC!n~TNqv2)) z*FxSC4YwNj*7I^C-fj@u%WF}1r$O`%Z^p-a4H6xEGXh^VNbl!6QTVn&?j9eU{BrqTSq z;0$N^Mymz&LOVX|G}^WZzc99JwErW*aV4+O@vgW=OJ}3=QV9=}?nc)`5*c558r`o; zbhPv}dcBi4;nLse_e+9D)iGe|g zd5VG#1EgDH#?8P08g=ktU;qyYM=(sb^5!y32elVLok9i%1{-b$20;b}248Ll1|bFp z2KXonR34->eR8RHcD)<}14AnV0|SWu|NsAgP89}*#S9D#psqWNhKcWGU|^7DU|@jJ zAaMZ|28J693=E*rM;Hwf=g?tbcn7r^MuWsb>e(3?7=#%Z7}yyZ>cI?<1ju43Mu^2Q z8YB+VV8Y12Aj-hN0HZ9ADh{Jz;t^1DU^GY^WKJ4XeLai;k^mW0#{@|& zFd8O4oe2^JFd8HdisLm*3=E143=A+DBn~p?EE59*Xh0oCgTz4=KV$+;DKIcV>3Rl` z1V{rTGbB;KXplHag9tMNgBk+^1B?cVgVb9wGcbrVFfhPqkT}Refy@vG!Dx^;NPQ7A z1A_(w0|ShPh}SbP%wlF>P-Z~T5D5l`y-*D>8Ycb}svbte#5q_P7}P=YGf)~PuFt~2 z0Gg73(J=8q76t}6(5M-h!N33yiF_7FNWf^229VF1SQr>&85kH~G)Nrev-vELAcfH| z@%>P9U^GY^WX>fPNYsGndQcp}B%ZNAf(}N5#6cFYvqBsMqe0>zgQQp)800}^1(XJf zgVb9>)x&6zI7qz@R2_`|56XWa4cV*^7r|(lhI*(57!48!shv)C9IR6uF29?F18?1S=QG)(*v8zhm!XqY%FI|G9j0|Nt$hKcL5L)61) zn0OFWJ&dl0Nz|}2Fo0$XVKhv9Av?rpFd8O)7HR>EhKYY*heRce28n~RF$)L8K`r!a!udGf)jM8m8d^SOWtCj0TB=EMVq@cmPIcE_b3#HCMuWsb>TRInU>cPF zVG@a)3=GU^GnpB-8^i8YX@XY7UHsi9hFp6xGkU q#32lr1Rpn~lHi*x8tTq@8`ALFuQxe0v|}@4STE=1 MethodEmitContext<'a> { ast::Instruction::Xor { data, arguments } => self.emit_xor(data, arguments), ast::Instruction::Rem { data, arguments } => self.emit_rem(data, arguments), ast::Instruction::BarWarp { .. } => self.emit_bar_warp(), - ast::Instruction::PrmtSlow { .. } => { - Err(error_todo_msg("PrmtSlow is not implemented yet")) - } ast::Instruction::Prmt { data, arguments } => self.emit_prmt(data, arguments), ast::Instruction::Membar { data } => self.emit_membar(data), ast::Instruction::Trap {} => self.emit_trap(), @@ -534,7 +531,8 @@ impl<'a> MethodEmitContext<'a> { | ast::Instruction::Nanosleep { .. } | ast::Instruction::ReduxSync { .. } | ast::Instruction::LdMatrix { .. } - | ast::Instruction::Mma { .. } => return Err(error_unreachable()), + | ast::Instruction::Mma { .. } + | ast::Instruction::PrmtSlow { .. } => return Err(error_unreachable()), } } diff --git a/ptx/src/pass/replace_instructions_with_functions.rs b/ptx/src/pass/replace_instructions_with_functions.rs index f7c976e..2304538 100644 --- a/ptx/src/pass/replace_instructions_with_functions.rs +++ b/ptx/src/pass/replace_instructions_with_functions.rs @@ -519,6 +519,9 @@ fn run_instruction<'input>( i, )? } + i @ ptx_parser::Instruction::PrmtSlow { .. } => { + to_call(resolver, fn_declarations, "prmt_b32".into(), i)? + } i => i, }) } diff --git a/ptx/src/test/ll/prmt_slow.ll b/ptx/src/test/ll/prmt_slow.ll new file mode 100644 index 0000000..3943afa --- /dev/null +++ b/ptx/src/test/ll/prmt_slow.ll @@ -0,0 +1,46 @@ +declare hidden i32 @__zluda_ptx_impl_prmt_b32(i32, i32, i32) #0 + +define amdgpu_kernel void @prmt_slow(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) + %"45" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"38" + +"38": ; preds = %1 + %"46" = load i64, ptr addrspace(4) %"39", align 8 + store i64 %"46", ptr addrspace(5) %"41", align 8 + %"47" = load i64, ptr addrspace(4) %"40", align 8 + store i64 %"47", ptr addrspace(5) %"42", align 8 + %"49" = load i64, ptr addrspace(5) %"41", align 8 + %"60" = inttoptr i64 %"49" to ptr + %"48" = load i32, ptr %"60", align 4 + store i32 %"48", ptr addrspace(5) %"43", align 4 + %"50" = load i64, ptr addrspace(5) %"41", align 8 + %"61" = inttoptr i64 %"50" to ptr + %"35" = getelementptr inbounds i8, ptr %"61", i64 4 + %"51" = load i32, ptr %"35", align 4 + store i32 %"51", ptr addrspace(5) %"44", align 4 + %"52" = load i64, ptr addrspace(5) %"41", align 8 + %"62" = inttoptr i64 %"52" to ptr + %"37" = getelementptr inbounds i8, ptr %"62", i64 8 + %"53" = load i32, ptr %"37", align 4 + store i32 %"53", ptr addrspace(5) %"45", align 4 + %"55" = load i32, ptr addrspace(5) %"43", align 4 + %"56" = load i32, ptr addrspace(5) %"44", align 4 + %"57" = load i32, ptr addrspace(5) %"45", align 4 + %"54" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"55", i32 %"56", i32 %"57") + store i32 %"54", ptr addrspace(5) %"44", align 4 + %"58" = load i64, ptr addrspace(5) %"42", align 8 + %"59" = load i32, ptr addrspace(5) %"44", align 4 + %"63" = inttoptr i64 %"58" to ptr + store i32 %"59", ptr %"63", align 4 + ret void +} + +attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index 46bdd0b..5d2fc85 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -274,6 +274,11 @@ test_ptx!(const_ident, [0u16], [0u64, 0u64]); test_ptx!(cvt_s16_s8, [0x139231C2u32], [0xFFFFFFC2u32]); test_ptx!(cvt_f64_f32, [0.125f32], [0.125f64]); test_ptx!(prmt, [0x70c507d6u32, 0x6fbd4b5cu32], [0x6fbdd65cu32]); +test_ptx!( + prmt_slow, + [0x70c507d6u32, 0x6fbd4b5cu32, 30212], + [0x6fbdd65cu32] +); test_ptx!(activemask, [0u32], [1u32]); test_ptx!(membar, [152731u32], [152731u32]); test_ptx!(shared_unify_extern, [7681u64, 7682u64], [15363u64]); diff --git a/ptx/src/test/spirv_run/prmt_slow.ptx b/ptx/src/test/spirv_run/prmt_slow.ptx new file mode 100644 index 0000000..08668ae --- /dev/null +++ b/ptx/src/test/spirv_run/prmt_slow.ptx @@ -0,0 +1,25 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry prmt_slow( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u32 temp1; + .reg .u32 temp2; + .reg .u32 temp3; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.u32 temp1, [in_addr]; + ld.u32 temp2, [in_addr+4]; + ld.u32 temp3, [in_addr+8]; + prmt.b32 temp2, temp1, temp2, temp3; + st.u32 [out_addr], temp2; + ret; +}