From ddc00895a72c756dd47d3adab748271bf7dfde52 Mon Sep 17 00:00:00 2001 From: Violet Date: Sat, 20 Sep 2025 00:43:15 +0000 Subject: [PATCH] Handle sign extend bit correctly Passes the prmt default mode ptx_tests (other modes are still unimplemented). --- ptx/lib/zluda_ptx_impl.bc | Bin 24824 -> 24796 bytes ptx/lib/zluda_ptx_impl.cpp | 46 ++++++++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index 0eef0b42d9d7e1ff94779fd6bf7ebdd51498f91c..b89e0d120aa69ba530b9a6a739cdbc6d1dcf250e 100644 GIT binary patch delta 8022 zcmexyknzqz#tABnTPLb!Gx|)ND9`Ayaibt3I|DAI3{p2fPkk21B2~kSaQ!CpsngD%?v@=;Ayq&2xn1A#WQe zvrGi@W?n8Pruv754h$j;0t`$H3=9Gck_JnpxYsf=9bjl+5MW?nXka+epcopT!^hym z4zeGF9oTqQ794JBU~mBOz}P6kM?27#SECI2w2) zf?jrrF&>x#RsbSW4os-;^3&qvU|?ln0-MI9z}yzHa+52AfhtG=7)u%~QJU__#N@z; zgMlGTA1ItIDW)?qEkJgY zs8pf|gAX4}5y(vroU0iUs$e2uH+lE5vM{hPFe-pV8S5u7I%^dl(qovR1mc4*D4>1@ zw=o`20fie#1mvby%e0s|8sHuVxhY4KO_IS!1grw&bT*!qQQems7*-&MfoNR91jYj{ z$VO=$QRC%k-~tDEJp%(c%uUkRWEgzJkj?t?W)oT*i)JSVFdj%jHp}bYg$Cw^lkhM; z(V#f>T(2W5!(>*NS)in_&zYHzK@F0IL9xT!7V^}bO_af>9>vN40}Bjw`4Sx%#zpYh z@GvYxG48#DkAwgNJZ3?G?_%`AieZKk%pIU`{hPwTCePp_1ru|qXX9D<&uGR0295?^ zm>?)@H%SJ_3oyvQgh0lfG3IFIXHY{?rY_}!uB^o5haJNV6PQkrtxq!zY8@B~U_zkO zlXUgLZ-#_R4D~QUqXdtw>lnW>G9Ew<-dFL>@0lAsnPI9x!F!(VGCKz46P-l|*wPso zCL^1uRnB+g0S`mMCz$&{DSYW=M#%+?3_gi)K~Ngs&v$}>mmxt2)y(pG1GY>ChRg7H z110rUN0?1Lm>FgqgBt*fk$kxxM=6E_E^Kg7NrNSmzB=$;VqjQ?$eAY^6h-&5r!HV) zFqjJW6Dax)*(ETWGBm8@f$KL)@VHw4>!3sggTYJ`r4Q^Em?$u4A(vbe7Z|XKG5Dmx zHG_iPSVRM!k)ZjgTBE0tm0_|ZTwTh638%zay_lI6Fd-FRQ*F4C6BrMqu)*UWlqgnt zHlk;H z0!p)9`&y4Nq1t-2o_iuIs^hdI#Lqq8LoQX7jS@UcpEF54VH5z@KVW+u*gSFsG z#_Y*|CA6(Tg|BvtTco1JDXxcsVH^`SC3=9ep0VJe4}41vk`gvBax^s>@w*!&HEdzz zXgVn4bI?}iSO??7$#Ie`<5$-NW4u;7KrPO44 zoi{KtBq}g4ure?(2(vh{HA@(3JW^me=pfWS`M;DXJDAM`X0w6WpC6L870!;}9t&>A!f*2JiFPAA~a|FqSO_r0L!?EvdwFZd6O*^gsWX0^d|%F- z@x)|td4zbiJVN}oyc*-*$;T8xdhHd|SiU+19tnjafu;mTw!#CVPKk^S z2>t_BC!?f>6O-drB^d=KSE^d4l_rQB$#7!mk&pp}pyVCKB9Xw|%NTEL*4LlS$lfZ`q9Eb$bn-<56SfnM98HWR zlerDG1)#zP(CAH>>}IF~S6gYQz^({NpaPbY7Z_?n^rcKbZ>Uxu2ojc(NyP9%#p$&K1*$KiIJXu8^}u$l0`8E8G6k!h6W7+EX>_v zEe|@L=y`l;Vl;}-WNj|wU{+>h6KG0Ecsu!`ksNEpHXh#%lfM{QML=}RZjRZIVF%OA zE#C5=lP+}BaKI=h+`JF*olOtB0R@DC3++z95|X9 zr%rA&*5m_QlOwq#;igf{L8i%jjWg`q)C|pd`jSDpK$#62_B z1)8+g_9R^5W&;^J0#K+l_XPPKU+-hI2 zJE55;%&XCJj>L+CP+hD-lP{YnOSTF%O2ss=K{Qr3Tip<3X6A4-*bp_D-_%C2U>lF$ zgB-~<2W}eWG=hr912>HpI6lace4tRUji*$=VsnnE5;MCiD6~DYCQr4{uII4?X|Fh> z;l=}DRz1?->ui-ZGyo;v1HU>MB%w?uZpNKYMazyHn9wZ1!tAZnlnP~L>$E(b6LCY3 z*_qEtfM?AnjyX`K&Sn7^bIs-s7*l7<1Q>J8mimYVFvVLFV9YgJ1D3!c45?5m;n2NJJ$H67OM1+~ML^8+uE~t`L zV}FA-fsH4il2*d3$sEDmJrWYOplo*3IM|>qU?)u0E|?W+$6udjr~#~I=M14vykT<; zYE05*Y5U~S#5|gq1$fp>4t4)uf7Qszpv~dWNoe}ecV`sf=a zCD?+p(6sYD2PK?Xm7%hYzM?Su&3Iu3+^~RUljP45P=ymG3wWl~YeSl+I-o%_P#48S zqy3PEgM-74CJhINIZhf5eS!^4A{q`8m{?R+I5;?TaIvtkv}iau{Ap)mVR6xL_}BbN zo`Hcu-L)#XMBJq+6ht_0X!k5{Tw=~r?YK#qjUCi%-RtSx2@!-_PWruki+1W1h&KC1A&yXy1+HjfoR{I=*H;#>QhhK6Y z%wu3+C{$oz01Z+Y@d+rkFz?~%m~g9sYZLE@3AY>g4)Agm-f0jz$7|7ew?XtRZwBLw z28o$`GZJ4nNMGPP(fF=G?mZvJruPj>?ff1}zZz6`)bnQ;{cTWx$=~6`(Wp5~U_}sj zqxNZm8%&aodNTzjE=x5U9}*W|27+M(^7(g^= z6qZwkfnhNN0|TfJ2cu!)dl?uQq!}0(U^GZvK!t(f1_J{FXtWAOgTy&>7#Q9$Fff4n z`!E_L4pPs~$iN`Xz`(%H$WRYvfFwW`OEE$$hS4B#kOmV*1_n_E1_l@n5(laGWP~^f zMuWsb>Oli>APZnLNF1cToRI-Mas{LR|F8f5AEaS2)B+d{)35@n0Y-zwLF$h%GJr?a zU^Gnp4kH7&9}c5I;vkD(LDj=(kT}R3rg|obK}=8@CLzTHF$hM3#6bpWK-Ix$kT}R7 z2dFrVhKWZ&&4JM%agaG_Q1$gN21o*AP#qH_&R{f5d^!^(3Scxy92CcEm>3uo85kH~ zG)Np|&RHe~2G9U3j0TB=EPluY8iHnEfYS90APJBLMrKH&fYBgvkOmQE1_m_-1_l@n z5(lZbVrF0vXJBA}(I9bxOkAIZfdMpx4WnV=fh-IRa-dNzFoS^s9uoO1 zkdT1UAPpd&HL)--$TBc6z-W*-$Y=9eAVCVFVdDFt=D=u>ILMq!ERd)H(ef=N7M zfdn0l28n|#U}uFm2u6d%K?X^&GBALOW*7|;2dTG&s)x}aagcf+sCpO;n$iJhzid{B zi(oWNLp@Xjj0TB=ESLwi07iqvK^AOeh2(D-4H5^bKg$XUAs7u4f6H1ADf!;ALTH!- zHyb1uz-W*-$O08Mh&mVz5(in}#Kyp&!@$4*qhaECP;nRy6Q9M#z@P$3bM;ULOky9D z52In?kJuoI6h_0uS=kvFv=|r|U^GlzpBl!)TB=NPQ7hJ&Xp4gVaxjs)x}aagh3zVDlU^Gnp6VwBrxZEK=gh~8|8U&~R|NjrNK!6)kw!>(c sdUI|_rD8r=G}N8rDH8*O5c3p`$tj_on;FA4ak3Po7UgbU9v{dE02}NH-T(jq delta 8050 zcmca}knzVs#tABnyC?9YSjJ zh4>UUIj2i@aIhY9;b3532$W!8IJNmC<7T${Ka2{DwE{PoHY~Jk{{R2~fBA-BhCd7w z8tgc75^pptaAZ(9z;xjtn+h+B0;AZ1TMjE289y9kOXB_a|NsAbh9$gQ(ku*`EQ}&c z4oc?)+;U*>P~=#kz~oc zY2c9vdV4~Jk#PZo0|O{99T-v$OsGHQyM=>c84FC60&`o)iXcx11`P%QuxSDek_Jm& zP1R*;&}4)uI?QfPo9yO<`@_3<+K^H4H`x9;IeeSy&pF7#I~mqKqIn?RzN0 zFhdN)1>pvsdWoQid}fRXj)1seoN{0SSKS;=js`XcCI^rR6Ua^fngbaOEJ0i_mNZzh zNim11p%d9nQ>8711sHgd-SmTFF+)NTSPO_SO7L*?j$mcD%nagza6QOTWkos)3NjA5;lQVfe51nkEdDF zSyAJ7RS|bH7lRr%NC60goTb*zCd1$Z4`Wbrxs%7E#V|t$qymINK0GQXRlr~n39=s~ z!~k;ssr`m4(5yV#d{7dk4x)wutW4^t1BP)^lOH597{sC|JDG5(f#CpL$SA=hb+Y4i z#sgE3!nJ-vmL9Xg4~B$G$m&dIA7o*3V90@sNg6CMI^XcCf#CsM$SA>M>3m0t8U}+* zxR|8Dl3N!O(3L%1cJMpnfh9<`PFNM%{F0feftdm3e^8R%!FHdWVKN)cv9WGYe_7=$HYVxS~>Y8tbd1vA488<-%-#Lkif zZ0QUPli6W{paj01?*s!cLjo5(*@IM`EOTJXsApi9jB4n(vECsCBTHZd(_WDHFq)X^n6t1 zn#iNWFyjo|4p4A@(g=_eU=U?ssYjHlK0y-9+ze{QaD^ut6hou9FtYDH5lr{7&ckqD zmmL=O-Ccp^zIuZtQNas&ZZI$`L$O$N4Oi*|HU@*aD8cwDG=kZjq2VP;*syME(m24# z;FAG&I4H_Lih3~eF(f#l1kcQk6MRvN(fTF7k{dTQFnFM-JG<#ZjRQjg+(1c#B~0lH zf1+i-tLK=6+0m4pN@twRjFuOUZkceDQGh`PW*jI%T!|_$QD6{Zs7ESvA8u_xFHoCu z6*r;TDtgb!l-+@01xmoYiO$eLv$c1JKqQ*u78O3^dB?!PfK=Pbw}n*eEpU`$U<4-w zuoF`nj<7Wz>JwsMh*a1-TiBPG(S7nuv409Yo=Il}1dMrHlg=0z^e8Gfa(cD)o?UlV zwRg>#-n=OXCNoQXWXziURYKbgWP>(KdZePwX)ea7Fpi5%3=GDChd%HvHAqTW!^qLp zXvFVskkqh&k)!FLjL$(^nPVM{7bizany~F)AaF7xk}-tQOu;x zsS?c!H;tw=e#w!%r{J-Tr%=HWX{$w)so*-lVvcOq^ zEy3XQ291Lqj*Kk|5(y6+&I;*B2{1D-fbv!}n?k}io{3jbB7i4JfN8;GDXQqV9T{Iv-mL71&6??xIaIR5gxD21#1aJ9V*@0@ zO(ry^>oQCbWLBBnp`yolVDfgA7>{@j77Z4rCMG2XF@_Tzj4<_41{%t29x@;YJIdrZ z8}vvhKwMLJK-4Lbu>rwSEp<@)29)d2)Z-AprG`Vj+T;QwJ$*Njmm(yKVhS?!nq>?P8U$FFv&C8- zbUe}X_|n8^6rsu5T*$$!%*H0rl#uXp@>wG})`V?5z6U0MFtQSZ=$73avmwI{ru((n zWCvpjt)ET}F$oTA+Dj7*)-6mpa)7}}fM-o1$1HBK6A4R2c#e5W^hiiJa5OcxPHr;R zez8GLN&hGsnHlR-&enaypozKNK$PP{;qw%VSAOWbTAQ)V1d zyw(RXC3JFvi2~!>$&Dt$jNX%{nkY$JYG1HBp_wPltI>0g#EOGZbzk`>pEpsKEEQ^$ zifLe*aL`NBptHK!>V_aQGl!$WhKR}BrZ$Qd+j#swm`W_3A_Oe47ZaffX#Ul;A&Q@7N15jc-@TrqQ z63YC=$++{WXxWhi9nAtP%-T9lsZgf3PRr9d5jO;xmHC_mcyu;#%z-l3Y!ZMmbvAdv zm}@pqfH8Hp)JH6UDc+I*W9n=TSOOE>8UbVKZ1Y$F6WtcD!jPrdq1~{hn4=vksI%ii zi)C9J$2CRf+kWOOdnU+AEI8Z;DJT-Ix)z)<;$SpCVeo}X|BMii`Uj?Sh93?oOx|Uw z#sZOz4SXqPfM5F0uUF%Rr zNafHpxx>b^UTJEA1SCbS4rzJFcCi5(Nq(vZZah2+EC&xbi5|3QHi4!tolwq6P!(aW z9(snnFwLP%X^`}%6D~Ldsv@qn5o$)$R5mGyE!tsXnNWq1iAw}kAxR?5bs<#cqN%M= zm8-*qvY{#`C))n1U)dwjqbN{ol$5}r$iXIdTHv5W(?fM=rrFse*rTXW50gBja8PyW zCaBy=p9#=xU~0?+Q*oB(IJo?kh%j}QNai@-1yxdN>~GK}un|%?OGH=-vnF!{clSt0 z*n;xWR^woUwt$^5S=(S%s2x9jnxO`KH9ltub>a=1V^BL1H;daRk0$2P#4Nz0Gda-x zfBjJ-CxbSJKPRE-L(iEdnS*TwGzqa9yBV}Kw48!U8a1;_pK}UYwz?WSLX&XVS!feN zBErmA2AU4<@+9^s+UUZRoU=J7;q*iaD*4t&q({+5-zX`;mO+t&E$qC{K?x^TWvJ|5 zA5ob7X1p*1ZdkywNz!KtsKVaKJf11_1`G@g!YqzDps_Ge+tfsZ(Mg1bmur#0sYI3q z4qS^9K<(;920NHFdn6>X8JZFnTyPe;)AV2iqoM`N1?Hv(9%+F_V}|Prx7wK-SwvWz z*qV2&ILIya_ON0m6Bm!WMpZ zZ*C2a79O_7#xllo7Heqp{g6=dhFk#wW>7afx=qnz8;|RR9LWg^H;qgjH>J*KN;$oN z6*gE@sKCHr!oa{_#3!KC!n}`X#)4Z7+?#oCEV$jke~?$A@lJ#Ad0vl+cN@gs@m2`F zXpo%6x1#WMgUm&~`WqA9HOPP9lQ{IgLAirJ!su6n+D`ror@svvulQ#KaWrbp7TA%* z-KcX$;DwN6qy8*GjmJ`rCWi$xKFc;*d=y-ftkGz_N$7;IUZdSK;UB?PjShXH5?}Hg zo!*OEEa_@=*)9XplIk3IoG-1_lPu2n&pciC<)3 zV31~DV1UseaRC(uhBpih44?ro7!48!na{+?z#z%Mz`(@FP!DE+BtROZ85tOaL54wT zkT}Re&z-W*-D2@*>F)%1HFfhPqkT}SkhfI*P2Bqs6KoTH}zcMi} zfCi*tG)NqzL6jMiCSWv39Hd@@nSnu#fq?-=gTz7V{g@dT#2FYEU^GY^ zNL)aro`Io_nSnur0YO6~7#P+uGcYJKAZVEQMW}ih4HN$fRS%Iz##$hK`IL*`@v|KhGrH>NWf^2ILK$ySQr>&85kH~G)Np|&Sn-! zh{9-?_+_X$Ai5ru-9Qo`gPyTK;s!>;#DB3sf(}N5#6cEFvqBsMqe0>zbM#ml7~~lk z7+^F=9HibCsvbsz=6wExG$cSZz-X9;dRB;oU^GY^q<%6~J&Xp4gDlttwE#wg#6jwh zvO=;qj0TB=)IVgchXm0>RtOD~U}S?t0gQ%;%d&A`9_qhaC;I2jn67#J8}G)(*vCj)~E0|Nt$hKYmbwm{V`j0TC< zgDke^g18h$gCszyI+%-rftP`S0Y<~b6QJTS8YZ60#Q>f(h0!qaN-juc2BTr(6S<)E z|3ofGwt`8l;)2w2Fd8Hd3c4LogJ3jB9OTdoT#zOWjE0FnB6Bprzl;JQMCT`6QsXMGE{|#~Hcm`>2=}h(s?G(w$Da+N%$xKTv$;?gF VOU+BsD=^)BHFOi_=IQaSi~y7T65#*< diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index 58e8a44..f985ebc 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -781,8 +781,52 @@ typedef uint32_t ShflSyncResult __attribute__((ext_vector_type(2))); return d_out; } + struct byte4 + { + union + { + uint32_t u32; + uint8_t u8x4[4]; + }; + } __attribute__((aligned(4))); + + struct byte8 + { + union + { + uint32_t u32x2[2]; + uint8_t u8x8[8]; + }; + } __attribute__((aligned(8))); + uint32_t FUNC(prmt_b32)(uint32_t x, uint32_t y, uint32_t s) { - return __byte_perm(x, y, s); + byte4 v_perm_selector; + v_perm_selector.u32 = 0; + + byte8 input; + input.u32x2[0] = x; + input.u32x2[1] = y; + + for (size_t i = 0; i < 4; i++) + { + uint8_t sel = static_cast(s >> (i * 4)); + uint8_t addr = sel & 0x7; + v_perm_selector.u8x4[i] = addr; + } + + byte4 output; + output.u32 = __builtin_amdgcn_perm(input.u32x2[1], input.u32x2[0], v_perm_selector.u32); + + for (size_t i = 0; i < 4; i++) + { + uint8_t sel = static_cast(s >> (i * 4)); + if (sel & 0x8) + { + output.u8x4[i] = (output.u8x4[i] & 0x80) * 0xff; + } + } + + return output.u32; } }