From 93820e3159295b3c07693caffc1ecf3110a7c7d6 Mon Sep 17 00:00:00 2001 From: Violet Date: Tue, 23 Sep 2025 12:31:55 -0700 Subject: [PATCH] Handle PrmtSlow (#518) --- ptx/lib/zluda_ptx_impl.bc | Bin 24456 -> 24896 bytes ptx/lib/zluda_ptx_impl.cpp | 62 ++++++++++++++++++ ptx/src/pass/insert_post_saturation.rs | 1 - .../instruction_mode_to_global_mode/mod.rs | 1 - ptx/src/pass/llvm/emit.rs | 49 +------------- .../replace_instructions_with_functions.rs | 3 + ptx/src/test/ll/prmt.ll | 51 +++++++------- ptx/src/test/ll/prmt_slow.ll | 46 +++++++++++++ ptx/src/test/spirv_run/mod.rs | 5 ++ ptx/src/test/spirv_run/prmt_slow.ptx | 25 +++++++ ptx_parser/src/ast.rs | 9 --- ptx_parser/src/lib.rs | 14 +--- 12 files changed, 172 insertions(+), 94 deletions(-) create mode 100644 ptx/src/test/ll/prmt_slow.ll create mode 100644 ptx/src/test/spirv_run/prmt_slow.ptx diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index bc375c3b14ff5032dff6fc592f683e1b081a93be..afc9c2c502e413f473ecd473e5f781656d4a9e4c 100644 GIT binary patch delta 5420 zcmeC!&v@Vv;{+APa~o9`GcsP-e3Ef9TfK*aEJLlp4W!J~n7K{IcM6o&(oj>Fc*2xg`S&Acqq ze?W>EJfx?XaWDjNFll%+o8MTlqmf~PlYoPx)SiQEk9q(7|Np<9;TppPGo6AD|Hc0Q z|IfUEt%2o>V*)n=2zW{`FxXE1$+5{vV9r4nCf-X9yw?((4zLNgm=!d09+dXcxR;>7 zm3-(E8*`up_hk!y6UT<;M5hE_g?kAKU7Uxdd5*9=NjyRsIh}=0AU5@wvcQc2Vnt*Ob{20Conp%vK7WMFDw2C1nB5hofHL%q2pWf&L_FoO6X zoN{2osWq(b%uEi94GaPd3=9nnCmIw*W4UKAvK(LmsQ_V+t=GhqxY2Cgs~b|xz;Zwe zqy&sp4ovXc*LsWzYAZ+#FIg-IJvy_1X3?ax^hso*W>lEB=X1frIUgr$(YciX>Zygkix0 z29|>h94Akav|_u*z;du*=Hx4qMj}sm9Rzr0%nSISu%KCxnTf&Cz~IPaDXCP(-pLJ8 zB9@O7SPnV}wJS6;<}f5GFfgz(Ffa(SII=ZM7;1p!m~hBV-Y6x_23GNC@;fON#b)L*C6UeAIIbNoa-4P@ira$?*%xuOzlZ$0l*t@Ja z91Sj{Or9sJ31uftJ|}C#cw#cQoI2y*$=~Iy8Ba{kms4l_IyqY2obklu^GM>x@(A(C z@@kBKC)X)}^nRCDWBfWRZnA`ug~$R2o(LW$qq_nUhdLS=TNEN14ysR1QF35hFgZ|3dGZM* z8}3C3BA0nt1$!hUiYD_b%QBiz)>U?7d^2(ZTnNQ9eA zXiV2-m>|gPGTB8%kMY6eVwITs`5G)5EKE&IN(y2OCps8m>Z1%al-W9DKn`(~$#FL5 zkx(cSXi8vYD?A|Tl*rhC;6Gq>GD>PV;mFa%DskkXxBxR-e}j)XOOtU@0)rw4TY>;L zuZiqK4n~O_25=%Vh9nXlP`+eknCz!2S6`YSawNlvp+`an6n2t#7>h&}b21vtQecr} zNjh?X!AU^pzys;lLp%(Uh7L~@SPnArJWAkZNj<_au~~qH+n|G4PlC;ci4h@tK$$58 zF59dx(IUv~!Ur-&f$zWpjwv_W)EUyGnUrVba&RPWoBUc;f-!zFi<%{4(Plffcy_*` zhyWfZh8c&Mnk8Bo9!y@XJB9JhWCy)vj5j7f*K1+oVBegpFUrW?D$}AM;qY{Fr-2C{ zC;%8s_zo608?+o`o6N5!I{ASCp8!;e0W@8tO#Ww}1J`G0sKBlWO4kCGlOqf@UBPCg z@Ev4RGz1lJ+vgvd%@k-cXq zE(eiS&tX8<3o>)oWNjmT(Ke9RA|#7q3NrMXWeg1(1X!56C+8bUGQOSMX%s64k(b>Z zvmwI{CeJNC`Hzu=)?cTFm;?tl?WG9@>lP+}qmM_Yh+`JF*olOtB0R@DC3++z95|X9 zr%tvrUSk9?DOtf)+@;)cNhWU*M8}$VfhKLWJqee%*+AN698tX12ho-~S;j;`^qqi% zxW$BnUYZ7-tM|!59GE>h&_qe%R{Mh83C%oVUX7k}Bvu@R>S7g|+-;)F*gAQ=i8y1_ ztC>pRh2Gugsiy}pQj;t%#YJKC$!&;m!nDU3g#HQaGjDM?cQ$Eg19p&T;G z2;h1$o4JU6;)Tw+4ej&L%vj)PIE5Lm`@ur>Iwl=tGcp2%Ai4vg+Br9@VBZKgWPxMg z6jn6tuN-wwEP@)65g^DK(!j_n6JTh}aAh|0PWEjH+)y-G-$I<-6_m(5vL^dlXxH;t zayS}ns5qqI#sgtiJ<{OoY?U=M02Md~eswZPLYYk5j60u-mK`}Tp;>^1*;}V670S%k zX?Z#);)WozGoOppyn#~%Ois75W&Ls}lPU2G6!W(I|4fU1ET503)N!2@2R2Q8XS zpvA$OP|itE6%no;dWO6(&7n+b(Ci;BI0LF8p|ufeM%Pp}DTpoJVPct3g^`I%1l1t6 zWw_BUt~*mwde zX(h~>%n{t(BOzf6D*28Y2OG2n?1ahM1+zl!`0LXQHGtLZoFUYSH*AhUjY*n(*{*lA z#FGCpT4IfsSRANeQ-~ zX8g4CJ_jY7Se2o&jlQBV`^|V^2HdcKm4(TlC7=o?PCnt0P!DRkx*)atc^McOOf))$ zG@6?<+MP674{7u^X-;4gS>fQ|;LyRv!qTGQ;NZ~0#B!uw!@;46>509@e^Bq<-Ph`l zKtMu)SgUwI730?FfmU}FSdQk3#%*|6;C!I*V5NB6ftRdz<{f-l(kvii&L$scz-Dzv zpnd|Q#2m>fjByTZRg98vBvTe#=5-3VBh|wgw*e%-fl*?P^c2Rp18h}{(r+Ylz|sj5 z*vuQer0yKxJgB6~ZeHNDLA=5`sQ?rh!rCtm%CQ{gO-o=;C^#hjqQu#N<1laH1oi}j zI}2ViO7NKbR5P$$<}FrWKUmT%bEN)YfzyV|yhQ@+$2bql3BS-RUZBph8{{E|LInl} z69xtbBR&D87Un%X9TRRfaBbo}G2wOt-vM5Z!aEH@=Xfm|?>30O<;`Gx(I7FCZ${$l z2I&iYCmP>1$i3&|*z~?Zsh!_L=~sj54*m?IzYXdy`8%9A8Z~DLtO(+6)IKe6qn=5! zQE#T8#AT^Q<3oZGk7XOpKM2kU)@ZcaD71rFuhDj@@C#?FM*Ch-jwktzj_<@ZrgSwr zZ1lMmERn&|*XaI2qGL*bqt}0l6CpDi{kkPNL}xYzO_%hzI{KoJ{ zQXNnKHb&l7fsJ&e@-Q&;Gchnk3NcS9&|zR;0QG9(xfvMP7#J9Q7#J8pK@b5|4WdCK zM+RIFfmCh=20;b}hB9sj1|bFp21$r8156&o&D@;mWz10zk`m&AC;<(Z38*kIXmc@u zhkjr*NSs55fx()KfkA`;LBquTxfmEg$r47x#ABf1Fd8PF$;H6H!@$6h$;D6)X22v$ zxfmEE7#J8}G)%k~>OdF`6JN%~03P;&(I9b76$XYaP;+23NF3yWGhC2xfzki}=l}l? zlDNyoz#z)NzyPCR8a_dN0Ha~z|DguK>HiRO1h^R()EO8UU^GmX85vJ)KFPS5t^N*@5Yo!Uk3b1_nw9X@!XySTZm*FvHZHXiyZjE)QcoBK7&u^}4s0Gd0vc;n7#Sw8 zGHsqEe3jY2Si#8D$jC$?#n{X&Ez!ax%{VP3IVr^?$ z*d#4!@_cb?Mvl$5#S<7+F7PciNJ?14$kEg&#P4p9)Ubh(qv@cGkE5;3u@1(IY>FIg z+>_%a_1ShXax^h+o;*cTSNsv10tcIqr$(YciX>Zygkixw29|>h3@6`^v|_u%z;dvm zbFze#k;oBV2LYZLa|1poENB*FW@2zOFgP$dPAXNp)p-LWL!tr$11kdqgD{IDTeF0r z#sdYGgAPoS?@5UhEzJ%Vc+@)izLY2%n0;xok+ce<+vIR*7dAz9MGiLM$qS`d zvKI+7B^dm5nj9z-$S62@u}mSmAxQF?-ehUn*^FBzZ>h)@hvh+*G7@yKLr1vS>UP64?vvt}zGp>Rg!@Q1+ zY@2xw3G_&aIB+yEPUF6!<5+lN$zk3`cDBtt?SfG8ZId4=N=SWWGWDJyWBjtfS%EFV z;PeKKgB^~HEea9|`zH%4Ik6WuORzK_mYp1!5IK_J#Ly!l0}4FJE0YhXiaK()9^zq;G<0~Rz;cj@=TQPT zOX?AZ&Sn7?Zi5bHJqb1&CPswp0cEBXxNNh&L<`8#Vrqtrag#mNEEx+oH>t(5^A$z} z@HjC{ILy>6(ZX0aEQ-R&tbr-9xSen#R4@P8Ybr(>Fc|J z{2U=!6jP9)*DPab&>+CV>@C*vpyP?2$CoBXqXc;(pKA(aEY4@q-e$w#cO>KMXr+*OcWU3Ol~w0X4IZM z)kI0+RQrP63C%oVUX7k}Bvu@Rs(Z>m`Mim;WU5f3R7?Zggo9q12A$Q-RyPEhnK>K{ zHiS*)HZ5U{ncQG%;vK~Q<^UfQIcybdkYw0&mhtXk+Xu{cm?$9Eq#>u+;nxEG?*;t7 z7NDWY|4kJfH!^!%akj-oJ*&7tNH!q1qs-FSJ+h0Go1QGMBlc1jI&| zPLMW;?US9&k!*x&1?$_@q1)Uo@k+6-(vwvDkX@q%bp3c5(^IZL28MFi>?J{j5rw0PZ)e*(mx}_qyB;EoZ*K<3X^wPs7KutPHWs>xbbmTcexPHb|fl{Hj&-{jR+)@)$q3n#y^vIZBIjjEeYsi_GPkTRt^q~#&o#Rh1O^ief%Dw`C<7Va>yOsK+$#3h1iko=$Gx)7>z*3?$0 z%I+|sY^chp$w{_9>zDQj^e76{8YLw#C~~k}J0);XqUoVJwA|a;BiN&;P!E$lt#D9v z=_aV$QJ)FWGELQ(38vx<&v9_|D-mJhERoD{z6+`()!5&lO<*IWZkC9!5N1v02=4BY zkgx?6ic5`y4cY>B!eniNS)q1(^=XD0@YML6A=HWIZH__hNSG{c-#c2g$$uFw+D41E z(V`8xXbW=xSHIQB$)L^Q&q-)$q3g_&%)zz-TA+M2ax-XaXgLLyG;C&>KIas)U1e(Q z2rbaW&O-YW5)r1xGSE`#CQo9IqKz(0$vK;Y5>8K)ppq|rM0yl`^o^1dY(WL8*m<9W z5>BkjP}!e8qA>f7d0__Juz(esiJv8)3OgtBcqY^rDljk@GcYh1@d+rkFt6t6Xt>$H zwUGBj!>tCs^}HO3w;P1^@>&$$X%M}`oAL2pgG2}4jKEh7();;N6uxbcyT`||=v{-- zH$D%cpAD+Z`7@OMG^pR;?=bq`pxG&~!ilRMq}t&$&9aa8^bS3bzJ=e8>~p>VPI%xVqg$r zo}!?`z`y|NGnjERFt9N&F!(SqFo4uWFj#|{D+~+~3=9nEASM@xU|_J}W?&FxU|{g& zW?&FvU|^7h2s1E1U+uU^GnpB-8^i`u~6X|NlV-U4t3~qhT7Jb1^We rGcYi~XqY%3Hv@wP0|Nu!S&fAc9+OM~HW2h+C=J)Y|jEoEbyc$^5 diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index 6174ec1..f247f45 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -780,4 +780,66 @@ typedef uint32_t ShflSyncResult __attribute__((ext_vector_type(2))); return d_out; } + + struct byte4 + { + union + { + uint32_t u32; + uint8_t u8x4[4]; + }; + } __attribute__((aligned(4))); + + struct byte8 + { + union + { + uint32_t u32x2[2]; + uint8_t u8x8[8]; + }; + } __attribute__((aligned(8))); + + uint32_t FUNC(prmt_b32)(uint32_t x, uint32_t y, uint32_t s) + { + byte4 v_perm_selector; + v_perm_selector.u32 = 0; + + byte8 input; + input.u32x2[0] = x; + input.u32x2[1] = y; + + for (size_t i = 0; i < 4; i++) + { + uint8_t sel = static_cast(s >> (i * 4)); + uint8_t addr = sel & 0x7; + if (sel & 0x8) + { + if (addr % 2 == 1) + { + v_perm_selector.u8x4[i] = 0x8 + addr / 2; + continue; + } + } + v_perm_selector.u8x4[i] = addr; + } + + byte4 output; + output.u32 = __builtin_amdgcn_perm(input.u32x2[1], input.u32x2[0], v_perm_selector.u32); + + for (size_t i = 0; i < 4; i++) + { + uint8_t sel = static_cast(s >> (i * 4)); + uint8_t addr = sel & 0x7; + if (sel & 0x8) + { + if (addr % 2 != 1) + { + output.u8x4[i] = (output.u8x4[i] & 0x80) * 0xff; + continue; + } + } + } + + return output.u32; + } } diff --git a/ptx/src/pass/insert_post_saturation.rs b/ptx/src/pass/insert_post_saturation.rs index 525ae15..620e46b 100644 --- a/ptx/src/pass/insert_post_saturation.rs +++ b/ptx/src/pass/insert_post_saturation.rs @@ -164,7 +164,6 @@ fn run_instruction<'input>( | ast::Instruction::Or { .. } | ast::Instruction::Popc { .. } | ast::Instruction::Prmt { .. } - | ast::Instruction::PrmtSlow { .. } | ast::Instruction::Rcp { .. } | ast::Instruction::Rem { .. } | ast::Instruction::Ret { .. } diff --git a/ptx/src/pass/instruction_mode_to_global_mode/mod.rs b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs index d365e29..12851a6 100644 --- a/ptx/src/pass/instruction_mode_to_global_mode/mod.rs +++ b/ptx/src/pass/instruction_mode_to_global_mode/mod.rs @@ -1818,7 +1818,6 @@ fn get_modes(inst: &ast::Instruction) -> InstructionModes { | ast::Instruction::Mov { .. } | ast::Instruction::Ld { .. } | ast::Instruction::St { .. } - | ast::Instruction::PrmtSlow { .. } | ast::Instruction::Prmt { .. } | ast::Instruction::Activemask { .. } | ast::Instruction::Membar { .. } diff --git a/ptx/src/pass/llvm/emit.rs b/ptx/src/pass/llvm/emit.rs index 76717e1..0a68f8b 100644 --- a/ptx/src/pass/llvm/emit.rs +++ b/ptx/src/pass/llvm/emit.rs @@ -510,10 +510,6 @@ impl<'a> MethodEmitContext<'a> { ast::Instruction::Xor { data, arguments } => self.emit_xor(data, arguments), ast::Instruction::Rem { data, arguments } => self.emit_rem(data, arguments), ast::Instruction::BarWarp { .. } => self.emit_bar_warp(), - ast::Instruction::PrmtSlow { .. } => { - Err(error_todo_msg("PrmtSlow is not implemented yet")) - } - ast::Instruction::Prmt { data, arguments } => self.emit_prmt(data, arguments), ast::Instruction::Membar { data } => self.emit_membar(data), ast::Instruction::Trap {} => self.emit_trap(), ast::Instruction::Tanh { data, arguments } => self.emit_tanh(data, arguments), @@ -533,7 +529,8 @@ impl<'a> MethodEmitContext<'a> { | ast::Instruction::Nanosleep { .. } | ast::Instruction::ReduxSync { .. } | ast::Instruction::LdMatrix { .. } - | ast::Instruction::Mma { .. } => return Err(error_unreachable()), + | ast::Instruction::Mma { .. } + | ast::Instruction::Prmt { .. } => return Err(error_unreachable()), } } @@ -2447,48 +2444,6 @@ impl<'a> MethodEmitContext<'a> { Ok(()) } - fn emit_prmt( - &mut self, - control: u16, - arguments: ptx_parser::PrmtArgs, - ) -> Result<(), TranslateError> { - let components = [ - (control >> 0) & 0b1111, - (control >> 4) & 0b1111, - (control >> 8) & 0b1111, - (control >> 12) & 0b1111, - ]; - if components.iter().any(|&c| c > 7) { - return Err(error_todo()); - } - let u32_type = get_scalar_type(self.context, ast::ScalarType::U32); - let v4u8_type = get_type(self.context, &ast::Type::Vector(4, ast::ScalarType::U8))?; - let mut components = [ - unsafe { LLVMConstInt(u32_type, components[0] as _, 0) }, - unsafe { LLVMConstInt(u32_type, components[1] as _, 0) }, - unsafe { LLVMConstInt(u32_type, components[2] as _, 0) }, - unsafe { LLVMConstInt(u32_type, components[3] as _, 0) }, - ]; - let components_indices = - unsafe { LLVMConstVector(components.as_mut_ptr(), components.len() as u32) }; - let src1 = self.resolver.value(arguments.src1)?; - let src1_vector = - unsafe { LLVMBuildBitCast(self.builder, src1, v4u8_type, LLVM_UNNAMED.as_ptr()) }; - let src2 = self.resolver.value(arguments.src2)?; - let src2_vector = - unsafe { LLVMBuildBitCast(self.builder, src2, v4u8_type, LLVM_UNNAMED.as_ptr()) }; - self.resolver.with_result(arguments.dst, |dst| unsafe { - LLVMBuildShuffleVector( - self.builder, - src1_vector, - src2_vector, - components_indices, - dst, - ) - }); - Ok(()) - } - fn emit_abs( &mut self, data: ast::TypeFtz, diff --git a/ptx/src/pass/replace_instructions_with_functions.rs b/ptx/src/pass/replace_instructions_with_functions.rs index a68008f..2951657 100644 --- a/ptx/src/pass/replace_instructions_with_functions.rs +++ b/ptx/src/pass/replace_instructions_with_functions.rs @@ -519,6 +519,9 @@ fn run_instruction<'input>( i, )? } + i @ ptx_parser::Instruction::Prmt { .. } => { + to_call(resolver, fn_declarations, "prmt_b32".into(), i)? + } i => i, }) } diff --git a/ptx/src/test/ll/prmt.ll b/ptx/src/test/ll/prmt.ll index 7753f5c..933229d 100644 --- a/ptx/src/test/ll/prmt.ll +++ b/ptx/src/test/ll/prmt.ll @@ -1,38 +1,39 @@ -define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"36", ptr addrspace(4) byref(i64) %"37") #0 { - %"38" = alloca i64, align 8, addrspace(5) +declare hidden i32 @__zluda_ptx_impl_prmt_b32(i32, i32, i32) #0 + +define amdgpu_kernel void @prmt(ptr addrspace(4) byref(i64) %"37", ptr addrspace(4) byref(i64) %"38") #1 { %"39" = alloca i64, align 8, addrspace(5) - %"40" = alloca i32, align 4, addrspace(5) + %"40" = alloca i64, align 8, addrspace(5) %"41" = alloca i32, align 4, addrspace(5) + %"42" = alloca i32, align 4, addrspace(5) br label %1 1: ; preds = %0 - br label %"35" + br label %"36" -"35": ; preds = %1 - %"42" = load i64, ptr addrspace(4) %"36", align 8 - store i64 %"42", ptr addrspace(5) %"38", align 8 +"36": ; preds = %1 %"43" = load i64, ptr addrspace(4) %"37", align 8 store i64 %"43", ptr addrspace(5) %"39", align 8 - %"45" = load i64, ptr addrspace(5) %"38", align 8 - %"53" = inttoptr i64 %"45" to ptr - %"44" = load i32, ptr %"53", align 4 - store i32 %"44", ptr addrspace(5) %"40", align 4 - %"46" = load i64, ptr addrspace(5) %"38", align 8 + %"44" = load i64, ptr addrspace(4) %"38", align 8 + store i64 %"44", ptr addrspace(5) %"40", align 8 + %"46" = load i64, ptr addrspace(5) %"39", align 8 %"54" = inttoptr i64 %"46" to ptr - %"34" = getelementptr inbounds i8, ptr %"54", i64 4 - %"47" = load i32, ptr %"34", align 4 - store i32 %"47", ptr addrspace(5) %"41", align 4 - %"49" = load i32, ptr addrspace(5) %"40", align 4 + %"45" = load i32, ptr %"54", align 4 + store i32 %"45", ptr addrspace(5) %"41", align 4 + %"47" = load i64, ptr addrspace(5) %"39", align 8 + %"55" = inttoptr i64 %"47" to ptr + %"34" = getelementptr inbounds i8, ptr %"55", i64 4 + %"48" = load i32, ptr %"34", align 4 + store i32 %"48", ptr addrspace(5) %"42", align 4 %"50" = load i32, ptr addrspace(5) %"41", align 4 - %2 = bitcast i32 %"49" to <4 x i8> - %3 = bitcast i32 %"50" to <4 x i8> - %"55" = shufflevector <4 x i8> %2, <4 x i8> %3, <4 x i32> - store <4 x i8> %"55", ptr addrspace(5) %"41", align 4 - %"51" = load i64, ptr addrspace(5) %"39", align 8 - %"52" = load i32, ptr addrspace(5) %"41", align 4 - %"58" = inttoptr i64 %"51" to ptr - store i32 %"52", ptr %"58", align 4 + %"51" = load i32, ptr addrspace(5) %"42", align 4 + %"56" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"50", i32 %"51", i32 30212) + store i32 %"56", ptr addrspace(5) %"42", align 4 + %"52" = load i64, ptr addrspace(5) %"40", align 8 + %"53" = load i32, ptr addrspace(5) %"42", align 4 + %"59" = inttoptr i64 %"52" to ptr + store i32 %"53", ptr %"59", align 4 ret void } -attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file +attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/ll/prmt_slow.ll b/ptx/src/test/ll/prmt_slow.ll new file mode 100644 index 0000000..f178332 --- /dev/null +++ b/ptx/src/test/ll/prmt_slow.ll @@ -0,0 +1,46 @@ +declare hidden i32 @__zluda_ptx_impl_prmt_b32(i32, i32, i32) #0 + +define amdgpu_kernel void @prmt_slow(ptr addrspace(4) byref(i64) %"39", ptr addrspace(4) byref(i64) %"40") #1 { + %"41" = alloca i64, align 8, addrspace(5) + %"42" = alloca i64, align 8, addrspace(5) + %"43" = alloca i32, align 4, addrspace(5) + %"44" = alloca i32, align 4, addrspace(5) + %"45" = alloca i32, align 4, addrspace(5) + br label %1 + +1: ; preds = %0 + br label %"38" + +"38": ; preds = %1 + %"46" = load i64, ptr addrspace(4) %"39", align 8 + store i64 %"46", ptr addrspace(5) %"41", align 8 + %"47" = load i64, ptr addrspace(4) %"40", align 8 + store i64 %"47", ptr addrspace(5) %"42", align 8 + %"49" = load i64, ptr addrspace(5) %"41", align 8 + %"60" = inttoptr i64 %"49" to ptr + %"48" = load i32, ptr %"60", align 4 + store i32 %"48", ptr addrspace(5) %"43", align 4 + %"50" = load i64, ptr addrspace(5) %"41", align 8 + %"61" = inttoptr i64 %"50" to ptr + %"35" = getelementptr inbounds i8, ptr %"61", i64 4 + %"51" = load i32, ptr %"35", align 4 + store i32 %"51", ptr addrspace(5) %"44", align 4 + %"52" = load i64, ptr addrspace(5) %"41", align 8 + %"62" = inttoptr i64 %"52" to ptr + %"37" = getelementptr inbounds i8, ptr %"62", i64 8 + %"53" = load i32, ptr %"37", align 4 + store i32 %"53", ptr addrspace(5) %"45", align 4 + %"55" = load i32, ptr addrspace(5) %"43", align 4 + %"56" = load i32, ptr addrspace(5) %"44", align 4 + %"57" = load i32, ptr addrspace(5) %"45", align 4 + %"63" = call i32 @__zluda_ptx_impl_prmt_b32(i32 %"55", i32 %"56", i32 %"57") + store i32 %"63", ptr addrspace(5) %"44", align 4 + %"58" = load i64, ptr addrspace(5) %"42", align 8 + %"59" = load i32, ptr addrspace(5) %"44", align 4 + %"67" = inttoptr i64 %"58" to ptr + store i32 %"59", ptr %"67", align 4 + ret void +} + +attributes #0 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="dynamic" "denormal-fp-math-f32"="dynamic" "no-trapping-math"="true" "uniform-work-group-size"="true" } +attributes #1 = { "amdgpu-ieee"="false" "amdgpu-unsafe-fp-atomics"="true" "denormal-fp-math"="preserve-sign" "denormal-fp-math-f32"="preserve-sign" "no-trapping-math"="true" "uniform-work-group-size"="true" } \ No newline at end of file diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index bd5d900..c24ca1a 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -275,6 +275,11 @@ test_ptx!(const_ident, [0u16], [0u64, 0u64]); test_ptx!(cvt_s16_s8, [0x139231C2u32], [0xFFFFFFC2u32]); test_ptx!(cvt_f64_f32, [0.125f32], [0.125f64]); test_ptx!(prmt, [0x70c507d6u32, 0x6fbd4b5cu32], [0x6fbdd65cu32]); +test_ptx!( + prmt_slow, + [0x70c507d6u32, 0x6fbd4b5cu32, 30212], + [0x6fbdd65cu32] +); test_ptx!(activemask, [0u32], [1u32]); test_ptx!(membar, [152731u32], [152731u32]); test_ptx!(shared_unify_extern, [7681u64, 7682u64], [15363u64]); diff --git a/ptx/src/test/spirv_run/prmt_slow.ptx b/ptx/src/test/spirv_run/prmt_slow.ptx new file mode 100644 index 0000000..08668ae --- /dev/null +++ b/ptx/src/test/spirv_run/prmt_slow.ptx @@ -0,0 +1,25 @@ +.version 6.5 +.target sm_30 +.address_size 64 + +.visible .entry prmt_slow( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u32 temp1; + .reg .u32 temp2; + .reg .u32 temp3; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.u32 temp1, [in_addr]; + ld.u32 temp2, [in_addr+4]; + ld.u32 temp3, [in_addr+8]; + prmt.b32 temp2, temp1, temp2, temp3; + st.u32 [out_addr], temp2; + ret; +} diff --git a/ptx_parser/src/ast.rs b/ptx_parser/src/ast.rs index 84d5f57..b1cf959 100644 --- a/ptx_parser/src/ast.rs +++ b/ptx_parser/src/ast.rs @@ -432,15 +432,6 @@ ptx_parser_macros::generate_instruction_type!( }, Prmt { type: Type::Scalar(ScalarType::B32), - data: u16, - arguments: { - dst: T, - src1: T, - src2: T - } - }, - PrmtSlow { - type: Type::Scalar(ScalarType::U32), arguments: { dst: T, src1: T, diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs index 2c9003b..26ae5e9 100644 --- a/ptx_parser/src/lib.rs +++ b/ptx_parser/src/lib.rs @@ -3671,17 +3671,9 @@ derive_parser!( // prmt.b32{.mode} d, a, b, c; // .mode = { .f4e, .b4e, .rc8, .ecl, .ecr, .rc16 }; prmt.b32 d, a, b, c => { - match c { - ast::ParsedOperand::Imm(ImmediateValue::S64(control)) => ast::Instruction::Prmt { - data: control as u16, - arguments: PrmtArgs { - dst: d, src1: a, src2: b - } - }, - _ => ast::Instruction::PrmtSlow { - arguments: PrmtSlowArgs { - dst: d, src1: a, src2: b, src3: c - } + ast::Instruction::Prmt { + arguments: PrmtArgs { + dst: d, src1: a, src2: b, src3: c } } }