From fcd7a57888cd91abb9849f681c5fe753e8315972 Mon Sep 17 00:00:00 2001 From: NyanCatTW1 <17372086+NyanCatTW1@users.noreply.github.com> Date: Thu, 16 May 2024 06:38:52 +0800 Subject: [PATCH] Fix + improve vprintf implementation (#211) --- ptx/lib/zluda_ptx_impl.bc | Bin 232076 -> 232464 bytes ptx/lib/zluda_ptx_impl.cpp | 56 ++++++++++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index 1edcbd5cc43abb641e28b19d37651f905d1e0986..fec881a63b2759bcad291163e4ff3b21393bbd7b 100644 GIT binary patch delta 1748 zcmeB~$~R#K-vkvV*0mee4)HRxonYDgnl~zl>EZd!?g_apjE2oyQ`)ztFmB(P!ZcZ@ zKAw?*L6{{?fSZMZL56{W;fMq0l>h(#|7YKrW5dA67{I|A=#?txz$3ya7{DhQD8S+n zD9D%^z}Xsbf>BjA;1%Oj*?

tkMC;7>~*X^e`@!3Fu;cDihGhs45fC#F#1<@PP5C ztOI{P!xaX>fD4SRvJMPW7zG2ESsXYS7#QjqJ{mAE2r)1)D7l}Q5Xj}i?83;vkdvR6 zUYwC%RKfsq+=IVr91MTf3gkT!596@-$SvJ6YeOgRVV@67TM{pD&TTN`F*(S6p{-H9 zf{BTNL79zJrbR(wMR?r}=XH%NR^9?UAs$QyEV2xW5zImi9EW)wWf~6ZGz+l&WE0A$ z7vg!sS|lKTps`Ovp+KN1!JxBa63a`6LIwwkJdO_xJj$o1XkusMb3!XPbxCqzJ6nf<6k>lVYj z0G<#75XXSs!Q{O=+rRB43@nLaeGaTU7(^kKFL9jUDB{h?Sg*{?_NXVUL6Di9&q;u1 zm1$ssQvml?2%6Cu$&|LBsiNIbVOk@XrrfauQEXZ>DkPYb6&V{0ycaBSY2Yblb7AOA z_|TK^VM4-(DG48DBz%~Y@L@s1hb0LQRwO)Flki|e!h~vYMfKB!cs*@4^Gp;eVPIJ$ zlg7C%k#Et39D^Bo1}m%>oKpEj7zB(I9NIcN9E1-lNFP*CKB%C5P{H`1g7ra#*#{eD zsh+T6V44O~wiBf6fYiYQN(T>U9Xw!k@IbxQ!2?bQA9R6KE9^eVFkAIR8s{dyh7D7h zSs7Y*I3^@2DP3?eIN@e+!o%Q%m%#}igA;xRC(;Z~uxT+n$mvw=$Z1yU@3c|-G~+}v zm~neW&f-nHXUq(4AIM?e3}LWoF(0te`Ml#yvr+#^8@10f&Ll$^E9%cI-o$xE&0zL{ z9Hz|>MjGdqMzMy;JGu^c^vq_9V)!=U#^NOYo)b5crSdr5G4cc@C|EHtNwUZ?EaZ8U zI4zOKENKCgLvzPPHZ2w74xZc{jxX2^W~53m>z!z9*kQVYb%z7Pl5Ry7KaP?atPY19 z*qizO+5GTg_d<~idm{bIt8gvZQ=46L~engcs| z1(~N_NRyN>l3>f?xTsK}B*^P*!j{65;3T=C((s4E1La2k!?tWcB$$5$uw|)SP&lG= zk((K$BE?B^MWRRZh0Yceo=XxF3@31=PFu*+mI!rFR&3MF0RFEqbmvC>9gYr01_mhx z28QT1C5Lw1!-_YLOk1c}kaRkM^~HwN150M#FiK&x%8|5qskV(LN#MniX%BgrN^Te} zX-pB0X0zj{N<8hr1yYcr2-2?0@?t~ki&h5NZ9Lxu795_|D9<$ShEa#h1@36JDI6tC z3=E2f3=GT+3=G^HiVP>%at;bGB(rU|pURXdQ@`mnQ$^N<-8u~`*bba#T9H+-N2g&m z+qu(BH?jiu>NKoldvKbGBfDUqPQymFFQ=I-vLBq#Y1qQH@(fc&cEMSlh8=8w&oIr% zHn^qJu#4^FS*91+3ts3n>}Okcj!7ct!7H7H!))8mF-7EDc&*cLjP39_rjDEiZ*&?? zv0XdIbRtKg{+&+4S+>vTn11AZc(2oNk*)hYlSOXAPo0LVYzxmbW#k_CrPFYoZTESm zj@*DhIt@43E}v&wk$d2;PQz`scjuW-n+abYAOf&K}=;$`QX1j8U Q=|&)M&stKDeYTR7`JatVVbN{ z|Br!zL6{|0M}UEWL4tvS;fMq0l>h(#|7YKrW5dA67{I|A=;h*|$UddSVI}*Na)*QL zA~_C{>>|YuhHO)c95mUc6gyb5O(}5TWSdgy(8w-Q&R}c96u^AMK~R8^RW{%T;}Hi* z0Y*~?hA9jn@X>&QL6CugLCO8Zgg`ErdS+He28QB{{GyVa{JeCqV;=lf<6!u+Rv_<@ zxEY6OqOf$^%m=I)hbIVIGUiHTG>I?ZJ?!^}g@u8EL79zJrbR(wMR?5)=e|Z3D{ldw z5Dz8;7Fh;G31%S%j>Ei;QVj=nngv)Y*@QBLc%HCk35YLf?2}L^5NJv;c-cOwp5>uK zCWC`S9>)U)o@LTk82FlAvYAFOCr(R{SfLQ%Jfl5#fzzj!svB%t67DP7*D@<`6ge<6 zB+g*w*z*1Mhkgadh6#*}MrR}i4o<9Z`zQH3{lol=^=dlM1W6#N>#2pCrT1qbg#P zBYcuP8yO^@DKTmOm-$=#?fgUg<^N^=7yoI3KJV#n1mnW=ynZ^9unQZ{5go^AT+A041zsqeQ0S^(+R!kOk=f9D!qdYJ zB@8V#97YV)nh6bOgi>q*4slFR3Y_XL+j_90k(phQgYD{^GmH@&MT+7OWNqlZfOm?( z8-oqZhxmjh@=o!1Veo=!5&I;$6lPT(Yo0d^8=6y6HnZGzU`@>UWWW=$M)8IEV&ce7*rC&~k!|v6CX4I~yL1}1u$?%~RFR#q zU#DRQ+nh5@GqM%V=rrtNdvS*8MfQYSIt}~Tj-F+b$hmMwr{OT$)w4_yIS20QG#q1l zbe5?jXTm+5hEr@`&oZ6J5vYHm({Pq;(mAFdIS*dyG+bm`b&km*H{i8S!&SBe=a@2b zH@wklxXyO#98*WG!#kaZn{1!XF|Ekm@Ls3kHe2s`rW3ghKXn@JvMoQ)^deW`w@$-- zwu9%HIPy09(P? cvt.h && /opt/rocm/llvm/bin/clang -std=c++17 -Xclang -no-opaque-pointers -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -S -emit-llvm --cuda-device-only -nogpulib -O3 -Xclang -fallow-half-arguments-and-returns -o - | sed -e 's/define/define linkonce_odr/g' | sed -e '/@llvm.used/d' | sed -e 's/\"target-cpu\"=\"[^\"]*\"//g' | sed -e 's/\"target-features\"=\"[^\"]*\"//g' | sed -e 's/\"denormal-fp-math-f32\"=\"[^\"]*\"//g' | sed -e 's/!llvm.module.flags = !{!0, !1, !2, !3, !4}/!llvm.module.flags = !{ }/g' | sed -e 's/memory(none)/readnone/g' | sed -e 's/memory(argmem: readwrite, inaccessiblemem: readwrite)/inaccessiblemem_or_argmemonly/g' | sed -e 's/memory(read)/readonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | llvm-as-13 -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc +// python3 ./cvt.py > cvt.h && /opt/rocm/llvm/bin/clang -std=c++20 -Xclang -no-opaque-pointers -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -S -emit-llvm --cuda-device-only -nogpulib -O3 -Xclang -fallow-half-arguments-and-returns -o - | sed -e 's/define/define linkonce_odr/g' | sed -e '/@llvm.used/d' | sed -e 's/\"target-cpu\"=\"[^\"]*\"//g' | sed -e 's/\"target-features\"=\"[^\"]*\"//g' | sed -e 's/\"denormal-fp-math-f32\"=\"[^\"]*\"//g' | sed -e 's/!llvm.module.flags = !{!0, !1, !2, !3, !4}/!llvm.module.flags = !{ }/g' | sed -e 's/memory(none)/readnone/g' | sed -e 's/memory(argmem: readwrite, inaccessiblemem: readwrite)/inaccessiblemem_or_argmemonly/g' | sed -e 's/memory(read)/readonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | llvm-as-13 -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc // Compile to binary: // /opt/rocm/llvm/bin/clang -x ir -target amdgcn-amd-amdhsa -Xlinker --no-undefined zluda_ptx_impl.bc -mno-wavefrontsize64 -mcpu=gfx1030 // Decompile: @@ -1260,6 +1260,7 @@ extern "C" default: return 0; } + return 2; case 'l': switch (s[1]) { @@ -1289,17 +1290,18 @@ extern "C" case 'X': case 'n': len = 8; - return 2; + break; default: return 0; } + return 3; default: return 0; } + return 2; default: return 0; } - return 1; } __device__ static bool parse_printf_specifier(const char *s, uint8_t &len) @@ -1393,8 +1395,36 @@ extern "C" char c = *(s++); if (c == 0) break; - if (c == '%') + if (c != '%') + continue; + + // %% requires no additional handling + if (*s == '%') { + s++; + continue; + } + + // %s uses __ockl_printf_append_string_n + // https://github.com/ROCm/ROCm-Device-Libs/blob/rocm-5.7.x/ockl/src/services.cl#L343 + if (*s == 's') + { + s++; + const char *value = (const char *)read_valist(valist_ptr, valist_offset, 8); + handle = __ockl_printf_append_string_n(handle, value, strlen_plus_one(value), 0); + continue; + } + + // Keep scanning until we figure out the length of this specifier or if we reach the end of the string + while (*s != 0) { + // "The width is not specified in the format string, but as an additional integer value argument preceding the argument that has to be formatted." + if (*s == '*') { + s++; + uint64_t value = read_valist(valist_ptr, valist_offset, 4); + handle = __ockl_printf_append_args(handle, 1, value, 0, 0, 0, 0, 0, 0, 0); + continue; + } + uint8_t len = 0; if (parse_printf_specifier(s, len)) { @@ -1406,16 +1436,22 @@ extern "C" if (specifier_with_length) { s += specifier_with_length; - } - if (len > 0) - { - uint64_t value = read_valist(valist_ptr, valist_offset, len); - handle = __ockl_printf_append_args(handle, 1, value, 0, 0, 0, 0, 0, 0, 0); + } else { + // Assume the unknown character is a sub-specifier and move on + s++; + continue; } } + + if (len > 0) + { + uint64_t value = read_valist(valist_ptr, valist_offset, len); + handle = __ockl_printf_append_args(handle, 1, value, 0, 0, 0, 0, 0, 0, 0); + } + break; } } - return (uint32_t)__ockl_printf_append_args(handle, 0, 0, 0, 0, 0, 0, 0, 0, 1); + __ockl_printf_append_args(handle, 0, 0, 0, 0, 0, 0, 0, 0, 1); return 1; }