From fcd7a57888cd91abb9849f681c5fe753e8315972 Mon Sep 17 00:00:00 2001
From: NyanCatTW1 <17372086+NyanCatTW1@users.noreply.github.com>
Date: Thu, 16 May 2024 06:38:52 +0800
Subject: [PATCH] Fix + improve vprintf implementation (#211)
---
ptx/lib/zluda_ptx_impl.bc | Bin 232076 -> 232464 bytes
ptx/lib/zluda_ptx_impl.cpp | 56 ++++++++++++++++++++++++++++++-------
2 files changed, 46 insertions(+), 10 deletions(-)
diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc
index 1edcbd5cc43abb641e28b19d37651f905d1e0986..fec881a63b2759bcad291163e4ff3b21393bbd7b 100644
GIT binary patch
delta 1748
zcmeB~$~R#K-vkvV*0mee4)HRxonYDgnl~zl>EZd!?g_apjE2oyQ`)ztFmB(P!ZcZ@
zKAw?*L6{{?fSZMZL56{W;fMq0l>h(#|7YKrW5dA67{I|A=#?txz$3ya7{DhQD8S+n
zD9D%^z}Xsbf>BjA;1%Oj*?
tkMC;7>~*X^e`@!3Fu;cDihGhs45fC#F#1<@PP5C
ztOI{P!xaX>fD4SRvJMPW7zG2ESsXYS7#QjqJ{mAE2r)1)D7l}Q5Xj}i?83;vkdvR6
zUYwC%RKfsq+=IVr91MTf3gkT!596@-$SvJ6YeOgRVV@67TM{pD&TTN`F*(S6p{-H9
zf{BTNL79zJrbR(wMR?r}=XH%NR^9?UAs$QyEV2xW5zImi9EW)wWf~6ZGz+l&WE0A$
z7vg!sS|lKTps`Ovp+KN1!JxBa63a`6LIwwkJdO_xJj$o1XkusMb3!XPbxCqzJ6nf<6k>lVYj
z0G<#75XXSs!Q{O=+rRB43@nLaeGaTU7(^kKFL9jUDB{h?Sg*{?_NXVUL6Di9&q;u1
zm1$ssQvml?2%6Cu$&|LBsiNIbVOk@XrrfauQEXZ>DkPYb6&V{0ycaBSY2Yblb7AOA
z_|TK^VM4-(DG48DBz%~Y@L@s1hb0LQRwO)Flki|e!h~vYMfKB!cs*@4^Gp;eVPIJ$
zlg7C%k#Et39D^Bo1}m%>oKpEj7zB(I9NIcN9E1-lNFP*CKB%C5P{H`1g7ra#*#{eD
zsh+T6V44O~wiBf6fYiYQN(T>U9Xw!k@IbxQ!2?bQA9R6KE9^eVFkAIR8s{dyh7D7h
zSs7Y*I3^@2DP3?eIN@e+!o%Q%m%#}igA;xRC(;Z~uxT+n$mvw=$Z1yU@3c|-G~+}v
zm~neW&f-nHXUq(4AIM?e3}LWoF(0te`Ml#yvr+#^8@10f&Ll$^E9%cI-o$xE&0zL{
z9Hz|>MjGdqMzMy;JGu^c^vq_9V)!=U#^NOYo)b5crSdr5G4cc@C|EHtNwUZ?EaZ8U
zI4zOKENKCgLvzPPHZ2w74xZc{jxX2^W~53m>z!z9*kQVYb%z7Pl5Ry7KaP?atPY19
z*qizO+5GTg_d<~idm{bIt8gvZQ=46L~engcs|
z1(~N_NRyN>l3>f?xTsK}B*^P*!j{65;3T=C((s4E1La2k!?tWcB$$5$uw|)SP&lG=
zk((K$BE?B^MWRRZh0Yceo=XxF3@31=PFu*+mI!rFR&3MF0RFEqbmvC>9gYr01_mhx
z28QT1C5Lw1!-_YLOk1c}kaRkM^~HwN150M#FiK&x%8|5qskV(LN#MniX%BgrN^Te}
zX-pB0X0zj{N<8hr1yYcr2-2?0@?t~ki&h5NZ9Lxu795_|D9<$ShEa#h1@36JDI6tC
z3=E2f3=GT+3=G^HiVP>%at;bGB(rU|pURXdQ@`mnQ$^N<-8u~`*bba#T9H+-N2g&m
z+qu(BH?jiu>NKoldvKbGBfDUqPQymFFQ=I-vLBq#Y1qQH@(fc&cEMSlh8=8w&oIr%
zHn^qJu#4^FS*91+3ts3n>}Okcj!7ct!7H7H!))8mF-7EDc&*cLjP39_rjDEiZ*&??
zv0XdIbRtKg{+&+4S+>vTn11AZc(2oNk*)hYlSOXAPo0LVYzxmbW#k_CrPFYoZTESm
zj@*DhIt@43E}v&wk$d2;PQz`scjuW-n+abYAOf&K}=;$`QX1j8U
Q=|&)M&stKDeYTR7`JatVVbN{
z|Br!zL6{|0M}UEWL4tvS;fMq0l>h(#|7YKrW5dA67{I|A=;h*|$UddSVI}*Na)*QL
zA~_C{>>|YuhHO)c95mUc6gyb5O(}5TWSdgy(8w-Q&R}c96u^AMK~R8^RW{%T;}Hi*
z0Y*~?hA9jn@X>&QL6CugLCO8Zgg`ErdS+He28QB{{GyVa{JeCqV;=lf<6!u+Rv_<@
zxEY6OqOf$^%m=I)hbIVIGUiHTG>I?ZJ?!^}g@u8EL79zJrbR(wMR?5)=e|Z3D{ldw
z5Dz8;7Fh;G31%S%j>Ei;QVj=nngv)Y*@QBLc%HCk35YLf?2}L^5NJv;c-cOwp5>uK
zCWC`S9>)U)o@LTk82FlAvYAFOCr(R{SfLQ%Jfl5#fzzj!svB%t67DP7*D@<`6ge<6
zB+g*w*z*1Mhkgadh6#*}MrR}i4o<9Z`zQH3{lol=^=dlM1W6#N>#2pCrT1qbg#P
zBYcuP8yO^@DKTmOm-$=#?fgUg<^N^=7yoI3KJV#n1mnW=ynZ^9unQZ{5go^AT+A041zsqeQ0S^(+R!kOk=f9D!qdYJ
zB@8V#97YV)nh6bOgi>q*4slFR3Y_XL+j_90k(phQgYD{^GmH@&MT+7OWNqlZfOm?(
z8-oqZhxmjh@=o!1Veo=!5&I;$6lPT(Yo0d^8=6y6HnZGzU`@>UWWW=$M)8IEV&ce7*rC&~k!|v6CX4I~yL1}1u$?%~RFR#q
zU#DRQ+nh5@GqM%V=rrtNdvS*8MfQYSIt}~Tj-F+b$hmMwr{OT$)w4_yIS20QG#q1l
zbe5?jXTm+5hEr@`&oZ6J5vYHm({Pq;(mAFdIS*dyG+bm`b&km*H{i8S!&SBe=a@2b
zH@wklxXyO#98*WG!#kaZn{1!XF|Ekm@Ls3kHe2s`rW3ghKXn@JvMoQ)^deW`w@$--
zwu9%HIPy09(P? cvt.h && /opt/rocm/llvm/bin/clang -std=c++17 -Xclang -no-opaque-pointers -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -S -emit-llvm --cuda-device-only -nogpulib -O3 -Xclang -fallow-half-arguments-and-returns -o - | sed -e 's/define/define linkonce_odr/g' | sed -e '/@llvm.used/d' | sed -e 's/\"target-cpu\"=\"[^\"]*\"//g' | sed -e 's/\"target-features\"=\"[^\"]*\"//g' | sed -e 's/\"denormal-fp-math-f32\"=\"[^\"]*\"//g' | sed -e 's/!llvm.module.flags = !{!0, !1, !2, !3, !4}/!llvm.module.flags = !{ }/g' | sed -e 's/memory(none)/readnone/g' | sed -e 's/memory(argmem: readwrite, inaccessiblemem: readwrite)/inaccessiblemem_or_argmemonly/g' | sed -e 's/memory(read)/readonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | llvm-as-13 -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
+// python3 ./cvt.py > cvt.h && /opt/rocm/llvm/bin/clang -std=c++20 -Xclang -no-opaque-pointers -Wall -Wextra -Wsign-compare -Wconversion -x hip zluda_ptx_impl.cpp -S -emit-llvm --cuda-device-only -nogpulib -O3 -Xclang -fallow-half-arguments-and-returns -o - | sed -e 's/define/define linkonce_odr/g' | sed -e '/@llvm.used/d' | sed -e 's/\"target-cpu\"=\"[^\"]*\"//g' | sed -e 's/\"target-features\"=\"[^\"]*\"//g' | sed -e 's/\"denormal-fp-math-f32\"=\"[^\"]*\"//g' | sed -e 's/!llvm.module.flags = !{!0, !1, !2, !3, !4}/!llvm.module.flags = !{ }/g' | sed -e 's/memory(none)/readnone/g' | sed -e 's/memory(argmem: readwrite, inaccessiblemem: readwrite)/inaccessiblemem_or_argmemonly/g' | sed -e 's/memory(read)/readonly/g' | sed -e 's/memory(argmem: readwrite)/argmemonly/g' | llvm-as-13 -o zluda_ptx_impl.bc && /opt/rocm/llvm/bin/llvm-dis zluda_ptx_impl.bc
// Compile to binary:
// /opt/rocm/llvm/bin/clang -x ir -target amdgcn-amd-amdhsa -Xlinker --no-undefined zluda_ptx_impl.bc -mno-wavefrontsize64 -mcpu=gfx1030
// Decompile:
@@ -1260,6 +1260,7 @@ extern "C"
default:
return 0;
}
+ return 2;
case 'l':
switch (s[1])
{
@@ -1289,17 +1290,18 @@ extern "C"
case 'X':
case 'n':
len = 8;
- return 2;
+ break;
default:
return 0;
}
+ return 3;
default:
return 0;
}
+ return 2;
default:
return 0;
}
- return 1;
}
__device__ static bool parse_printf_specifier(const char *s, uint8_t &len)
@@ -1393,8 +1395,36 @@ extern "C"
char c = *(s++);
if (c == 0)
break;
- if (c == '%')
+ if (c != '%')
+ continue;
+
+ // %% requires no additional handling
+ if (*s == '%')
{
+ s++;
+ continue;
+ }
+
+ // %s uses __ockl_printf_append_string_n
+ // https://github.com/ROCm/ROCm-Device-Libs/blob/rocm-5.7.x/ockl/src/services.cl#L343
+ if (*s == 's')
+ {
+ s++;
+ const char *value = (const char *)read_valist(valist_ptr, valist_offset, 8);
+ handle = __ockl_printf_append_string_n(handle, value, strlen_plus_one(value), 0);
+ continue;
+ }
+
+ // Keep scanning until we figure out the length of this specifier or if we reach the end of the string
+ while (*s != 0) {
+ // "The width is not specified in the format string, but as an additional integer value argument preceding the argument that has to be formatted."
+ if (*s == '*') {
+ s++;
+ uint64_t value = read_valist(valist_ptr, valist_offset, 4);
+ handle = __ockl_printf_append_args(handle, 1, value, 0, 0, 0, 0, 0, 0, 0);
+ continue;
+ }
+
uint8_t len = 0;
if (parse_printf_specifier(s, len))
{
@@ -1406,16 +1436,22 @@ extern "C"
if (specifier_with_length)
{
s += specifier_with_length;
- }
- if (len > 0)
- {
- uint64_t value = read_valist(valist_ptr, valist_offset, len);
- handle = __ockl_printf_append_args(handle, 1, value, 0, 0, 0, 0, 0, 0, 0);
+ } else {
+ // Assume the unknown character is a sub-specifier and move on
+ s++;
+ continue;
}
}
+
+ if (len > 0)
+ {
+ uint64_t value = read_valist(valist_ptr, valist_offset, len);
+ handle = __ockl_printf_append_args(handle, 1, value, 0, 0, 0, 0, 0, 0, 0);
+ }
+ break;
}
}
- return (uint32_t)__ockl_printf_append_args(handle, 0, 0, 0, 0, 0, 0, 0, 0, 1);
+ __ockl_printf_append_args(handle, 0, 0, 0, 0, 0, 0, 0, 0, 1);
return 1;
}