From 5d03261457c6d6ee936ad9e3c4c63e169ce252f3 Mon Sep 17 00:00:00 2001 From: Andrzej Janik Date: Thu, 25 Sep 2025 18:19:10 -0700 Subject: [PATCH] Add atomic loads and stores (#526) And add various smaller fixes across the compiler and runtime --- .github/workflows/pr_master.yml | 20 +--- .github/workflows/push_master.yml | 20 +--- Cargo.lock | 8 +- comgr/src/lib.rs | 6 + cuda_macros/Cargo.toml | 2 +- llvm_zluda/src/lib.cpp | 20 ++++ llvm_zluda/src/lib.rs | 6 + ptx/lib/zluda_ptx_impl.bc | Bin 24896 -> 25144 bytes ptx/lib/zluda_ptx_impl.cpp | 13 +++ ptx/src/pass/llvm/emit.rs | 150 ++++++++++++++++++------- ptx/src/pass/mod.rs | 5 + ptx/src/test/spirv_run/atomics_128.ptx | 24 ++++ ptx/src/test/spirv_run/mod.rs | 6 + ptx_parser/src/lib.rs | 5 +- zluda/Cargo.toml | 2 +- zluda/src/impl/function.rs | 27 +++-- zluda/src/impl/kernel.rs | 4 +- zluda/src/impl/memory.rs | 16 +-- zluda_bindgen/Cargo.toml | 2 +- zluda_common/src/lib.rs | 4 +- zluda_trace/Cargo.toml | 2 +- 21 files changed, 238 insertions(+), 104 deletions(-) create mode 100644 ptx/src/test/spirv_run/atomics_128.ptx diff --git a/.github/workflows/pr_master.yml b/.github/workflows/pr_master.yml index 8787c81..c2ceb6d 100644 --- a/.github/workflows/pr_master.yml +++ b/.github/workflows/pr_master.yml @@ -24,15 +24,7 @@ jobs: name: Build (Linux) runs-on: ubuntu-22.04 steps: - - uses: jlumbroso/free-disk-space@main - with: - # Removing Android stuff should be enough - android: true - dotnet: false - haskell: false - large-packages: false - docker-images: false - swap-storage: false + - uses: jlumbroso/free-disk-space@v1.3.1 - uses: actions/checkout@v4 with: submodules: true @@ -79,15 +71,7 @@ jobs: outputs: test_package: ${{ steps.upload_artifacts.outputs.artifact-id }} steps: - - uses: jlumbroso/free-disk-space@main - with: - # Removing Android stuff should be enough - android: true - dotnet: false - haskell: false - large-packages: false - docker-images: false - swap-storage: false + - uses: jlumbroso/free-disk-space@v1.3.1 - uses: actions/checkout@v4 with: submodules: true diff --git a/.github/workflows/push_master.yml b/.github/workflows/push_master.yml index 378fefe..1fdee90 100644 --- a/.github/workflows/push_master.yml +++ b/.github/workflows/push_master.yml @@ -18,15 +18,7 @@ jobs: permissions: contents: write steps: - - uses: jlumbroso/free-disk-space@main - with: - # Removing Android stuff should be enough - android: true - dotnet: false - haskell: false - large-packages: false - docker-images: false - swap-storage: false + - uses: jlumbroso/free-disk-space@v1.3.1 - uses: actions/checkout@v4 # fetch-depth and fetch-tags are required to properly tag pre-release builds with: @@ -117,15 +109,7 @@ jobs: outputs: test_package: ${{ steps.upload_artifacts.outputs.artifact-id }} steps: - - uses: jlumbroso/free-disk-space@main - with: - # Removing Android stuff should be enough - android: true - dotnet: false - haskell: false - large-packages: false - docker-images: false - swap-storage: false + - uses: jlumbroso/free-disk-space@v1.3.1 - uses: actions/checkout@v4 with: submodules: true diff --git a/Cargo.lock b/Cargo.lock index cfe4cff..78ed7bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -420,7 +420,7 @@ version = "0.0.0" dependencies = [ "proc-macro2", "quote", - "rustc-hash 1.1.0", + "rustc-hash 2.0.0", "syn 2.0.89", ] @@ -3706,7 +3706,7 @@ dependencies = [ "paste", "ptx", "ptx_parser", - "rustc-hash 1.1.0", + "rustc-hash 2.0.0", "serde", "serde_json", "tempfile", @@ -3726,7 +3726,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "rustc-hash 1.1.0", + "rustc-hash 2.0.0", "syn 2.0.89", ] @@ -3854,7 +3854,7 @@ dependencies = [ "ptx", "ptx_parser", "regex", - "rustc-hash 1.1.0", + "rustc-hash 2.0.0", "unwrap_or", "wchar", "winapi", diff --git a/comgr/src/lib.rs b/comgr/src/lib.rs index 8546203..9e36ab6 100644 --- a/comgr/src/lib.rs +++ b/comgr/src/lib.rs @@ -219,6 +219,12 @@ pub fn compile_bitcode( compile_to_exec.set_isa_name(gcn_arch)?; compile_to_exec.set_language(Language::LlvmIr)?; let common_options = [ + // Uncomment for LLVM debug + //c"-mllvm", + //c"-debug", + // Uncomment to save passes + // c"-mllvm", + // c"-print-before-all", c"-mllvm", c"-ignore-tti-inline-compatible", // c"-mllvm", diff --git a/cuda_macros/Cargo.toml b/cuda_macros/Cargo.toml index cfefc62..aa4e377 100644 --- a/cuda_macros/Cargo.toml +++ b/cuda_macros/Cargo.toml @@ -8,7 +8,7 @@ edition = "2021" quote = "1.0" syn = { version = "2.0", features = ["full", "visit-mut", "extra-traits"] } proc-macro2 = "1.0" -rustc-hash = "1.1.0" +rustc-hash = "2.0.0" [lib] proc-macro = true diff --git a/llvm_zluda/src/lib.cpp b/llvm_zluda/src/lib.cpp index c8ac2d7..1151330 100644 --- a/llvm_zluda/src/lib.cpp +++ b/llvm_zluda/src/lib.cpp @@ -196,4 +196,24 @@ void LLVMZludaBuildFence(LLVMBuilderRef B, LLVMAtomicOrdering Ordering, Name); } +void LLVMZludaSetAtomic( + LLVMValueRef AtomicInst, + LLVMAtomicOrdering Ordering, + char * SSID) +{ + auto inst = unwrap(AtomicInst); + if (LoadInst *LI = dyn_cast(inst)) + { + LI->setAtomic(mapFromLLVMOrdering(Ordering), LI->getContext().getOrInsertSyncScopeID(SSID)); + } + else if (StoreInst *SI = dyn_cast(inst)) + { + SI->setAtomic(mapFromLLVMOrdering(Ordering), SI->getContext().getOrInsertSyncScopeID(SSID)); + } + else + { + llvm_unreachable("Invalid instruction type for LLVMZludaSetAtomic"); + } +} + LLVM_C_EXTERN_C_END \ No newline at end of file diff --git a/llvm_zluda/src/lib.rs b/llvm_zluda/src/lib.rs index 18046a5..37b1d97 100644 --- a/llvm_zluda/src/lib.rs +++ b/llvm_zluda/src/lib.rs @@ -78,4 +78,10 @@ extern "C" { scope: *const i8, Name: *const i8, ) -> LLVMValueRef; + + pub fn LLVMZludaSetAtomic( + AtomicInst: LLVMValueRef, + Ordering: LLVMAtomicOrdering, + SSID: *const i8, + ); } diff --git a/ptx/lib/zluda_ptx_impl.bc b/ptx/lib/zluda_ptx_impl.bc index afc9c2c502e413f473ecd473e5f781656d4a9e4c..fef08533c25992908b3538dc52859f68d213af89 100644 GIT binary patch delta 5772 zcmX?bh;hde#tABn?>DN-aVE1|WqBmkk;Jr+$HlO51>2$-0Wr*h62fe$3Th%(9Yl_` zFw7Q65$^E5y@RvK$VXvOqlOyCT?5At6F85W3$z+Ku4vD!oZ`(Gu6~bVpZb| zg@#569|x^$1+HYtX3m2qo0oBZXJit#-aMZ>oUxvhnVmt6jR6W2nA<|03bXMs`0zkQ z7#!GmR{C(vW=P0_i7*%?cszAJ$HLOU%)qDs5@npg=&WUSNP}U99Ec0T4LlM-6NSte z4_pE9z&Pc=gs$mxI5`;D8JHYEB1{U*^=%=qS^^miY(ac5mNZx*q`8Kvfs+Z$Wnf@9 z(V#fh&r(c)L6Cu^0VK!*a?=H_-3$qnKwL03O7LKfl4N7J%mU^zFfdGDbY6956^{ag z222Fxrk&vuDGUaoFrj(|NrNR-hF;7}2bdZd1Q-|?8bJ14ozdvX#t_U3QwB0CaJGp8 zgAM}+10w?i0|&^go3kVf7z|=ziaX}wYR))*SMrrMGX69p9 z#?8RMP!CoH3idtbY@!T4;$S`l131|K@$*_R%+P}gfvlXHXn>(C#Lxm}99SvHw2u-U z=(et!$z#LApazdvkg~#hNgoLT21&3!sI4BxFRU15n81WUwzj4-u*oy{NW;WH;qPQJ z;{XE(10PHbWSogqfV=>MEKCTb?1BkLGe5&JUYHn2*-9xNm@=^cLFy_@f7mh1FoP)r z+4?rypw@w*1SSLu-lCfielsN8f(scXcwF7a_?40I0w_|z!YKzPXq7a-XKu)32J;~a zNc0EWWp)+^c9;+-LVxn^;IBt>n!ZfKL3W18DC*4lYz!G1ULqS8s?9Db$-u|}&p05* z89hIs&CD?45X=q+kY80~Hyo5?IPeA8eO&99%`BK1X4t{vp8@3Ek7W*Q84L`QCE)r~ z4ou)W!ffQg%rFBk2vRw*(ts_Kf#EV-kRj#3gjZLXO+A zDxU-_K+hmE7a6dLG5DmxodgQ@5)lnv4hC*W8Bou_1ajtIjh;qUhRN`B2=Wk@4l8=L z6piIdPGCHcg5sN9zKxd}7&b5{Fo3eY0?2(^=V5VQObCYiI#>`+o?5$ zQJUdE35vy^!WZ(~VPIHh4EG)=yWil-e!#|HFc&3kV!{PjEf^X&;n@J>O4h?nn)L@5 z8GJI}27t=ZD6vhP49nmt5EMLbbsU5R7&4KSt*R5#;O1a3g^QhNPz?3qK8u#fwdRY> z;^$~U$)(x4289gJya3LBpjg>0W?;#{v;aAgU(M!@lwm;4T3mZs-IkeP|W(ZJxuuiGz`zL-V+Dz^ZH||gWMkYixl-`} zN9FtZj!7_c;K_EDS4&Q}}}z~jWQ;4o9OL<_@*&F^&&FfvMP-k>kT z$oPEnSp!SPDU;a^4Y{5h^hk&}a5OQtOm;FfVthWi)KHai!sOY8`hqQd2iX)2LGjAn z*Cf%xps@M2;Vfp}d2NwXSS))$A}?4cH<=oFPvZZ;07DL21sfz8Hl1aBd)V#+vppsX z$TexmDR%hxfdBUf{(le9(B%K73XTVvJ)bz+VWJ*ZMn)0A)s3jR4Z8DjrQ8fHRkH|P3)66IOm>duR%i#90jK^ z{(RQ(pi!k_k-C$U`X@zsWRwxW^<*}46Z@nGo%1HN&qp(3furFRX1MMr57iGj=^&et z5g-K7oe$N{y>SQoCb%IB90RAYqG^BEsB`8K)R2q-LDrB4Mpl^sLt}<3vzbq_Z!h46 zB8BrD42oO}MYVr_Yd7x+g*y#=CwVy-?=}cs=d}>L*C6tiH=^-=gLp4r z2ji;-shxZ~65lq+e&Bo2_@hCgpI>9s&jyv{{25Ar8q{y_cNqO|(Cidg;l$Oby-VOk z5O1UIYk?n3(v1c?1uZViG@3pVtavQnXgNV>N3d3Xqs>L38_X7s_FIKHTx=R0*NSR9 zscdxalkk|*)989lBEzMx(fzYT2g{5`&ko5IQ)V{$%$Ix-GOICQwWNmV?8cDok{MTL zH-_Jn>|kBl7}+Gf;_3g!=-(>f8YY#8fnhci1A`Frlm|Kt3=E(Wbs;wcxC!XPz`(%B zV9&r10agm3vl$o|61X4&%efgC1Q{3@c5pK=2r)3g8{9B?kgDv>2fY_F)+;hFFdSiI zU;xqo|Ns9lpu)iL6v~IuFmW~}1_n?g5Jtnq<(L@24RaU`66aK5V9|_9!CHFUk~x| z0VYTw!Dx^MkOeoH7#Ktu7#LtQNSs55f#Dq!B*b7eNF1b|pP7L{gn@wpMuWsb7ArF| zFo3$QFd8HdQtw>P%)lVbz`)=Pr9l!P4Pnd-3~CGv3@{ob4$@G=%)kKZw83bQILJYh znIR5>(I9b<`Yp^1;0`H_hKb){W?)bTbqHY$n8bgm1~?5(6Y4Aw4KNy}-iw8SL7jnt z0Y<~b3t1Q#lo%KoU^Gm8G7AF(sMijq>lt7Y8(APB0i!|UAfFv!VE}jWU^GY^qU%9%0h91$gX97j4H5@gkOtKNqe0>z z3tHG17(itcjE0GCfQrLtnD`ww1_n^+2BZIj@*l_`26l)9jD~4YVTU9-7!4D5XJ=s0 zVqjo^(J=8ssCpO;6Q2TA52In?``GIl7*rV;81}J4Xqd!fc8JelG)$bI1L7bU4HGxw zfFv#$4H5@sV>hUJ7!48!c{mfQKC>Rm07-x})Iv4DXplHa!vd%}7!48!sow%s52Hci zAobUw>R~iU9Hjm=R6UHY2T6c5@Nq(X0HZLF((E;xHO2UeB zFN}tX=W;QC2ijmXNF0=?Yq>xku4iC?Fc8^qA{PS#s96f5VH)N^#bGo|d^HyX18BGc zM#IFnb3tlL7!4CY4|OP*2IYU4#0xG+H439a;-Db?1~mvqgTz5T;DgrkFd8N<&kd=x zU^GnJgd5^97!4A)1sUWBH3&w7BtYUp+>o*zM#ID_xgnKG<>bUrcaCq6R?4c)8$xd~ oPQD+;%AJ##mztYcoE@K2GWlGX7=oXkI(c5W*yg_QO`zdc0I-w?+yDRo delta 5592 zcmdmSgz>;3#tABn=QgU!aV85~cMuWgNLA?KbTJh;+VV)MT6`G{3}GS+k_JmIDW)?qEnsY55MW?nXka+e zpeQPpD8k^w2UFz0#`Wf8DctC{_#0BA$0~1~?(_-dm-~{nN804lLQ8q~i9}y52ghBR2bzf>=Sizvc zz{0@5pa8N@G%jHR;{g}AQS~VYCTJZ|0<(!Ws=4w!{8$Z;)5{AtS@gi zH86lxfJDG%WhVwO9!P z7gh{2jKE4jal|7L^f#rRflZ#lM+zngQuohj#sLP7240vL$Z?w_1LOr5WMD!dWoL{z zn)w;jP?V`l`JgK+G5KM~FvA3<6BL_IGYx7T7z)5bVE;=REJ?ch;5S3UC9oKXFiP;) zx{mQHBjW+&;C&U}{GPeNlNqD}gikamPCd_dnVp3Jo;pFnyMuQJy35u|HymVV2u4x% zjn~Goo}u9+vT;|r*d-+y7(XD}+SS9no`)fU39b(0iO!+}Z0QUPlaWo-D(AcLfQKRB z6Uc2~Y?R=!^fII50!9X(M6dve0HyK$d?y%q84`r*Q4K9OV9R7+xD1asQ1)AOgxS=C znPJ8;kX|qb#YnzfkE0aB0T(u~2#An0STgCW1MejUhGmGH3Cf52*;5y=F&IpR`w0|% zhwKuVO&J>MSMq@Lg0WG8$JJj4B_bFMW}+y4V86gbfkBIbg8?MW0rLIC1qN(l3_fXK zeINo9?8YJ*==rExqol@#27%q7!fAGYQfOJ3C{+gg5l{dCd~tk3_clPWgr3+<&t8XIMIsH zY@NxIgoNsC#Wc7%7)(J5!T3aj;#F<#vuMd$D^6?{KL-Q6Kn10OmpTT83}~spO3c8L zfvEwWut0A2=8lwMK+Q|1*08!WGdVCeFw`T8>{#v@j4TI0r6t%QAZK3_Q{qOmb+2wn zF$2p1c&PzOrC$45k1?UzdbOT=A}gxnv?RpOJ>WwtM-@w-Gf6#R6ad#aV0#_dJaPmy zWV9F=Ca{9a(#;2i?O2#PtTsu+Gpc^zTWXM$uz``IsnLkv-5{x93nNF$ z7$34Jad}Y z4+;yK1(}%`91RSPOm31&W$AU^z&P1nT5PhcwDV*QX+g$Elf$HqHb0RTVPl*!`KRIm z#)iqeloS}5H{Vuz!^|2VVZg$$`Hb3BcD|yB03IiX8HbsgC0ZCBY|hs^z{tq4*}y=C zk@4wdUqegAlF99chFnh#dL%>~IGPw!CLc01VthLJtD!2Rz+`D7eZdsIgKUb1ps3^S zYm#VT5ZD}TG>e&cmaFH4!x|lq91XDy43nA6jJ%8Z-#y@iB8RPl4U!C-&N4oAwtLZx z1qI}qG~^UJ{5in?a{~XL188V+y_tgJ&SsB0hix%Y4=W?1h+u1>Lgyxf-p>gKSX28kG56l!L9u5Qyxt2GFOy<*a*`JG6iD$Iwl=t zGcp2%AiDFR+Br9@VBZKgWPxMg6jn6tuN-wwEP@)65g^DK(!j_n6JTh}aAh|0PWEjH z+)$)&o`XS=YoSQjLYa&JgDXsgHk)chtjVG)gAm9Mt>XBU-EZ2aWrbq5?B$$-Kc$9 z;0BXqquxwGiOW)r#)kwW9?LeGe-NAztkGz-QD_IVUVWqORN)uSR*m+(q8v~18y(+? zYfR~CblxW6;nLISdRZcarLWQbg+#}c{zk9=5+_1tH2QT*a){1s44N+Kadl>6=sw8| z*7=R$kEA-D{%wrBtpct&7#J8*c^DY_nHU&^n5PuzFfcHHO4E351_n?S@58_VZp1_| zOy=_8GR$OPU@+i({BMuRkfEQo*_1fxOXAcNAN>R~iU9HhRE3F2oM4HKWv#K0iRz`y{b|NpQ5 z|DQvLfng031A`(1f`(~0%f!F{>V?5*kT}TVhfJV87y|>028n|#W@Khy5C+8ylm>}| z)Qd1PFsLywFu-V#I7qz}sLd?Sz`$U|%)kJmK@uPb1u{b%1fxOXAPq=A3!3=A+D zCO(UqfkBypfdNLt#P>qg!)TcJQ>glS7y~B3!NS0x&cMI`qhaFuEDQ{wek_cJi3hSU zFvu}5Fu-V-cs>gxBw#d19OSVk76t}c1_lNwUC#iL0Qqb_3nWNkG)#Oy)F2oQ5(gP{ zi3JihFd8QQj0F;MFd8HdGM}9l;vg6e66esVXJC+GWnhqJK+qrwkOoVr1{e(z2U+X` zRS%;<;vn_etPlsmXplHaeLYk?j0TB=)XxK%59%|584L{IkN|1e$jSijp2BFDhO?}Y z5Q5P#@wcpyD1gy0ac(w9E`ZS>agg~cY!LMzx*imFAPJC*oY)u`K&1?fhKc7v#bGo| zd=?u61E{Ei(J=9SP;nRy6Mw`8Nu)6PKPdl!EM{eAV9;VH#0PdrT*7FOI4B>pa6r_q@f6^0Y-zwK^mq))xl_xI7t0UsCpO; z5(lY2162>BLE<3w51{H{bUjD{q=A_e;sY2B5(jAz=Y#|)j0TB=)Z0MCVKhuUk&}Tz zn}LA=M#IE=I2jn67#J8}G*rBv;T$Ieg9`%#oQ6s;2ysDDFN}tXYjZJx$K_x&NF0=^ zt+^l`fYC5ql!^C62;`Iy+5C$UqWpXhvfCds^G)zM&7o?Vh(J=8|s6$~i zOney^r1pc+AaPKLZh@Kura}21Bmwfl87@ez4WnV=cex<77L108e**h}fdNLt#Q#Ih wfz$sXm7D-Kq-clHAoaW;i_N(qm5TXf(J*(8r;zrB#^#i;n~a<5Bert_0OX?ORR910 diff --git a/ptx/lib/zluda_ptx_impl.cpp b/ptx/lib/zluda_ptx_impl.cpp index f247f45..c378d78 100644 --- a/ptx/lib/zluda_ptx_impl.cpp +++ b/ptx/lib/zluda_ptx_impl.cpp @@ -17,6 +17,7 @@ typedef _Float16 half16 __attribute__((ext_vector_type(16))); typedef float float8 __attribute__((ext_vector_type(8))); #define FUNC(NAME) __device__ __attribute__((retain)) __zluda_ptx_impl_##NAME +#define FUNC_CALL(NAME) __zluda_ptx_impl_##NAME #define ATTR(NAME) __ZLUDA_PTX_IMPL_ATTRIBUTE_##NAME #define DECLARE_ATTR(TYPE, NAME) \ extern "C" __attribute__((constant)) CONSTANT_SPACE TYPE ATTR(NAME) \ @@ -58,6 +59,18 @@ extern "C" return __lane_id(); } + uint32_t FUNC(sreg_lanemask_lt)() + { + uint32_t lane_idx = FUNC_CALL(sreg_laneid)(); + return (1U << lane_idx) - 1U; + } + + uint32_t FUNC(sreg_lanemask_ge)() + { + uint32_t lane_idx = FUNC_CALL(sreg_laneid)(); + return (~0U) << lane_idx; + } + uint32_t __ockl_bfe_u32(uint32_t, uint32_t, uint32_t) __device__; uint32_t FUNC(bfe_u32)(uint32_t base, uint32_t pos_32, uint32_t len_32) { diff --git a/ptx/src/pass/llvm/emit.rs b/ptx/src/pass/llvm/emit.rs index 0a68f8b..2484440 100644 --- a/ptx/src/pass/llvm/emit.rs +++ b/ptx/src/pass/llvm/emit.rs @@ -539,17 +539,25 @@ impl<'a> MethodEmitContext<'a> { data: ast::LdDetails, arguments: ast::LdArgs, ) -> Result<(), TranslateError> { - if data.qualifier != ast::LdStQualifier::Weak { - todo!() - } let builder = self.builder; - let type_ = get_type(self.context, &data.typ)?; - let ptr = self.resolver.value(arguments.src)?; - self.resolver.with_result(arguments.dst, |dst| { - let load = unsafe { LLVMBuildLoad2(builder, type_, ptr, dst) }; - unsafe { LLVMSetAlignment(load, data.typ.layout().align() as u32) }; - load - }); + let underlying_type = get_type(self.context, &data.typ)?; + let needs_cast = not_supported_by_atomics(data.qualifier, underlying_type); + let op_type = if needs_cast { + unsafe { LLVMIntTypeInContext(self.context, data.typ.layout().size() as u32 * 8) } + } else { + underlying_type + }; + let src = self.resolver.value(arguments.src)?; + let load = unsafe { LLVMBuildLoad2(builder, op_type, src, LLVM_UNNAMED.as_ptr()) }; + apply_qualifier(load, data.qualifier)?; + unsafe { LLVMSetAlignment(load, data.typ.layout().align() as u32) }; + if needs_cast { + self.resolver.with_result(arguments.dst, |dst| unsafe { + LLVMBuildBitCast(builder, load, underlying_type, dst) + }); + } else { + self.resolver.register(arguments.dst, load); + } Ok(()) } @@ -758,11 +766,21 @@ impl<'a> MethodEmitContext<'a> { arguments: ast::StArgs, ) -> Result<(), TranslateError> { let ptr = self.resolver.value(arguments.src1)?; - let value = self.resolver.value(arguments.src2)?; - if data.qualifier != ast::LdStQualifier::Weak { - todo!() + let underlying_type = get_type(self.context, &data.typ)?; + let needs_cast = not_supported_by_atomics(data.qualifier, underlying_type); + let mut value = self.resolver.value(arguments.src2)?; + if needs_cast { + value = unsafe { + LLVMBuildBitCast( + self.builder, + value, + LLVMIntTypeInContext(self.context, data.typ.layout().size() as u32 * 8), + LLVM_UNNAMED.as_ptr(), + ) + }; } let store = unsafe { LLVMBuildStore(self.builder, value, ptr) }; + apply_qualifier(store, data.qualifier)?; unsafe { LLVMSetAlignment(store, data.typ.layout().align() as u32); } @@ -1653,25 +1671,23 @@ impl<'a> MethodEmitContext<'a> { .ok_or_else(|| error_mismatched_type())?, ); let src2 = self.resolver.value(src2)?; - self.resolver.with_result(arguments.dst, |dst| { - let vec = unsafe { - LLVMBuildInsertElement( - self.builder, - LLVMGetPoison(dst_type), - llvm_fn(self.builder, src, packed_type, LLVM_UNNAMED.as_ptr()), - LLVMConstInt(LLVMInt32TypeInContext(self.context), 1, false as i32), - LLVM_UNNAMED.as_ptr(), - ) - }; - unsafe { - LLVMBuildInsertElement( - self.builder, - vec, - llvm_fn(self.builder, src2, packed_type, LLVM_UNNAMED.as_ptr()), - LLVMConstInt(LLVMInt32TypeInContext(self.context), 0, false as i32), - dst, - ) - } + let vec = unsafe { + LLVMBuildInsertElement( + self.builder, + LLVMGetPoison(dst_type), + llvm_fn(self.builder, src, packed_type, LLVM_UNNAMED.as_ptr()), + LLVMConstInt(LLVMInt32TypeInContext(self.context), 1, false as i32), + LLVM_UNNAMED.as_ptr(), + ) + }; + self.resolver.with_result(arguments.dst, |dst| unsafe { + LLVMBuildInsertElement( + self.builder, + vec, + llvm_fn(self.builder, src2, packed_type, LLVM_UNNAMED.as_ptr()), + LLVMConstInt(LLVMInt32TypeInContext(self.context), 0, false as i32), + dst, + ) }) } else { self.resolver.with_result(arguments.dst, |dst| unsafe { @@ -2197,7 +2213,7 @@ impl<'a> MethodEmitContext<'a> { Some(&ast::ScalarType::F32.into()), vec![( self.resolver.value(arguments.src)?, - get_scalar_type(self.context, ast::ScalarType::F32.into()), + get_scalar_type(self.context, ast::ScalarType::F32), )], )?; Ok(()) @@ -2236,7 +2252,7 @@ impl<'a> MethodEmitContext<'a> { } fn emit_bar_warp(&mut self) -> Result<(), TranslateError> { - self.emit_intrinsic(c"llvm.amdgcn.barrier.warp", None, None, vec![])?; + self.emit_intrinsic(c"llvm.amdgcn.wave.barrier", None, None, vec![])?; Ok(()) } @@ -2658,14 +2674,14 @@ impl<'a> MethodEmitContext<'a> { let load = unsafe { LLVMBuildLoad2(self.builder, from_type, from, LLVM_UNNAMED.as_ptr()) }; unsafe { - LLVMSetAlignment(load, (cp_size.as_u64() as u32) * 8); + LLVMSetAlignment(load, cp_size.as_u64() as u32); } let extended = unsafe { LLVMBuildZExt(self.builder, load, to_type, LLVM_UNNAMED.as_ptr()) }; - unsafe { LLVMBuildStore(self.builder, extended, to) }; + let store = unsafe { LLVMBuildStore(self.builder, extended, to) }; unsafe { - LLVMSetAlignment(load, (cp_size.as_u64() as u32) * 8); + LLVMSetAlignment(store, cp_size.as_u64() as u32); } Ok(()) } @@ -2923,6 +2939,61 @@ impl<'a> MethodEmitContext<'a> { */ } +fn not_supported_by_atomics(qualifier: ast::LdStQualifier, underlying_type: *mut LLVMType) -> bool { + // This is not meant to be 100% accurate, just a best-effort guess for atomics + fn is_non_scalar_type(type_: LLVMTypeRef) -> bool { + let kind = unsafe { LLVMGetTypeKind(type_) }; + matches!( + kind, + LLVMTypeKind::LLVMArrayTypeKind + | LLVMTypeKind::LLVMVectorTypeKind + | LLVMTypeKind::LLVMStructTypeKind + ) + } + !matches!(qualifier, ast::LdStQualifier::Weak) && is_non_scalar_type(underlying_type) +} + +fn apply_qualifier( + value: LLVMValueRef, + qualifier: ptx_parser::LdStQualifier, +) -> Result<(), TranslateError> { + match qualifier { + ptx_parser::LdStQualifier::Weak => {} + ptx_parser::LdStQualifier::Volatile => unsafe { + LLVMSetVolatile(value, 1); + // The semantics of volatile operations are equivalent to a relaxed memory operation + // with system-scope but with the following extra implementation-specific constraints... + LLVMZludaSetAtomic( + value, + LLVMAtomicOrdering::LLVMAtomicOrderingMonotonic, + get_scope(ast::MemScope::Sys)?, + ); + }, + ptx_parser::LdStQualifier::Relaxed(mem_scope) => unsafe { + LLVMZludaSetAtomic( + value, + LLVMAtomicOrdering::LLVMAtomicOrderingMonotonic, + get_scope(mem_scope)?, + ); + }, + ptx_parser::LdStQualifier::Acquire(mem_scope) => unsafe { + LLVMZludaSetAtomic( + value, + LLVMAtomicOrdering::LLVMAtomicOrderingAcquire, + get_scope(mem_scope)?, + ); + }, + ptx_parser::LdStQualifier::Release(mem_scope) => unsafe { + LLVMZludaSetAtomic( + value, + LLVMAtomicOrdering::LLVMAtomicOrderingRelease, + get_scope(mem_scope)?, + ); + }, + } + Ok(()) +} + fn get_pointer_type<'ctx>( context: LLVMContextRef, to_space: ast::StateSpace, @@ -2936,7 +3007,7 @@ fn get_scope(scope: ast::MemScope) -> Result<*const i8, TranslateError> { ast::MemScope::Cta => c"workgroup-one-as", ast::MemScope::Gpu => c"agent-one-as", ast::MemScope::Sys => c"one-as", - ast::MemScope::Cluster => todo!(), + ast::MemScope::Cluster => return Err(error_todo()), } .as_ptr()) } @@ -2945,8 +3016,9 @@ fn get_scope_membar(scope: ast::MemScope) -> Result<*const i8, TranslateError> { Ok(match scope { ast::MemScope::Cta => c"workgroup", ast::MemScope::Gpu => c"agent", + // Don't change to "system", this is the same as __threadfence_system, AMDPGU LLVM expects "" here ast::MemScope::Sys => c"", - ast::MemScope::Cluster => todo!(), + ast::MemScope::Cluster => return Err(error_todo()), } .as_ptr()) } diff --git a/ptx/src/pass/mod.rs b/ptx/src/pass/mod.rs index b14903d..e4b5b27 100644 --- a/ptx/src/pass/mod.rs +++ b/ptx/src/pass/mod.rs @@ -136,6 +136,7 @@ enum PtxSpecialRegister { Nctaid, Clock, LanemaskLt, + LanemaskGe, Laneid, } @@ -148,6 +149,7 @@ impl PtxSpecialRegister { Self::Nctaid => "%nctaid", Self::Clock => "%clock", Self::LanemaskLt => "%lanemask_lt", + Self::LanemaskGe => "%lanemask_ge", Self::Laneid => "%laneid", } } @@ -170,6 +172,7 @@ impl PtxSpecialRegister { PtxSpecialRegister::Nctaid => ast::ScalarType::U32, PtxSpecialRegister::Clock => ast::ScalarType::U32, PtxSpecialRegister::LanemaskLt => ast::ScalarType::U32, + PtxSpecialRegister::LanemaskGe => ast::ScalarType::U32, PtxSpecialRegister::Laneid => ast::ScalarType::U32, } } @@ -182,6 +185,7 @@ impl PtxSpecialRegister { | PtxSpecialRegister::Nctaid => Some(ast::ScalarType::U8), PtxSpecialRegister::Clock | PtxSpecialRegister::LanemaskLt + | PtxSpecialRegister::LanemaskGe | PtxSpecialRegister::Laneid => None, } } @@ -194,6 +198,7 @@ impl PtxSpecialRegister { PtxSpecialRegister::Nctaid => "sreg_nctaid", PtxSpecialRegister::Clock => "sreg_clock", PtxSpecialRegister::LanemaskLt => "sreg_lanemask_lt", + PtxSpecialRegister::LanemaskGe => "sreg_lanemask_ge", PtxSpecialRegister::Laneid => "sreg_laneid", } } diff --git a/ptx/src/test/spirv_run/atomics_128.ptx b/ptx/src/test/spirv_run/atomics_128.ptx new file mode 100644 index 0000000..147d350 --- /dev/null +++ b/ptx/src/test/spirv_run/atomics_128.ptx @@ -0,0 +1,24 @@ +.version 7.0 +.target sm_80 +.address_size 64 + +.visible .entry atomics_128( + .param .u64 input, + .param .u64 output +) +{ + .reg .u64 in_addr; + .reg .u64 out_addr; + .reg .u64 temp1; + .reg .u64 temp2; + + ld.param.u64 in_addr, [input]; + ld.param.u64 out_addr, [output]; + + ld.acquire.gpu.v2.u64 {temp1, temp2}, [in_addr]; + add.u64 temp1, temp1, 1; + add.u64 temp2, temp2, 1; + st.release.gpu.v2.u64 [out_addr], {temp1, temp2}; + + ret; +} diff --git a/ptx/src/test/spirv_run/mod.rs b/ptx/src/test/spirv_run/mod.rs index c24ca1a..a7f1989 100644 --- a/ptx/src/test/spirv_run/mod.rs +++ b/ptx/src/test/spirv_run/mod.rs @@ -352,6 +352,12 @@ test_ptx!( [613065134u32] ); test_ptx!(param_is_addressable, [0xDEAD], [0u64]); +// TODO: re-enable when we have a patched LLVM +//test_ptx!( +// atomics_128, +// [0xce16728dead1ceb0u64, 0xe7728e3c390b7fb7], +// [0xce16728dead1ceb1u64, 0xe7728e3c390b7fb8] +//); test_ptx!(assertfail); // TODO: not yet supported diff --git a/ptx_parser/src/lib.rs b/ptx_parser/src/lib.rs index 26ae5e9..ead37d4 100644 --- a/ptx_parser/src/lib.rs +++ b/ptx_parser/src/lib.rs @@ -226,8 +226,9 @@ fn int_immediate<'a, 'input>(input: &mut PtxParser<'a, 'input>) -> PResult Ok(ast::ImmediateValue::S64(-x)), + let full_number = format!("-{num}"); + match i64::from_str_radix(&full_number, radix) { + Ok(x) => Ok(ast::ImmediateValue::S64(x)), Err(err) => Err((ast::ImmediateValue::S64(0), PtxError::from(err))), } } else if is_unsigned { diff --git a/zluda/Cargo.toml b/zluda/Cargo.toml index d0a65f4..1060e2b 100644 --- a/zluda/Cargo.toml +++ b/zluda/Cargo.toml @@ -22,7 +22,7 @@ num_enum = "0.4" lz4-sys = "1.9" tempfile = "3" paste = "1.0" -rustc-hash = "1.1" +rustc-hash = "2.0.0" zluda_common = { path = "../zluda_common" } blake3 = "1.8.2" serde = "1.0.219" diff --git a/zluda/src/impl/function.rs b/zluda/src/impl/function.rs index 90afb51..ee1b557 100644 --- a/zluda/src/impl/function.rs +++ b/zluda/src/impl/function.rs @@ -1,22 +1,33 @@ +use cuda_types::cuda::CUfunction_attribute; use hip_runtime_sys::*; +use std::mem; pub(crate) fn get_attribute( pi: &mut i32, - cu_attrib: hipFunction_attribute, + cu_attrib: CUfunction_attribute, func: hipFunction_t, ) -> hipError_t { // TODO: implement HIP_FUNC_ATTRIBUTE_PTX_VERSION // TODO: implement HIP_FUNC_ATTRIBUTE_BINARY_VERSION match cu_attrib { - hipFunction_attribute::HIP_FUNC_ATTRIBUTE_PTX_VERSION - | hipFunction_attribute::HIP_FUNC_ATTRIBUTE_BINARY_VERSION => { + CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => { *pi = 120; return Ok(()); } + CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE => { + *pi = 0; + return Ok(()); + } _ => {} } - unsafe { hipFuncGetAttribute(pi, cu_attrib, func) }?; - if cu_attrib == hipFunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS { + unsafe { hipFuncGetAttribute(pi, mem::transmute(cu_attrib), func) }?; + if cu_attrib == CUfunction_attribute::CU_FUNC_ATTRIBUTE_NUM_REGS { *pi = (*pi).max(1); } Ok(()) @@ -55,12 +66,12 @@ pub(crate) fn launch_kernel( pub(crate) unsafe fn set_attribute( func: hipFunction_t, - attribute: hipFunction_attribute, + attribute: CUfunction_attribute, value: i32, ) -> hipError_t { match attribute { - hipFunction_attribute::HIP_FUNC_ATTRIBUTE_PTX_VERSION - | hipFunction_attribute::HIP_FUNC_ATTRIBUTE_BINARY_VERSION => { + CUfunction_attribute::CU_FUNC_ATTRIBUTE_PTX_VERSION + | CUfunction_attribute::CU_FUNC_ATTRIBUTE_BINARY_VERSION => { return hipError_t::ErrorNotSupported; } _ => {} diff --git a/zluda/src/impl/kernel.rs b/zluda/src/impl/kernel.rs index ab45b04..e4c3404 100644 --- a/zluda/src/impl/kernel.rs +++ b/zluda/src/impl/kernel.rs @@ -1,4 +1,4 @@ -use cuda_types::cuda::CUresult; +use cuda_types::cuda::{CUfunction_attribute, CUresult}; use hip_runtime_sys::*; use crate::r#impl::function; @@ -9,7 +9,7 @@ pub(crate) unsafe fn get_function(func: &mut hipFunction_t, kernel: hipFunction_ } pub(crate) unsafe fn set_attribute( - attrib: hipFunction_attribute, + attrib: CUfunction_attribute, val: ::core::ffi::c_int, kernel: hipFunction_t, _dev: hipDevice_t, diff --git a/zluda/src/impl/memory.rs b/zluda/src/impl/memory.rs index 70395ed..4b33460 100644 --- a/zluda/src/impl/memory.rs +++ b/zluda/src/impl/memory.rs @@ -1,16 +1,18 @@ -use std::ptr; - +use crate::r#impl::{context, driver}; use cuda_types::cuda::{CUerror, CUresult, CUresultConsts}; use hip_runtime_sys::*; +use std::{mem, ptr}; -use crate::r#impl::{context, driver}; - -pub(crate) fn alloc_v2(dptr: &mut hipDeviceptr_t, bytesize: usize) -> CUresult { +pub(crate) unsafe fn alloc_v2(dptr: &mut hipDeviceptr_t, bytesize: usize) -> CUresult { let context = context::get_current_context()?; - unsafe { hipMalloc(ptr::from_mut(dptr).cast(), bytesize) }?; + hipMalloc(ptr::from_mut(dptr).cast(), bytesize)?; add_allocation(dptr.0, bytesize, context)?; + let mut status = mem::zeroed(); + hipStreamIsCapturing(hipStream_t(ptr::null_mut()), &mut status)?; // TODO: parametrize for non-Geekbench - unsafe { hipMemsetD8(*dptr, 0, bytesize) }?; + if status != hipStreamCaptureStatus::hipStreamCaptureStatusNone { + hipMemsetD8(*dptr, 0, bytesize)?; + } Ok(()) } diff --git a/zluda_bindgen/Cargo.toml b/zluda_bindgen/Cargo.toml index 5753307..8e7bb4d 100644 --- a/zluda_bindgen/Cargo.toml +++ b/zluda_bindgen/Cargo.toml @@ -9,6 +9,6 @@ syn = { version = "2.0", features = ["full", "visit-mut"] } proc-macro2 = "1.0.89" quote = "1.0" prettyplease = "0.2.25" -rustc-hash = "1.1.0" +rustc-hash = "2.0.0" libloading = "0.8" cuda_types = { path = "../cuda_types" } diff --git a/zluda_common/src/lib.rs b/zluda_common/src/lib.rs index 4f8aef7..4c76ef1 100644 --- a/zluda_common/src/lib.rs +++ b/zluda_common/src/lib.rs @@ -173,12 +173,12 @@ from_cuda_nop!( cublasLtMatmulDescAttributes_t, CUmemAllocationGranularity_flags, CUmemAllocationProp, - CUresult + CUresult, + CUfunction_attribute ); from_cuda_transmute!( CUuuid => hipUUID, CUfunction => hipFunction_t, - CUfunction_attribute => hipFunction_attribute, CUstream => hipStream_t, CUpointer_attribute => hipPointer_attribute, CUdeviceptr_v2 => hipDeviceptr_t, diff --git a/zluda_trace/Cargo.toml b/zluda_trace/Cargo.toml index a6c4120..0925c1a 100644 --- a/zluda_trace/Cargo.toml +++ b/zluda_trace/Cargo.toml @@ -24,7 +24,7 @@ paste = "1.0" cuda_macros = { path = "../cuda_macros" } cuda_types = { path = "../cuda_types" } parking_lot = "0.12.3" -rustc-hash = "1.1.0" +rustc-hash = "2.0.0" cglue = "0.3.5" zstd-safe = { version = "7.2.4", features = ["std"] } unwrap_or = "1.0.1"