diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index aaffd7d00d..e7c28ed8ff 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -608,7 +608,7 @@ public: for (auto &arg : values) { u32 arg_size = align(u32(arg.size() + 1), stack_align); - u32 arg_addr = Memory.MainMem.AllocAlign(arg_size, stack_align); + u32 arg_addr = (u32)Memory.MainMem.AllocAlign(arg_size, stack_align); std::strcpy(vm::get_ptr(arg_addr), arg.c_str()); diff --git a/rpcs3/Emu/Memory/vm_ptr.h b/rpcs3/Emu/Memory/vm_ptr.h index 6f0c65ed0d..0970ad7996 100644 --- a/rpcs3/Emu/Memory/vm_ptr.h +++ b/rpcs3/Emu/Memory/vm_ptr.h @@ -112,7 +112,7 @@ namespace vm __forceinline T* const operator -> () const { - return vm::get_ptr(m_addr); + return vm::get_ptr((u32)m_addr); } _ptr_base operator++ (int) diff --git a/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp b/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp index 8163443e42..483c3d56c6 100644 --- a/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp +++ b/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp @@ -37,7 +37,7 @@ s64 spursCreateLv2EventQueue(vm::ptr spurs, u32& queue_id, vm::ptrm.wklSysG.pm.set(be_t::make(vm::read32(libsre_rtoc - 0x7EA4))); spurs->m.wklSysG.size = 0x2200; #else - spurs->m.wklSysG.pm.set(be_t::make(0x100)); // wrong 64-bit address + spurs->m.wklInfoSysSrv.pm.set(be_t::make(0x100)); // wrong 64-bit address #endif - spurs->m.wklSysG.data = 0; - spurs->m.wklSysG.copy.write_relaxed(0xff); + spurs->m.wklInfoSysSrv.data = 0; + spurs->m.wklInfoSysSrv.copy.write_relaxed(0xff); u32 sem; for (u32 i = 0; i < 0x10; i++) { @@ -146,7 +146,7 @@ s64 spursInit( assert(!"spu_image_import() failed"); } #else - spurs->m.spuImg.addr = Memory.Alloc(0x40000, 4096); + spurs->m.spuImg.addr = (u32)Memory.Alloc(0x40000, 4096); #endif s32 tgt = SYS_SPU_THREAD_GROUP_TYPE_NORMAL; @@ -208,12 +208,12 @@ s64 spursInit( u64 vRES = 0x20ull << 32; u128 vSET = {}; - if (spurs->m.x72.read_relaxed() & (1 << num)) + if (spurs->m.sysSrvMessage.read_relaxed() & (1 << num)) { SPU.WriteLS8(0x1eb, 0); // var4 if (arg1 == 0 || var1 == 0x20) { - spurs->m.x72._and_not(1 << num); + spurs->m.sysSrvMessage._and_not(1 << num); } } else @@ -224,9 +224,9 @@ s64 spursInit( u128 savedD = SPU.ReadLS128(0x1B0); u128 vRC = u128::add8(u128::minu8(wklReadyCount0, u128::from8p(8)), u128::minu8(wklReadyCount1, u128::from8p(8))); u32 wklFlag = spurs->m.wklFlag.flag.read_relaxed(); - u32 flagRecv = spurs->m.flagRecv.read_relaxed(); + u32 flagRecv = spurs->m.wklFlagReceiver.read_relaxed(); u128 vFM = u128::fromV(g_imm_table.fsmb_table[(wklFlag == 0) && (flagRecv < 16) ? 0x8000 >> flagRecv : 0]); - u128 wklSet1 = u128::fromV(g_imm_table.fsmb_table[spurs->m.wklSet1.read_relaxed()]); + u128 wklSet1 = u128::fromV(g_imm_table.fsmb_table[spurs->m.wklSignal1.read_relaxed()]); u128 vFMS1 = vFM | wklSet1; u128 vFMV1 = u128::fromV(g_imm_table.fsmb_table[(var1 < 16) ? 0x8000 >> var1 : 0]); u32 var5 = SPU.ReadLS32(0x1ec); @@ -270,7 +270,7 @@ s64 spursInit( if (!arg1 || var1 == vNUM) { - spurs->m.wklSet1._and_not(be_t::make((u16)(vNUM < 16 ? 0x8000 >> vNUM : 0))); + spurs->m.wklSignal1._and_not(be_t::make((u16)(vNUM < 16 ? 0x8000 >> vNUM : 0))); if (vNUM == flagRecv && wklFlag == 0) { spurs->m.wklFlag.flag.write_relaxed(be_t::make(-1)); @@ -305,126 +305,145 @@ s64 spursInit( { LV2_LOCK(0); // TODO: lock-free implementation if possible - const u32 arg1 = SPU.GPR[3]._u32[3]; - u32 var0 = SPU.ReadLS32(0x1d8); - u32 var1 = SPU.ReadLS32(0x1dc); - u128 wklA = vm::read128(spurs.addr() + 0x20); - u128 wklB = vm::read128(spurs.addr() + 0x30); - u128 savedA = SPU.ReadLS128(0x180); - u128 savedB = SPU.ReadLS128(0x190); - u128 vAA = u128::sub8(wklA, savedA); - u128 vBB = u128::sub8(wklB, savedB); - u128 vM1 = {}; if (var1 <= 31) vM1.u8r[var1 & 0xf] = (var1 <= 15) ? 0xf : 0xf0; - u128 vAABB = (arg1 == 0) ? vAA : u128::add8(vAA, u128::andnot(vM1, vBB)); + auto mgmt = vm::get_ptr(SPU.ls_offset); - u32 vNUM = 0x20; - u64 vRES = 0x20ull << 32; - u128 vSET = {}; + // The first and only argument to this function is a boolean that is set to false if the function + // is called by the SPURS kernel and set to true if called by cellSpursModulePollStatus. + // If the first argument is true then the shared data is not updated with the result. + const auto isPoll = SPU.GPR[3]._u32[3]; - if (spurs->m.x72.read_relaxed() & (1 << num)) + // Calculate the contention (number of SPUs used) for each workload + u8 contention[CELL_SPURS_MAX_WORKLOAD2]; + u8 pendingContention[CELL_SPURS_MAX_WORKLOAD2]; + for (auto i = 0; i < CELL_SPURS_MAX_WORKLOAD2; i++) { - SPU.WriteLS8(0x1eb, 0); // var4 - if (arg1 == 0 || var1 == 0x20) + contention[i] = mgmt->spurs->m.wklCurrentContention[i & 0x0F] - mgmt->wklLocContention[i & 0x0F]; + contention[i] = i < CELL_SPURS_MAX_WORKLOAD ? contention[i] & 0x0F : contention[i] >> 4; + + // If this is a poll request then the number of SPUs pending to context switch is also added to the contention presumably + // to prevent unnecessary jumps to the kernel + if (isPoll) { - spurs->m.x72._and_not(1 << num); + pendingContention[i] = mgmt->spurs->m.wklPendingContention[i] - mgmt->wklLocPendingContention[i]; + pendingContention[i] = i < CELL_SPURS_MAX_WORKLOAD ? pendingContention[i] & 0x0F : pendingContention[i] >> 4; + if (i != mgmt->wklCurrentId) + { + contention[i] += pendingContention[i]; + } + } + } + + u32 wklSelectedId = CELL_SPURS_SYS_SERVICE_WORKLOAD_ID; + u32 pollStatus = 0; + + // The system service workload has the highest priority. Select the system service workload if + // the system service message bit for this SPU is set. + if (mgmt->spurs->m.sysSrvMessage.read_relaxed() & (1 << mgmt->spuNum)) + { + // Not sure what this does. Possibly Mark the SPU as in use. + mgmt->x1EB = 0; + if (!isPoll || mgmt->wklCurrentId == 0x20) + { + // Clear the message bit + mgmt->spurs->m.sysSrvMessage.write_relaxed(mgmt->spurs->m.sysSrvMessage.read_relaxed() & ~(1 << mgmt->spuNum)); } } else { - u128 wklReadyCount0 = vm::read128(spurs.addr() + 0x0); - u128 wklReadyCount1 = vm::read128(spurs.addr() + 0x10); - u128 savedC = SPU.ReadLS128(0x1A0); - u128 wklMaxCnt = vm::read128(spurs.addr() + 0x50); - u32 wklFlag = spurs->m.wklFlag.flag.read_relaxed(); - u32 flagRecv = spurs->m.flagRecv.read_relaxed(); - u128 wklSet1 = u128::fromV(g_imm_table.fsmb_table[spurs->m.wklSet1.read_relaxed()]); - u128 wklSet2 = u128::fromV(g_imm_table.fsmb_table[spurs->m.wklSet2.read_relaxed()]); - u128 vABL = vAABB & u128::from8p(0x0f); - u128 vABH = u128::fromV(_mm_srli_epi32((vAABB & u128::from8p(0xf0)).vi, 4)); - u32 var5 = SPU.ReadLS32(0x1ec); - u128 v5L = u128::fromV(g_imm_table.fsmb_table[var5 >> 16]); - u128 v5H = u128::fromV(g_imm_table.fsmb_table[(u16)var5]); - u128 vFML = u128::fromV(g_imm_table.fsmb_table[(wklFlag == 0) && (flagRecv < 16) ? 0x8000 >> flagRecv : 0]); - u128 vFMH = u128::fromV(g_imm_table.fsmb_table[(u16)((wklFlag == 0) && (flagRecv < 32) ? 0x80000000 >> flagRecv : 0)]); - u128 vCL = u128::fromV(_mm_slli_epi32((savedC & u128::from8p(0x0f)).vi, 4)); - u128 vCH = savedC & u128::from8p(0xf0); - u128 vABRL = u128::gtu8(wklReadyCount0, vABL); - u128 vABRH = u128::gtu8(wklReadyCount1, vABH); - u128 vCCL = v5L & u128::gtu8(vCL, {}) & u128::gtu8(wklMaxCnt & u128::from8p(0x0f), vABL) & (wklSet1 | vFML | vABRL); - u128 vCCH = v5H & u128::gtu8(vCH, {}) & u128::gtu8(u128::fromV(_mm_srli_epi32((wklMaxCnt & u128::from8p(0xf0)).vi, 4)), vABH) & (wklSet2 | vFMH | vABRH); - u128 v1H = {}; if (var1 <= 31 && var1 > 15) v1H.u8r[var1 & 0xf] = 4; - u128 v1L = {}; if (var1 <= 15) v1L.u8r[var1] = 4; - u128 vCH1 = (v1H | vCH & u128::from8p(0xFB)) & vCCH; - u128 vCL1 = (v1L | vCL & u128::from8p(0xFB)) & vCCL; - u128 vSTATL = vABRL & u128::from8p(1) | wklSet1 & u128::from8p(2) | vFML & u128::from8p(4); - u128 vSTATH = vABRH & u128::from8p(1) | wklSet2 & u128::from8p(2) | vFMH & u128::from8p(4); + // Caclulate the scheduling weight for each worjload + u8 maxWeight = 0; + for (auto i = 0; i < CELL_SPURS_MAX_WORKLOAD2; i++) + { + auto j = i & 0x0F; + u8 x1ECx1EE = i < CELL_SPURS_MAX_WORKLOAD ? mgmt->x1EC & (0x8000 >> j) : mgmt->x1EE & (0x8000 >> j); + u8 priority = i < CELL_SPURS_MAX_WORKLOAD ? mgmt->priority[j] & 0x0F : mgmt->priority[j] >> 4; + u8 maxContention = i < CELL_SPURS_MAX_WORKLOAD ? mgmt->spurs->m.wklMaxContention[j].read_relaxed() & 0x0F : mgmt->spurs->m.wklMaxContention[j].read_relaxed() >> 4; + u8 wklSignal = i < CELL_SPURS_MAX_WORKLOAD ? mgmt->spurs->m.wklSignal1.read_relaxed() & (0x8000 >> j) : mgmt->spurs->m.wklSignal2.read_relaxed() & (0x8000 >> j); + u8 wklFlag = mgmt->spurs->m.wklFlag.flag.read_relaxed() == 0 ? mgmt->spurs->m.wklFlagReceiver.read_relaxed() == i ? 1 : 0 : 0; - s32 max = -1; - for (u32 i = 0; i < 0x10; i++) - { - const s32 value = vCL1.u8r[i]; - if (value > max && (vCCL.u8r[i] & 1)) + // For a worload to be considered for scheduling: + // 1. Its priority must be greater than 0 + // 2. The number of SPUs used by it must be less than the max contention for that workload + // 3. The bit in 0x1EC/0x1EE for the wokload must be set + // 4. The number of SPUs allocated to it must be less than the number of SPUs requested (i.e. readyCount) + // Condition #4 may be overriden using a workload signal or using the workload flag + if (x1ECx1EE && priority > 0 && maxContention > contention[i]) { - vNUM = i; - max = value; - } - } - for (u32 i = 16; i < 0x20; i++) - { - const s32 value = vCH1.u8r[i]; - if (value > max && (vCCH.u8r[i] & 1)) - { - vNUM = i; - max = value; + if (wklFlag || wklSignal || mgmt->spurs->m.wklReadyCount[i].read_relaxed() > contention[i]) + { + // The scheduling weight of the workload is equal to the priority of the workload for the SPU. + // The current workload is given a sligtly higher weight presumably to reduce the number of context switches. + u8 weight = priority << 4; + if (mgmt->wklCurrentId == i) + { + weight |= 0x04; + } + + // In case of a tie the lower numbered workload is chosen + if (weight > maxWeight) + { + wklSelectedId = i; + maxWeight = weight; + pollStatus = mgmt->spurs->m.wklReadyCount[i].read_relaxed() > contention[i] ? CELL_SPURS_MODULE_POLL_STATUS_READYCOUNT : 0; + pollStatus |= wklSignal ? CELL_SPURS_MODULE_POLL_STATUS_SIGNAL : 0; + pollStatus |= wklFlag ? CELL_SPURS_MODULE_POLL_STATUS_FLAG : 0; + } + } } } - if (vNUM < 0x10) - { - vRES = ((u64)vNUM << 32) | vSTATL.u8r[vNUM]; - vSET.u8r[vNUM] = 0x01; - } - else if (vNUM < 0x20) - { - vRES = ((u64)vNUM << 32) | vSTATH.u8r[vNUM & 0xf]; - vSET.u8r[vNUM] = 0x10; - } + // Not sure what this does. Possibly mark the SPU as idle/in use. + mgmt->x1EB = wklSelectedId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID ? 1 : 0; - SPU.WriteLS8(0x1eb, vNUM == 0x20); - - if (!arg1 || var1 == vNUM) + if (!isPoll || wklSelectedId == mgmt->wklCurrentId) { - spurs->m.wklSet1._and_not(be_t::make((u16)(vNUM < 16 ? 0x8000 >> vNUM : 0))); - spurs->m.wklSet2._and_not(be_t::make((u16)(0x80000000 >> vNUM))); - if (vNUM == flagRecv && wklFlag == 0) + // Clear workload signal for the selected workload + mgmt->spurs->m.wklSignal1.write_relaxed(be_t::make(mgmt->spurs->m.wklSignal1.read_relaxed() & ~(0x8000 >> wklSelectedId))); + mgmt->spurs->m.wklSignal2.write_relaxed(be_t::make(mgmt->spurs->m.wklSignal1.read_relaxed() & ~(0x80000000u >> wklSelectedId))); + + // If the selected workload is the wklFlag workload then pull the wklFlag to all 1s + if (wklSelectedId == mgmt->spurs->m.wklFlagReceiver.read_relaxed()) { - spurs->m.wklFlag.flag.write_relaxed(be_t::make(-1)); + mgmt->spurs->m.wklFlag.flag.write_relaxed(be_t::make(0xFFFFFFFF)); } } } - if (arg1 == 0) + if (!isPoll) { - vm::write128(spurs.addr() + 0x20, u128::add8(vAA, vSET)); // update wklA + // Called by kernel + // Increment the contention for the selected workload + if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) + { + contention[wklSelectedId]++; + } - SPU.WriteLS128(0x180, vSET); // update savedA - SPU.WriteLS32(0x1dc, vNUM); // update var1 + for (auto i = 0; i < (CELL_SPURS_MAX_WORKLOAD2 >> 1); i++) + { + mgmt->spurs->m.wklCurrentContention[i] = contention[i] | (contention[i + 0x10] << 4); + mgmt->wklLocContention[i] = 0; + mgmt->wklLocPendingContention[i] = 0; + } + + mgmt->wklLocContention[wklSelectedId & 0x0F] = wklSelectedId < CELL_SPURS_MAX_WORKLOAD ? 0x01 : wklSelectedId < CELL_SPURS_MAX_WORKLOAD2 ? 0x10 : 0; + mgmt->wklCurrentId = wklSelectedId; + } + else if (wklSelectedId != mgmt->wklCurrentId) + { + // Not called by kernel but a context switch is required + // Increment the pending contention for the selected workload + for (auto i = 0; i < (CELL_SPURS_MAX_WORKLOAD2 >> 1); i++) + { + mgmt->spurs->m.wklPendingContention[i] = pendingContention[i] | (pendingContention[i + 0x10] << 4); + mgmt->wklLocPendingContention[i] = 0; + } + + mgmt->wklLocPendingContention[wklSelectedId & 0x0F] = wklSelectedId < CELL_SPURS_MAX_WORKLOAD ? 0x01 : wklSelectedId < CELL_SPURS_MAX_WORKLOAD2 ? 0x10 : 0; } - if (arg1 == 1 && vNUM != var1) - { - vm::write128(spurs.addr() + 0x30, u128::add8(vBB, vSET)); // update wklB - - SPU.WriteLS128(0x190, vSET); // update savedB - } - else - { - vm::write128(spurs.addr() + 0x30, vBB); // update wklB - - SPU.WriteLS128(0x190, {}); // update savedB - } - - return vRES; + u64 result = (u64)wklSelectedId << 32; + result |= pollStatus; + return result; }; //SPU.m_code3_func = [spurs, num](SPUThread& SPU) -> u64 // test //{ @@ -434,10 +453,13 @@ s64 spursInit( // return vRES; //}; - SPU.WriteLS128(0x1c0, u128::from32r(0, spurs.addr(), num, 0x1f)); + SpursKernelMgmtData * mgmt = vm::get_ptr(SPU.ls_offset); + mgmt->spurs = spurs; + mgmt->spuNum = num; + mgmt->dmaTagId = 0x1F; - u32 wid = 0x20; - u32 stat = 0; + u32 wid = CELL_SPURS_SYS_SERVICE_WORKLOAD_ID; + u32 pollStatus = 0; while (true) { if (Emu.IsStopped()) @@ -447,14 +469,14 @@ s64 spursInit( } // get current workload info: - auto& wkl = wid <= 15 ? spurs->m.wklG1[wid] : (wid <= 31 && isSecond ? spurs->m.wklG2[wid & 0xf] : spurs->m.wklSysG); + auto& wkl = wid < CELL_SPURS_MAX_WORKLOAD ? spurs->m.wklInfo1[wid] : (wid < CELL_SPURS_MAX_WORKLOAD2 && isSecond ? spurs->m.wklInfo2[wid & 0xf] : spurs->m.wklInfoSysSrv); - if (SPU.ReadLS64(0x1d0) != wkl.pm.addr()) + if (mgmt->wklCurrentAddr != wkl.pm) { // load executable code: memcpy(vm::get_ptr(SPU.ls_offset + 0xa00), wkl.pm.get_ptr(), wkl.size); - SPU.WriteLS64(0x1d0, wkl.pm.addr()); - SPU.WriteLS32(0x1d8, wkl.copy.read_relaxed()); + mgmt->wklCurrentAddr = wkl.pm; + mgmt->x1D8 = wkl.copy.read_relaxed(); } if (!isSecond) SPU.WriteLS16(0x1e8, 0); @@ -463,7 +485,7 @@ s64 spursInit( SPU.GPR[1]._u32[3] = 0x3FFB0; SPU.GPR[3]._u32[3] = 0x100; SPU.GPR[4]._u64[1] = wkl.data; - SPU.GPR[5]._u32[3] = stat; + SPU.GPR[5]._u32[3] = pollStatus; SPU.FastCall(0xa00); // check status: @@ -481,7 +503,7 @@ s64 spursInit( SPU.GPR[3].clear(); assert(SPU.m_code3_func); u64 res = SPU.m_code3_func(SPU); - stat = (u32)(res); + pollStatus = (u32)(res); wid = (u32)(res >> 32); } @@ -507,8 +529,8 @@ s64 spursInit( assert(!"lwcond_create() failed"); } - spurs->m.flags1 = (flags & SAF_EXIT_IF_NO_WORK ? SF1_EXIT_IF_NO_WORK : 0) | (isSecond ? SF1_IS_SECOND : 0); - spurs->m.flagRecv.write_relaxed(0xff); + spurs->m.flags1 = (flags & SAF_EXIT_IF_NO_WORK ? SF1_EXIT_IF_NO_WORK : 0) | (isSecond ? SF1_32_WORKLOADS : 0); + spurs->m.wklFlagReceiver.write_relaxed(0xff); spurs->m.wklFlag.flag.write_relaxed(be_t::make(-1)); spurs->_u8[0xD64] = 0; spurs->_u8[0xD65] = 0; @@ -516,7 +538,7 @@ s64 spursInit( spurs->m.ppuPriority = ppuPriority; u32 queue; - if (s32 res = spursCreateLv2EventQueue(spurs, queue, vm::ptr::make(spurs.addr() + 0xc9), 0x2a, *(u64*)"_spuPrv")) + if (s32 res = (s32)spursCreateLv2EventQueue(spurs, queue, vm::ptr::make(spurs.addr() + 0xc9), 0x2a, *(u64*)"_spuPrv")) { assert(!"spursCreateLv2EventQueue() failed"); } @@ -575,14 +597,14 @@ s64 spursInit( for (u32 i = 0; i < 16; i++) { if (spurs->m.wklStat1[i].read_relaxed() == 2 && - spurs->m.wklG1[i].priority.ToBE() != 0 && - spurs->m.wklMaxCnt[i].read_relaxed() & 0xf + spurs->m.wklInfo1[i].priority.ToBE() != 0 && + spurs->m.wklMaxContention[i].read_relaxed() & 0xf ) { if (spurs->m.wklReadyCount[i].read_relaxed() || - spurs->m.wklSet1.read_relaxed() & (0x8000u >> i) || + spurs->m.wklSignal1.read_relaxed() & (0x8000u >> i) || (spurs->m.wklFlag.flag.read_relaxed() == 0 && - spurs->m.flagRecv.read_relaxed() == (u8)i + spurs->m.wklFlagReceiver.read_relaxed() == (u8)i )) { do_break = true; @@ -590,17 +612,17 @@ s64 spursInit( } } } - if (spurs->m.flags1 & SF1_IS_SECOND) for (u32 i = 0; i < 16; i++) + if (spurs->m.flags1 & SF1_32_WORKLOADS) for (u32 i = 0; i < 16; i++) { if (spurs->m.wklStat2[i].read_relaxed() == 2 && - spurs->m.wklG2[i].priority.ToBE() != 0 && - spurs->m.wklMaxCnt[i].read_relaxed() & 0xf0 + spurs->m.wklInfo2[i].priority.ToBE() != 0 && + spurs->m.wklMaxContention[i].read_relaxed() & 0xf0 ) { if (spurs->m.wklReadyCount[i + 0x10].read_relaxed() || - spurs->m.wklSet2.read_relaxed() & (0x8000u >> i) || + spurs->m.wklSignal2.read_relaxed() & (0x8000u >> i) || (spurs->m.wklFlag.flag.read_relaxed() == 0 && - spurs->m.flagRecv.read_relaxed() == (u8)i + 0x10 + spurs->m.wklFlagReceiver.read_relaxed() == (u8)i + 0x10 )) { do_break = true; @@ -1339,7 +1361,7 @@ s32 spursAddWorkload( } u32 wnum; - const u32 wmax = spurs->m.flags1 & SF1_IS_SECOND ? 0x20u : 0x10u; // TODO: check if can be changed + const u32 wmax = spurs->m.flags1 & SF1_32_WORKLOADS ? 0x20u : 0x10u; // TODO: check if can be changed spurs->m.wklMskA.atomic_op([spurs, wmax, &wnum](be_t& value) { wnum = cntlz32(~(u32)value); // found empty position @@ -1358,15 +1380,15 @@ s32 spursAddWorkload( u32 index = wnum & 0xf; if (wnum <= 15) { - assert((spurs->m.wklA[wnum] & 0xf) == 0); - assert((spurs->m.wklB[wnum] & 0xf) == 0); + assert((spurs->m.wklCurrentContention[wnum] & 0xf) == 0); + assert((spurs->m.wklPendingContention[wnum] & 0xf) == 0); spurs->m.wklStat1[wnum].write_relaxed(1); spurs->m.wklD1[wnum] = 0; spurs->m.wklE1[wnum] = 0; - spurs->m.wklG1[wnum].pm = pm; - spurs->m.wklG1[wnum].data = data; - spurs->m.wklG1[wnum].size = size; - spurs->m.wklG1[wnum].priority = *(be_t*)priorityTable; + spurs->m.wklInfo1[wnum].pm = pm; + spurs->m.wklInfo1[wnum].data = data; + spurs->m.wklInfo1[wnum].size = size; + spurs->m.wklInfo1[wnum].priority = *(be_t*)priorityTable; spurs->m.wklH1[wnum].nameClass = nameClass; spurs->m.wklH1[wnum].nameInstance = nameInstance; memset(spurs->m.wklF1[wnum].unk0, 0, 0x20); // clear struct preserving semaphore id @@ -1377,23 +1399,23 @@ s32 spursAddWorkload( spurs->m.wklF1[wnum].hookArg = hookArg; spurs->m.wklE1[wnum] |= 2; } - if ((spurs->m.flags1 & SF1_IS_SECOND) == 0) + if ((spurs->m.flags1 & SF1_32_WORKLOADS) == 0) { spurs->m.wklReadyCount[wnum + 16].write_relaxed(0); - spurs->m.wklMinCnt[wnum] = minContention > 8 ? 8 : minContention; + spurs->m.wklMinContention[wnum] = minContention > 8 ? 8 : minContention; } } else { - assert((spurs->m.wklA[index] & 0xf0) == 0); - assert((spurs->m.wklB[index] & 0xf0) == 0); + assert((spurs->m.wklCurrentContention[index] & 0xf0) == 0); + assert((spurs->m.wklPendingContention[index] & 0xf0) == 0); spurs->m.wklStat2[index].write_relaxed(1); spurs->m.wklD2[index] = 0; spurs->m.wklE2[index] = 0; - spurs->m.wklG2[index].pm = pm; - spurs->m.wklG2[index].data = data; - spurs->m.wklG2[index].size = size; - spurs->m.wklG2[index].priority = *(be_t*)priorityTable; + spurs->m.wklInfo2[index].pm = pm; + spurs->m.wklInfo2[index].data = data; + spurs->m.wklInfo2[index].size = size; + spurs->m.wklInfo2[index].priority = *(be_t*)priorityTable; spurs->m.wklH2[index].nameClass = nameClass; spurs->m.wklH2[index].nameInstance = nameInstance; memset(spurs->m.wklF2[index].unk0, 0, 0x20); // clear struct preserving semaphore id @@ -1409,27 +1431,27 @@ s32 spursAddWorkload( if (wnum <= 15) { - spurs->m.wklMaxCnt[wnum].atomic_op([maxContention](u8& v) + spurs->m.wklMaxContention[wnum].atomic_op([maxContention](u8& v) { v &= ~0xf; v |= (maxContention > 8 ? 8 : maxContention); }); - spurs->m.wklSet1._and_not({ be_t::make(0x8000 >> index) }); // clear bit in wklFlag1 + spurs->m.wklSignal1._and_not({ be_t::make(0x8000 >> index) }); // clear bit in wklFlag1 } else { - spurs->m.wklMaxCnt[index].atomic_op([maxContention](u8& v) + spurs->m.wklMaxContention[index].atomic_op([maxContention](u8& v) { v &= ~0xf0; v |= (maxContention > 8 ? 8 : maxContention) << 4; }); - spurs->m.wklSet2._and_not({ be_t::make(0x8000 >> index) }); // clear bit in wklFlag2 + spurs->m.wklSignal2._and_not({ be_t::make(0x8000 >> index) }); // clear bit in wklFlag2 } - spurs->m.flagRecv.compare_and_swap(wnum, 0xff); + spurs->m.wklFlagReceiver.compare_and_swap(wnum, 0xff); u32 res_wkl; - CellSpurs::_sub_str3& wkl = wnum <= 15 ? spurs->m.wklG1[wnum] : spurs->m.wklG2[wnum & 0xf]; + CellSpurs::WorkloadInfo& wkl = wnum <= 15 ? spurs->m.wklInfo1[wnum] : spurs->m.wklInfo2[wnum & 0xf]; spurs->m.wklMskB.atomic_op_sync([spurs, &wkl, wnum, &res_wkl](be_t& v) { const u32 mask = v.ToLE() & ~(0x80000000u >> wnum); @@ -1439,7 +1461,7 @@ s32 spursAddWorkload( { if (mask & m) { - CellSpurs::_sub_str3& current = i <= 15 ? spurs->m.wklG1[i] : spurs->m.wklG2[i & 0xf]; + CellSpurs::WorkloadInfo& current = i <= 15 ? spurs->m.wklInfo1[i] : spurs->m.wklInfo2[i & 0xf]; if (current.pm.addr() == wkl.pm.addr()) { // if a workload with identical policy module found @@ -1461,7 +1483,7 @@ s32 spursAddWorkload( spurs->wklStat(wnum).exchange(2); spurs->m.xBD.exchange(0xff); - spurs->m.x72.exchange(0xff); + spurs->m.sysSrvMessage.exchange(0xff); return CELL_OK; } @@ -1671,7 +1693,7 @@ s64 _cellSpursWorkloadFlagReceiver(vm::ptr spurs, u32 wid, u32 is_set { return CELL_SPURS_POLICY_MODULE_ERROR_ALIGN; } - if (wid >= (spurs->m.flags1 & SF1_IS_SECOND ? 0x20u : 0x10u)) + if (wid >= (spurs->m.flags1 & SF1_32_WORKLOADS ? 0x20u : 0x10u)) { return CELL_SPURS_POLICY_MODULE_ERROR_INVAL; } @@ -1687,14 +1709,14 @@ s64 _cellSpursWorkloadFlagReceiver(vm::ptr spurs, u32 wid, u32 is_set { if (is_set) { - if (spurs->m.flagRecv.read_relaxed() != 0xff) + if (spurs->m.wklFlagReceiver.read_relaxed() != 0xff) { return CELL_SPURS_POLICY_MODULE_ERROR_BUSY; } } else { - if (spurs->m.flagRecv.read_relaxed() != wid) + if (spurs->m.wklFlagReceiver.read_relaxed() != wid) { return CELL_SPURS_POLICY_MODULE_ERROR_PERM; } @@ -1706,7 +1728,7 @@ s64 _cellSpursWorkloadFlagReceiver(vm::ptr spurs, u32 wid, u32 is_set return res; } - spurs->m.flagRecv.atomic_op([wid, is_set](u8& FR) + spurs->m.wklFlagReceiver.atomic_op([wid, is_set](u8& FR) { if (is_set) { @@ -1783,7 +1805,7 @@ s64 cellSpursReadyCountStore(vm::ptr spurs, u32 wid, u32 value) { return CELL_SPURS_POLICY_MODULE_ERROR_ALIGN; } - if (wid >= (spurs->m.flags1 & SF1_IS_SECOND ? 0x20u : 0x10u) || value > 0xff) + if (wid >= (spurs->m.flags1 & SF1_32_WORKLOADS ? 0x20u : 0x10u) || value > 0xff) { return CELL_SPURS_POLICY_MODULE_ERROR_INVAL; } diff --git a/rpcs3/Emu/SysCalls/Modules/cellSpurs.h b/rpcs3/Emu/SysCalls/Modules/cellSpurs.h index 849a597223..61fcc42648 100644 --- a/rpcs3/Emu/SysCalls/Modules/cellSpurs.h +++ b/rpcs3/Emu/SysCalls/Modules/cellSpurs.h @@ -94,6 +94,7 @@ enum SPURSKernelInterfaces CELL_SPURS_MAX_SPU = 8, CELL_SPURS_MAX_WORKLOAD = 16, CELL_SPURS_MAX_WORKLOAD2 = 32, + CELL_SPURS_SYS_SERVICE_WORKLOAD_ID = 32, CELL_SPURS_MAX_PRIORITY = 16, CELL_SPURS_NAME_MAX_LENGTH = 15, CELL_SPURS_SIZE = 4096, @@ -162,8 +163,8 @@ enum SpursAttrFlags : u32 enum SpursFlags1 : u8 { SF1_NONE = 0x0, - - SF1_IS_SECOND = 0x40, + + SF1_32_WORKLOADS = 0x40, SF1_EXIT_IF_NO_WORK = 0x80, }; @@ -214,6 +215,12 @@ struct CellSpursWorkloadFlag typedef void(*CellSpursShutdownCompletionEventHook)(vm::ptr, u32 wid, vm::ptr arg); +enum CellSpursModulePollStatus { + CELL_SPURS_MODULE_POLL_STATUS_READYCOUNT = 1, + CELL_SPURS_MODULE_POLL_STATUS_SIGNAL = 2, + CELL_SPURS_MODULE_POLL_STATUS_FLAG = 4 +}; + // Core CellSpurs structures struct CellSpurs { @@ -246,7 +253,7 @@ struct CellSpurs static_assert(sizeof(_sub_str2) == 0x80, "Wrong _sub_str2 size"); - struct _sub_str3 + struct WorkloadInfo { vm::bptr pm; // policy module be_t data; // spu argument @@ -255,7 +262,7 @@ struct CellSpurs be_t priority; }; - static_assert(sizeof(_sub_str3) == 0x20, "Wrong _sub_str3 size"); + static_assert(sizeof(WorkloadInfo) == 0x20, "Wrong WorkloadInfo size"); struct _sub_str4 { @@ -274,29 +281,33 @@ struct CellSpurs // real data struct { - atomic_t wklReadyCount[0x20]; // 0x0 (index = wid) - u8 wklA[0x10]; // 0x20 (packed 4-bit data, index = wid % 16, internal index = wid / 16) - Current contention - u8 wklB[0x10]; // 0x30 (packed 4-bit data, index = wid % 16, internal index = wid / 16) - Pending - u8 wklMinCnt[0x10]; // 0x40 (seems only for first 0..15 wids) - Min contention - atomic_t wklMaxCnt[0x10]; // 0x50 (packed 4-bit data, index = wid % 16, internal index = wid / 16) - Max contention - CellSpursWorkloadFlag wklFlag; // 0x60 - atomic_t wklSet1; // 0x70 (bitset for 0..15 wids) - Workload signal - atomic_t x72; // 0x72 - message - u8 x73; // 0x73 - idling - u8 flags1; // 0x74 - Bit0(MSB)=exit_if_no_work, Bit1=32_workloads - u8 x75; // 0x75 - trace control - u8 nSpus; // 0x76 - Number of SPUs - atomic_t flagRecv; // 0x77 - atomic_t wklSet2; // 0x78 (bitset for 16..32 wids) - Workload signal - u8 x7A[6]; // 0x7A + // The first 0x80 bytes of the CellSpurs structure is shared by all instances of the SPURS kernel in a SPURS instance and the PPU. + // The SPURS kernel copies this from main memory to the LS (address 0x100) then runs its scheduling algorithm using this as one + // of the inputs. After selecting a new workload, the SPURS kernel updates this and writes it back to main memory. + // The read-modify-write is performed atomically by the SPURS kernel. + atomic_t wklReadyCount[0x10]; // 0x00 (index = wid) - Number of SPUs requested by each workload + u8 wklCurrentContention[0x10]; // 0x20 (packed 4-bit data, index = wid % 16, internal index = wid / 16) - Number of SPUs used by each workload + u8 wklPendingContention[0x10]; // 0x30 (packed 4-bit data, index = wid % 16, internal index = wid / 16) - Number of SPUs that are pending to context switch to the workload + u8 wklMinContention[0x10]; // 0x40 (seems only for first 0..15 wids) - Min SPUs required for each workload + atomic_t wklMaxContention[0x10]; // 0x50 (packed 4-bit data, index = wid % 16, internal index = wid / 16) - Max SPUs that may be allocated to each workload + CellSpursWorkloadFlag wklFlag; // 0x60 + atomic_t wklSignal1; // 0x70 (bitset for 0..15 wids) + atomic_t sysSrvMessage; // 0x72 + u8 sysSrvIdling; // 0x73 + u8 flags1; // 0x74 Type is SpursFlags1 + u8 sysSrvTraceControl; // 0x75 + u8 nSpus; // 0x76 + atomic_t wklFlagReceiver; // 0x77 + atomic_t wklSignal2; // 0x78 (bitset for 16..32 wids) + u8 x7A[6]; // 0x7A atomic_t wklStat1[0x10]; // 0x80 - Workload state (16*u8) - State enum {non_exitent, preparing, runnable, shutting_down, removable, invalid} u8 wklD1[0x10]; // 0x90 - Workload status (16*u8) u8 wklE1[0x10]; // 0xA0 - Workload event (16*u8) - atomic_t wklMskA; // 0xB0 - Available workloads (32*u1) - atomic_t wklMskB; // 0xB4 - Available module id - u8 xB8[5]; // 0xB8 - 0xBC - exit barrier - atomic_t xBD; // 0xBD - update workload - u8 xBE[2]; // 0xBE - 0xBF - message - terminate + atomic_t wklMskA; // 0xB0 - System service - Available workloads (32*u1) + atomic_t wklMskB; // 0xB4 - System service - Available module id + u8 xB8[5]; // 0xB8 - 0xBC - Syetem service exit barrier + atomic_t xBD; // 0xBD - System service message - update workload + u8 xBE[2]; // 0xBE - 0xBF - System service message - terminate u8 xC0[8]; // 0xC0 - System workload u8 xC8; // 0xC8 - System service - on spu u8 spuPort; // 0xC9 - SPU port for system service @@ -304,7 +315,7 @@ struct CellSpurs u8 xCB; // 0xCB u8 xCC; // 0xCC u8 xCD; // 0xCD - u8 xCE; // 0xCE - message - update trace + u8 xCE; // 0xCE - System service message - update trace u8 xCF; // 0xCF atomic_t wklStat2[0x10]; // 0xD0 - Workload state (16*u8) u8 wklD2[0x10]; // 0xE0 - Workload status (16*u8) @@ -317,8 +328,8 @@ struct CellSpurs be_t unk12; // 0x98C be_t unk13; // 0x990 u8 unknown4[0xB00 - 0x998]; - _sub_str3 wklG1[0x10]; // 0xB00 - _sub_str3 wklSysG; // 0xD00 + WorkloadInfo wklInfo1[0x10]; // 0xB00 + WorkloadInfo wklInfoSysSrv; // 0xD00 be_t ppu0; // 0xD20 be_t ppu1; // 0xD28 be_t spuTG; // 0xD30 - SPU thread group @@ -347,7 +358,7 @@ struct CellSpurs _sub_str4 wklH1[0x10]; // 0xE00 _sub_str2 sub3; // 0xF00 u8 unknown6[0x1000 - 0xF80]; // 0xF80 - Gloabl SPU exception handler 0xF88 - Gloabl SPU exception handlers args - _sub_str3 wklG2[0x10]; // 0x1000 + WorkloadInfo wklInfo2[0x10]; // 0x1000 _sub_str1 wklF2[0x10]; // 0x1200 _sub_str4 wklH2[0x10]; // 0x1A00 } m; @@ -741,5 +752,29 @@ struct CellSpursTaskBinInfo CellSpursTaskLsPattern lsPattern; }; +// The SPURS kernel data store. This resides at 0x00 of the LS. +struct SpursKernelMgmtData { + u8 unk0[0x100]; // 0x00 + u8 tempArea[0x80]; // 0x100 + u8 wklLocContention[0x10]; // 0x180 + u8 wklLocPendingContention[0x10]; // 0x190 + u8 priority[0x10]; // 0x1A0 + u8 x1B0[0x10]; // 0x1B0 + vm::bptr spurs; // 0x1C0 + be_t spuNum; // 0x1C8 + be_t dmaTagId; // 0x1CC + vm::bptr wklCurrentAddr; // 0x1D0 + be_t x1D8; // 0x1D8 + u32 wklCurrentId; // 0x1DC + be_t yieldToKernelAddr; // 0x1E0 + be_t selectWorkloadAddr; // 0x1E4 + u8 x1E8; // 0x1E8 + u8 x1E9; // 0x1E9 + u8 x1EA; // 0x1EA + u8 x1EB; // 0x1EB - This might be spuIdling + be_t x1EC; // 0x1EC - This might be wklEnable1 + be_t x1EE; // 0x1EE - This might be wklEnable2 +}; + s64 spursAttachLv2EventQueue(vm::ptr spurs, u32 queue, vm::ptr port, s32 isDynamic, bool wasCreated); s64 spursWakeUp(vm::ptr spurs);