From e055bf2692e0df0e55e5d9be379ca33e608a79aa Mon Sep 17 00:00:00 2001 From: capriots <29807355+capriots@users.noreply.github.com> Date: Sun, 8 Dec 2024 16:03:28 +0100 Subject: [PATCH] cellAdec: review fixes --- rpcs3/Emu/Cell/Modules/cellAdec.cpp | 103 ++++++++++++++-------------- rpcs3/Emu/Cell/Modules/cellAdec.h | 4 +- rpcs3/util/simd.hpp | 37 ++++++++++ 3 files changed, 89 insertions(+), 55 deletions(-) diff --git a/rpcs3/Emu/Cell/Modules/cellAdec.cpp b/rpcs3/Emu/Cell/Modules/cellAdec.cpp index cccdee54a0..dfc91c8d2f 100644 --- a/rpcs3/Emu/Cell/Modules/cellAdec.cpp +++ b/rpcs3/Emu/Cell/Modules/cellAdec.cpp @@ -403,8 +403,8 @@ void LpcmDecContext::exec(ppu_thread& ppu) const v128 f32_1 = gv_mulfs(gv_cvts32_tofs(s32_1), 1.f / static_cast(0x80000000u)); const v128 f32_2 = gv_mulfs(gv_cvts32_tofs(s32_2), 1.f / static_cast(0x80000000u)); - *reinterpret_cast(&_output[i]) = gv_to_be32(f32_1); - *reinterpret_cast(&_output[i + 4]) = gv_to_be32(f32_2); + v128::storeu(gv_to_be32(f32_1), &_output[i]); + v128::storeu(gv_to_be32(f32_2), &_output[i + 4]); } for (; i < au_size_s16; i++) @@ -425,7 +425,7 @@ void LpcmDecContext::exec(ppu_thread& ppu) // Convert to float and divide by INT32_MAX + 1 const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast(0x80000000u)); - *reinterpret_cast(&_output[i]) = gv_to_be32(_f32); + v128::storeu(gv_to_be32(_f32), &_output[i]); } for (; i * 3 <= au_size_u8 - 3; i++) @@ -452,9 +452,9 @@ void LpcmDecContext::exec(ppu_thread& ppu) case CELL_ADEC_CH_MONO: for (s32 i = 0; i < sample_num / 2; i += 4) { - const v128 tmp1 = *reinterpret_cast(&_output[i * 2]); - const v128 tmp2 = *reinterpret_cast(&_output[i * 2 + 4]); - *reinterpret_cast(&_output[i]) = gv_shufflefs<0 << 0 | 2 << 2 | 0 << 4 | 2 << 6>(tmp1, tmp2); // Remove every other sample + const v128 tmp1 = v128::loadu(&_output[i * 2]); + const v128 tmp2 = v128::loadu(&_output[i * 2 + 4]); + v128::storeu(gv_shufflefs<0, 2, 0, 2>(tmp1, tmp2), &_output[i]); // Remove every other sample } break; @@ -466,7 +466,7 @@ void LpcmDecContext::exec(ppu_thread& ppu) case CELL_ADEC_CH_3_0: for (s32 i_in = 0, i_out = 0; i_in < sample_num; i_in += 4, i_out += 3) { - const v128 tmp = gv_shuffle32<0 << 0 | 2 << 2 | 1 << 4 | 3 << 6>(*reinterpret_cast(&_output[i_in])); // Swap Front Right and Center + const v128 tmp = gv_shuffle32<0, 2, 1, 3>(v128::loadu(&_output[i_in])); // Swap Front Right and Center v128::storeu(tmp, &_output[i_out]); } break; @@ -474,7 +474,7 @@ void LpcmDecContext::exec(ppu_thread& ppu) case CELL_ADEC_CH_2_1: for (s32 i_in = 0, i_out = 0; i_in < sample_num; i_in += 4, i_out += 3) { - v128::storeu(*reinterpret_cast(&_output[i_in]), &_output[i_out]); + v128::storeu(v128::loadu(&_output[i_in]), &_output[i_out]); } break; @@ -490,7 +490,7 @@ void LpcmDecContext::exec(ppu_thread& ppu) case CELL_ADEC_CH_3_2: for (s32 i_in = 0, i_out = 0; i_in < sample_num; i_in += 6, i_out += 5) { - const v128 tmp = gv_shuffle32<0 << 0 | 2 << 2 | 1 << 4 | 3 << 6>(v128::loadu(&_output[i_in])); // Swap Front Right and Center + const v128 tmp = gv_shuffle32<0, 2, 1, 3>(v128::loadu(&_output[i_in])); // Swap Front Right and Center v128::storeu(tmp, &_output[i_out]); _output[i_out + 4] = _output[i_in + 4]; } @@ -499,8 +499,8 @@ void LpcmDecContext::exec(ppu_thread& ppu) case CELL_ADEC_CH_3_4: for (s32 i_in = 0, i_out = 0; i_in < sample_num; i_in += 8, i_out += 7) { - const v128 tmp1 = gv_shuffle32<0 << 0 | 2 << 2 | 1 << 4 | 3 << 6>(*reinterpret_cast(&_output[i_in])); // Swap Front Right and Center - const v128 tmp2 = gv_shuffle32<2 << 0 | 0 << 2 | 1 << 4 | 3 << 6>(*reinterpret_cast(&_output[i_in + 4])); // Reorder Rear Left, Rear Right, Side Right -> Side Right, Rear Left, Rear Right + const v128 tmp1 = gv_shuffle32<0, 2, 1, 3>(v128::loadu(&_output[i_in])); // Swap Front Right and Center + const v128 tmp2 = gv_shuffle32<2, 0, 1, 3>(v128::loadu(&_output[i_in + 4])); // Reorder Rear Left, Rear Right, Side Right -> Side Right, Rear Left, Rear Right v128::storeu(tmp1, &_output[i_out]); v128::storeu(tmp2, &_output[i_out + 4]); } @@ -509,8 +509,8 @@ void LpcmDecContext::exec(ppu_thread& ppu) case CELL_ADEC_CH_3_4_LFE: for (s32 i = 0; i < sample_num; i += 8) { - const v128 tmp1 = gv_shuffle32<3 << 0 | 2 << 2 | 0 << 4 | 1 << 6>(*reinterpret_cast(&_output[i + 4])); // Reorder Rear Left, Rear Right, Side Right, LFE -> LFE, Side Right, Rear Left, Rear Right - *reinterpret_cast(&_output[i + 4]) = tmp1; + const v128 tmp1 = gv_shuffle32<3, 2, 0, 1>(v128::loadu(&_output[i + 4])); // Reorder Rear Left, Rear Right, Side Right, LFE -> LFE, Side Right, Rear Left, Rear Right + v128::storeu(tmp1, &_output[i + 4]); const u64 tmp2 = std::rotl(read_from_ptr(&_output[i + 3]), 0x20); // Swap Side Left and LFE std::memcpy(&_output[i + 3], &tmp2, sizeof(u64)); } @@ -569,7 +569,7 @@ void LpcmDecContext::exec(ppu_thread& ppu) // Convert to float and divide by INT32_MAX + 1 const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast(0x80000000u)); - *reinterpret_cast(&_output[i_out]) = gv_to_be32(_f32); + v128::storeu(gv_to_be32(_f32), &_output[i_out]); } for (; i_in <= au_size_s16 - 2; i_in += channel_num, i_out += 2) @@ -602,7 +602,7 @@ void LpcmDecContext::exec(ppu_thread& ppu) ? v128::normal_array_t{ -1, 8, 1, 0, -1, 8, 3, 2, -1, 10, 5, 4, -1, 11, 7, 6 } : v128::normal_array_t{ 0, 1, 8, -1, 2, 3, 8, -1, 4, 5, 10, -1, 6, 7, 11, -1 }; - const v128 shuffle_ctrl = channel_num & 1 ? shuffle_ctrl_different_offset : shuffle_ctrl_same_offset; + const v128 shuffle_ctrl = channel_num & 1 ? v128::loadu(&shuffle_ctrl_different_offset) : v128::loadu(&shuffle_ctrl_same_offset); alignas(alignof(v128)) static constexpr auto low_bits_mask_same_offset = std::endian::native == std::endian::little ? v128::normal_array_t{ 0x00, 0xf0, 0xff, 0xff, 0x00, 0x0f, 0xff, 0xff, 0x00, 0xf0, 0xff, 0xff, 0x00, 0x0f, 0xff, 0xff } @@ -612,7 +612,7 @@ void LpcmDecContext::exec(ppu_thread& ppu) ? v128::normal_array_t{ 0x00, 0xf0, 0xff, 0xff, 0x00, 0x0f, 0xff, 0xff, 0x00, 0x0f, 0xff, 0xff, 0x00, 0xf0, 0xff, 0xff } : v128::normal_array_t{ 0xff, 0xff, 0xf0, 0x00, 0xff, 0xff, 0x0f, 0x00, 0xff, 0xff, 0x0f, 0x00, 0xff, 0xff, 0xf0, 0x00 }; - const v128 low_bits_mask = channel_num & 1 ? low_bits_mask_different_offset : low_bits_mask_same_offset; + const v128 low_bits_mask = channel_num & 1 ? v128::loadu(&low_bits_mask_different_offset) : v128::loadu(&low_bits_mask_same_offset); for (s64 i_in = 0, i_out = 0; i_in <= au_size_u8 - low_bits_3_4_offset - (channel_num & 1); i_in += next_samples_offset, i_out += 4) { @@ -634,7 +634,7 @@ void LpcmDecContext::exec(ppu_thread& ppu) // Convert to float and divide by INT32_MAX + 1 const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast(0x80000000u)); - *reinterpret_cast(&_output[i_out]) = gv_to_be32(_f32); + v128::storeu(gv_to_be32(_f32), &_output[i_out]); } break; } @@ -662,7 +662,7 @@ void LpcmDecContext::exec(ppu_thread& ppu) // Convert to float and divide by INT32_MAX + 1 const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast(0x80000000u)); - *reinterpret_cast(&_output[i_out]) = gv_to_be32(_f32); + v128::storeu(gv_to_be32(_f32), &_output[i_out]); } } } @@ -820,16 +820,18 @@ error_code _CellAdecCoreOpOpenExt_lpcm(ppu_thread& ppu, vm::ptr const vm::var queue_mutex_attr{{ SYS_SYNC_PRIORITY, SYS_SYNC_NOT_RECURSIVE, SYS_SYNC_NOT_PROCESS_SHARED, SYS_SYNC_NOT_ADAPTIVE, 0, 0, 0, { "_adem06"_u64 } }}; const vm::var cond_attr{{ SYS_SYNC_NOT_PROCESS_SHARED, 0, 0, { "_adec03"_u64 } }}; - if (error_code ret = sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::queue_size_mutex), mutex_attr); ret != CELL_OK - || (ret = sys_cond_create(ppu, handle.ptr(&LpcmDecContext::queue_size_cond), handle->queue_size_mutex, cond_attr)) != CELL_OK - || (ret = sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::unk_mutex), mutex_attr)) != CELL_OK - || (ret = sys_cond_create(ppu, handle.ptr(&LpcmDecContext::unk_cond), handle->unk_mutex, cond_attr)) != CELL_OK - || (ret = sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::output_mutex), output_mutex_attr)) != CELL_OK - || (ret = sys_cond_create(ppu, handle.ptr(&LpcmDecContext::output_consumed), handle->output_mutex, cond_attr)) != CELL_OK - || (ret = sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::queue_mutex), queue_mutex_attr)) != CELL_OK - || (ret = handle->release_output(ppu)) != CELL_OK - || (ret = handle->cmd_available.init(ppu, handle.ptr(&LpcmDecContext::cmd_available), 0)) != CELL_OK - || (ret = handle->reserved2.init(ppu, handle.ptr(&LpcmDecContext::reserved2), 0)) != CELL_OK) + error_code ret = sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::queue_size_mutex), mutex_attr); + ret = ret ? ret : sys_cond_create(ppu, handle.ptr(&LpcmDecContext::queue_size_cond), handle->queue_size_mutex, cond_attr); + ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::unk_mutex), mutex_attr); + ret = ret ? ret : sys_cond_create(ppu, handle.ptr(&LpcmDecContext::unk_cond), handle->unk_mutex, cond_attr); + ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::output_mutex), output_mutex_attr); + ret = ret ? ret : sys_cond_create(ppu, handle.ptr(&LpcmDecContext::output_consumed), handle->output_mutex, cond_attr); + ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::queue_mutex), queue_mutex_attr); + ret = ret ? ret : handle->release_output(ppu); + ret = ret ? ret : handle->cmd_available.init(ppu, handle.ptr(&LpcmDecContext::cmd_available), 0); + ret = ret ? ret : handle->reserved2.init(ppu, handle.ptr(&LpcmDecContext::reserved2), 0); + + if (ret != CELL_OK) { return ret; } @@ -841,14 +843,11 @@ error_code _CellAdecCoreOpOpenExt_lpcm(ppu_thread& ppu, vm::ptr const vm::var _name = vm::make_str("HLE LPCM decoder"); const auto entry = g_fxo->get().func_addr(FIND_FUNC(lpcmDecEntry)); - if (error_code ret = ppu_execute<&sys_ppu_thread_create>(ppu, handle.ptr(&LpcmDecContext::thread_id), entry, handle.addr(), +res->ppuThreadPriority, +res->ppuThreadStackSize, SYS_PPU_THREAD_CREATE_JOINABLE, +_name); ret != CELL_OK - || (ret = sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::spurs_queue_pop_mutex), mutex_attr)) != CELL_OK - || (ret = sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::spurs_queue_push_mutex), mutex_attr)) != CELL_OK) - { - return ret; - } + ret = ppu_execute<&sys_ppu_thread_create>(ppu, handle.ptr(&LpcmDecContext::thread_id), entry, handle.addr(), +res->ppuThreadPriority, +res->ppuThreadStackSize, SYS_PPU_THREAD_CREATE_JOINABLE, +_name); + ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::spurs_queue_pop_mutex), mutex_attr); + ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::spurs_queue_push_mutex), mutex_attr); - return CELL_OK; + return ret; } error_code _CellAdecCoreOpOpen_lpcm(ppu_thread& ppu, vm::ptr handle, vm::ptr notifyAuDone, vm::ptr notifyAuDoneArg, vm::ptr notifyPcmOut, vm::ptr notifyPcmOutArg, @@ -900,27 +899,25 @@ error_code _CellAdecCoreOpClose_lpcm(ppu_thread& ppu, vm::ptr ha } } + error_code ret = sys_mutex_unlock(ppu, handle->queue_size_mutex); + ret = ret ? ret : handle->release_output(ppu); + vm::var thread_ret; + ret = ret ? ret : sys_ppu_thread_join(ppu, static_cast(handle->thread_id), +thread_ret); - if (error_code ret = sys_mutex_unlock(ppu, handle->queue_size_mutex); ret != CELL_OK - || (ret = handle->release_output(ppu)) != CELL_OK - || (ret = sys_ppu_thread_join(ppu, static_cast(handle->thread_id), +thread_ret)) != CELL_OK - || (ret = sys_cond_destroy(ppu, handle->queue_size_cond)) != CELL_OK - || (ret = sys_cond_destroy(ppu, handle->unk_cond)) != CELL_OK - || (ret = sys_cond_destroy(ppu, handle->output_consumed)) != CELL_OK - || (ret = sys_mutex_destroy(ppu, handle->queue_mutex)) != CELL_OK - || (ret = sys_mutex_destroy(ppu, handle->queue_size_mutex)) != CELL_OK - || (ret = sys_mutex_destroy(ppu, handle->unk_mutex)) != CELL_OK - || (ret = sys_mutex_destroy(ppu, handle->output_mutex)) != CELL_OK - || (ret = handle->cmd_available.finalize(ppu)) != CELL_OK - || (ret = handle->reserved2.finalize(ppu)) != CELL_OK - || (ret = sys_mutex_destroy(ppu, handle->spurs_queue_pop_mutex)) != CELL_OK - || (ret = sys_mutex_destroy(ppu, handle->spurs_queue_push_mutex)) != CELL_OK) - { - return ret; - } + ret = ret ? ret : sys_cond_destroy(ppu, handle->queue_size_cond); + ret = ret ? ret : sys_cond_destroy(ppu, handle->unk_cond); + ret = ret ? ret : sys_cond_destroy(ppu, handle->output_consumed); + ret = ret ? ret : sys_mutex_destroy(ppu, handle->queue_mutex); + ret = ret ? ret : sys_mutex_destroy(ppu, handle->queue_size_mutex); + ret = ret ? ret : sys_mutex_destroy(ppu, handle->unk_mutex); + ret = ret ? ret : sys_mutex_destroy(ppu, handle->output_mutex); + ret = ret ? ret : handle->cmd_available.finalize(ppu); + ret = ret ? ret : handle->reserved2.finalize(ppu); + ret = ret ? ret : sys_mutex_destroy(ppu, handle->spurs_queue_pop_mutex); + ret = ret ? ret : sys_mutex_destroy(ppu, handle->spurs_queue_push_mutex); - return CELL_OK; + return ret; } error_code _CellAdecCoreOpStartSeq_lpcm(ppu_thread& ppu, vm::ptr handle, vm::ptr lpcmParam) diff --git a/rpcs3/Emu/Cell/Modules/cellAdec.h b/rpcs3/Emu/Cell/Modules/cellAdec.h index b79c07f018..a43c3f0aef 100644 --- a/rpcs3/Emu/Cell/Modules/cellAdec.h +++ b/rpcs3/Emu/Cell/Modules/cellAdec.h @@ -253,7 +253,7 @@ enum CellAdecSampleRate : s32 CELL_ADEC_FS_8kHz, }; -enum CellAdecBitLength : s32 +enum CellAdecBitLength : u32 { CELL_ADEC_BIT_LENGTH_RESERVED1, CELL_ADEC_BIT_LENGTH_16, @@ -762,7 +762,7 @@ public: return {}; } - if (value == 0) + if (value == 0u) { savestate = lpcm_dec_state::waiting_for_cmd_cond_wait; cond_wait: diff --git a/rpcs3/util/simd.hpp b/rpcs3/util/simd.hpp index e5d68b53b9..750bb97fd1 100644 --- a/rpcs3/util/simd.hpp +++ b/rpcs3/util/simd.hpp @@ -3192,6 +3192,24 @@ inline v128 gv_shuffle32(const v128& vec) #endif } +// For each index, r = vec[index & 3] +template +inline v128 gv_shuffle32(const v128& vec) +{ +#if defined(ARCH_X64) + return _mm_shuffle_epi32(vec, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6); +#elif defined(ARCH_ARM64) + constexpr u8 idx0 = (Index0 & 3) * sizeof(s32); + constexpr u8 idx1 = (Index1 & 3) * sizeof(s32); + constexpr u8 idx2 = (Index2 & 3) * sizeof(s32); + constexpr u8 idx3 = (Index3 & 3) * sizeof(s32); + + constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 }; + + return vqtbl1q_s8(vec, idx_vec); +#endif +} + // For the first two 2-bit indices in Control, r = a[index], // for the last two indices, r = b[index] template @@ -3211,6 +3229,25 @@ inline v128 gv_shufflefs(const v128& a, const v128& b) #endif } +// For the first two indices, r = a[index & 3], +// for the last two indices, r = b[index & 3] +template +inline v128 gv_shufflefs(const v128& a, const v128& b) +{ +#if defined(ARCH_X64) + return _mm_shuffle_ps(a, b, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6); +#elif defined(ARCH_ARM64) + constexpr u8 idx0 = (Index0 & 3) * sizeof(s32); + constexpr u8 idx1 = (Index1 & 3) * sizeof(s32); + constexpr u8 idx2 = (Index2 & 3) * sizeof(s32) + sizeof(v128); + constexpr u8 idx3 = (Index3 & 3) * sizeof(s32) + sizeof(v128); + + constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 }; + + return vqtbl2q_s8({ a, b }, idx_vec); +#endif +} + // For each 32-bit element, reverse byte order inline v128 gv_rev32(const v128& vec) {