diff --git a/rpcs3/Emu/Cell/Modules/cellAdec.cpp b/rpcs3/Emu/Cell/Modules/cellAdec.cpp index 43c5f7989b..dfc91c8d2f 100644 --- a/rpcs3/Emu/Cell/Modules/cellAdec.cpp +++ b/rpcs3/Emu/Cell/Modules/cellAdec.cpp @@ -1,11 +1,14 @@ #include "stdafx.h" +#include "Emu/perf_meter.hpp" #include "Emu/Cell/PPUModule.h" #include "Emu/Cell/lv2/sys_sync.h" +#include "Emu/Cell/lv2/sys_ppu_thread.h" #include "Emu/savestate_utils.hpp" +#include "sysPrxForUser.h" #include "cellAdec.h" -#include +#include "util/simd.hpp" #include "util/asm.hpp" LOG_CHANNEL(cellAdec); @@ -225,57 +228,732 @@ void fmt_class_string::format(std::string& out, u64 arg) vm::gvar g_cell_adec_core_ops_lpcm; +void LpcmDecContext::exec(ppu_thread& ppu) +{ + perf_meter<"LPCMDEC"_u64> perf0; + + switch (savestate) + { + case lpcm_dec_state::waiting_for_cmd_mutex_lock: break; + case lpcm_dec_state::waiting_for_cmd_cond_wait: break; + case lpcm_dec_state::waiting_for_output_mutex_lock: goto output_mutex_lock; + case lpcm_dec_state::waiting_for_output_cond_wait: goto output_cond_wait; + case lpcm_dec_state::queue_mutex_lock: goto queue_mutex_lock; + case lpcm_dec_state::executing_cmd: goto execute_cmd; + } + + for (; run_thread; cmd_counter++) + { + cellAdec.trace("Command counter: %llu, waiting for next command...", cmd_counter); + + // Wait for a command to become available + error_occurred |= static_cast(cmd_available.acquire(ppu, savestate) != CELL_OK); + + if (ppu.state & cpu_flag::again) + { + return; + } + + cellAdec.trace("Command available, waiting for output to be consumed..."); + + // Wait for the output to be consumed. + // The output has to be consumed even if the next command is not a decode command + savestate = lpcm_dec_state::waiting_for_output_mutex_lock; + output_mutex_lock: + + error_occurred |= static_cast(sys_mutex_lock(ppu, output_mutex, 0) != CELL_OK); + + if (ppu.state & cpu_flag::again) + { + return; + } + + while (output_locked) + { + savestate = lpcm_dec_state::waiting_for_output_cond_wait; + output_cond_wait: + + ensure(sys_cond_wait(ppu, output_consumed, 0) == CELL_OK); // Error code isn't checked on LLE + + if (ppu.state & cpu_flag::again) + { + return; + } + } + + cellAdec.trace("Output consumed"); + + // Pop command from queue + savestate = lpcm_dec_state::queue_mutex_lock; + queue_mutex_lock: + + ensure(sys_mutex_lock(ppu, queue_mutex, 0) == CELL_OK); // Error code isn't checked on LLE + + if (ppu.state & cpu_flag::again) + { + return; + } + + cmd_queue.pop(cmd); + + ensure(sys_mutex_unlock(ppu, queue_mutex) == CELL_OK); // Error code isn't checked on LLE + + cellAdec.trace("Command type: %d", static_cast(cmd.type.get())); + + savestate = lpcm_dec_state::executing_cmd; + execute_cmd: + + switch (cmd.type) + { + case LpcmDecCmdType::start_seq: + // LLE sends a command to the SPU thread. The SPU thread consumes the command without doing anything, however + error_occurred |= static_cast(sys_mutex_unlock(ppu, output_mutex) != CELL_OK); + break; + + case LpcmDecCmdType::end_seq: + { + // Block savestate creation during callbacks + std::unique_lock savestate_lock{g_fxo->get(), std::try_to_lock}; + + if (!savestate_lock.owns_lock()) + { + ppu.state += cpu_flag::again; + return; + } + + // Doesn't do anything else + notify_seq_done.cbFunc(ppu, notify_seq_done.cbArg); + + error_occurred |= static_cast(sys_mutex_unlock(ppu, output_mutex) != CELL_OK); + break; + } + case LpcmDecCmdType::close: + ensure(sys_mutex_unlock(ppu, output_mutex) == CELL_OK); // Error code isn't checked on LLE + return; + + case LpcmDecCmdType::decode_au: + { + // For 20 and 24-bit samples + const u8* const input_u8 = static_cast(cmd.au_start_addr.get_ptr()); + const s64 au_size_u8 = cmd.au_size; + + // For 16-bit samples + const be_t* const input_s16 = static_cast*>(cmd.au_start_addr.get_ptr()); + const s64 au_size_s16 = static_cast(au_size_u8 / sizeof(s16)); + + be_t* const _output = std::assume_aligned<0x80>(output.get_ptr()); + s64 output_size = cmd.au_size; + + s32 sample_num = static_cast(utils::align(+lpcm_param->audioPayloadSize, 0x10)); + s32 channel_num = 0; + + if (!dvd_packing) + { + switch (lpcm_param->sizeOfWord) + { + case CELL_ADEC_BIT_LENGTH_16: output_size = output_size * 32 / 16; sample_num /= 2; break; + case CELL_ADEC_BIT_LENGTH_20: // Same as below + case CELL_ADEC_BIT_LENGTH_24: output_size = output_size * 32 / 24; sample_num /= 3; break; + default: ; // LLE skips decoding entirely, the output buffer isn't written to, and it outputs whatever was there before + } + + // LPCM streams with an odd number of channels contain an empty dummy channel + switch (lpcm_param->channelNumber) + { + case CELL_ADEC_CH_MONO: channel_num = 1; output_size = output_size * 1 / 2; break; + case CELL_ADEC_CH_STEREO: channel_num = 2; break; + case CELL_ADEC_CH_3_0: // Same as below + case CELL_ADEC_CH_2_1: channel_num = 3; output_size = output_size * 3 / 4; break; + case CELL_ADEC_CH_3_1: // Same as below + case CELL_ADEC_CH_2_2: channel_num = 4; break; + case CELL_ADEC_CH_3_2: channel_num = 5; output_size = output_size * 5 / 6; break; + case CELL_ADEC_CH_3_2_LFE: channel_num = 6; break; + case CELL_ADEC_CH_3_4: channel_num = 7; output_size = output_size * 7 / 8; break; + case CELL_ADEC_CH_3_4_LFE: channel_num = 8; break; + default: ; // Don't do anything, LLE simply skips reordering channels + } + + // LLE doesn't check the output size + ensure(output_size <= LPCM_DEC_OUTPUT_BUFFER_SIZE); + ensure(sample_num * sizeof(f32) <= LPCM_DEC_OUTPUT_BUFFER_SIZE); + + // Convert to float + if (lpcm_param->sizeOfWord == CELL_ADEC_BIT_LENGTH_16) + { + s64 i = 0; + for (; i <= au_size_s16 - 8; i += 8) + { + const v128 s16be = v128::loadu(&input_s16[i]); + + // Convert endianess if necessary and shift left by 16 +#if defined(ARCH_X64) && !defined(__SSSE3__) + const v128 s16le = gv_rol16<8>(s16be); + const v128 s32_1 = gv_unpacklo16(gv_bcst16(0), s16le); + const v128 s32_2 = gv_unpackhi16(gv_bcst16(0), s16le); +#else + const v128 s32_1 = std::endian::native == std::endian::little + ? gv_shuffle8(s16be, v128::normal_array_t{ -1, -1, 1, 0, -1, -1, 3, 2, -1, -1, 5, 4, -1, -1, 7, 6 }) + : gv_unpacklo16(s16be, gv_bcst16(0)); + + const v128 s32_2 = std::endian::native == std::endian::little + ? gv_shuffle8(s16be, v128::normal_array_t{ -1, -1, 9, 8, -1, -1, 11, 10, -1, -1, 13, 12, -1, -1, 15, 14 }) + : gv_unpackhi16(s16be, gv_bcst16(0)); +#endif + // Convert to float and divide by INT32_MAX + 1 + const v128 f32_1 = gv_mulfs(gv_cvts32_tofs(s32_1), 1.f / static_cast(0x80000000u)); + const v128 f32_2 = gv_mulfs(gv_cvts32_tofs(s32_2), 1.f / static_cast(0x80000000u)); + + v128::storeu(gv_to_be32(f32_1), &_output[i]); + v128::storeu(gv_to_be32(f32_2), &_output[i + 4]); + } + + for (; i < au_size_s16; i++) + { + _output[i] = static_cast(input_s16[i]) / 0x8000u; + } + } + else if (lpcm_param->sizeOfWord == CELL_ADEC_BIT_LENGTH_20 || lpcm_param->sizeOfWord == CELL_ADEC_BIT_LENGTH_24) + { + s64 i = 0; + for (; i * 3 <= au_size_u8 - static_cast(sizeof(v128)); i += 4) + { + // Load four samples, convert endianness if necessary and shift left by 8 + const v128 _s32 = std::endian::native == std::endian::little + ? gv_shuffle8(v128::loadu(&input_u8[i * 3]), v128::normal_array_t{ -1, 2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9 }) + : gv_shuffle8(v128::loadu(&input_u8[i * 3]), v128::normal_array_t{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 }); + + // Convert to float and divide by INT32_MAX + 1 + const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast(0x80000000u)); + + v128::storeu(gv_to_be32(_f32), &_output[i]); + } + + for (; i * 3 <= au_size_u8 - 3; i++) + { + alignas(alignof(s32)) const u8 s32le[4] = { 0, input_u8[i * 3 + 2], input_u8[i * 3 + 1], input_u8[i * 3] }; + + _output[i] = static_cast(std::bit_cast>(s32le)) / static_cast(0x80000000u); + } + } + + // Reorder channels and remove the dummy channel + + // Input channel order: + // Front Left, Front Right, Center, Side Left, Rear Left, Rear Right, Side Right, LFE + + // Output channel order: + // - up to 3_4: Front Left, Center, Front Right, Side Left, Side Right, Rear Left, Rear Right, LFE + // - 3_4_LFE: Front Left, Front Right, Center, LFE, Side Left, Side Right, Rear Left, Rear Right + + // The following loops can access sample_num % channel_num * sizeof(f32) bytes more than LPCM_DEC_OUTPUT_BUFFER_SIZE (up to 28 bytes). + // This is intended, LLE does something similar. The buffer is much larger than LPCM_DEC_OUTPUT_BUFFER_SIZE (see _CellAdecCoreOpGetMemSize_lpcm()) + switch (lpcm_param->channelNumber) + { + case CELL_ADEC_CH_MONO: + for (s32 i = 0; i < sample_num / 2; i += 4) + { + const v128 tmp1 = v128::loadu(&_output[i * 2]); + const v128 tmp2 = v128::loadu(&_output[i * 2 + 4]); + v128::storeu(gv_shufflefs<0, 2, 0, 2>(tmp1, tmp2), &_output[i]); // Remove every other sample + } + break; + + case CELL_ADEC_CH_STEREO: + case CELL_ADEC_CH_2_2: + // Input order == output order, no need to do anything + break; + + case CELL_ADEC_CH_3_0: + for (s32 i_in = 0, i_out = 0; i_in < sample_num; i_in += 4, i_out += 3) + { + const v128 tmp = gv_shuffle32<0, 2, 1, 3>(v128::loadu(&_output[i_in])); // Swap Front Right and Center + v128::storeu(tmp, &_output[i_out]); + } + break; + + case CELL_ADEC_CH_2_1: + for (s32 i_in = 0, i_out = 0; i_in < sample_num; i_in += 4, i_out += 3) + { + v128::storeu(v128::loadu(&_output[i_in]), &_output[i_out]); + } + break; + + case CELL_ADEC_CH_3_1: + case CELL_ADEC_CH_3_2_LFE: + for (s32 i = 0; i < sample_num; i += channel_num) + { + const u64 tmp = std::rotl(read_from_ptr(&_output[i + 1]), 0x20); // Swap Front Right and Center + std::memcpy(&_output[i + 1], &tmp, sizeof(u64)); + } + break; + + case CELL_ADEC_CH_3_2: + for (s32 i_in = 0, i_out = 0; i_in < sample_num; i_in += 6, i_out += 5) + { + const v128 tmp = gv_shuffle32<0, 2, 1, 3>(v128::loadu(&_output[i_in])); // Swap Front Right and Center + v128::storeu(tmp, &_output[i_out]); + _output[i_out + 4] = _output[i_in + 4]; + } + break; + + case CELL_ADEC_CH_3_4: + for (s32 i_in = 0, i_out = 0; i_in < sample_num; i_in += 8, i_out += 7) + { + const v128 tmp1 = gv_shuffle32<0, 2, 1, 3>(v128::loadu(&_output[i_in])); // Swap Front Right and Center + const v128 tmp2 = gv_shuffle32<2, 0, 1, 3>(v128::loadu(&_output[i_in + 4])); // Reorder Rear Left, Rear Right, Side Right -> Side Right, Rear Left, Rear Right + v128::storeu(tmp1, &_output[i_out]); + v128::storeu(tmp2, &_output[i_out + 4]); + } + break; + + case CELL_ADEC_CH_3_4_LFE: + for (s32 i = 0; i < sample_num; i += 8) + { + const v128 tmp1 = gv_shuffle32<3, 2, 0, 1>(v128::loadu(&_output[i + 4])); // Reorder Rear Left, Rear Right, Side Right, LFE -> LFE, Side Right, Rear Left, Rear Right + v128::storeu(tmp1, &_output[i + 4]); + const u64 tmp2 = std::rotl(read_from_ptr(&_output[i + 3]), 0x20); // Swap Side Left and LFE + std::memcpy(&_output[i + 3], &tmp2, sizeof(u64)); + } + break; + + default: + ; // Don't do anything + } + } + else + { + switch (lpcm_param->sizeOfWord) + { + case CELL_ADEC_BIT_LENGTH_16: output_size = output_size * 32 / 16; break; + case CELL_ADEC_BIT_LENGTH_20: output_size = output_size * 32 / 20; break; + case CELL_ADEC_BIT_LENGTH_24: output_size = output_size * 32 / 24; break; + default: fmt::throw_exception("Unreachable"); // Parameters get verified in adecSetLpcmDvdParams() + } + + // Only the front left and right channels are decoded, all other channels are ignored + switch (lpcm_param->channelNumber) + { + case CELL_ADEC_LPCM_DVD_CH_MONO: // Set channel_num to two for mono as well + case CELL_ADEC_LPCM_DVD_CH_STEREO: channel_num = 2; break; + case 4: channel_num = 3; output_size = output_size * 2 / 3; break; + case 5: channel_num = 4; output_size = output_size * 2 / 4; break; + case CELL_ADEC_LPCM_DVD_CH_3_2: channel_num = 5; output_size = output_size * 2 / 5; break; + case CELL_ADEC_LPCM_DVD_CH_3_2_LFE: channel_num = 6; output_size = output_size * 2 / 6; break; + case CELL_ADEC_LPCM_DVD_CH_3_4: channel_num = 7; output_size = output_size * 2 / 7; break; + case CELL_ADEC_LPCM_DVD_CH_3_4_LFE: channel_num = 8; output_size = output_size * 2 / 8; break; + default: fmt::throw_exception("Unreachable"); // Parameters get verified in adecSetLpcmDvdParams() + } + + // LLE doesn't check the output size + ensure(output_size <= LPCM_DEC_OUTPUT_BUFFER_SIZE); + + // Convert to float + switch (lpcm_param->sizeOfWord) + { + case CELL_ADEC_BIT_LENGTH_16: + { + s64 i_in = 0; + s64 i_out = 0; + for (; i_in <= au_size_s16 - channel_num - 2; i_in += channel_num * 2, i_out += 4) + { + // Load four samples + const v128 tmp1 = gv_loadu32(&input_s16[i_in]); + const v128 tmp2 = gv_loadu32(&input_s16[i_in + channel_num]); + const v128 s16be = gv_unpacklo32(tmp1, tmp2); + + // Convert endianess if necessary and shift left by 16 + const v128 _s32 = std::endian::native == std::endian::little + ? gv_shuffle8(s16be, v128::normal_array_t{ -1, -1, 1, 0, -1, -1, 3, 2, -1, -1, 5, 4, -1, -1, 7, 6 }) + : gv_unpacklo16(s16be, gv_bcst16(0)); + + // Convert to float and divide by INT32_MAX + 1 + const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast(0x80000000u)); + + v128::storeu(gv_to_be32(_f32), &_output[i_out]); + } + + for (; i_in <= au_size_s16 - 2; i_in += channel_num, i_out += 2) + { + const v128 s16be = gv_loadu32(&input_s16[i_in]); + + const v128 _s32 = std::endian::native == std::endian::little + ? gv_shuffle8(s16be, v128::normal_array_t{ -1, -1, 1, 0, -1, -1, 3, 2, -1, -1, 5, 4, -1, -1, 7, 6 }) + : gv_unpacklo16(s16be, gv_bcst16(0)); + + const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast(0x80000000u)); + + std::memcpy(&_output[i_out], &gv_to_be32(_f32)._u64[0], sizeof(u64)); + } + break; + } + case CELL_ADEC_BIT_LENGTH_20: + { + const s64 high_bytes_3_4_offset = lpcm_param->channelNumber == CELL_ADEC_LPCM_DVD_CH_MONO ? 5 : channel_num * 2; + const s64 low_bits_1_2_offset = lpcm_param->channelNumber == CELL_ADEC_LPCM_DVD_CH_MONO ? 4 : channel_num * 4; + const s64 low_bits_3_4_offset = channel_num * 4 + channel_num / 2 - !(channel_num & 1); + const s64 next_samples_offset = channel_num * 5; + + // If channel_num is odd, the low bits of samples three and four are in different bytes + alignas(alignof(v128)) static constexpr auto shuffle_ctrl_same_offset = std::endian::native == std::endian::little + ? v128::normal_array_t{ -1, 8, 1, 0, -1, 8, 3, 2, -1, 11, 5, 4, -1, 11, 7, 6 } + : v128::normal_array_t{ 0, 1, 8, -1, 2, 3, 8, -1, 4, 5, 11, -1, 6, 7, 11, -1 }; + + alignas(alignof(v128)) static constexpr auto shuffle_ctrl_different_offset = std::endian::native == std::endian::little + ? v128::normal_array_t{ -1, 8, 1, 0, -1, 8, 3, 2, -1, 10, 5, 4, -1, 11, 7, 6 } + : v128::normal_array_t{ 0, 1, 8, -1, 2, 3, 8, -1, 4, 5, 10, -1, 6, 7, 11, -1 }; + + const v128 shuffle_ctrl = channel_num & 1 ? v128::loadu(&shuffle_ctrl_different_offset) : v128::loadu(&shuffle_ctrl_same_offset); + + alignas(alignof(v128)) static constexpr auto low_bits_mask_same_offset = std::endian::native == std::endian::little + ? v128::normal_array_t{ 0x00, 0xf0, 0xff, 0xff, 0x00, 0x0f, 0xff, 0xff, 0x00, 0xf0, 0xff, 0xff, 0x00, 0x0f, 0xff, 0xff } + : v128::normal_array_t{ 0xff, 0xff, 0xf0, 0x00, 0xff, 0xff, 0x0f, 0x00, 0xff, 0xff, 0xf0, 0x00, 0xff, 0xff, 0x0f, 0x00 }; + + alignas(alignof(v128)) static constexpr auto low_bits_mask_different_offset = std::endian::native == std::endian::little + ? v128::normal_array_t{ 0x00, 0xf0, 0xff, 0xff, 0x00, 0x0f, 0xff, 0xff, 0x00, 0x0f, 0xff, 0xff, 0x00, 0xf0, 0xff, 0xff } + : v128::normal_array_t{ 0xff, 0xff, 0xf0, 0x00, 0xff, 0xff, 0x0f, 0x00, 0xff, 0xff, 0x0f, 0x00, 0xff, 0xff, 0xf0, 0x00 }; + + const v128 low_bits_mask = channel_num & 1 ? v128::loadu(&low_bits_mask_different_offset) : v128::loadu(&low_bits_mask_same_offset); + + for (s64 i_in = 0, i_out = 0; i_in <= au_size_u8 - low_bits_3_4_offset - (channel_num & 1); i_in += next_samples_offset, i_out += 4) + { + // Load all the high and low bits of four samples + const v128 tmp1 = gv_loadu32(&input_u8[i_in]); + const v128 tmp2 = gv_loadu32(&input_u8[i_in + high_bytes_3_4_offset]); + v128 s20be = gv_unpacklo32(tmp1, tmp2); + s20be = gv_insert16<4>(s20be, read_from_ptr(&input_u8[i_in + low_bits_1_2_offset])); + s20be = gv_insert16<5>(s20be, read_from_ptr(&input_u8[i_in + low_bits_3_4_offset])); + + // Reorder bytes to form four 32-bit integer samples + v128 _s32 = gv_shuffle8(s20be, shuffle_ctrl); + + // Set low 12 bits to zero for each sample + _s32 = _s32 & low_bits_mask; + + // LLE is missing a step: each byte that was ANDed with 0x0f would still need to be shifted left by 4 + + // Convert to float and divide by INT32_MAX + 1 + const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast(0x80000000u)); + + v128::storeu(gv_to_be32(_f32), &_output[i_out]); + } + break; + } + case CELL_ADEC_BIT_LENGTH_24: + { + const s64 high_bytes_3_4_offset = lpcm_param->channelNumber == CELL_ADEC_LPCM_DVD_CH_MONO ? 6 : channel_num * 2; + const s64 low_bytes_1_2_offset = lpcm_param->channelNumber == CELL_ADEC_LPCM_DVD_CH_MONO ? 4 : channel_num * 4; + const s64 low_bytes_3_4_offset = channel_num * 5; + const s64 next_samples_offset = channel_num * 6; + + for (s64 i_in = 0, i_out = 0; i_in <= au_size_u8 - low_bytes_3_4_offset - 2; i_in += next_samples_offset, i_out += 4) + { + // Load all the high and low bytes of four samples + const v128 tmp1 = gv_loadu32(&input_u8[i_in]); + const v128 tmp2 = gv_loadu32(&input_u8[i_in + high_bytes_3_4_offset]); + v128 s24be = gv_unpacklo32(tmp1, tmp2); + s24be = gv_insert16<4>(s24be, read_from_ptr(&input_u8[i_in + low_bytes_1_2_offset])); + s24be = gv_insert16<5>(s24be, read_from_ptr(&input_u8[i_in + low_bytes_3_4_offset])); + + // Reorder bytes to form four 32-bit integer samples + const v128 _s32 = std::endian::native == std::endian::little + ? gv_shuffle8(s24be, v128::normal_array_t{ -1, 8, 1, 0, -1, 9, 3, 2, -1, 10, 5, 4, -1, 11, 7, 6 }) + : gv_shuffle8(s24be, v128::normal_array_t{ 0, 1, 8, -1, 2, 3, 9, -1, 4, 5, 10, -1, 6, 7, 11, -1 }); + + // Convert to float and divide by INT32_MAX + 1 + const v128 _f32 = gv_mulfs(gv_cvts32_tofs(_s32), 1.f / static_cast(0x80000000u)); + + v128::storeu(gv_to_be32(_f32), &_output[i_out]); + } + } + } + } + + // Block savestate creation during callbacks + std::unique_lock savestate_lock{g_fxo->get(), std::try_to_lock}; + + if (!savestate_lock.owns_lock()) + { + ppu.state += cpu_flag::again; + return; + } + + if (error_occurred) + { + notify_error.cbFunc(ppu, CELL_ADEC_ERROR_FATAL, notify_error.cbArg); + } + + notify_au_done.cbFunc(ppu, cmd.pcm_handle, notify_au_done.cbArg); + + output_locked = true; + error_occurred |= static_cast(sys_mutex_unlock(ppu, output_mutex) != CELL_OK); + + const vm::var bsi_info{{ lpcm_param->channelNumber, lpcm_param->sampleRate, static_cast(output_size) }}; + + notify_pcm_out.cbFunc(ppu, cmd.pcm_handle, output, static_cast(output_size), notify_pcm_out.cbArg, vm::make_var>(bsi_info), ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_HDMV, error_occurred ? static_cast(CELL_ADEC_ERROR_FATAL) : CELL_OK); + break; + } + default: + fmt::throw_exception("Invalid command"); + } + } +} + +template +error_code LpcmDecContext::send_command(ppu_thread& ppu, auto&&... args) +{ + ppu.state += cpu_flag::wait; + + if (error_code ret = sys_mutex_lock(ppu, queue_size_mutex, 0); ret != CELL_OK) + { + return ret; + } + + if (cmd_queue.full()) + { + ensure(sys_mutex_unlock(ppu, queue_size_mutex) == CELL_OK); // Error code isn't checked on LLE + return CELL_ADEC_ERROR_BUSY; + } + + // LLE copies the parameters directly into the context + if constexpr (type == LpcmDecCmdType::start_seq) + { + *lpcm_param = { args... }; + } + + if (error_code ret = sys_mutex_lock(ppu, queue_mutex, 0); ret != CELL_OK) + { + ensure(sys_mutex_unlock(ppu, queue_size_mutex) == CELL_OK); // Error code isn't checked on LLE + return ret; + } + + cmd_queue.emplace(type, std::forward(args)...); + + if (error_code ret = sys_mutex_unlock(ppu, queue_mutex); ret != CELL_OK + || (ret = cmd_available.release(ppu)) != CELL_OK) + { + ensure(sys_mutex_unlock(ppu, queue_size_mutex) == CELL_OK); // Error code isn't checked on LLE + return ret; + } + + return sys_mutex_unlock(ppu, queue_size_mutex); +} + +inline error_code LpcmDecContext::release_output(ppu_thread& ppu) +{ + if (error_code ret = sys_mutex_lock(ppu, output_mutex, 0); ret != CELL_OK) + { + return ret; + } + + output_locked = false; + + if (error_code ret = sys_cond_signal(ppu, output_consumed); ret != CELL_OK) + { + return ret; // LLE doesn't unlock the mutex + } + + return sys_mutex_unlock(ppu, output_mutex); +} + +void lpcmDecEntry(ppu_thread& ppu, vm::ptr lpcm_dec) +{ + lpcm_dec->exec(ppu); + + if (ppu.state & cpu_flag::again) + { + // For savestates, save argument + ppu.syscall_args[0] = lpcm_dec.addr(); + + return; + } + + ppu_execute<&sys_ppu_thread_exit>(ppu, CELL_OK); +} + error_code _CellAdecCoreOpGetMemSize_lpcm(vm::ptr attr) { - cellAdec.todo("_CellAdecCoreOpGetMemSize_lpcm(attr=*0x%x)", attr); + cellAdec.notice("_CellAdecCoreOpGetMemSize_lpcm(attr=*0x%x)", attr); + + constexpr u32 mem_size = + utils::align(static_cast(sizeof(LpcmDecContext)), 0x80) + + utils::align(static_cast(sizeof(CellAdecParamLpcm)), 0x80) + + 0x100 // Command data for Spurs task + + LPCM_DEC_OUTPUT_BUFFER_SIZE + + 0x2900 // sizeof(CellSpurs) + sizeof(CellSpursTaskset) + + 0x3b400 // Spurs context + + 0x300 // (sizeof(CellSpursQueue) + 0x80 + queue buffer) * 2 + + 0x855; // Unused + + static_assert(mem_size == 0x7ebd5); + + attr->workMemSize = mem_size; return CELL_OK; } -[[noreturn]] error_code _CellAdecCoreOpOpenExt_lpcm(vm::ptr handle, vm::ptr notifyAuDone, vm::ptr notifyAuDoneArg, vm::ptr notifyPcmOut, vm::ptr notifyPcmOutArg, +error_code _CellAdecCoreOpOpenExt_lpcm(ppu_thread& ppu, vm::ptr handle, vm::ptr notifyAuDone, vm::ptr notifyAuDoneArg, vm::ptr notifyPcmOut, vm::ptr notifyPcmOutArg, vm::ptr notifyError, vm::ptr notifyErrorArg, vm::ptr notifySeqDone, vm::ptr notifySeqDoneArg, vm::cptr res, vm::cptr spursRes) { - cellAdec.todo("_CellAdecCoreOpOpenExt_lpcm(handle=*0x%x, notifyAuDone=*0x%x, notifyAuDoneArg=0x%x, notifyPcmOut=*0x%x, notifyPcmOutArg=0x%x, notifyError=*0x%x, notifyErrorArg=0x%x, notifySeqDone=*0x%x, notifySeqDoneArg=0x%x, res=*0x%x, spursRes=*0x%x)", + cellAdec.notice("_CellAdecCoreOpOpenExt_lpcm(handle=*0x%x, notifyAuDone=*0x%x, notifyAuDoneArg=0x%x, notifyPcmOut=*0x%x, notifyPcmOutArg=0x%x, notifyError=*0x%x, notifyErrorArg=0x%x, notifySeqDone=*0x%x, notifySeqDoneArg=0x%x, res=*0x%x, spursRes=*0x%x)", handle, notifyAuDone, notifyAuDoneArg, notifyPcmOut, notifyPcmOutArg, notifyError, notifyErrorArg, notifySeqDone, notifySeqDoneArg, res, spursRes); - fmt::throw_exception("LPCM decoder not implemented, please disable HLE libadec.sprx"); + ensure(!!handle && !!res); // Not checked on LLE + ensure(handle.aligned(0x80)); // LLE doesn't check the alignment or aligns the address itself + ensure(!!notifyAuDone && !!notifyAuDoneArg && !!notifyPcmOut && !!notifyPcmOutArg && !!notifyError && !!notifyErrorArg && !!notifySeqDone && !!notifySeqDoneArg); // These should always be set + + const u32 end_of_context_addr = handle.addr() + utils::align(static_cast(sizeof(LpcmDecContext)), 0x80); + + handle->cmd_queue.front = 0; + handle->cmd_queue.back = 0; + handle->cmd_queue.size = 0; + handle->run_thread = true; + handle->notify_au_done = { notifyAuDone, notifyAuDoneArg }; + handle->notify_pcm_out = { notifyPcmOut, notifyPcmOutArg }; + handle->notify_error = { notifyError, notifyErrorArg }; + handle->notify_seq_done = { notifySeqDone, notifySeqDoneArg }; + handle->output = vm::bptr::make(end_of_context_addr + 0x180); + handle->lpcm_param.set(end_of_context_addr); + handle->error_occurred = false; + + const vm::var mutex_attr{{ SYS_SYNC_PRIORITY, SYS_SYNC_NOT_RECURSIVE, SYS_SYNC_NOT_PROCESS_SHARED, SYS_SYNC_NOT_ADAPTIVE, 0, 0, 0, { "_adem04"_u64 } }}; + const vm::var output_mutex_attr{{ SYS_SYNC_PRIORITY, SYS_SYNC_NOT_RECURSIVE, SYS_SYNC_NOT_PROCESS_SHARED, SYS_SYNC_NOT_ADAPTIVE, 0, 0, 0, { "_adem05"_u64 } }}; + const vm::var queue_mutex_attr{{ SYS_SYNC_PRIORITY, SYS_SYNC_NOT_RECURSIVE, SYS_SYNC_NOT_PROCESS_SHARED, SYS_SYNC_NOT_ADAPTIVE, 0, 0, 0, { "_adem06"_u64 } }}; + const vm::var cond_attr{{ SYS_SYNC_NOT_PROCESS_SHARED, 0, 0, { "_adec03"_u64 } }}; + + error_code ret = sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::queue_size_mutex), mutex_attr); + ret = ret ? ret : sys_cond_create(ppu, handle.ptr(&LpcmDecContext::queue_size_cond), handle->queue_size_mutex, cond_attr); + ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::unk_mutex), mutex_attr); + ret = ret ? ret : sys_cond_create(ppu, handle.ptr(&LpcmDecContext::unk_cond), handle->unk_mutex, cond_attr); + ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::output_mutex), output_mutex_attr); + ret = ret ? ret : sys_cond_create(ppu, handle.ptr(&LpcmDecContext::output_consumed), handle->output_mutex, cond_attr); + ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::queue_mutex), queue_mutex_attr); + ret = ret ? ret : handle->release_output(ppu); + ret = ret ? ret : handle->cmd_available.init(ppu, handle.ptr(&LpcmDecContext::cmd_available), 0); + ret = ret ? ret : handle->reserved2.init(ppu, handle.ptr(&LpcmDecContext::reserved2), 0); + + if (ret != CELL_OK) + { + return ret; + } + + // HLE exclusive + handle->savestate = lpcm_dec_state::waiting_for_cmd_mutex_lock; + handle->cmd_counter = 0; + + const vm::var _name = vm::make_str("HLE LPCM decoder"); + const auto entry = g_fxo->get().func_addr(FIND_FUNC(lpcmDecEntry)); + + ret = ppu_execute<&sys_ppu_thread_create>(ppu, handle.ptr(&LpcmDecContext::thread_id), entry, handle.addr(), +res->ppuThreadPriority, +res->ppuThreadStackSize, SYS_PPU_THREAD_CREATE_JOINABLE, +_name); + ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::spurs_queue_pop_mutex), mutex_attr); + ret = ret ? ret : sys_mutex_create(ppu, handle.ptr(&LpcmDecContext::spurs_queue_push_mutex), mutex_attr); + + return ret; } -[[noreturn]] error_code _CellAdecCoreOpOpen_lpcm(vm::ptr handle, vm::ptr notifyAuDone, vm::ptr notifyAuDoneArg, vm::ptr notifyPcmOut, vm::ptr notifyPcmOutArg, +error_code _CellAdecCoreOpOpen_lpcm(ppu_thread& ppu, vm::ptr handle, vm::ptr notifyAuDone, vm::ptr notifyAuDoneArg, vm::ptr notifyPcmOut, vm::ptr notifyPcmOutArg, vm::ptr notifyError, vm::ptr notifyErrorArg, vm::ptr notifySeqDone, vm::ptr notifySeqDoneArg, vm::cptr res) { - cellAdec.todo("_CellAdecCoreOpOpen_lpcm(handle=*0x%x, notifyAuDone=*0x%x, notifyAuDoneArg=*0x%x, notifyPcmOut=*0x%x, notifyPcmOutArg=*0x%x, notifyError=*0x%x, notifyErrorArg=*0x%x, notifySeqDone=*0x%x, notifySeqDoneArg=*0x%x, res=*0x%x)", + cellAdec.notice("_CellAdecCoreOpOpen_lpcm(handle=*0x%x, notifyAuDone=*0x%x, notifyAuDoneArg=*0x%x, notifyPcmOut=*0x%x, notifyPcmOutArg=*0x%x, notifyError=*0x%x, notifyErrorArg=*0x%x, notifySeqDone=*0x%x, notifySeqDoneArg=*0x%x, res=*0x%x)", handle, notifyAuDone, notifyAuDoneArg, notifyPcmOut, notifyPcmOutArg, notifyError, notifyErrorArg, notifySeqDone, notifySeqDoneArg, res); - fmt::throw_exception("LPCM decoder not implemented, please disable HLE libadec.sprx"); + return _CellAdecCoreOpOpenExt_lpcm(ppu, handle, notifyAuDone, notifyAuDoneArg, notifyPcmOut, notifyPcmOutArg, notifyError, notifyErrorArg, notifySeqDone, notifySeqDoneArg, res, vm::null); } -error_code _CellAdecCoreOpClose_lpcm(vm::ptr handle) +error_code _CellAdecCoreOpClose_lpcm(ppu_thread& ppu, vm::ptr handle) { - cellAdec.todo("_CellAdecCoreOpClose_lpcm(handle=*0x%x)", handle); + ppu.state += cpu_flag::wait; - return CELL_OK; + cellAdec.notice("_CellAdecCoreOpClose_lpcm(handle=*0x%x)", handle); + + if (error_code ret = sys_mutex_lock(ppu, handle->queue_size_mutex, 0); ret != CELL_OK + || (ret = sys_mutex_lock(ppu, handle->queue_mutex, 0)) != CELL_OK) + { + return ret; + } + + if (handle->cmd_queue.empty()) + { + handle->cmd_queue.emplace(LpcmDecCmdType::close); + + if (error_code ret = sys_mutex_unlock(ppu, handle->queue_mutex); ret != CELL_OK) + { + return ret; // LLE doesn't unlock the queue size mutex + } + + if (error_code ret = handle->cmd_available.release(ppu); ret != CELL_OK) + { + ensure(sys_mutex_unlock(ppu, handle->queue_size_mutex) == CELL_OK); // Error code isn't checked on LLE + return ret; + } + } + else + { + for (auto& cmd : handle->cmd_queue.elements) + { + cmd.type = LpcmDecCmdType::close; + } + + if (error_code ret = sys_mutex_unlock(ppu, handle->queue_mutex); ret != CELL_OK) + { + return ret; // LLE doesn't unlock the queue size mutex + } + } + + error_code ret = sys_mutex_unlock(ppu, handle->queue_size_mutex); + ret = ret ? ret : handle->release_output(ppu); + + vm::var thread_ret; + ret = ret ? ret : sys_ppu_thread_join(ppu, static_cast(handle->thread_id), +thread_ret); + + ret = ret ? ret : sys_cond_destroy(ppu, handle->queue_size_cond); + ret = ret ? ret : sys_cond_destroy(ppu, handle->unk_cond); + ret = ret ? ret : sys_cond_destroy(ppu, handle->output_consumed); + ret = ret ? ret : sys_mutex_destroy(ppu, handle->queue_mutex); + ret = ret ? ret : sys_mutex_destroy(ppu, handle->queue_size_mutex); + ret = ret ? ret : sys_mutex_destroy(ppu, handle->unk_mutex); + ret = ret ? ret : sys_mutex_destroy(ppu, handle->output_mutex); + ret = ret ? ret : handle->cmd_available.finalize(ppu); + ret = ret ? ret : handle->reserved2.finalize(ppu); + ret = ret ? ret : sys_mutex_destroy(ppu, handle->spurs_queue_pop_mutex); + ret = ret ? ret : sys_mutex_destroy(ppu, handle->spurs_queue_push_mutex); + + return ret; } -error_code _CellAdecCoreOpStartSeq_lpcm(vm::ptr handle, vm::ptr lpcmParam) +error_code _CellAdecCoreOpStartSeq_lpcm(ppu_thread& ppu, vm::ptr handle, vm::ptr lpcmParam) { - cellAdec.todo("_CellAdecCoreOpStartSeq_lpcm(handle=*0x%x, lpcmParam=*0x%x)", handle, lpcmParam); + cellAdec.notice("_CellAdecCoreOpStartSeq_lpcm(handle=*0x%x, lpcmParam=*0x%x)", handle, lpcmParam); - return CELL_OK; + ensure(!!handle && !!lpcmParam); // Not checked on LLE + + cellAdec.notice("_CellAdecCoreOpStartSeq_lpcm(): channelNumber=%d, sampleRate=%d, sizeOfWord=%d, audioPayloadSize=0x%x", lpcmParam->channelNumber, lpcmParam->sampleRate, lpcmParam->sizeOfWord, lpcmParam->audioPayloadSize); + + if (lpcmParam->channelNumber >= 0x20u || lpcmParam->sampleRate >= 0x20u || lpcmParam->sizeOfWord >= 0x20u || lpcmParam->audioPayloadSize == 0u) + { + return CELL_ADEC_ERROR_LPCM_ARG; + } + + return handle->send_command(ppu, *lpcmParam); } -error_code _CellAdecCoreOpEndSeq_lpcm(vm::ptr handle) +error_code _CellAdecCoreOpEndSeq_lpcm(ppu_thread& ppu, vm::ptr handle) { - cellAdec.todo("_CellAdecCoreOpEndSeq_lpcm(handle=*0x%x)", handle); + cellAdec.notice("_CellAdecCoreOpEndSeq_lpcm(handle=*0x%x)", handle); - return CELL_OK; + ensure(!!handle); // Not checked on LLE + + return handle->send_command(ppu); } -error_code _CellAdecCoreOpDecodeAu_lpcm(vm::ptr handle, s32 pcmHandle, vm::ptr auInfo) +error_code _CellAdecCoreOpDecodeAu_lpcm(ppu_thread& ppu, vm::ptr handle, s32 pcmHandle, vm::ptr auInfo) { - cellAdec.todo("_CellAdecCoreOpDecodeAu_lpcm(handle=*0x%x, pcmHandle=%d, auInfo=*0x%x)", handle, pcmHandle, auInfo); + cellAdec.trace("_CellAdecCoreOpDecodeAu_lpcm(handle=*0x%x, pcmHandle=%d, auInfo=*0x%x)", handle, pcmHandle, auInfo); - return CELL_OK; + ensure(!!handle && !!auInfo); // Not checked on LLE + + cellAdec.trace("_CellAdecCoreOpDecodeAu_lpcm(): startAddr=*0x%x, size=0x%x, pts=0x%x, userData=0x%016x", auInfo->startAddr, auInfo->size, std::bit_cast>(auInfo->pts), auInfo->userData); + + return handle->send_command(ppu, pcmHandle, *auInfo); } void _CellAdecCoreOpGetVersion_lpcm(vm::ptr> version) @@ -285,18 +963,35 @@ void _CellAdecCoreOpGetVersion_lpcm(vm::ptr> version) *version = 0x20070323; } -error_code _CellAdecCoreOpRealign_lpcm(vm::ptr handle, vm::ptr outBuffer, vm::cptr pcmStartAddr) +error_code _CellAdecCoreOpRealign_lpcm(vm::ptr handle, vm::ptr outBuffer, vm::cptr pcmStartAddr) { - cellAdec.todo("_CellAdecCoreOpRealign_lpcm(handle=*0x%x, outBuffer=*0x%x, pcmStartAddr=*0x%x)", handle, outBuffer, pcmStartAddr); + cellAdec.trace("_CellAdecCoreOpRealign_lpcm(handle=*0x%x, outBuffer=*0x%x, pcmStartAddr=*0x%x)", handle, outBuffer, pcmStartAddr); + + if (!pcmStartAddr) + { + return CELL_ADEC_ERROR_LPCM_ARG; + } + + if (outBuffer) + { + ensure(!!handle); // Not checked on LLE + ensure(vm::check_addr(outBuffer.addr(), vm::page_info_t::page_writable, handle->output_size)); + + std::memcpy(outBuffer.get_ptr(), pcmStartAddr.get_ptr(), handle->output_size); + } return CELL_OK; } -error_code _CellAdecCoreOpReleasePcm_lpcm(vm::ptr handle, s32 pcmHandle, vm::ptr outBuffer) +error_code _CellAdecCoreOpReleasePcm_lpcm(ppu_thread& ppu, vm::ptr handle, s32 pcmHandle, vm::ptr outBuffer) { - cellAdec.todo("_CellAdecCoreOpReleasePcm_lpcm(handle=*0x%x, pcmHandle=%d, outBuffer=*0x%x)", handle, pcmHandle, outBuffer); + ppu.state += cpu_flag::wait; - return CELL_OK; + cellAdec.trace("_CellAdecCoreOpReleasePcm_lpcm(handle=*0x%x, pcmHandle=%d, outBuffer=*0x%x)", handle, pcmHandle, outBuffer); + + ensure(!!handle); // Not checked on LLE + + return handle->release_output(ppu); } s32 _CellAdecCoreOpGetPcmHandleNum_lpcm() @@ -544,8 +1239,8 @@ error_code AdecContext::correct_pts_value(ppu_thread& ppu, s32 pcm_handle, s8 co { switch (correct_pts_type) { - case ADEC_CORRECT_PTS_VALUE_TYPE_LPCM: return 450; - case 1: return 150; + case ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_HDMV: return 450; + case ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_DVD: return 150; case ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_48000Hz: return 3840; case ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_44100Hz: return 4180; case ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_32000Hz: return 5760; @@ -800,6 +1495,15 @@ error_code adecOpen(ppu_thread& ppu, vm::ptr type, vm::cptraudioCodecType); + // Block savestate creation during ppu_thread::fast_call() + std::unique_lock savestate_lock{g_fxo->get(), std::try_to_lock}; + + if (!savestate_lock.owns_lock()) + { + ppu.state += cpu_flag::again; + return {}; + } + const s32 pcm_handle_num = core_ops->getPcmHandleNum(ppu); const u32 bitstream_info_size = core_ops->getBsiInfoSize(ppu); @@ -810,11 +1514,11 @@ error_code adecOpen(ppu_thread& ppu, vm::ptr type, vm::cptraudioCodecType == CELL_ADEC_TYPE_LPCM_DVD) { - // TODO + vm::static_ptr_cast(core_handle)->dvd_packing = true; } else if (type->audioCodecType == CELL_ADEC_TYPE_LPCM_PAMF || type->audioCodecType == CELL_ADEC_TYPE_LPCM_BLURAY) { - // TODO + vm::static_ptr_cast(core_handle)->dvd_packing = false; } _this->_this = _this; @@ -862,15 +1566,6 @@ error_code adecOpen(ppu_thread& ppu, vm::ptr type, vm::cptr::make(g_fxo->get().func_addr(FIND_FUNC(adecNotifyError))); const auto notifySeqDone = vm::ptr::make(g_fxo->get().func_addr(FIND_FUNC(adecNotifySeqDone))); - // Block savestate creation during ppu_thread::fast_call() - std::unique_lock savestate_lock{g_fxo->get(), std::try_to_lock}; - - if (!savestate_lock.owns_lock()) - { - ppu.state += cpu_flag::again; - return {}; - } - if (spursRes) { return core_ops->openExt(ppu, _this->core_handle, notifyAuDone, _this, notifyPcmOut, _this, notifyError, _this, notifySeqDone, _this, res, spursRes); @@ -1013,6 +1708,177 @@ error_code cellAdecEndSeq(ppu_thread& ppu, vm::ptr handle) return handle->core_ops->endSeq(ppu, handle->core_handle); } +error_code adecSetLpcmBlurayParams(vm::ptr handle, u64 userData) +{ + const u8 channel_number = static_cast(userData >> 32); + const u8 sample_rate = static_cast(userData >> 40); + const u8 size_of_word = static_cast(userData >> 48); + const u8 unk = static_cast(userData >> 56); + + handle->lpcm_param->channelNumber = channel_number; + handle->lpcm_param->sampleRate = sample_rate; + handle->lpcm_param->sizeOfWord = size_of_word; + + u32 allocated_channels; + + switch (channel_number) + { + case CELL_ADEC_CH_MONO: + case CELL_ADEC_CH_STEREO: + allocated_channels = 2; + break; + + case CELL_ADEC_CH_3_0: + case CELL_ADEC_CH_2_1: + case CELL_ADEC_CH_3_1: + case CELL_ADEC_CH_2_2: + allocated_channels = 4; + break; + + case CELL_ADEC_CH_3_2: + case CELL_ADEC_CH_3_2_LFE: + allocated_channels = 6; + break; + + case CELL_ADEC_CH_3_4: + case CELL_ADEC_CH_3_4_LFE: + allocated_channels = 8; + break; + + default: + return CELL_ADEC_ERROR_FATAL; + } + + u32 samples_per_frame; + + switch (sample_rate) + { + case CELL_ADEC_FS_48kHz: samples_per_frame = 48000 / 200; break; + case CELL_ADEC_FS_96kHz: samples_per_frame = 96000 / 200; break; + case CELL_ADEC_FS_192kHz: samples_per_frame = 192000 / 200; break; + default: return CELL_ADEC_ERROR_FATAL; + } + + u32 allocated_bytes_per_sample; + + switch (size_of_word) + { + case CELL_ADEC_BIT_LENGTH_16: allocated_bytes_per_sample = 2; break; + case CELL_ADEC_BIT_LENGTH_20: // Same as below + case CELL_ADEC_BIT_LENGTH_24: allocated_bytes_per_sample = 3; break; + default: return CELL_ADEC_ERROR_FATAL; + } + + handle->lpcm_param->audioPayloadSize = allocated_bytes_per_sample * allocated_channels * samples_per_frame * unk; + + return CELL_OK; +} + +error_code adecSetLpcmDvdParams(vm::ptr handle, u64 userData) +{ + const u8 channel_layout = static_cast(userData >> 32); + const u8 sample_rate = static_cast(userData >> 40); + const u8 size_of_word = static_cast(userData >> 48); + const u8 unk = static_cast(userData >> 56); + + handle->lpcm_param->channelNumber = channel_layout; + handle->lpcm_param->sampleRate = sample_rate; + handle->lpcm_param->sizeOfWord = size_of_word; + + u32 samples_per_frame; + + switch (sample_rate) + { + case CELL_ADEC_FS_48kHz: samples_per_frame = 48000 / 600; break; + case CELL_ADEC_FS_96kHz: samples_per_frame = 96000 / 600; break; + default: return CELL_ADEC_ERROR_FATAL; + } + + u32 bits_per_sample; + + switch (size_of_word) + { + case CELL_ADEC_BIT_LENGTH_16: bits_per_sample = 16; break; + case CELL_ADEC_BIT_LENGTH_20: bits_per_sample = 20; break; + case CELL_ADEC_BIT_LENGTH_24: bits_per_sample = 24; break; + default: return CELL_ADEC_ERROR_FATAL; + } + + u32 channel_number; + + switch (channel_layout) + { + case CELL_ADEC_LPCM_DVD_CH_MONO: + channel_number = 1; + break; + + case CELL_ADEC_LPCM_DVD_CH_STEREO: + channel_number = 2; + break; + + case 4: + if (sample_rate == CELL_ADEC_FS_96kHz && size_of_word == CELL_ADEC_BIT_LENGTH_24) + { + return CELL_ADEC_ERROR_FATAL; + } + + channel_number = 3; + break; + + case 5: + if (sample_rate == CELL_ADEC_FS_96kHz && size_of_word != CELL_ADEC_BIT_LENGTH_16) + { + return CELL_ADEC_ERROR_FATAL; + } + + channel_number = 4; + break; + + case CELL_ADEC_LPCM_DVD_CH_3_2: + if (sample_rate == CELL_ADEC_FS_96kHz) + { + return CELL_ADEC_ERROR_FATAL; + } + + channel_number = 5; + break; + + case CELL_ADEC_LPCM_DVD_CH_3_2_LFE: + if (sample_rate == CELL_ADEC_FS_96kHz || size_of_word == CELL_ADEC_BIT_LENGTH_24) + { + return CELL_ADEC_ERROR_FATAL; + } + + channel_number = 6; + break; + + case CELL_ADEC_LPCM_DVD_CH_3_4: + if (sample_rate == CELL_ADEC_FS_96kHz || size_of_word != CELL_ADEC_BIT_LENGTH_16) + { + return CELL_ADEC_ERROR_FATAL; + } + + channel_number = 7; + break; + + case CELL_ADEC_LPCM_DVD_CH_3_4_LFE: + if (sample_rate == CELL_ADEC_FS_96kHz || size_of_word != CELL_ADEC_BIT_LENGTH_16) + { + return CELL_ADEC_ERROR_FATAL; + } + + channel_number = 8; + break; + + default: + return CELL_ADEC_ERROR_FATAL; + } + + handle->lpcm_param->audioPayloadSize = bits_per_sample * channel_number * samples_per_frame / 8 * unk; + + return CELL_OK; +} + error_code cellAdecDecodeAu(ppu_thread& ppu, vm::ptr handle, vm::ptr auInfo) { // Block savestate creation during ppu_thread::fast_call() @@ -1052,11 +1918,17 @@ error_code cellAdecDecodeAu(ppu_thread& ppu, vm::ptr handle, vm::pt if (handle->type.audioCodecType == CELL_ADEC_TYPE_LPCM_BLURAY) { - // TODO + if (adecSetLpcmBlurayParams(vm::static_ptr_cast(handle->core_handle), auInfo->userData) != CELL_OK) + { + return CELL_ADEC_ERROR_FATAL; + } } else if (handle->type.audioCodecType == CELL_ADEC_TYPE_LPCM_DVD) { - // TODO + if (adecSetLpcmDvdParams(vm::static_ptr_cast(handle->core_handle), auInfo->userData) != CELL_OK) + { + return CELL_ADEC_ERROR_FATAL; + } } return handle->core_ops->decodeAu(ppu, handle->core_handle, pcmHandle, auInfo); @@ -1104,7 +1976,7 @@ error_code cellAdecGetPcm(ppu_thread& ppu, vm::ptr handle, vm::ptr< if (handle->type.audioCodecType == CELL_ADEC_TYPE_LPCM_PAMF || handle->type.audioCodecType == CELL_ADEC_TYPE_LPCM_BLURAY || handle->type.audioCodecType == CELL_ADEC_TYPE_LPCM_DVD) { - // TODO + vm::static_ptr_cast(handle->core_handle)->output_size = pcm_item->size; } if (error_code ret = handle->core_ops->realign(ppu, handle->core_handle, outBuffer, pcm_queue_entry->pcm_item->startAddr); ret != CELL_OK) @@ -1206,6 +2078,8 @@ DECLARE(ppu_module_manager::cellAdec)("cellAdec", []() REG_HIDDEN_FUNC(_CellAdecCoreOpGetPcmHandleNum_lpcm); REG_HIDDEN_FUNC(_CellAdecCoreOpGetBsiInfoSize_lpcm); REG_HIDDEN_FUNC(_CellAdecCoreOpOpenExt_lpcm); + + REG_HIDDEN_FUNC(lpcmDecEntry); }); DECLARE(ppu_module_manager::cell_libac3dec)("cell_libac3dec", [] diff --git a/rpcs3/Emu/Cell/Modules/cellAdec.h b/rpcs3/Emu/Cell/Modules/cellAdec.h index 1cbeca08b8..a43c3f0aef 100644 --- a/rpcs3/Emu/Cell/Modules/cellAdec.h +++ b/rpcs3/Emu/Cell/Modules/cellAdec.h @@ -253,7 +253,7 @@ enum CellAdecSampleRate : s32 CELL_ADEC_FS_8kHz, }; -enum CellAdecBitLength : s32 +enum CellAdecBitLength : u32 { CELL_ADEC_BIT_LENGTH_RESERVED1, CELL_ADEC_BIT_LENGTH_16, @@ -352,8 +352,8 @@ enum AdecCorrectPtsValueType : s8 ADEC_CORRECT_PTS_VALUE_TYPE_UNSPECIFIED = -1, // Adds a fixed amount - ADEC_CORRECT_PTS_VALUE_TYPE_LPCM = 0, - // 1 + ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_HDMV = 0, + ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_DVD = 1, // Unused for some reason, the DVD player probably takes care of timestamps itself ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_48000Hz = 2, ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_44100Hz = 3, ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_32000Hz = 4, @@ -562,6 +562,11 @@ public: { ensure(sys_mutex_lock(ppu, mutex, 0) == CELL_OK); // Error code isn't checked on LLE + if (ppu.state & cpu_flag::again) // Savestate was created while waiting on the mutex + { + return {}; + } + if (entries[front].state == 0xff) { ensure(sys_mutex_unlock(ppu, mutex) == CELL_OK); // Error code isn't checked on LLE @@ -648,6 +653,20 @@ static_assert(std::is_standard_layout_v && std::is_trivial_v channelNumber; @@ -664,6 +683,216 @@ struct CellAdecLpcmInfo be_t outputDataSize; }; +// HLE exclusive, for savestates +enum class lpcm_dec_state : u8 +{ + waiting_for_cmd_mutex_lock, + waiting_for_cmd_cond_wait, + waiting_for_output_mutex_lock, + waiting_for_output_cond_wait, + queue_mutex_lock, + executing_cmd +}; + +class LpcmDecSemaphore +{ + be_t value; + be_t mutex; // sys_mutex_t + be_t cond; // sys_cond_t + +public: + error_code init(ppu_thread& ppu, vm::ptr _this, u32 initial_value) + { + value = initial_value; + + const vm::var mutex_attr{{ SYS_SYNC_PRIORITY, SYS_SYNC_NOT_RECURSIVE, SYS_SYNC_NOT_PROCESS_SHARED, SYS_SYNC_NOT_ADAPTIVE, 0, 0, 0, { "_adem01"_u64 } }}; + const vm::var cond_attr{{ SYS_SYNC_NOT_PROCESS_SHARED, 0, 0, { "_adec01"_u64 } }}; + + if (error_code ret = sys_mutex_create(ppu, _this.ptr(&LpcmDecSemaphore::mutex), mutex_attr); ret != CELL_OK) + { + return ret; + } + + return sys_cond_create(ppu, _this.ptr(&LpcmDecSemaphore::cond), mutex, cond_attr); + } + + error_code finalize(ppu_thread& ppu) const + { + if (error_code ret = sys_cond_destroy(ppu, cond); ret != CELL_OK) + { + return ret; + } + + return sys_mutex_destroy(ppu, mutex); + } + + error_code release(ppu_thread& ppu) + { + if (error_code ret = sys_mutex_lock(ppu, mutex, 0); ret != CELL_OK) + { + return ret; + } + + value++; + + if (error_code ret = sys_cond_signal(ppu, cond); ret != CELL_OK) + { + return ret; // LLE doesn't unlock the mutex + } + + return sys_mutex_unlock(ppu, mutex); + } + + error_code acquire(ppu_thread& ppu, lpcm_dec_state& savestate) + { + if (savestate == lpcm_dec_state::waiting_for_cmd_cond_wait) + { + goto cond_wait; + } + + savestate = lpcm_dec_state::waiting_for_cmd_mutex_lock; + + if (error_code ret = sys_mutex_lock(ppu, mutex, 0); ret != CELL_OK) + { + return ret; + } + + if (ppu.state & cpu_flag::again) + { + return {}; + } + + if (value == 0u) + { + savestate = lpcm_dec_state::waiting_for_cmd_cond_wait; + cond_wait: + + if (error_code ret = sys_cond_wait(ppu, cond, 0); ret != CELL_OK) + { + return ret; // LLE doesn't unlock the mutex + } + + if (ppu.state & cpu_flag::again) + { + return {}; + } + } + + value--; + + return sys_mutex_unlock(ppu, mutex); + } +}; + +CHECK_SIZE(LpcmDecSemaphore, 0xc); + +enum class LpcmDecCmdType : u32 +{ + start_seq, + end_seq, + decode_au, + close +}; + +struct LpcmDecCmd +{ + be_t pcm_handle; + vm::bcptr au_start_addr; + be_t au_size; + u32 reserved1[2]; + CellAdecParamLpcm lpcm_param; + be_t type; + u32 reserved2; + + LpcmDecCmd() = default; // cellAdecOpen() + + LpcmDecCmd(LpcmDecCmdType&& type) // End sequence + : type(type) + { + } + + LpcmDecCmd(LpcmDecCmdType&& type, const CellAdecParamLpcm& lpcm_param) // Start sequence + : lpcm_param(lpcm_param), type(type) + { + } + + LpcmDecCmd(LpcmDecCmdType&& type, const s32& pcm_handle, const CellAdecAuInfo& au_info) // Decode au + : pcm_handle(pcm_handle), au_start_addr(au_info.startAddr), au_size(au_info.size), type(type) + { + } +}; + +CHECK_SIZE(LpcmDecCmd, 0x2c); + +struct LpcmDecContext +{ + AdecCmdQueue cmd_queue; + + be_t thread_id; // sys_ppu_thread_t + + be_t queue_size_mutex; // sys_mutex_t + be_t queue_size_cond; // sys_cond_t, unused + be_t unk_mutex; // sys_mutex_t, unused + be_t unk_cond; // sys_cond_t, unused + + be_t run_thread; + + AdecCb notify_au_done; + AdecCb notify_pcm_out; + AdecCb notify_error; + AdecCb notify_seq_done; + + be_t output_locked; + vm::bptr output; + + vm::bptr lpcm_param; + + vm::bcptr spurs_cmd_data; + + // HLE exclusive + lpcm_dec_state savestate; + u64 cmd_counter; // For debugging + + u8 reserved1[24]; // 36 bytes on LLE + + be_t output_mutex; // sys_mutex_t + be_t output_consumed; // sys_cond_t + + LpcmDecSemaphore cmd_available; + LpcmDecSemaphore reserved2; // Unused + + be_t queue_mutex; // sys_mutex_t + + be_t error_occurred; + + u8 spurs_stuff[32]; + + be_t spurs_queue_pop_mutex; + be_t spurs_queue_push_mutex; + + be_t using_existing_spurs_instance; + + be_t dvd_packing; + + be_t output_size; + + LpcmDecCmd cmd; // HLE exclusive, name of Spurs taskset (32 bytes) + CellSpursTaskLsPattern on LLE + + u8 more_spurs_stuff[10]; // 52 bytes on LLE + + void exec(ppu_thread& ppu); + + template + error_code send_command(ppu_thread& ppu, auto&&... args); + + inline error_code release_output(ppu_thread& ppu); +}; + +static_assert(std::is_standard_layout_v); +CHECK_SIZE_ALIGN(LpcmDecContext, 0x1c8, 8); + +constexpr s32 LPCM_DEC_OUTPUT_BUFFER_SIZE = 0x40000; + // CELP Excitation Mode enum CELP_ExcitationMode : s32 { diff --git a/rpcs3/Emu/Cell/lv2/sys_prx.cpp b/rpcs3/Emu/Cell/lv2/sys_prx.cpp index 49b5839cfa..da24edbedb 100644 --- a/rpcs3/Emu/Cell/lv2/sys_prx.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_prx.cpp @@ -35,7 +35,7 @@ extern const std::map g_prx_list { "libaacenc_spurs.sprx", 0 }, { "libac3dec.sprx", 0 }, { "libac3dec2.sprx", 0 }, - { "libadec.sprx", 0 }, + { "libadec.sprx", 1 }, { "libadec2.sprx", 0 }, { "libadec_internal.sprx", 0 }, { "libad_async.sprx", 0 }, diff --git a/rpcs3/util/simd.hpp b/rpcs3/util/simd.hpp index d357c800ab..750bb97fd1 100644 --- a/rpcs3/util/simd.hpp +++ b/rpcs3/util/simd.hpp @@ -21,6 +21,7 @@ #include #endif +#include #include #include #include @@ -1967,6 +1968,15 @@ inline v128 gv_mulfs(const v128& a, const v128& b) #endif } +inline v128 gv_mulfs(const v128& a, f32 b) +{ +#if defined(ARCH_X64) + return _mm_mul_ps(a, _mm_set_ps1(b)); +#elif defined(ARCH_ARM64) + return vmulq_n_f32(a, b); +#endif +} + inline v128 gv_hadds8x2(const v128& a) { #if defined(__SSSE3__) @@ -2979,6 +2989,23 @@ inline v128 gv_rol16(const v128& a, const v128& b) #endif } +// For each 16-bit element, r = rotate a by count +template +inline v128 gv_rol16(const v128& a) +{ + constexpr u8 count = Count & 0xf; +#if defined(ARCH_X64) + return _mm_or_si128(_mm_srli_epi16(a, 16 - count), _mm_slli_epi16(a, count)); +#elif defined(ARCH_ARM64) + return vorrq_u16(vshrq_n_u16(a, 16 - count), vshlq_n_u16(a, count)); +#else + v128 r; + for (u32 i = 0; i < 8; i++) + r._u16[i] = std::rotl(a._u16[i], count); + return r; +#endif +} + // For each 32-bit element, r = rotate a by b inline v128 gv_rol32(const v128& a, const v128& b) { @@ -2997,15 +3024,16 @@ inline v128 gv_rol32(const v128& a, const v128& b) } // For each 32-bit element, r = rotate a by count -inline v128 gv_rol32(const v128& a, u32 count) +template +inline v128 gv_rol32(const v128& a) { - count %= 32; -#if defined(ARCH_X64) - return _mm_or_epi32(_mm_srli_epi32(a, 32 - count), _mm_slli_epi32(a, count)); + constexpr u8 count = Count & 0x1f; +#if defined(__AVX512VL__) + return _mm_rol_epi32(a, count); +#elif defined(ARCH_X64) + return _mm_or_si128(_mm_srli_epi32(a, 32 - count), _mm_slli_epi32(a, count)); #elif defined(ARCH_ARM64) - const auto amt1 = vdupq_n_s32(count); - const auto amt2 = vdupq_n_s32(count - 32); - return vorrq_u32(vshlq_u32(a, amt1), vshlq_u32(a, amt2)); + return vorrq_u32(vshrq_n_u32(a, 32 - count), vshlq_n_u32(a, count)); #else v128 r; for (u32 i = 0; i < 4; i++) @@ -3107,6 +3135,139 @@ inline auto gv_shuffle_right(A&& a) FOR_X64(unary_op, kIdPsrldq, kIdVpsrldq, std::forward(a), Count); } +// Load 32-bit integer into the first element of a new vector, set other elements to zero +inline v128 gv_loadu32(const void* ptr) +{ +#if defined(ARCH_X64) + return _mm_loadu_si32(ptr); +#elif defined(ARCH_ARM64) + return vld1q_lane_u32(static_cast(ptr), vdupq_n_u32(0), 0); +#endif +} + +// Load 16-bit integer into an existing vector at the position specified by Index +template +inline v128 gv_insert16(const v128& vec, u16 value) +{ +#if defined(ARCH_X64) + return _mm_insert_epi16(vec, value, Index); +#elif defined(ARCH_ARM64) + return vsetq_lane_u16(value, vec, Index & 0x7); +#endif +} + +// For each 8-bit element, +// if ctrl >= 0 && ctrl < 16 then r = vec[ctrl], +// else if ctrl < 0 then r = 0 +inline v128 gv_shuffle8(const v128& vec, const v128& ctrl) +{ + AUDIT(std::ranges::none_of(ctrl._chars, [](s8 i){ return i >= static_cast(sizeof(v128)); }), "All indices must be in the range [0, 15] or negative, since PSHUFB and TBL behave differently otherwise"); +#if defined(__SSSE3__) + return _mm_shuffle_epi8(vec, ctrl); +#elif defined(ARCH_ARM64) + return vqtbl1q_s8(vec, ctrl); +#else + v128 r; + for (s32 i = 0; i < 16; i++) + r._s8[i] = ctrl._s8[i] < 0 ? 0 : vec._s8[ctrl._s8[i] & 0xf]; + return r; +#endif +} + +// For each 2-bit index in Control, r = vec[index] +template +inline v128 gv_shuffle32(const v128& vec) +{ +#if defined(ARCH_X64) + return _mm_shuffle_epi32(vec, Control); +#elif defined(ARCH_ARM64) + constexpr u8 idx0 = (Control & 3) * sizeof(s32); + constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32); + constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32); + constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32); + + constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 }; + + return vqtbl1q_s8(vec, idx_vec); +#endif +} + +// For each index, r = vec[index & 3] +template +inline v128 gv_shuffle32(const v128& vec) +{ +#if defined(ARCH_X64) + return _mm_shuffle_epi32(vec, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6); +#elif defined(ARCH_ARM64) + constexpr u8 idx0 = (Index0 & 3) * sizeof(s32); + constexpr u8 idx1 = (Index1 & 3) * sizeof(s32); + constexpr u8 idx2 = (Index2 & 3) * sizeof(s32); + constexpr u8 idx3 = (Index3 & 3) * sizeof(s32); + + constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 }; + + return vqtbl1q_s8(vec, idx_vec); +#endif +} + +// For the first two 2-bit indices in Control, r = a[index], +// for the last two indices, r = b[index] +template +inline v128 gv_shufflefs(const v128& a, const v128& b) +{ +#if defined(ARCH_X64) + return _mm_shuffle_ps(a, b, Control); +#elif defined(ARCH_ARM64) + constexpr u8 idx0 = (Control & 3) * sizeof(s32); + constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32); + constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32) + sizeof(v128); + constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32) + sizeof(v128); + + constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 }; + + return vqtbl2q_s8({ a, b }, idx_vec); +#endif +} + +// For the first two indices, r = a[index & 3], +// for the last two indices, r = b[index & 3] +template +inline v128 gv_shufflefs(const v128& a, const v128& b) +{ +#if defined(ARCH_X64) + return _mm_shuffle_ps(a, b, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6); +#elif defined(ARCH_ARM64) + constexpr u8 idx0 = (Index0 & 3) * sizeof(s32); + constexpr u8 idx1 = (Index1 & 3) * sizeof(s32); + constexpr u8 idx2 = (Index2 & 3) * sizeof(s32) + sizeof(v128); + constexpr u8 idx3 = (Index3 & 3) * sizeof(s32) + sizeof(v128); + + constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 }; + + return vqtbl2q_s8({ a, b }, idx_vec); +#endif +} + +// For each 32-bit element, reverse byte order +inline v128 gv_rev32(const v128& vec) +{ +#if defined(__SSSE3__) + return _mm_shuffle_epi8(vec, _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12)); +#elif defined(ARCH_ARM64) + return vrev32q_u8(vec); +#else + return gv_rol32<16>(gv_rol16<8>(vec)); +#endif +} + +// For each 32-bit element, convert between big-endian and native-endian +inline v128 gv_to_be32(const v128& vec) +{ + if constexpr (std::endian::native == std::endian::little) + return gv_rev32(vec); + return vec; +} + #if defined(__clang__) #pragma clang diagnostic pop #elif defined(__GNUC__)