cellAdec implementation part 2: LPCM decoder (#16381)

* cellAdec: savestate fixup * simd.hpp: add some intrinsics * cellAdec implementation part 2: LPCM decoder * cellAdec: set to HLE by default * cellAdec: review fixes --------- Co-authored-by: Elad <18193363+elad335@users.noreply.github.com>
2025-04-20 03:25:16 +00:00 · 2024-12-18 19:21:56 +01:00 · 2024-12-18 19:21:56 +01:00 · 9d4ff13c2b
commit 9d4ff13c2b
parent e18ae5abd6
4 changed files with 1316 additions and 52 deletions
--- a/rpcs3/Emu/Cell/Modules/cellAdec.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellAdec.cpp
--- a/rpcs3/Emu/Cell/Modules/cellAdec.h
+++ b/rpcs3/Emu/Cell/Modules/cellAdec.h
@ -253,7 +253,7 @@ enum CellAdecSampleRate : s32
 	CELL_ADEC_FS_8kHz,
 };

-enum CellAdecBitLength : s32
+enum CellAdecBitLength : u32
 {
 	CELL_ADEC_BIT_LENGTH_RESERVED1,
 	CELL_ADEC_BIT_LENGTH_16,
@ -352,8 +352,8 @@ enum AdecCorrectPtsValueType : s8
 	ADEC_CORRECT_PTS_VALUE_TYPE_UNSPECIFIED = -1,

 	// Adds a fixed amount
-	ADEC_CORRECT_PTS_VALUE_TYPE_LPCM = 0,
-	// 1
+	ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_HDMV = 0,
+	ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_DVD = 1, // Unused for some reason, the DVD player probably takes care of timestamps itself
 	ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_48000Hz = 2,
 	ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_44100Hz = 3,
 	ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_32000Hz = 4,
@ -562,6 +562,11 @@ public:
 	{
 		ensure(sys_mutex_lock(ppu, mutex, 0) == CELL_OK); // Error code isn't checked on LLE

+		if (ppu.state & cpu_flag::again) // Savestate was created while waiting on the mutex
+		{
+			return {};
+		}
+
 		if (entries[front].state == 0xff)
 		{
 			ensure(sys_mutex_unlock(ppu, mutex) == CELL_OK); // Error code isn't checked on LLE
@ -648,6 +653,20 @@ static_assert(std::is_standard_layout_v<AdecContext> && std::is_trivial_v<AdecCo
 CHECK_SIZE_ALIGN(AdecContext, 0x530, 8);


+enum : u32
+{
+	CELL_ADEC_LPCM_DVD_CH_RESERVED1,
+	CELL_ADEC_LPCM_DVD_CH_MONO,
+	CELL_ADEC_LPCM_DVD_CH_RESERVED2,
+	CELL_ADEC_LPCM_DVD_CH_STEREO,
+	CELL_ADEC_LPCM_DVD_CH_UNK1, // Either 3 front or 2 front + 1 surround
+	CELL_ADEC_LPCM_DVD_CH_UNK2, // Either 3 front + 1 surround or 2 front + 2 surround
+	CELL_ADEC_LPCM_DVD_CH_3_2,
+	CELL_ADEC_LPCM_DVD_CH_3_2_LFE,
+	CELL_ADEC_LPCM_DVD_CH_3_4,
+	CELL_ADEC_LPCM_DVD_CH_3_4_LFE,
+};
+
 struct CellAdecParamLpcm
 {
 	be_t<u32> channelNumber;
@ -664,6 +683,216 @@ struct CellAdecLpcmInfo
 	be_t<u32> outputDataSize;
 };

+// HLE exclusive, for savestates
+enum class lpcm_dec_state : u8
+{
+	waiting_for_cmd_mutex_lock,
+	waiting_for_cmd_cond_wait,
+	waiting_for_output_mutex_lock,
+	waiting_for_output_cond_wait,
+	queue_mutex_lock,
+	executing_cmd
+};
+
+class LpcmDecSemaphore
+{
+	be_t<u32> value;
+	be_t<u32> mutex; // sys_mutex_t
+	be_t<u32> cond;  // sys_cond_t
+
+public:
+	error_code init(ppu_thread& ppu, vm::ptr<LpcmDecSemaphore> _this, u32 initial_value)
+	{
+		value = initial_value;
+
+		const vm::var<sys_mutex_attribute_t> mutex_attr{{ SYS_SYNC_PRIORITY, SYS_SYNC_NOT_RECURSIVE, SYS_SYNC_NOT_PROCESS_SHARED, SYS_SYNC_NOT_ADAPTIVE, 0, 0, 0, { "_adem01"_u64 } }};
+		const vm::var<sys_cond_attribute_t> cond_attr{{ SYS_SYNC_NOT_PROCESS_SHARED, 0, 0, { "_adec01"_u64 } }};
+
+		if (error_code ret = sys_mutex_create(ppu, _this.ptr(&LpcmDecSemaphore::mutex), mutex_attr); ret != CELL_OK)
+		{
+			return ret;
+		}
+
+		return sys_cond_create(ppu, _this.ptr(&LpcmDecSemaphore::cond), mutex, cond_attr);
+	}
+
+	error_code finalize(ppu_thread& ppu) const
+	{
+		if (error_code ret = sys_cond_destroy(ppu, cond); ret != CELL_OK)
+		{
+			return ret;
+		}
+
+		return sys_mutex_destroy(ppu, mutex);
+	}
+
+	error_code release(ppu_thread& ppu)
+	{
+		if (error_code ret = sys_mutex_lock(ppu, mutex, 0); ret != CELL_OK)
+		{
+			return ret;
+		}
+
+		value++;
+
+		if (error_code ret = sys_cond_signal(ppu, cond); ret != CELL_OK)
+		{
+			return ret; // LLE doesn't unlock the mutex
+		}
+
+		return sys_mutex_unlock(ppu, mutex);
+	}
+
+	error_code acquire(ppu_thread& ppu, lpcm_dec_state& savestate)
+	{
+		if (savestate == lpcm_dec_state::waiting_for_cmd_cond_wait)
+		{
+			goto cond_wait;
+		}
+
+		savestate = lpcm_dec_state::waiting_for_cmd_mutex_lock;
+
+		if (error_code ret = sys_mutex_lock(ppu, mutex, 0); ret != CELL_OK)
+		{
+			return ret;
+		}
+
+		if (ppu.state & cpu_flag::again)
+		{
+			return {};
+		}
+
+		if (value == 0u)
+		{
+			savestate = lpcm_dec_state::waiting_for_cmd_cond_wait;
+			cond_wait:
+
+			if (error_code ret = sys_cond_wait(ppu, cond, 0); ret != CELL_OK)
+			{
+				return ret; // LLE doesn't unlock the mutex
+			}
+
+			if (ppu.state & cpu_flag::again)
+			{
+				return {};
+			}
+		}
+
+		value--;
+
+		return sys_mutex_unlock(ppu, mutex);
+	}
+};
+
+CHECK_SIZE(LpcmDecSemaphore, 0xc);
+
+enum class LpcmDecCmdType : u32
+{
+	start_seq,
+	end_seq,
+	decode_au,
+	close
+};
+
+struct LpcmDecCmd
+{
+	be_t<s32> pcm_handle;
+	vm::bcptr<void> au_start_addr;
+	be_t<u32> au_size;
+	u32 reserved1[2];
+	CellAdecParamLpcm lpcm_param;
+	be_t<LpcmDecCmdType> type;
+	u32 reserved2;
+
+	LpcmDecCmd() = default; // cellAdecOpen()
+
+	LpcmDecCmd(LpcmDecCmdType&& type) // End sequence
+		: type(type)
+	{
+	}
+
+	LpcmDecCmd(LpcmDecCmdType&& type, const CellAdecParamLpcm& lpcm_param) // Start sequence
+		: lpcm_param(lpcm_param), type(type)
+	{
+	}
+
+	LpcmDecCmd(LpcmDecCmdType&& type, const s32& pcm_handle, const CellAdecAuInfo& au_info) // Decode au
+		: pcm_handle(pcm_handle), au_start_addr(au_info.startAddr), au_size(au_info.size), type(type)
+	{
+	}
+};
+
+CHECK_SIZE(LpcmDecCmd, 0x2c);
+
+struct LpcmDecContext
+{
+	AdecCmdQueue<LpcmDecCmd> cmd_queue;
+
+	be_t<u64> thread_id; // sys_ppu_thread_t
+
+	be_t<u32> queue_size_mutex; // sys_mutex_t
+	be_t<u32> queue_size_cond;  // sys_cond_t, unused
+	be_t<u32> unk_mutex;        // sys_mutex_t, unused
+	be_t<u32> unk_cond;         // sys_cond_t, unused
+
+	be_t<u32> run_thread;
+
+	AdecCb<AdecNotifyAuDone> notify_au_done;
+	AdecCb<AdecNotifyPcmOut> notify_pcm_out;
+	AdecCb<AdecNotifyError> notify_error;
+	AdecCb<AdecNotifySeqDone> notify_seq_done;
+
+	be_t<u32> output_locked;
+	vm::bptr<f32> output;
+
+	vm::bptr<CellAdecParamLpcm> lpcm_param;
+
+	vm::bcptr<void> spurs_cmd_data;
+
+	// HLE exclusive
+	lpcm_dec_state savestate;
+	u64 cmd_counter; // For debugging
+
+	u8 reserved1[24]; // 36 bytes on LLE
+
+	be_t<u32> output_mutex;    // sys_mutex_t
+	be_t<u32> output_consumed; // sys_cond_t
+
+	LpcmDecSemaphore cmd_available;
+	LpcmDecSemaphore reserved2; // Unused
+
+	be_t<u32> queue_mutex; // sys_mutex_t
+
+	be_t<u32> error_occurred;
+
+	u8 spurs_stuff[32];
+
+	be_t<u32> spurs_queue_pop_mutex;
+	be_t<u32> spurs_queue_push_mutex;
+
+	be_t<u32> using_existing_spurs_instance;
+
+	be_t<u32> dvd_packing;
+
+	be_t<u32> output_size;
+
+	LpcmDecCmd cmd; // HLE exclusive, name of Spurs taskset (32 bytes) + CellSpursTaskLsPattern on LLE
+
+	u8 more_spurs_stuff[10]; // 52 bytes on LLE
+
+	void exec(ppu_thread& ppu);
+
+	template <LpcmDecCmdType type>
+	error_code send_command(ppu_thread& ppu, auto&&... args);
+
+	inline error_code release_output(ppu_thread& ppu);
+};
+
+static_assert(std::is_standard_layout_v<LpcmDecContext>);
+CHECK_SIZE_ALIGN(LpcmDecContext, 0x1c8, 8);
+
+constexpr s32 LPCM_DEC_OUTPUT_BUFFER_SIZE = 0x40000;
+
 // CELP Excitation Mode
 enum CELP_ExcitationMode : s32
 {
--- a/rpcs3/Emu/Cell/lv2/sys_prx.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_prx.cpp
@ -35,7 +35,7 @@ extern const std::map<std::string_view, int> g_prx_list
 	{ "libaacenc_spurs.sprx", 0 },
 	{ "libac3dec.sprx", 0 },
 	{ "libac3dec2.sprx", 0 },
-	{ "libadec.sprx", 0 },
+	{ "libadec.sprx", 1 },
 	{ "libadec2.sprx", 0 },
 	{ "libadec_internal.sprx", 0 },
 	{ "libad_async.sprx", 0 },
--- a/rpcs3/util/simd.hpp
+++ b/rpcs3/util/simd.hpp
@ -21,6 +21,7 @@
 #include <arm_neon.h>
 #endif

+#include <algorithm>
 #include <cmath>
 #include <math.h>
 #include <cfenv>
@ -1967,6 +1968,15 @@ inline v128 gv_mulfs(const v128& a, const v128& b)
 #endif
 }

+inline v128 gv_mulfs(const v128& a, f32 b)
+{
+#if defined(ARCH_X64)
+	return _mm_mul_ps(a, _mm_set_ps1(b));
+#elif defined(ARCH_ARM64)
+	return vmulq_n_f32(a, b);
+#endif
+}
+
 inline v128 gv_hadds8x2(const v128& a)
 {
 #if defined(__SSSE3__)
@ -2979,6 +2989,23 @@ inline v128 gv_rol16(const v128& a, const v128& b)
 #endif
 }

+// For each 16-bit element, r = rotate a by count
+template <u8 Count>
+inline v128 gv_rol16(const v128& a)
+{
+	constexpr u8 count = Count & 0xf;
+#if defined(ARCH_X64)
+	return _mm_or_si128(_mm_srli_epi16(a, 16 - count), _mm_slli_epi16(a, count));
+#elif defined(ARCH_ARM64)
+	return vorrq_u16(vshrq_n_u16(a, 16 - count), vshlq_n_u16(a, count));
+#else
+	v128 r;
+	for (u32 i = 0; i < 8; i++)
+		r._u16[i] = std::rotl(a._u16[i], count);
+	return r;
+#endif
+}
+
 // For each 32-bit element, r = rotate a by b
 inline v128 gv_rol32(const v128& a, const v128& b)
 {
@ -2997,15 +3024,16 @@ inline v128 gv_rol32(const v128& a, const v128& b)
 }

 // For each 32-bit element, r = rotate a by count
-inline v128 gv_rol32(const v128& a, u32 count)
+template <u8 Count>
+inline v128 gv_rol32(const v128& a)
 {
-	count %= 32;
-#if defined(ARCH_X64)
-	return _mm_or_epi32(_mm_srli_epi32(a, 32 - count), _mm_slli_epi32(a, count));
+	constexpr u8 count = Count & 0x1f;
+#if defined(__AVX512VL__)
+	return _mm_rol_epi32(a, count);
+#elif defined(ARCH_X64)
+	return _mm_or_si128(_mm_srli_epi32(a, 32 - count), _mm_slli_epi32(a, count));
 #elif defined(ARCH_ARM64)
-	const auto amt1 = vdupq_n_s32(count);
-	const auto amt2 = vdupq_n_s32(count - 32);
-	return vorrq_u32(vshlq_u32(a, amt1), vshlq_u32(a, amt2));
+	return vorrq_u32(vshrq_n_u32(a, 32 - count), vshlq_n_u32(a, count));
 #else
 	v128 r;
 	for (u32 i = 0; i < 4; i++)
@ -3107,6 +3135,139 @@ inline auto gv_shuffle_right(A&& a)
 	FOR_X64(unary_op, kIdPsrldq, kIdVpsrldq, std::forward<A>(a), Count);
 }

+// Load 32-bit integer into the first element of a new vector, set other elements to zero
+inline v128 gv_loadu32(const void* ptr)
+{
+#if defined(ARCH_X64)
+	return _mm_loadu_si32(ptr);
+#elif defined(ARCH_ARM64)
+	return vld1q_lane_u32(static_cast<const u32*>(ptr), vdupq_n_u32(0), 0);
+#endif
+}
+
+// Load 16-bit integer into an existing vector at the position specified by Index
+template <u8 Index>
+inline v128 gv_insert16(const v128& vec, u16 value)
+{
+#if defined(ARCH_X64)
+	return _mm_insert_epi16(vec, value, Index);
+#elif defined(ARCH_ARM64)
+	return vsetq_lane_u16(value, vec, Index & 0x7);
+#endif
+}
+
+// For each 8-bit element,
+// if ctrl >= 0 && ctrl < 16 then r = vec[ctrl],
+// else if ctrl < 0 then r = 0
+inline v128 gv_shuffle8(const v128& vec, const v128& ctrl)
+{
+	AUDIT(std::ranges::none_of(ctrl._chars, [](s8 i){ return i >= static_cast<s8>(sizeof(v128)); }), "All indices must be in the range [0, 15] or negative, since PSHUFB and TBL behave differently otherwise");
+#if defined(__SSSE3__)
+	return _mm_shuffle_epi8(vec, ctrl);
+#elif defined(ARCH_ARM64)
+	return vqtbl1q_s8(vec, ctrl);
+#else
+	v128 r;
+	for (s32 i = 0; i < 16; i++)
+		r._s8[i] = ctrl._s8[i] < 0 ? 0 : vec._s8[ctrl._s8[i] & 0xf];
+	return r;
+#endif
+}
+
+// For each 2-bit index in Control, r = vec[index]
+template <u8 Control>
+inline v128 gv_shuffle32(const v128& vec)
+{
+#if defined(ARCH_X64)
+	return _mm_shuffle_epi32(vec, Control);
+#elif defined(ARCH_ARM64)
+	constexpr u8 idx0 = (Control & 3) * sizeof(s32);
+	constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32);
+	constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32);
+	constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32);
+
+	constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 };
+
+	return vqtbl1q_s8(vec, idx_vec);
+#endif
+}
+
+// For each index, r = vec[index & 3]
+template <u8 Index0, u8 Index1, u8 Index2, u8 Index3>
+inline v128 gv_shuffle32(const v128& vec)
+{
+#if defined(ARCH_X64)
+	return _mm_shuffle_epi32(vec, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6);
+#elif defined(ARCH_ARM64)
+	constexpr u8 idx0 = (Index0 & 3) * sizeof(s32);
+	constexpr u8 idx1 = (Index1 & 3) * sizeof(s32);
+	constexpr u8 idx2 = (Index2 & 3) * sizeof(s32);
+	constexpr u8 idx3 = (Index3 & 3) * sizeof(s32);
+
+	constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 };
+
+	return vqtbl1q_s8(vec, idx_vec);
+#endif
+}
+
+// For the first two 2-bit indices in Control, r = a[index],
+// for the last two indices, r = b[index]
+template <u8 Control>
+inline v128 gv_shufflefs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_shuffle_ps(a, b, Control);
+#elif defined(ARCH_ARM64)
+	constexpr u8 idx0 = (Control & 3) * sizeof(s32);
+	constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32);
+	constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32) + sizeof(v128);
+	constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32) + sizeof(v128);
+
+	constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 };
+
+	return vqtbl2q_s8({ a, b }, idx_vec);
+#endif
+}
+
+// For the first two indices, r = a[index & 3],
+// for the last two indices, r = b[index & 3]
+template <u8 Index0, u8 Index1, u8 Index2, u8 Index3>
+inline v128 gv_shufflefs(const v128& a, const v128& b)
+{
+#if defined(ARCH_X64)
+	return _mm_shuffle_ps(a, b, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6);
+#elif defined(ARCH_ARM64)
+	constexpr u8 idx0 = (Index0 & 3) * sizeof(s32);
+	constexpr u8 idx1 = (Index1 & 3) * sizeof(s32);
+	constexpr u8 idx2 = (Index2 & 3) * sizeof(s32) + sizeof(v128);
+	constexpr u8 idx3 = (Index3 & 3) * sizeof(s32) + sizeof(v128);
+
+	constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 };
+
+	return vqtbl2q_s8({ a, b }, idx_vec);
+#endif
+}
+
+// For each 32-bit element, reverse byte order
+inline v128 gv_rev32(const v128& vec)
+{
+#if defined(__SSSE3__)
+	return _mm_shuffle_epi8(vec, _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12));
+#elif defined(ARCH_ARM64)
+	return vrev32q_u8(vec);
+#else
+	return gv_rol32<16>(gv_rol16<8>(vec));
+#endif
+}
+
+// For each 32-bit element, convert between big-endian and native-endian
+inline v128 gv_to_be32(const v128& vec)
+{
+	if constexpr (std::endian::native == std::endian::little)
+		return gv_rev32(vec);
+	return vec;
+}
+
 #if defined(__clang__)
 #pragma clang diagnostic pop
 #elif defined(__GNUC__)