diff --git a/Source/Core/Core/HW/DSPHLE/UCodes/AX.cpp b/Source/Core/Core/HW/DSPHLE/UCodes/AX.cpp index f82abf5d23..63d75ccc17 100644 --- a/Source/Core/Core/HW/DSPHLE/UCodes/AX.cpp +++ b/Source/Core/Core/HW/DSPHLE/UCodes/AX.cpp @@ -478,9 +478,9 @@ void AXUCode::ProcessPBList(u32 pb_addr) { ApplyUpdatesForMs(curr_ms, pb, pb.updates.num_updates, updates); - ProcessVoice(static_cast(m_accelerator.get()), pb, buffers, spms, - ConvertMixerControl(pb.mixer_control), - m_coeffs_checksum ? m_coeffs.data() : nullptr, false); + ProcessVoice(static_cast(m_accelerator.get()), pb, buffers, + ConvertMixerControl(pb.mixer_control), + m_coeffs_checksum ? m_coeffs.data() : nullptr, false); // Forward the buffers for (auto& ptr : buffers.ptrs) diff --git a/Source/Core/Core/HW/DSPHLE/UCodes/AX.h b/Source/Core/Core/HW/DSPHLE/UCodes/AX.h index 138db3aaef..f35ccf4601 100644 --- a/Source/Core/Core/HW/DSPHLE/UCodes/AX.h +++ b/Source/Core/Core/HW/DSPHLE/UCodes/AX.h @@ -88,15 +88,15 @@ protected: static constexpr u32 MAIL_CMDLIST_MASK = 0xFFFF0000; // 32 * 5 because 32 samples per millisecond, for max 5 milliseconds. - int m_samples_main_left[32 * 5]{}; - int m_samples_main_right[32 * 5]{}; - int m_samples_main_surround[32 * 5]{}; - int m_samples_auxA_left[32 * 5]{}; - int m_samples_auxA_right[32 * 5]{}; - int m_samples_auxA_surround[32 * 5]{}; - int m_samples_auxB_left[32 * 5]{}; - int m_samples_auxB_right[32 * 5]{}; - int m_samples_auxB_surround[32 * 5]{}; + alignas(32) int m_samples_main_left[32 * 5]{}; + alignas(32) int m_samples_main_right[32 * 5]{}; + alignas(32) int m_samples_main_surround[32 * 5]{}; + alignas(32) int m_samples_auxA_left[32 * 5]{}; + alignas(32) int m_samples_auxA_right[32 * 5]{}; + alignas(32) int m_samples_auxA_surround[32 * 5]{}; + alignas(32) int m_samples_auxB_left[32 * 5]{}; + alignas(32) int m_samples_auxB_right[32 * 5]{}; + alignas(32) int m_samples_auxB_surround[32 * 5]{}; u16 m_cmdlist[512]{}; u32 m_cmdlist_size = 0; diff --git a/Source/Core/Core/HW/DSPHLE/UCodes/AXStructs.h b/Source/Core/Core/HW/DSPHLE/UCodes/AXStructs.h index 701d692e1c..c61fd5f7de 100644 --- a/Source/Core/Core/HW/DSPHLE/UCodes/AXStructs.h +++ b/Source/Core/Core/HW/DSPHLE/UCodes/AXStructs.h @@ -10,7 +10,7 @@ namespace DSP::HLE struct VolumeData { u16 volume; - u16 volume_delta; + s16 volume_delta; }; struct PBMixer @@ -198,7 +198,7 @@ struct PBLowPassFilter u16 b0; }; -struct AXPB +struct alignas(32) AXPB { u16 next_pb_hi; u16 next_pb_lo; @@ -255,7 +255,7 @@ union PBInfImpulseResponseWM PBBiquadFilter biquad; }; -struct AXPBWii +struct alignas(32) AXPBWii { u16 next_pb_hi; u16 next_pb_lo; diff --git a/Source/Core/Core/HW/DSPHLE/UCodes/AXVoice.h b/Source/Core/Core/HW/DSPHLE/UCodes/AXVoice.h index 87c569a6e4..89b648afad 100644 --- a/Source/Core/Core/HW/DSPHLE/UCodes/AXVoice.h +++ b/Source/Core/Core/HW/DSPHLE/UCodes/AXVoice.h @@ -355,10 +355,11 @@ s16 ClampS16(s64 sample) } // Add samples to an output buffer, with optional volume ramping. -void MixAdd(int* out, const s16* input, u32 count, VolumeData* vd, s16* dpop, bool ramp) +template +static void MixAdd(int* out, const s16* input, VolumeData* vd, s16* dpop, bool ramp) { - u16& volume = vd->volume; - u16 volume_delta = vd->volume_delta; + u16 volume = vd->volume; + s16 volume_delta = vd->volume_delta; // If volume ramping is disabled, set volume_delta to 0. That way, the // mixing loop can avoid testing if volume ramping is enabled at each step, @@ -366,18 +367,61 @@ void MixAdd(int* out, const s16* input, u32 count, VolumeData* vd, s16* dpop, bo if (!ramp) volume_delta = 0; - for (u32 i = 0; i < count; ++i) +#ifdef __AVX2__ + if constexpr ((count & 15) == 0) { - s64 sample = input[i]; - sample *= volume; - sample >>= 15; - s16 sample16 = ClampS16((s32)sample); + out = std::assume_aligned<32>(out); + input = std::assume_aligned<32>(input); - out[i] += sample16; - volume += volume_delta; + auto vol = _mm256_set1_epi16(volume); + const auto delta = _mm256_set1_epi16(volume_delta); - *dpop = sample16; + // Vectorize the volume. + const auto iota = _mm256_set_epi16(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + vol = _mm256_add_epi16(vol, _mm256_mullo_epi16(delta, iota)); + + // Each loop iteration processes 16 samples. + const auto delta16 = _mm256_slli_epi16(delta, 4); + + for (u32 i = 0; i < count; i += 16) + { + const auto val = *(__m256i*)&input[i]; + const auto dst = (__m256i*)&out[i]; + + // mulhrs is signed * signed but we need signed * unsigned, + // so drop the top bit and adjust the product if it was set. + const auto mul = _mm256_mulhrs_epi16(val, _mm256_and_si256(vol, _mm256_set1_epi16(0x7FFF))); + const auto add = _mm256_adds_epi16(mul, _mm256_and_si256(val, _mm256_srai_epi16(vol, 15))); + + // Sign-extend to 32-bit. + const auto lo = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(add, 0)); + const auto hi = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(add, 1)); + + // Add to the output. + dst[0] = _mm256_add_epi32(dst[0], lo); + dst[1] = _mm256_add_epi32(dst[1], hi); + + // Update the volume for the next iteration. + vol = _mm256_add_epi16(vol, delta16); + } } + else +#endif + { + for (u32 i = 0; i < count; ++i) + { + s32 sample = input[i]; + sample *= volume; + sample >>= 15; + s16 sample16 = ClampS16(sample); + + out[i] += (s16)sample16; + volume += volume_delta; + } + } + + vd->volume += volume_delta * count; + *dpop = out[count - 1]; } // Execute a low pass filter on the samples using one history value. @@ -418,7 +462,8 @@ static void BiquadFilter(s16* samples, u32 count, PBBiquadFilter& f) // Process 1ms of audio (for AX GC) or 3ms of audio (for AX Wii) from a PB and // mix it to the output buffers. -void ProcessVoice(HLEAccelerator* accelerator, PB_TYPE& pb, const AXBuffers& buffers, u16 count, +template +void ProcessVoice(HLEAccelerator* accelerator, PB_TYPE& pb, const AXBuffers& buffers, AXMixControl mctrl, const s16* coeffs, bool new_filter) { // If the voice is not running, nothing to do. @@ -426,7 +471,7 @@ void ProcessVoice(HLEAccelerator* accelerator, PB_TYPE& pb, const AXBuffers& buf return; // Read input samples, performing sample rate conversion if needed. - s16 samples[MAX_SAMPLES_PER_FRAME]; + alignas(32) s16 samples[MAX_SAMPLES_PER_FRAME]; GetInputSamples(accelerator, pb, samples, count, coeffs); // Apply a global volume ramp using the volume envelope parameters. @@ -465,67 +510,67 @@ void ProcessVoice(HLEAccelerator* accelerator, PB_TYPE& pb, const AXBuffers& buf if (MIX_ON(MAIN_L)) { - MixAdd(buffers.main_left, samples, count, &pb.mixer.main_left, &pb.dpop.main_left, - RAMP_ON(MAIN_L)); + MixAdd(buffers.main_left, samples, &pb.mixer.main_left, &pb.dpop.main_left, + RAMP_ON(MAIN_L)); } if (MIX_ON(MAIN_R)) { - MixAdd(buffers.main_right, samples, count, &pb.mixer.main_right, &pb.dpop.main_right, - RAMP_ON(MAIN_R)); + MixAdd(buffers.main_right, samples, &pb.mixer.main_right, &pb.dpop.main_right, + RAMP_ON(MAIN_R)); } if (MIX_ON(MAIN_S)) { - MixAdd(buffers.main_surround, samples, count, &pb.mixer.main_surround, &pb.dpop.main_surround, - RAMP_ON(MAIN_S)); + MixAdd(buffers.main_surround, samples, &pb.mixer.main_surround, &pb.dpop.main_surround, + RAMP_ON(MAIN_S)); } if (MIX_ON(AUXA_L)) { - MixAdd(buffers.auxA_left, samples, count, &pb.mixer.auxA_left, &pb.dpop.auxA_left, - RAMP_ON(AUXA_L)); + MixAdd(buffers.auxA_left, samples, &pb.mixer.auxA_left, &pb.dpop.auxA_left, + RAMP_ON(AUXA_L)); } if (MIX_ON(AUXA_R)) { - MixAdd(buffers.auxA_right, samples, count, &pb.mixer.auxA_right, &pb.dpop.auxA_right, - RAMP_ON(AUXA_R)); + MixAdd(buffers.auxA_right, samples, &pb.mixer.auxA_right, &pb.dpop.auxA_right, + RAMP_ON(AUXA_R)); } if (MIX_ON(AUXA_S)) { - MixAdd(buffers.auxA_surround, samples, count, &pb.mixer.auxA_surround, &pb.dpop.auxA_surround, - RAMP_ON(AUXA_S)); + MixAdd(buffers.auxA_surround, samples, &pb.mixer.auxA_surround, &pb.dpop.auxA_surround, + RAMP_ON(AUXA_S)); } if (MIX_ON(AUXB_L)) { - MixAdd(buffers.auxB_left, samples, count, &pb.mixer.auxB_left, &pb.dpop.auxB_left, - RAMP_ON(AUXB_L)); + MixAdd(buffers.auxB_left, samples, &pb.mixer.auxB_left, &pb.dpop.auxB_left, + RAMP_ON(AUXB_L)); } if (MIX_ON(AUXB_R)) { - MixAdd(buffers.auxB_right, samples, count, &pb.mixer.auxB_right, &pb.dpop.auxB_right, - RAMP_ON(AUXB_R)); + MixAdd(buffers.auxB_right, samples, &pb.mixer.auxB_right, &pb.dpop.auxB_right, + RAMP_ON(AUXB_R)); } if (MIX_ON(AUXB_S)) { - MixAdd(buffers.auxB_surround, samples, count, &pb.mixer.auxB_surround, &pb.dpop.auxB_surround, - RAMP_ON(AUXB_S)); + MixAdd(buffers.auxB_surround, samples, &pb.mixer.auxB_surround, &pb.dpop.auxB_surround, + RAMP_ON(AUXB_S)); } #ifdef AX_WII if (MIX_ON(AUXC_L)) { - MixAdd(buffers.auxC_left, samples, count, &pb.mixer.auxC_left, &pb.dpop.auxC_left, - RAMP_ON(AUXC_L)); + MixAdd(buffers.auxC_left, samples, &pb.mixer.auxC_left, &pb.dpop.auxC_left, + RAMP_ON(AUXC_L)); } if (MIX_ON(AUXC_R)) { - MixAdd(buffers.auxC_right, samples, count, &pb.mixer.auxC_right, &pb.dpop.auxC_right, - RAMP_ON(AUXC_R)); + MixAdd(buffers.auxC_right, samples, &pb.mixer.auxC_right, &pb.dpop.auxC_right, + RAMP_ON(AUXC_R)); } if (MIX_ON(AUXC_S)) { - MixAdd(buffers.auxC_surround, samples, count, &pb.mixer.auxC_surround, &pb.dpop.auxC_surround, - RAMP_ON(AUXC_S)); + MixAdd(buffers.auxC_surround, samples, &pb.mixer.auxC_surround, &pb.dpop.auxC_surround, + RAMP_ON(AUXC_S)); } #endif @@ -559,10 +604,10 @@ void ProcessVoice(HLEAccelerator* accelerator, PB_TYPE& pb, const AXBuffers& buf } // Old AXWii versions process ms per ms. - u16 wm_count = count == 96 ? 18 : 6; + constexpr u16 wm_count = count == 96 ? 18 : 6; // Interpolate at most 18 samples from the 96 samples we read before. - s16 wm_samples[18]; + alignas(32) s16 wm_samples[18]; // We use ratio 0x55555 == (5 * 65536 + 21845) / 65536 == 5.3333 which // is the nearest we can get to 96/18 @@ -576,29 +621,29 @@ void ProcessVoice(HLEAccelerator* accelerator, PB_TYPE& pb, const AXBuffers& buf #define WMCHAN_MIX_RAMP(n) (0 != ((pb.remote_mixer_control >> (2 * n)) & 2)) if (WMCHAN_MIX_ON(0)) - MixAdd(buffers.wm_main0, wm_samples, wm_count, &pb.remote_mixer.main0, &pb.remote_dpop.main0, - WMCHAN_MIX_RAMP(0)); + MixAdd(buffers.wm_main0, wm_samples, &pb.remote_mixer.main0, &pb.remote_dpop.main0, + WMCHAN_MIX_RAMP(0)); if (WMCHAN_MIX_ON(1)) - MixAdd(buffers.wm_aux0, wm_samples, wm_count, &pb.remote_mixer.aux0, &pb.remote_dpop.aux0, - WMCHAN_MIX_RAMP(1)); + MixAdd(buffers.wm_aux0, wm_samples, &pb.remote_mixer.aux0, &pb.remote_dpop.aux0, + WMCHAN_MIX_RAMP(1)); if (WMCHAN_MIX_ON(2)) - MixAdd(buffers.wm_main1, wm_samples, wm_count, &pb.remote_mixer.main1, &pb.remote_dpop.main1, - WMCHAN_MIX_RAMP(2)); + MixAdd(buffers.wm_main1, wm_samples, &pb.remote_mixer.main1, &pb.remote_dpop.main1, + WMCHAN_MIX_RAMP(2)); if (WMCHAN_MIX_ON(3)) - MixAdd(buffers.wm_aux1, wm_samples, wm_count, &pb.remote_mixer.aux1, &pb.remote_dpop.aux1, - WMCHAN_MIX_RAMP(3)); + MixAdd(buffers.wm_aux1, wm_samples, &pb.remote_mixer.aux1, &pb.remote_dpop.aux1, + WMCHAN_MIX_RAMP(3)); if (WMCHAN_MIX_ON(4)) - MixAdd(buffers.wm_main2, wm_samples, wm_count, &pb.remote_mixer.main2, &pb.remote_dpop.main2, - WMCHAN_MIX_RAMP(4)); + MixAdd(buffers.wm_main2, wm_samples, &pb.remote_mixer.main2, &pb.remote_dpop.main2, + WMCHAN_MIX_RAMP(4)); if (WMCHAN_MIX_ON(5)) - MixAdd(buffers.wm_aux2, wm_samples, wm_count, &pb.remote_mixer.aux2, &pb.remote_dpop.aux2, - WMCHAN_MIX_RAMP(5)); + MixAdd(buffers.wm_aux2, wm_samples, &pb.remote_mixer.aux2, &pb.remote_dpop.aux2, + WMCHAN_MIX_RAMP(5)); if (WMCHAN_MIX_ON(6)) - MixAdd(buffers.wm_main3, wm_samples, wm_count, &pb.remote_mixer.main3, &pb.remote_dpop.main3, - WMCHAN_MIX_RAMP(6)); + MixAdd(buffers.wm_main3, wm_samples, &pb.remote_mixer.main3, &pb.remote_dpop.main3, + WMCHAN_MIX_RAMP(6)); if (WMCHAN_MIX_ON(7)) - MixAdd(buffers.wm_aux3, wm_samples, wm_count, &pb.remote_mixer.aux3, &pb.remote_dpop.aux3, - WMCHAN_MIX_RAMP(7)); + MixAdd(buffers.wm_aux3, wm_samples, &pb.remote_mixer.aux3, &pb.remote_dpop.aux3, + WMCHAN_MIX_RAMP(7)); } #undef WMCHAN_MIX_RAMP #undef WMCHAN_MIX_ON diff --git a/Source/Core/Core/HW/DSPHLE/UCodes/AXWii.cpp b/Source/Core/Core/HW/DSPHLE/UCodes/AXWii.cpp index 04dee941ca..d57fdb4d41 100644 --- a/Source/Core/Core/HW/DSPHLE/UCodes/AXWii.cpp +++ b/Source/Core/Core/HW/DSPHLE/UCodes/AXWii.cpp @@ -459,9 +459,9 @@ void AXWiiUCode::ProcessPBList(u32 pb_addr) for (int curr_ms = 0; curr_ms < 3; ++curr_ms) { ApplyUpdatesForMs(curr_ms, pb, pb.updates.num_updates, updates); - ProcessVoice(static_cast(m_accelerator.get()), pb, buffers, spms, - ConvertMixerControl(HILO_TO_32(pb.mixer_control)), - m_coeffs_checksum ? m_coeffs.data() : nullptr, m_new_filter); + ProcessVoice(static_cast(m_accelerator.get()), pb, buffers, + ConvertMixerControl(HILO_TO_32(pb.mixer_control)), + m_coeffs_checksum ? m_coeffs.data() : nullptr, m_new_filter); // Forward the buffers for (auto& ptr : buffers.regular_ptrs) @@ -472,9 +472,9 @@ void AXWiiUCode::ProcessPBList(u32 pb_addr) } else { - ProcessVoice(static_cast(m_accelerator.get()), pb, buffers, 96, - ConvertMixerControl(HILO_TO_32(pb.mixer_control)), - m_coeffs_checksum ? m_coeffs.data() : nullptr, m_new_filter); + ProcessVoice<96>(static_cast(m_accelerator.get()), pb, buffers, + ConvertMixerControl(HILO_TO_32(pb.mixer_control)), + m_coeffs_checksum ? m_coeffs.data() : nullptr, m_new_filter); } WritePB(memory, pb_addr, pb); diff --git a/Source/Core/Core/HW/DSPHLE/UCodes/AXWii.h b/Source/Core/Core/HW/DSPHLE/UCodes/AXWii.h index b3c43b8012..cfa9bd63a7 100644 --- a/Source/Core/Core/HW/DSPHLE/UCodes/AXWii.h +++ b/Source/Core/Core/HW/DSPHLE/UCodes/AXWii.h @@ -21,19 +21,19 @@ public: protected: // Additional AUX buffers - int m_samples_auxC_left[32 * 3]{}; - int m_samples_auxC_right[32 * 3]{}; - int m_samples_auxC_surround[32 * 3]{}; + alignas(32) int m_samples_auxC_left[32 * 3]{}; + alignas(32) int m_samples_auxC_right[32 * 3]{}; + alignas(32) int m_samples_auxC_surround[32 * 3]{}; // Wiimote buffers - int m_samples_wm0[6 * 3]{}; - int m_samples_aux0[6 * 3]{}; - int m_samples_wm1[6 * 3]{}; - int m_samples_aux1[6 * 3]{}; - int m_samples_wm2[6 * 3]{}; - int m_samples_aux2[6 * 3]{}; - int m_samples_wm3[6 * 3]{}; - int m_samples_aux3[6 * 3]{}; + alignas(32) int m_samples_wm0[6 * 3]{}; + alignas(32) int m_samples_aux0[6 * 3]{}; + alignas(32) int m_samples_wm1[6 * 3]{}; + alignas(32) int m_samples_aux1[6 * 3]{}; + alignas(32) int m_samples_wm2[6 * 3]{}; + alignas(32) int m_samples_aux2[6 * 3]{}; + alignas(32) int m_samples_wm3[6 * 3]{}; + alignas(32) int m_samples_aux3[6 * 3]{}; // Are we implementing an old version of AXWii which still has updates? bool m_old_axwii = false;