Merge branch 'master' into SPU2

This commit is contained in:
Elad 2024-12-19 17:08:37 +02:00 committed by GitHub
commit 21bc3e8200
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
42 changed files with 1672 additions and 150 deletions

2
3rdparty/7zip/7zip vendored

@ -1 +1 @@
Subproject commit e008ce3976c087bfd21344af8f00a23cf69d4174
Subproject commit e5431fa6f5505e385c6f9367260717e9c47dc2ee

2
3rdparty/FAudio vendored

@ -1 +1 @@
Subproject commit 74d45e615c2e7510c7e0f2ccb91dc6d7ccae4bec
Subproject commit b7c2e109ea86b82109244c9c4569ce9ad0c884df

@ -1 +1 @@
Subproject commit d3875f333fb6abe2f39d82caca329414871ae53b
Subproject commit 90191edd20bb877c5cbddfdac7ec0fe49ad93727

2
3rdparty/curl/curl vendored

@ -1 +1 @@
Subproject commit b1ef0e1a01c0bb6ee5367bd9c186a603bde3615a
Subproject commit 75a2079d5c28debb2eaa848ca9430f1fe0d7844c

@ -1 +1 @@
Subproject commit c98c4fbff6d8f3016a3ce6685bf8f43433c3efcc
Subproject commit 9c821dc21ccbd69b2bda421fdb35cb4ae2da8f5e

View file

@ -506,6 +506,7 @@ target_sources(rpcs3_emu PRIVATE
RSX/GL/OpenGL.cpp
RSX/GL/upscalers/fsr1/fsr_pass.cpp
RSX/GSRender.cpp
RSX/Host/MM.cpp
RSX/Host/RSXDMAWriter.cpp
RSX/Null/NullGSRender.cpp
RSX/NV47/FW/draw_call.cpp

File diff suppressed because it is too large Load diff

View file

@ -253,7 +253,7 @@ enum CellAdecSampleRate : s32
CELL_ADEC_FS_8kHz,
};
enum CellAdecBitLength : s32
enum CellAdecBitLength : u32
{
CELL_ADEC_BIT_LENGTH_RESERVED1,
CELL_ADEC_BIT_LENGTH_16,
@ -352,8 +352,8 @@ enum AdecCorrectPtsValueType : s8
ADEC_CORRECT_PTS_VALUE_TYPE_UNSPECIFIED = -1,
// Adds a fixed amount
ADEC_CORRECT_PTS_VALUE_TYPE_LPCM = 0,
// 1
ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_HDMV = 0,
ADEC_CORRECT_PTS_VALUE_TYPE_LPCM_DVD = 1, // Unused for some reason, the DVD player probably takes care of timestamps itself
ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_48000Hz = 2,
ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_44100Hz = 3,
ADEC_CORRECT_PTS_VALUE_TYPE_ATRACX_32000Hz = 4,
@ -562,6 +562,11 @@ public:
{
ensure(sys_mutex_lock(ppu, mutex, 0) == CELL_OK); // Error code isn't checked on LLE
if (ppu.state & cpu_flag::again) // Savestate was created while waiting on the mutex
{
return {};
}
if (entries[front].state == 0xff)
{
ensure(sys_mutex_unlock(ppu, mutex) == CELL_OK); // Error code isn't checked on LLE
@ -648,6 +653,20 @@ static_assert(std::is_standard_layout_v<AdecContext> && std::is_trivial_v<AdecCo
CHECK_SIZE_ALIGN(AdecContext, 0x530, 8);
enum : u32
{
CELL_ADEC_LPCM_DVD_CH_RESERVED1,
CELL_ADEC_LPCM_DVD_CH_MONO,
CELL_ADEC_LPCM_DVD_CH_RESERVED2,
CELL_ADEC_LPCM_DVD_CH_STEREO,
CELL_ADEC_LPCM_DVD_CH_UNK1, // Either 3 front or 2 front + 1 surround
CELL_ADEC_LPCM_DVD_CH_UNK2, // Either 3 front + 1 surround or 2 front + 2 surround
CELL_ADEC_LPCM_DVD_CH_3_2,
CELL_ADEC_LPCM_DVD_CH_3_2_LFE,
CELL_ADEC_LPCM_DVD_CH_3_4,
CELL_ADEC_LPCM_DVD_CH_3_4_LFE,
};
struct CellAdecParamLpcm
{
be_t<u32> channelNumber;
@ -664,6 +683,216 @@ struct CellAdecLpcmInfo
be_t<u32> outputDataSize;
};
// HLE exclusive, for savestates
enum class lpcm_dec_state : u8
{
waiting_for_cmd_mutex_lock,
waiting_for_cmd_cond_wait,
waiting_for_output_mutex_lock,
waiting_for_output_cond_wait,
queue_mutex_lock,
executing_cmd
};
class LpcmDecSemaphore
{
be_t<u32> value;
be_t<u32> mutex; // sys_mutex_t
be_t<u32> cond; // sys_cond_t
public:
error_code init(ppu_thread& ppu, vm::ptr<LpcmDecSemaphore> _this, u32 initial_value)
{
value = initial_value;
const vm::var<sys_mutex_attribute_t> mutex_attr{{ SYS_SYNC_PRIORITY, SYS_SYNC_NOT_RECURSIVE, SYS_SYNC_NOT_PROCESS_SHARED, SYS_SYNC_NOT_ADAPTIVE, 0, 0, 0, { "_adem01"_u64 } }};
const vm::var<sys_cond_attribute_t> cond_attr{{ SYS_SYNC_NOT_PROCESS_SHARED, 0, 0, { "_adec01"_u64 } }};
if (error_code ret = sys_mutex_create(ppu, _this.ptr(&LpcmDecSemaphore::mutex), mutex_attr); ret != CELL_OK)
{
return ret;
}
return sys_cond_create(ppu, _this.ptr(&LpcmDecSemaphore::cond), mutex, cond_attr);
}
error_code finalize(ppu_thread& ppu) const
{
if (error_code ret = sys_cond_destroy(ppu, cond); ret != CELL_OK)
{
return ret;
}
return sys_mutex_destroy(ppu, mutex);
}
error_code release(ppu_thread& ppu)
{
if (error_code ret = sys_mutex_lock(ppu, mutex, 0); ret != CELL_OK)
{
return ret;
}
value++;
if (error_code ret = sys_cond_signal(ppu, cond); ret != CELL_OK)
{
return ret; // LLE doesn't unlock the mutex
}
return sys_mutex_unlock(ppu, mutex);
}
error_code acquire(ppu_thread& ppu, lpcm_dec_state& savestate)
{
if (savestate == lpcm_dec_state::waiting_for_cmd_cond_wait)
{
goto cond_wait;
}
savestate = lpcm_dec_state::waiting_for_cmd_mutex_lock;
if (error_code ret = sys_mutex_lock(ppu, mutex, 0); ret != CELL_OK)
{
return ret;
}
if (ppu.state & cpu_flag::again)
{
return {};
}
if (value == 0u)
{
savestate = lpcm_dec_state::waiting_for_cmd_cond_wait;
cond_wait:
if (error_code ret = sys_cond_wait(ppu, cond, 0); ret != CELL_OK)
{
return ret; // LLE doesn't unlock the mutex
}
if (ppu.state & cpu_flag::again)
{
return {};
}
}
value--;
return sys_mutex_unlock(ppu, mutex);
}
};
CHECK_SIZE(LpcmDecSemaphore, 0xc);
enum class LpcmDecCmdType : u32
{
start_seq,
end_seq,
decode_au,
close
};
struct LpcmDecCmd
{
be_t<s32> pcm_handle;
vm::bcptr<void> au_start_addr;
be_t<u32> au_size;
u32 reserved1[2];
CellAdecParamLpcm lpcm_param;
be_t<LpcmDecCmdType> type;
u32 reserved2;
LpcmDecCmd() = default; // cellAdecOpen()
LpcmDecCmd(LpcmDecCmdType&& type) // End sequence
: type(type)
{
}
LpcmDecCmd(LpcmDecCmdType&& type, const CellAdecParamLpcm& lpcm_param) // Start sequence
: lpcm_param(lpcm_param), type(type)
{
}
LpcmDecCmd(LpcmDecCmdType&& type, const s32& pcm_handle, const CellAdecAuInfo& au_info) // Decode au
: pcm_handle(pcm_handle), au_start_addr(au_info.startAddr), au_size(au_info.size), type(type)
{
}
};
CHECK_SIZE(LpcmDecCmd, 0x2c);
struct LpcmDecContext
{
AdecCmdQueue<LpcmDecCmd> cmd_queue;
be_t<u64> thread_id; // sys_ppu_thread_t
be_t<u32> queue_size_mutex; // sys_mutex_t
be_t<u32> queue_size_cond; // sys_cond_t, unused
be_t<u32> unk_mutex; // sys_mutex_t, unused
be_t<u32> unk_cond; // sys_cond_t, unused
be_t<u32> run_thread;
AdecCb<AdecNotifyAuDone> notify_au_done;
AdecCb<AdecNotifyPcmOut> notify_pcm_out;
AdecCb<AdecNotifyError> notify_error;
AdecCb<AdecNotifySeqDone> notify_seq_done;
be_t<u32> output_locked;
vm::bptr<f32> output;
vm::bptr<CellAdecParamLpcm> lpcm_param;
vm::bcptr<void> spurs_cmd_data;
// HLE exclusive
lpcm_dec_state savestate;
u64 cmd_counter; // For debugging
u8 reserved1[24]; // 36 bytes on LLE
be_t<u32> output_mutex; // sys_mutex_t
be_t<u32> output_consumed; // sys_cond_t
LpcmDecSemaphore cmd_available;
LpcmDecSemaphore reserved2; // Unused
be_t<u32> queue_mutex; // sys_mutex_t
be_t<u32> error_occurred;
u8 spurs_stuff[32];
be_t<u32> spurs_queue_pop_mutex;
be_t<u32> spurs_queue_push_mutex;
be_t<u32> using_existing_spurs_instance;
be_t<u32> dvd_packing;
be_t<u32> output_size;
LpcmDecCmd cmd; // HLE exclusive, name of Spurs taskset (32 bytes) + CellSpursTaskLsPattern on LLE
u8 more_spurs_stuff[10]; // 52 bytes on LLE
void exec(ppu_thread& ppu);
template <LpcmDecCmdType type>
error_code send_command(ppu_thread& ppu, auto&&... args);
inline error_code release_output(ppu_thread& ppu);
};
static_assert(std::is_standard_layout_v<LpcmDecContext>);
CHECK_SIZE_ALIGN(LpcmDecContext, 0x1c8, 8);
constexpr s32 LPCM_DEC_OUTPUT_BUFFER_SIZE = 0x40000;
// CELP Excitation Mode
enum CELP_ExcitationMode : s32
{

View file

@ -784,26 +784,26 @@ s32 cellCameraIsAttached(s32 dev_num)
if (g_cfg.io.camera == camera_handler::null)
{
return false;
return 0;
}
auto& g_camera = g_fxo->get<camera_thread>();
if (!g_camera.init)
{
return false;
return 0;
}
if (!check_dev_num(dev_num))
{
return false;
return 0;
}
vm::var<s32> type;
if (cellCameraGetType(dev_num, type) != CELL_OK)
{
return false;
return 0;
}
std::lock_guard lock(g_camera.mutex);
@ -821,12 +821,12 @@ s32 cellCameraIsAttached(s32 dev_num)
}
}
return is_attached;
return is_attached ? 1 : 0;
}
s32 cellCameraIsOpen(s32 dev_num)
{
cellCamera.notice("cellCameraIsOpen(dev_num=%d)", dev_num);
cellCamera.trace("cellCameraIsOpen(dev_num=%d)", dev_num);
if (g_cfg.io.camera == camera_handler::null)
{
@ -852,7 +852,7 @@ s32 cellCameraIsOpen(s32 dev_num)
s32 cellCameraIsStarted(s32 dev_num)
{
cellCamera.notice("cellCameraIsStarted(dev_num=%d)", dev_num);
cellCamera.trace("cellCameraIsStarted(dev_num=%d)", dev_num);
if (g_cfg.io.camera == camera_handler::null)
{

View file

@ -956,9 +956,9 @@ static inline void pos_to_gem_state(u32 gem_num, gem_config::gem_controller& con
static constexpr f32 PI = 3.14159265f;
const auto degree_to_rad = [](f32 degree) -> f32 { return degree * PI / 180.0f; };
static constexpr f32 CONE = 10.0f / 2.0f;
const f32 roll = -degree_to_rad((image_y - half_height) / half_height * CONE); // This is actually the pitch
const f32 pitch = -degree_to_rad((image_x - half_width) / half_width * CONE); // This is actually the yaw
const f32 max_angle_per_side = g_cfg.io.fake_move_rotation_cone / 2.0f;
const f32 roll = -degree_to_rad((image_y - half_height) / half_height * max_angle_per_side); // This is actually the pitch
const f32 pitch = -degree_to_rad((image_x - half_width) / half_width * max_angle_per_side); // This is actually the yaw
const f32 yaw = degree_to_rad(0.0f);
const f32 cr = std::cos(roll * 0.5f);
const f32 sr = std::sin(roll * 0.5f);

View file

@ -333,6 +333,10 @@ void lv2_socket_p2p::close()
auto& nc = g_fxo->get<p2p_context>();
{
std::lock_guard lock(nc.list_p2p_ports_mutex);
if (!nc.list_p2p_ports.contains(port))
return;
auto& p2p_port = ::at32(nc.list_p2p_ports, port);
{
std::lock_guard lock(p2p_port.bound_p2p_vports_mutex);

View file

@ -35,7 +35,7 @@ extern const std::map<std::string_view, int> g_prx_list
{ "libaacenc_spurs.sprx", 0 },
{ "libac3dec.sprx", 0 },
{ "libac3dec2.sprx", 0 },
{ "libadec.sprx", 0 },
{ "libadec.sprx", 1 },
{ "libadec2.sprx", 0 },
{ "libadec_internal.sprx", 0 },
{ "libad_async.sprx", 0 },

View file

@ -7,10 +7,10 @@ class null_camera_handler final : public camera_handler_base
public:
null_camera_handler() : camera_handler_base() {}
void open_camera() override { m_state = camera_handler_state::open; }
void close_camera() override { m_state = camera_handler_state::closed; }
void start_camera() override { m_state = camera_handler_state::running; }
void stop_camera() override { m_state = camera_handler_state::open; }
void open_camera() override { set_state(camera_handler_state::open); }
void close_camera() override { set_state(camera_handler_state::closed); }
void start_camera() override { set_state(camera_handler_state::running); }
void stop_camera() override { set_state(camera_handler_state::open); }
void set_format(s32 format, u32 bytesize) override
{
@ -45,6 +45,6 @@ public:
height = 0;
frame_number = 0;
bytes_read = 0;
return m_state;
return get_state();
}
};

View file

@ -30,22 +30,29 @@ public:
virtual u64 frame_number() const = 0; // Convenience function to check if there's a new frame.
virtual camera_handler_state get_image(u8* buf, u64 size, u32& width, u32& height, u64& frame_number, u64& bytes_read) = 0;
camera_handler_state get_state() const { return m_state.load(); };
camera_handler_state get_state() const { return m_state.load(); }
void set_state(camera_handler_state state) { m_state = m_state_expected = state; }
bool mirrored() const { return m_mirrored; };
s32 format() const { return m_format; };
u32 bytesize() const { return m_bytesize; };
u32 width() const { return m_width; };
u32 height() const { return m_height; };
u32 frame_rate() const { return m_frame_rate; };
camera_handler_state get_expected_state() const { return m_state_expected.load(); }
void set_expected_state(camera_handler_state state) { m_state_expected = state; }
bool mirrored() const { return m_mirrored; }
s32 format() const { return m_format; }
u32 bytesize() const { return m_bytesize; }
u32 width() const { return m_width; }
u32 height() const { return m_height; }
u32 frame_rate() const { return m_frame_rate; }
protected:
std::mutex m_mutex;
atomic_t<camera_handler_state> m_state = camera_handler_state::closed;
bool m_mirrored = false;
s32 m_format = 2; // CELL_CAMERA_RAW8
u32 m_bytesize = 0;
u32 m_width = 640;
u32 m_height = 480;
u32 m_frame_rate = 30;
private:
atomic_t<camera_handler_state> m_state = camera_handler_state::closed;
atomic_t<camera_handler_state> m_state_expected = camera_handler_state::closed;
};

View file

@ -5,6 +5,7 @@
#include "TextureUtils.h"
#include "Emu/Memory/vm.h"
#include "Emu/RSX/Host/MM.h"
#include "util/vm.hpp"
#include <list>
@ -29,8 +30,7 @@ namespace rsx
{
ensure(range.is_page_range());
//rsx_log.error("memory_protect(0x%x, 0x%x, %x)", static_cast<u32>(range.start), static_cast<u32>(range.length()), static_cast<u32>(prot));
utils::memory_protect(vm::base(range.start), range.length(), prot);
rsx::mm_protect(vm::base(range.start), range.length(), prot);
#ifdef TEXTURE_CACHE_DEBUG
tex_cache_checker.set_protection(range, prot);

View file

@ -7,6 +7,7 @@
#include "Emu/Memory/vm_locking.h"
#include "Emu/RSX/rsx_methods.h"
#include "Emu/RSX/Host/MM.h"
#include "Emu/RSX/Host/RSXDMAWriter.h"
#include "Emu/RSX/NV47/HW/context_accessors.define.h"
@ -1082,6 +1083,8 @@ void GLGSRender::patch_transform_constants(rsx::context* ctx, u32 index, u32 cou
bool GLGSRender::on_access_violation(u32 address, bool is_writing)
{
rsx::mm_flush(address);
const bool can_flush = is_current_thread();
const rsx::invalidation_cause cause = is_writing
? (can_flush ? rsx::invalidation_cause::write : rsx::invalidation_cause::deferred_write)

110
rpcs3/Emu/RSX/Host/MM.cpp Normal file
View file

@ -0,0 +1,110 @@
#include "stdafx.h"
#include "MM.h"
#include <Emu/RSX/Common/simple_array.hpp>
#include <Emu/RSX/RSXOffload.h>
#include <Emu/Memory/vm.h>
#include <Emu/IdManager.h>
#include <Emu/system_config.h>
#include <Utilities/address_range.h>
#include <Utilities/mutex.h>
namespace rsx
{
rsx::simple_array<MM_block> g_deferred_mprotect_queue;
shared_mutex g_mprotect_queue_lock;
void mm_flush_mprotect_queue_internal()
{
for (const auto& block : g_deferred_mprotect_queue)
{
utils::memory_protect(reinterpret_cast<void*>(block.start), block.length, block.prot);
}
g_deferred_mprotect_queue.clear();
}
void mm_defer_mprotect_internal(u64 start, u64 length, utils::protection prot)
{
// We could stack and merge requests here, but that is more trouble than it is truly worth.
// A fresh call to memory_protect only takes a few nanoseconds of setup overhead, it is not worth the risk of hanging because of conflicts.
g_deferred_mprotect_queue.push_back({ start, length, prot });
}
void mm_protect(void* ptr, u64 length, utils::protection prot)
{
if (g_cfg.video.disable_async_host_memory_manager)
{
utils::memory_protect(ptr, length, prot);
return;
}
// Naive merge. Eventually it makes more sense to do conflict resolution, but it's not as important.
const auto start = reinterpret_cast<u64>(ptr);
const auto end = start + length;
std::lock_guard lock(g_mprotect_queue_lock);
if (prot == utils::protection::rw || prot == utils::protection::wx)
{
// Basically an unlock op. Flush if any overlap is detected
for (const auto& block : g_deferred_mprotect_queue)
{
if (block.overlaps(start, end))
{
mm_flush_mprotect_queue_internal();
break;
}
}
utils::memory_protect(ptr, length, prot);
return;
}
// No, Ro, etc.
mm_defer_mprotect_internal(start, length, prot);
}
void mm_flush()
{
std::lock_guard lock(g_mprotect_queue_lock);
mm_flush_mprotect_queue_internal();
}
void mm_flush(u32 vm_address)
{
std::lock_guard lock(g_mprotect_queue_lock);
if (g_deferred_mprotect_queue.empty())
{
return;
}
const auto addr = reinterpret_cast<u64>(vm::base(vm_address));
for (const auto& block : g_deferred_mprotect_queue)
{
if (block.overlaps(addr))
{
mm_flush_mprotect_queue_internal();
return;
}
}
}
void mm_flush_lazy()
{
if (!g_cfg.video.multithreaded_rsx)
{
mm_flush();
return;
}
std::lock_guard lock(g_mprotect_queue_lock);
if (g_deferred_mprotect_queue.empty())
{
return;
}
auto& rsxdma = g_fxo->get<rsx::dma_manager>();
rsxdma.backend_ctrl(mm_backend_ctrl::cmd_mm_flush, nullptr);
}
}

40
rpcs3/Emu/RSX/Host/MM.h Normal file
View file

@ -0,0 +1,40 @@
#pragma once
#include <util/types.hpp>
#include <util/vm.hpp>
namespace rsx
{
struct MM_block
{
u64 start;
u64 length;
utils::protection prot;
inline bool overlaps(u64 start, u64 end) const
{
// [Start, End] is not a proper closed range, there is an off-by-one by design.
// FIXME: Use address_range64
const u64 this_end = this->start + this->length;
return (this->start < end && start < this_end);
}
inline bool overlaps(u64 addr) const
{
// [Start, End] is not a proper closed range, there is an off-by-one by design.
// FIXME: Use address_range64
const u64 this_end = this->start + this->length;
return (addr >= start && addr < this_end);
}
};
enum mm_backend_ctrl : u32
{
cmd_mm_flush = 0x81000000,
};
void mm_protect(void* start, u64 length, utils::protection prot);
void mm_flush_lazy();
void mm_flush(u32 vm_address);
void mm_flush();
}

View file

@ -7,6 +7,9 @@
namespace rsx
{
void mm_flush_lazy();
void mm_flush();
namespace util
{
template <bool FlushDMA, bool FlushPipe>
@ -24,17 +27,24 @@ namespace rsx
return;
}
if constexpr (FlushDMA)
if constexpr (FlushDMA || FlushPipe)
{
// If the backend handled the request, this call will basically be a NOP
g_fxo->get<rsx::dma_manager>().sync();
}
// Release op must be acoompanied by MM flush.
// FlushPipe implicitly does a MM flush but FlushDMA does not. Trigger the flush here
rsx::mm_flush();
if constexpr (FlushPipe)
{
// Manually flush the pipeline.
// It is possible to stream report writes using the host GPU, but that generates too much submit traffic.
RSX(ctx)->sync();
if constexpr (FlushDMA)
{
// If the backend handled the request, this call will basically be a NOP
g_fxo->get<rsx::dma_manager>().sync();
}
if constexpr (FlushPipe)
{
// Manually flush the pipeline.
// It is possible to stream report writes using the host GPU, but that generates too much submit traffic.
RSX(ctx)->sync();
}
}
if (handled)

View file

@ -83,6 +83,7 @@ namespace rsx
add_dropdown(&g_cfg.io.pad_mode, localized_string_id::HOME_MENU_SETTINGS_INPUT_PAD_MODE);
add_unsigned_slider(&g_cfg.io.pad_sleep, localized_string_id::HOME_MENU_SETTINGS_INPUT_PAD_SLEEP, " µs", 100);
add_unsigned_slider(&g_cfg.io.fake_move_rotation_cone, localized_string_id::HOME_MENU_SETTINGS_INPUT_FAKE_MOVE_ROTATION_CONE, "°", 1);
apply_layout();
}

View file

@ -9,6 +9,7 @@
#include "Common/time.hpp"
#include "Core/RSXReservationLock.hpp"
#include "Core/RSXEngLock.hpp"
#include "Host/MM.h"
#include "Host/RSXDMAWriter.h"
#include "NV47/HW/context.h"
#include "Program/GLSLCommon.h"
@ -2603,8 +2604,14 @@ namespace rsx
rsx_log.error("Depth texture bound to pipeline with unexpected format 0x%X", format);
}
}
else if (!backend_config.supports_hw_renormalization)
else if (!backend_config.supports_hw_renormalization /* &&
tex.min_filter() == rsx::texture_minify_filter::nearest &&
tex.mag_filter() == rsx::texture_magnify_filter::nearest*/)
{
// FIXME: This check should only apply to point-sampled textures. However, it severely regresses some games (id tech 5).
// This is because even when filtering is active, the error from the PS3 texture expansion still applies.
// A proper fix is to expand these formats into BGRA8 when high texture precision is required. That requires different GUI settings and inflation shaders, so it will be handled separately.
switch (format)
{
case CELL_GCM_TEXTURE_A1R5G5B5:
@ -3175,6 +3182,8 @@ namespace rsx
{
m_eng_interrupt_mask.clear(rsx::pipe_flush_interrupt);
mm_flush();
if (zcull_ctrl->has_pending())
{
zcull_ctrl->sync(this);
@ -3627,10 +3636,25 @@ namespace rsx
on_invalidate_memory_range(m_invalidated_memory_range, rsx::invalidation_cause::read);
}
// Host sync
rsx::mm_flush();
on_invalidate_memory_range(m_invalidated_memory_range, rsx::invalidation_cause::unmap);
m_invalidated_memory_range.invalidate();
}
void thread::renderctl(u32 request_code, void* args)
{
switch (request_code)
{
case rsx::mm_backend_ctrl::cmd_mm_flush:
rsx::mm_flush();
break;
default:
fmt::throw_exception("Unknown backend request: 0x%x", request_code);
}
}
//Pause/cont wrappers for FIFO ctrl. Never call this from rsx thread itself!
void thread::pause()
{
@ -3696,6 +3720,9 @@ namespace rsx
{
bool pause_emulator = false;
// MM sync. This is a pre-emptive operation, so we can use a deferred request.
rsx::mm_flush_lazy();
// Marks the end of a frame scope GPU-side
if (g_user_asked_for_frame_capture.exchange(false) && !capture_current_frame)
{

View file

@ -404,7 +404,7 @@ namespace rsx
virtual void notify_tile_unbound(u32 /*tile*/) {}
// control
virtual void renderctl(u32 /*request_code*/, void* /*args*/) {}
virtual void renderctl(u32 request_code, void* args);
// zcull
void notify_zcull_info_changed();

View file

@ -9,7 +9,7 @@ namespace vk
enum // callback commands
{
rctrl_queue_submit = 0x80000000,
rctrl_run_gc = 0x80000001
rctrl_run_gc = 0x80000001,
};
struct submit_packet

View file

@ -15,6 +15,7 @@
#include "vkutils/scratch.h"
#include "Emu/RSX/rsx_methods.h"
#include "Emu/RSX/Host/MM.h"
#include "Emu/RSX/Host/RSXDMAWriter.h"
#include "Emu/RSX/NV47/HW/context_accessors.define.h"
#include "Emu/Memory/vm_locking.h"
@ -1010,6 +1011,8 @@ VKGSRender::~VKGSRender()
bool VKGSRender::on_access_violation(u32 address, bool is_writing)
{
rsx::mm_flush(address);
vk::texture_cache::thrashed_set result;
{
const rsx::invalidation_cause cause = is_writing ? rsx::invalidation_cause::deferred_write : rsx::invalidation_cause::deferred_read;
@ -2460,6 +2463,9 @@ void VKGSRender::close_and_submit_command_buffer(vk::fence* pFence, VkSemaphore
{
ensure(!m_queue_status.test_and_set(flush_queue_state::flushing));
// Host MM sync before executing anything on the GPU
rsx::mm_flush();
// Workaround for deadlock occuring during RSX offloader fault
// TODO: Restructure command submission infrastructure to avoid this condition
const bool sync_success = g_fxo->get<rsx::dma_manager>().sync();
@ -2824,7 +2830,7 @@ void VKGSRender::renderctl(u32 request_code, void* args)
break;
}
default:
fmt::throw_exception("Unhandled request code 0x%x", request_code);
rsx::thread::renderctl(request_code, args);
}
}

View file

@ -223,6 +223,7 @@ enum class localized_string_id
HOME_MENU_SETTINGS_INPUT_CAMERA_FLIP,
HOME_MENU_SETTINGS_INPUT_PAD_MODE,
HOME_MENU_SETTINGS_INPUT_PAD_SLEEP,
HOME_MENU_SETTINGS_INPUT_FAKE_MOVE_ROTATION_CONE,
HOME_MENU_SETTINGS_ADVANCED,
HOME_MENU_SETTINGS_ADVANCED_PREFERRED_SPU_THREADS,
HOME_MENU_SETTINGS_ADVANCED_MAX_CPU_PREEMPTIONS,

View file

@ -91,8 +91,8 @@ scoped_progress_dialog::scoped_progress_dialog(std::string text) noexcept
scoped_progress_dialog& scoped_progress_dialog::operator=(std::string text) noexcept
{
// Exchange text atomically
g_progr_text_queue[m_text_index].exchange(make_single_value(std::move(text)));
// Set text atomically
g_progr_text_queue[m_text_index].store(make_single_value(std::move(text)));
return *this;
}

View file

@ -178,6 +178,7 @@ struct cfg_root : cfg::node
cfg::_bool decr_memory_layout{ this, "DECR memory layout", false}; // Force enable increased allowed main memory range as DECR console
cfg::_bool host_label_synchronization{ this, "Allow Host GPU Labels", false };
cfg::_bool disable_msl_fast_math{ this, "Disable MSL Fast Math", false };
cfg::_bool disable_async_host_memory_manager{ this, "Disable Asynchronous Memory Manager", false, true };
cfg::_enum<output_scaling_mode> output_scaling{ this, "Output Scaling Mode", output_scaling_mode::bilinear, true };
struct node_vk : cfg::node
@ -282,6 +283,7 @@ struct cfg_root : cfg::node
cfg::string midi_devices{this, "Emulated Midi devices", "ßßß@@@ßßß@@@ßßß@@@"};
cfg::_bool load_sdl_mappings{ this, "Load SDL GameController Mappings", true };
cfg::_bool debug_overlay{ this, "IO Debug overlay", false, true };
cfg::uint<1, 180> fake_move_rotation_cone{ this, "Fake Move Rotation Cone", 10, true };
} io{ this };

View file

@ -104,6 +104,7 @@
<ClCompile Include="Emu\perf_monitor.cpp" />
<ClCompile Include="Emu\RSX\Common\texture_cache.cpp" />
<ClCompile Include="Emu\RSX\Core\RSXContext.cpp" />
<ClCompile Include="Emu\RSX\Host\MM.cpp" />
<ClCompile Include="Emu\RSX\Host\RSXDMAWriter.cpp" />
<ClCompile Include="Emu\RSX\NV47\FW\draw_call.cpp" />
<ClCompile Include="Emu\RSX\NV47\FW\reg_context.cpp" />
@ -621,6 +622,7 @@
<ClInclude Include="Emu\RSX\Core\RSXDisplay.h" />
<ClInclude Include="Emu\RSX\Core\RSXReservationLock.hpp" />
<ClInclude Include="Emu\RSX\Core\RSXVertexTypes.h" />
<ClInclude Include="Emu\RSX\Host\MM.h" />
<ClInclude Include="Emu\RSX\Host\RSXDMAWriter.h" />
<ClInclude Include="Emu\RSX\NV47\FW\draw_call.hpp" />
<ClInclude Include="Emu\RSX\NV47\FW\draw_call.inc.h" />

View file

@ -1312,6 +1312,9 @@
<ClCompile Include="Emu\RSX\Host\RSXDMAWriter.cpp">
<Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
</ClCompile>
<ClCompile Include="Emu\RSX\Host\MM.cpp">
<Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="Crypto\aes.h">
@ -2644,6 +2647,9 @@
<ClInclude Include="Emu\RSX\Host\RSXDMAWriter.h">
<Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
</ClInclude>
<ClInclude Include="Emu\RSX\Host\MM.h">
<Filter>Emu\GPU\RSX\Host Mini-Driver</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="Emu\RSX\Program\GLSLSnippets\GPUDeswizzle.glsl">

View file

@ -103,6 +103,7 @@ enum class emu_settings_type
DisableMSLFastMath,
OutputScalingMode,
ForceHwMSAAResolve,
DisableAsyncHostMM,
// Performance Overlay
PerfOverlayEnabled,
@ -294,6 +295,7 @@ inline static const std::map<emu_settings_type, cfg_location> settings_location
{ emu_settings_type::DisableMSLFastMath, { "Video", "Disable MSL Fast Math"}},
{ emu_settings_type::OutputScalingMode, { "Video", "Output Scaling Mode"}},
{ emu_settings_type::ForceHwMSAAResolve, { "Video", "Force Hardware MSAA Resolve"}},
{ emu_settings_type::DisableAsyncHostMM, { "Video", "Disable Asynchronous Memory Manager"}},
// Vulkan
{ emu_settings_type::VulkanAsyncTextureUploads, { "Video", "Vulkan", "Asynchronous Texture Streaming 2"}},

View file

@ -79,10 +79,14 @@ flow_layout::~flow_layout()
void flow_layout::clear()
{
// We can't use a ranged loop here, since deleting the widget will call takeAt on the layout. So let's also use takeAt.
while (QLayoutItem* item = takeAt(0))
{
delete item->widget();
delete item;
if (item)
{
delete item->widget();
delete item;
}
}
m_item_list.clear();
m_positions.clear();
@ -185,8 +189,8 @@ int flow_layout::doLayout(const QRect& rect, bool testOnly) const
int x = effectiveRect.x();
int y = effectiveRect.y();
int lineHeight = 0;
int rows = 0;
int cols = 0;
int row_count = 0;
int col_count = 0;
if (m_dynamic_spacing)
{
@ -259,8 +263,8 @@ int flow_layout::doLayout(const QRect& rect, bool testOnly) const
pos.row = row;
pos.col = col++;
rows = std::max(rows, pos.row + 1);
cols = std::max(cols, pos.col + 1);
row_count = std::max(row_count, pos.row + 1);
col_count = std::max(col_count, pos.col + 1);
if (!testOnly)
item->setGeometry(QRect(QPoint(x, y), item->sizeHint()));
@ -269,8 +273,8 @@ int flow_layout::doLayout(const QRect& rect, bool testOnly) const
lineHeight = qMax(lineHeight, item->sizeHint().height());
}
m_rows = rows;
m_cols = cols;
m_rows = row_count;
m_cols = col_count;
return y + lineHeight - rect.y() + bottom;
}

View file

@ -11,16 +11,11 @@ struct gui_save
gui_save()
{
key = "";
name = "";
def = QVariant();
}
gui_save(const QString& k, const QString& n, const QVariant& d)
: key(k), name(n), def(d)
{
key = k;
name = n;
def = d;
}
bool operator==(const gui_save& rhs) const noexcept

View file

@ -244,6 +244,7 @@ private:
case localized_string_id::HOME_MENU_SETTINGS_INPUT_CAMERA_FLIP: return tr("Camera Flip", "Input");
case localized_string_id::HOME_MENU_SETTINGS_INPUT_PAD_MODE: return tr("Pad Handler Mode", "Input");
case localized_string_id::HOME_MENU_SETTINGS_INPUT_PAD_SLEEP: return tr("Pad Handler Sleep", "Input");
case localized_string_id::HOME_MENU_SETTINGS_INPUT_FAKE_MOVE_ROTATION_CONE: return tr("Fake PS Move Rotation Cone", "Input");
case localized_string_id::HOME_MENU_SETTINGS_ADVANCED: return tr("Advanced");
case localized_string_id::HOME_MENU_SETTINGS_ADVANCED_PREFERRED_SPU_THREADS: return tr("Preferred SPU Threads", "Advanced");
case localized_string_id::HOME_MENU_SETTINGS_ADVANCED_MAX_CPU_PREEMPTIONS: return tr("Max Power Saving CPU-Preemptions", "Advanced");

View file

@ -47,6 +47,7 @@ void qt_camera_handler::set_camera(const QCameraDevice& camera_info)
{
if (camera_info.isNull())
{
set_expected_state(camera_handler_state::closed);
reset();
return;
}
@ -57,9 +58,9 @@ void qt_camera_handler::set_camera(const QCameraDevice& camera_info)
camera_log.success("Using camera: id=\"%s\", description=\"%s\", front_facing=%d", camera_info.id().toStdString(), camera_info.description(), front_facing);
// Create camera and video surface
m_media_capture_session.reset(new QMediaCaptureSession(nullptr));
m_video_sink.reset(new qt_camera_video_sink(front_facing, nullptr));
m_camera.reset(new QCamera(camera_info));
m_media_capture_session = std::make_unique<QMediaCaptureSession>(nullptr);
m_video_sink = std::make_unique<qt_camera_video_sink>(front_facing, nullptr);
m_camera = std::make_unique<QCamera>(camera_info);
connect(m_camera.get(), &QCamera::activeChanged, this, &qt_camera_handler::handle_camera_active);
connect(m_camera.get(), &QCamera::errorOccurred, this, &qt_camera_handler::handle_camera_error);
@ -76,14 +77,37 @@ void qt_camera_handler::handle_camera_active(bool is_active)
{
camera_log.notice("Camera active status changed to %d", is_active);
if (is_active)
// Check if the camera does what it's supposed to do.
const camera_handler_state expected_state = get_expected_state();
switch (expected_state)
{
m_state = camera_handler_state::running;
}
else
case camera_handler_state::closed:
case camera_handler_state::open:
{
m_state = camera_handler_state::closed;
if (is_active)
{
// This is not supposed to happen and indicates an unexpected QCamera issue
camera_log.error("Camera started unexpectedly");
set_state(camera_handler_state::running);
return;
}
break;
}
case camera_handler_state::running:
{
if (!is_active)
{
// This is not supposed to happen and indicates an unexpected QCamera issue
camera_log.error("Camera stopped unexpectedly");
set_state(camera_handler_state::open);
return;
}
break;
}
}
set_state(expected_state);
}
void qt_camera_handler::handle_camera_error(QCamera::Error error, const QString& errorString)
@ -100,7 +124,11 @@ void qt_camera_handler::open_camera()
{
camera_log.notice("Switching camera from %s to %s", m_camera_id, camera_id);
camera_log.notice("Stopping old camera...");
if (m_camera) m_camera->stop();
if (m_camera)
{
set_expected_state(camera_handler_state::open);
m_camera->stop();
}
m_camera_id = camera_id;
}
@ -129,7 +157,7 @@ void qt_camera_handler::open_camera()
{
if (m_camera_id.empty()) camera_log.notice("Camera disabled");
else camera_log.error("No camera found");
m_state = camera_handler_state::closed;
set_state(camera_handler_state::closed);
return;
}
@ -148,7 +176,7 @@ void qt_camera_handler::open_camera()
// Update camera and view finder settings
update_camera_settings();
m_state = camera_handler_state::open;
set_state(camera_handler_state::open);
}
void qt_camera_handler::close_camera()
@ -159,11 +187,12 @@ void qt_camera_handler::close_camera()
{
if (m_camera_id.empty()) camera_log.notice("Camera disabled");
else camera_log.error("No camera found");
m_state = camera_handler_state::closed;
set_state(camera_handler_state::closed);
return;
}
// Unload/close camera
set_expected_state(camera_handler_state::closed);
m_camera->stop();
}
@ -175,7 +204,7 @@ void qt_camera_handler::start_camera()
{
if (m_camera_id.empty()) camera_log.notice("Camera disabled");
else camera_log.error("No camera found");
m_state = camera_handler_state::closed;
set_state(camera_handler_state::closed);
return;
}
@ -206,6 +235,7 @@ void qt_camera_handler::start_camera()
#endif
// Start camera. We will start receiving frames now.
set_expected_state(camera_handler_state::running);
m_camera->start();
}
@ -217,7 +247,7 @@ void qt_camera_handler::stop_camera()
{
if (m_camera_id.empty()) camera_log.notice("Camera disabled");
else camera_log.error("No camera found");
m_state = camera_handler_state::closed;
set_state(camera_handler_state::closed);
return;
}
@ -228,6 +258,7 @@ void qt_camera_handler::stop_camera()
}
// Stop camera. The camera will still be drawing power.
set_expected_state(camera_handler_state::open);
m_camera->stop();
}
@ -284,26 +315,26 @@ camera_handler_base::camera_handler_state qt_camera_handler::get_image(u8* buf,
m_camera_id != camera_id)
{
camera_log.notice("Switching cameras");
m_state = camera_handler_state::closed;
set_state(camera_handler_state::closed);
return camera_handler_state::closed;
}
if (m_camera_id.empty())
{
camera_log.notice("Camera disabled");
m_state = camera_handler_state::closed;
set_state(camera_handler_state::closed);
return camera_handler_state::closed;
}
if (!m_camera || !m_video_sink)
{
camera_log.fatal("Error: camera invalid");
m_state = camera_handler_state::closed;
set_state(camera_handler_state::closed);
return camera_handler_state::closed;
}
// Backup current state. State may change through events.
const camera_handler_state current_state = m_state;
const camera_handler_state current_state = get_state();
if (current_state == camera_handler_state::running)
{

View file

@ -17,8 +17,6 @@ public:
qt_camera_handler();
virtual ~qt_camera_handler();
void set_camera(const QCameraDevice& camera_info);
void open_camera() override;
void close_camera() override;
void start_camera() override;
@ -31,11 +29,12 @@ public:
camera_handler_state get_image(u8* buf, u64 size, u32& width, u32& height, u64& frame_number, u64& bytes_read) override;
private:
void set_camera(const QCameraDevice& camera_info);
void reset();
void update_camera_settings();
std::string m_camera_id;
std::shared_ptr<QCamera> m_camera;
std::unique_ptr<QCamera> m_camera;
std::unique_ptr<QMediaCaptureSession> m_media_capture_session;
std::unique_ptr<qt_camera_video_sink> m_video_sink;

View file

@ -94,7 +94,7 @@ void remove_item(QComboBox* box, int data_value, int def_value)
extern const std::map<std::string_view, int> g_prx_list;
settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std::shared_ptr<emu_settings> emu_settings, const int& tab_index, QWidget* parent, const GameInfo* game, bool create_cfg_from_global_cfg)
settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std::shared_ptr<emu_settings> emu_settings, int tab_index, QWidget* parent, const GameInfo* game, bool create_cfg_from_global_cfg)
: QDialog(parent)
, m_tab_index(tab_index)
, ui(new Ui::settings_dialog)
@ -1593,6 +1593,9 @@ settings_dialog::settings_dialog(std::shared_ptr<gui_settings> gui_settings, std
ui->disableMslFastMath->setVisible(false);
#endif
m_emu_settings->EnhanceCheckBox(ui->disableAsyncHostMM, emu_settings_type::DisableAsyncHostMM);
SubscribeTooltip(ui->disableAsyncHostMM, tooltips.settings.disable_async_host_mm);
// Comboboxes
m_emu_settings->EnhanceComboBox(ui->maxSPURSThreads, emu_settings_type::MaxSPURSThreads, true);
@ -2604,14 +2607,11 @@ void settings_dialog::ApplyStylesheet(bool reset)
}
}
int settings_dialog::exec()
void settings_dialog::open()
{
// singleShot Hack to fix following bug:
// If we use setCurrentIndex now we will miraculously see a resize of the dialog as soon as we
// switch to the cpu tab after conjuring the settings_dialog with another tab opened first.
// Weirdly enough this won't happen if we change the tab order so that anything else is at index 0.
ui->tab_widget_settings->setCurrentIndex(0);
QTimer::singleShot(0, [this]{ ui->tab_widget_settings->setCurrentIndex(m_tab_index); });
QDialog::open();
ui->tab_widget_settings->setCurrentIndex(m_tab_index);
// Open a dialog if your config file contained invalid entries
QTimer::singleShot(10, [this]
@ -2637,8 +2637,6 @@ int settings_dialog::exec()
}
}
});
return QDialog::exec();
}
void settings_dialog::SubscribeDescription(QLabel* description)

View file

@ -21,9 +21,9 @@ class settings_dialog : public QDialog
Q_OBJECT
public:
explicit settings_dialog(std::shared_ptr<gui_settings> gui_settings, std::shared_ptr<emu_settings> emu_settings, const int& tab_index = 0, QWidget* parent = nullptr, const GameInfo* game = nullptr, bool create_cfg_from_global_cfg = true);
explicit settings_dialog(std::shared_ptr<gui_settings> gui_settings, std::shared_ptr<emu_settings> emu_settings, int tab_index = 0, QWidget* parent = nullptr, const GameInfo* game = nullptr, bool create_cfg_from_global_cfg = true);
~settings_dialog();
int exec() override;
void open() override;
Q_SIGNALS:
void GuiStylesheetRequest();
void GuiRepaintRequest();

View file

@ -2695,6 +2695,13 @@
</property>
</widget>
</item>
<item>
<widget class="QCheckBox" name="disableAsyncHostMM">
<property name="text">
<string>Disable Asynchronous Memory Manager</string>
</property>
</widget>
</item>
</layout>
</widget>
</item>

View file

@ -40,6 +40,7 @@ public:
const QString allow_host_labels = tr("Allows the host GPU to synchronize with CELL directly. This incurs a performance penalty, but exposes the true state of GPU objects to the guest CPU. Can help eliminate visual noise and glitching at the cost of performance. Use with caution.");
const QString force_hw_MSAA = tr("Forces MSAA to use the host GPU's resolve capabilities for all sampling operations.\nThis option incurs a performance penalty as well as the risk of visual artifacts but can yield crisper visuals when MSAA is enabled.");
const QString disable_vertex_cache = tr("Disables the vertex cache.\nMight resolve missing or flickering graphics output.\nMay degrade performance.");
const QString disable_async_host_mm = tr("Force host memory management calls to be inlined instead of handled asynchronously.\nThis can cause severe performance degradation and stuttering in some games.\nThis option is only needed by developers to debug problems with texture cache memory protection.");
const QString zcull_operation_mode = tr("Changes ZCULL report synchronization behaviour. Experiment to find the best option for your game. Approximate mode is recommended for most games.\n· Precise is the most accurate to PS3 behaviour. Required for accurate visuals in some titles such as Demon's Souls and The Darkness.\n· Approximate is a much faster way to generate occlusion data which may not always match what the PS3 would generate. Works well with most PS3 games.\n· Relaxed changes the synchronization method completely and can greatly improve performance in some games or completely break others.");
const QString max_spurs_threads = tr("Limits the maximum number of SPURS threads in each thread group.\nMay improve performance in some cases, especially on systems with limited number of hardware threads.\nLimiting the number of threads is likely to cause crashes; it's recommended to keep this at the default value.");
const QString sleep_timers_accuracy = tr("Changes the sleep period accuracy.\n'As Host' uses default accuracy of the underlying operating system, while 'All Timers' attempts to improve it.\n'Usleep Only' limits the adjustments to usleep syscall only.\nCan affect performance in unexpected ways.");

View file

@ -14,9 +14,9 @@ class user_account
public:
explicit user_account(const std::string& user_id = "00000001");
std::string GetUserId() const { return m_user_id; }
std::string GetUserDir() const { return m_user_dir; }
std::string GetUsername() const { return m_username; }
const std::string& GetUserId() const { return m_user_id; }
const std::string& GetUserDir() const { return m_user_dir; }
const std::string& GetUsername() const { return m_username; }
static std::map<u32, user_account> GetUserAccounts(const std::string& base_dir);

View file

@ -21,6 +21,7 @@
#include <arm_neon.h>
#endif
#include <algorithm>
#include <cmath>
#include <math.h>
#include <cfenv>
@ -1967,6 +1968,15 @@ inline v128 gv_mulfs(const v128& a, const v128& b)
#endif
}
inline v128 gv_mulfs(const v128& a, f32 b)
{
#if defined(ARCH_X64)
return _mm_mul_ps(a, _mm_set_ps1(b));
#elif defined(ARCH_ARM64)
return vmulq_n_f32(a, b);
#endif
}
inline v128 gv_hadds8x2(const v128& a)
{
#if defined(__SSSE3__)
@ -2979,6 +2989,23 @@ inline v128 gv_rol16(const v128& a, const v128& b)
#endif
}
// For each 16-bit element, r = rotate a by count
template <u8 Count>
inline v128 gv_rol16(const v128& a)
{
constexpr u8 count = Count & 0xf;
#if defined(ARCH_X64)
return _mm_or_si128(_mm_srli_epi16(a, 16 - count), _mm_slli_epi16(a, count));
#elif defined(ARCH_ARM64)
return vorrq_u16(vshrq_n_u16(a, 16 - count), vshlq_n_u16(a, count));
#else
v128 r;
for (u32 i = 0; i < 8; i++)
r._u16[i] = std::rotl(a._u16[i], count);
return r;
#endif
}
// For each 32-bit element, r = rotate a by b
inline v128 gv_rol32(const v128& a, const v128& b)
{
@ -2997,15 +3024,16 @@ inline v128 gv_rol32(const v128& a, const v128& b)
}
// For each 32-bit element, r = rotate a by count
inline v128 gv_rol32(const v128& a, u32 count)
template <u8 Count>
inline v128 gv_rol32(const v128& a)
{
count %= 32;
#if defined(ARCH_X64)
return _mm_or_epi32(_mm_srli_epi32(a, 32 - count), _mm_slli_epi32(a, count));
constexpr u8 count = Count & 0x1f;
#if defined(__AVX512VL__)
return _mm_rol_epi32(a, count);
#elif defined(ARCH_X64)
return _mm_or_si128(_mm_srli_epi32(a, 32 - count), _mm_slli_epi32(a, count));
#elif defined(ARCH_ARM64)
const auto amt1 = vdupq_n_s32(count);
const auto amt2 = vdupq_n_s32(count - 32);
return vorrq_u32(vshlq_u32(a, amt1), vshlq_u32(a, amt2));
return vorrq_u32(vshrq_n_u32(a, 32 - count), vshlq_n_u32(a, count));
#else
v128 r;
for (u32 i = 0; i < 4; i++)
@ -3107,6 +3135,139 @@ inline auto gv_shuffle_right(A&& a)
FOR_X64(unary_op, kIdPsrldq, kIdVpsrldq, std::forward<A>(a), Count);
}
// Load 32-bit integer into the first element of a new vector, set other elements to zero
inline v128 gv_loadu32(const void* ptr)
{
#if defined(ARCH_X64)
return _mm_loadu_si32(ptr);
#elif defined(ARCH_ARM64)
return vld1q_lane_u32(static_cast<const u32*>(ptr), vdupq_n_u32(0), 0);
#endif
}
// Load 16-bit integer into an existing vector at the position specified by Index
template <u8 Index>
inline v128 gv_insert16(const v128& vec, u16 value)
{
#if defined(ARCH_X64)
return _mm_insert_epi16(vec, value, Index);
#elif defined(ARCH_ARM64)
return vsetq_lane_u16(value, vec, Index & 0x7);
#endif
}
// For each 8-bit element,
// if ctrl >= 0 && ctrl < 16 then r = vec[ctrl],
// else if ctrl < 0 then r = 0
inline v128 gv_shuffle8(const v128& vec, const v128& ctrl)
{
AUDIT(std::ranges::none_of(ctrl._chars, [](s8 i){ return i >= static_cast<s8>(sizeof(v128)); }), "All indices must be in the range [0, 15] or negative, since PSHUFB and TBL behave differently otherwise");
#if defined(__SSSE3__)
return _mm_shuffle_epi8(vec, ctrl);
#elif defined(ARCH_ARM64)
return vqtbl1q_s8(vec, ctrl);
#else
v128 r;
for (s32 i = 0; i < 16; i++)
r._s8[i] = ctrl._s8[i] < 0 ? 0 : vec._s8[ctrl._s8[i] & 0xf];
return r;
#endif
}
// For each 2-bit index in Control, r = vec[index]
template <u8 Control>
inline v128 gv_shuffle32(const v128& vec)
{
#if defined(ARCH_X64)
return _mm_shuffle_epi32(vec, Control);
#elif defined(ARCH_ARM64)
constexpr u8 idx0 = (Control & 3) * sizeof(s32);
constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32);
constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32);
constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32);
constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 };
return vqtbl1q_s8(vec, idx_vec);
#endif
}
// For each index, r = vec[index & 3]
template <u8 Index0, u8 Index1, u8 Index2, u8 Index3>
inline v128 gv_shuffle32(const v128& vec)
{
#if defined(ARCH_X64)
return _mm_shuffle_epi32(vec, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6);
#elif defined(ARCH_ARM64)
constexpr u8 idx0 = (Index0 & 3) * sizeof(s32);
constexpr u8 idx1 = (Index1 & 3) * sizeof(s32);
constexpr u8 idx2 = (Index2 & 3) * sizeof(s32);
constexpr u8 idx3 = (Index3 & 3) * sizeof(s32);
constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 };
return vqtbl1q_s8(vec, idx_vec);
#endif
}
// For the first two 2-bit indices in Control, r = a[index],
// for the last two indices, r = b[index]
template <u8 Control>
inline v128 gv_shufflefs(const v128& a, const v128& b)
{
#if defined(ARCH_X64)
return _mm_shuffle_ps(a, b, Control);
#elif defined(ARCH_ARM64)
constexpr u8 idx0 = (Control & 3) * sizeof(s32);
constexpr u8 idx1 = (Control >> 2 & 3) * sizeof(s32);
constexpr u8 idx2 = (Control >> 4 & 3) * sizeof(s32) + sizeof(v128);
constexpr u8 idx3 = (Control >> 6 & 3) * sizeof(s32) + sizeof(v128);
constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 };
return vqtbl2q_s8({ a, b }, idx_vec);
#endif
}
// For the first two indices, r = a[index & 3],
// for the last two indices, r = b[index & 3]
template <u8 Index0, u8 Index1, u8 Index2, u8 Index3>
inline v128 gv_shufflefs(const v128& a, const v128& b)
{
#if defined(ARCH_X64)
return _mm_shuffle_ps(a, b, (Index0 & 3) | (Index1 & 3) << 2 | (Index2 & 3) << 4 | (Index3 & 3) << 6);
#elif defined(ARCH_ARM64)
constexpr u8 idx0 = (Index0 & 3) * sizeof(s32);
constexpr u8 idx1 = (Index1 & 3) * sizeof(s32);
constexpr u8 idx2 = (Index2 & 3) * sizeof(s32) + sizeof(v128);
constexpr u8 idx3 = (Index3 & 3) * sizeof(s32) + sizeof(v128);
constexpr uint8x16_t idx_vec = { idx0, idx0 + 1, idx0 + 2, idx0 + 3, idx1, idx1 + 1, idx1 + 2, idx1 + 3, idx2, idx2 + 1, idx2 + 2, idx2 + 3, idx3, idx3 + 1, idx3 + 2, idx3 + 3 };
return vqtbl2q_s8({ a, b }, idx_vec);
#endif
}
// For each 32-bit element, reverse byte order
inline v128 gv_rev32(const v128& vec)
{
#if defined(__SSSE3__)
return _mm_shuffle_epi8(vec, _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12));
#elif defined(ARCH_ARM64)
return vrev32q_u8(vec);
#else
return gv_rol32<16>(gv_rol16<8>(vec));
#endif
}
// For each 32-bit element, convert between big-endian and native-endian
inline v128 gv_to_be32(const v128& vec)
{
if constexpr (std::endian::native == std::endian::little)
return gv_rev32(vec);
return vec;
}
#if defined(__clang__)
#pragma clang diagnostic pop
#elif defined(__GNUC__)

View file

@ -989,14 +989,14 @@ template <typename To, typename From> requires (std::is_integral_v<decltype(std:
constexpr bool is_from_signed = std::is_signed_v<CommonFrom>;
constexpr bool is_to_signed = std::is_signed_v<CommonTo>;
constexpr auto from_mask = is_from_signed > is_to_signed ? UnFrom{umax} >> 1 : UnFrom{umax};
constexpr auto to_mask = is_to_signed > is_from_signed ? UnTo{umax} >> 1 : UnTo{umax};
constexpr auto from_mask = (is_from_signed && !is_to_signed) ? UnFrom{umax} >> 1 : UnFrom{umax};
constexpr auto to_mask = (is_to_signed && !is_from_signed) ? UnTo{umax} >> 1 : UnTo{umax};
constexpr auto mask = ~(from_mask & to_mask);
// Signed to unsigned always require test
// Otherwise, this is bit-wise narrowing or conversion between types of different signedness of the same size
if constexpr (is_from_signed > is_to_signed || to_mask < from_mask)
if constexpr ((is_from_signed && !is_to_signed) || to_mask < from_mask)
{
// Try to optimize test if both are of the same signedness
if (is_from_signed != is_to_signed ? !!(value & mask) : static_cast<CommonTo>(value) != value) [[unlikely]]