diff --git a/rpcs3/Emu/Audio/audio_resampler.cpp b/rpcs3/Emu/Audio/audio_resampler.cpp
index c2d318db40..b029172dfa 100644
--- a/rpcs3/Emu/Audio/audio_resampler.cpp
+++ b/rpcs3/Emu/Audio/audio_resampler.cpp
@@ -33,8 +33,7 @@ void audio_resampler::put_samples(const f32* buf, u32 sample_cnt)
 
 std::pair<f32* /* buffer */, u32 /* samples */> audio_resampler::get_samples(u32 sample_cnt)
 {
-	f32 *const buf = resampler.bufBegin();
-	return std::make_pair(buf, resampler.receiveSamples(sample_cnt));
+	return std::make_pair(resampler.bufBegin(), resampler.receiveSamples(sample_cnt));
 }
 
 u32 audio_resampler::samples_available() const
diff --git a/rpcs3/Emu/Cell/Modules/cellAudio.cpp b/rpcs3/Emu/Cell/Modules/cellAudio.cpp
index c46b09b8a5..f9083c8fc4 100644
--- a/rpcs3/Emu/Cell/Modules/cellAudio.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellAudio.cpp
@@ -5,6 +5,7 @@
 #include "Emu/Cell/lv2/sys_process.h"
 #include "Emu/Cell/lv2/sys_event.h"
 #include "cellAudio.h"
+#include "util/video_provider.h"
 
 #include <cmath>
 
@@ -69,7 +70,7 @@ void cell_audio_config::reset(bool backend_changed)
 	const AudioFreq freq = AudioFreq::FREQ_48K;
 	const AudioSampleSize sample_size = raw.convert_to_s16 ? AudioSampleSize::S16 : AudioSampleSize::FLOAT;
 
-	const auto [req_ch_cnt, downmix] = AudioBackend::get_channel_count_and_downmixer(0); // CELL_AUDIO_OUT_PRIMARY
+	const auto& [req_ch_cnt, downmix] = AudioBackend::get_channel_count_and_downmixer(0); // CELL_AUDIO_OUT_PRIMARY
 	f64 cb_frame_len = 0.0;
 	u32 ch_cnt = 2;
 
@@ -276,16 +277,23 @@ void audio_ringbuffer::process_resampled_data()
 {
 	if (!cfg.time_stretching_enabled) return;
 
-	const auto [buffer, samples] = resampler.get_samples(static_cast<u32>(cb_ringbuf.get_free_size() / (cfg.audio_sample_size * static_cast<u32>(cfg.backend_ch_cnt))));
+	const auto& [buffer, samples] = resampler.get_samples(static_cast<u32>(cb_ringbuf.get_free_size() / (cfg.audio_sample_size * static_cast<u32>(cfg.backend_ch_cnt))));
 	commit_data(buffer, samples);
 }
 
 void audio_ringbuffer::commit_data(f32* buf, u32 sample_cnt)
 {
-	sample_cnt *= cfg.audio_channels;
+	const u32 sample_cnt_in = sample_cnt * cfg.audio_channels;
+	const u32 sample_cnt_out = sample_cnt * static_cast<u32>(cfg.backend_ch_cnt);
 
 	// Dump audio if enabled
-	m_dump.WriteData(buf, sample_cnt * static_cast<u32>(AudioSampleSize::FLOAT));
+	m_dump.WriteData(buf, sample_cnt_in * static_cast<u32>(AudioSampleSize::FLOAT));
+
+	// Record audio if enabled
+	if (utils::video_provider& provider = g_fxo->get<utils::video_provider>(); provider.can_consume_sample())
+	{
+		provider.present_samples(reinterpret_cast<u8*>(buf), sample_cnt, static_cast<u32>(cfg.audio_channels));
+	}
 
 	if (cfg.backend_ch_cnt < AudioChannelCnt{cfg.audio_channels})
 	{
@@ -293,11 +301,11 @@ void audio_ringbuffer::commit_data(f32* buf, u32 sample_cnt)
 		{
 			if (cfg.backend_ch_cnt == AudioChannelCnt::SURROUND_5_1)
 			{
-				AudioBackend::downmix<AudioChannelCnt::SURROUND_7_1, AudioChannelCnt::SURROUND_5_1>(sample_cnt, buf, buf);
+				AudioBackend::downmix<AudioChannelCnt::SURROUND_7_1, AudioChannelCnt::SURROUND_5_1>(sample_cnt_in, buf, buf);
 			}
 			else if (cfg.backend_ch_cnt == AudioChannelCnt::STEREO)
 			{
-				AudioBackend::downmix<AudioChannelCnt::SURROUND_7_1, AudioChannelCnt::STEREO>(sample_cnt, buf, buf);
+				AudioBackend::downmix<AudioChannelCnt::SURROUND_7_1, AudioChannelCnt::STEREO>(sample_cnt_in, buf, buf);
 			}
 			else
 			{
@@ -308,7 +316,7 @@ void audio_ringbuffer::commit_data(f32* buf, u32 sample_cnt)
 		{
 			if (cfg.backend_ch_cnt == AudioChannelCnt::STEREO)
 			{
-				AudioBackend::downmix<AudioChannelCnt::SURROUND_5_1, AudioChannelCnt::STEREO>(sample_cnt, buf, buf);
+				AudioBackend::downmix<AudioChannelCnt::SURROUND_5_1, AudioChannelCnt::STEREO>(sample_cnt_in, buf, buf);
 			}
 			else
 			{
@@ -321,8 +329,6 @@ void audio_ringbuffer::commit_data(f32* buf, u32 sample_cnt)
 		}
 	}
 
-	const u32 sample_cnt_out = sample_cnt / cfg.audio_channels * static_cast<u32>(cfg.backend_ch_cnt);
-
 	if (cfg.backend->get_convert_to_s16())
 	{
 		AudioBackend::convert_to_s16(sample_cnt_out, buf, buf);
diff --git a/rpcs3/Emu/Cell/Modules/cellRec.cpp b/rpcs3/Emu/Cell/Modules/cellRec.cpp
index 53165ef6a5..ebdeffb49f 100644
--- a/rpcs3/Emu/Cell/Modules/cellRec.cpp
+++ b/rpcs3/Emu/Cell/Modules/cellRec.cpp
@@ -140,34 +140,26 @@ struct rec_param
 
 constexpr u32 rec_framerate = 30; // Always 30 fps
 
-class rec_image_sink : public utils::image_sink
+class rec_video_sink : public utils::video_sink
 {
 public:
-	rec_image_sink() : utils::image_sink()
+	rec_video_sink() : utils::video_sink()
 	{
 		m_framerate = rec_framerate;
+		m_sample_rate = 44100; // TODO
 	}
 
 	void stop(bool flush = true) override
 	{
-		cellRec.notice("Stopping image sink. flush=%d", flush);
+		cellRec.notice("Stopping video sink. flush=%d", flush);
 
 		std::lock_guard lock(m_mtx);
 		m_flush = flush;
 		m_frames_to_encode.clear();
+		m_samples_to_encode.clear();
 		has_error = false;
 	}
 
-	void add_frame(std::vector<u8>& frame, u32 pitch, u32 width, u32 height, s32 pixel_format, usz timestamp_ms) override
-	{
-		std::lock_guard lock(m_mtx);
-
-		if (m_flush)
-			return;
-
-		m_frames_to_encode.emplace_back(timestamp_ms, pitch, width, height, pixel_format, std::move(frame));
-	}
-
 	encoder_frame get_frame()
 	{
 		std::lock_guard lock(m_mtx);
@@ -196,7 +188,7 @@ struct rec_info
 	vm::bptr<u8> video_input_buffer{}; // Used by the game to inject a frame right before it would render a frame to the screen.
 	vm::bptr<u8> audio_input_buffer{}; // Used by the game to inject audio: 2-channel interleaved (left-right) * 256 samples * sizeof(f32) at 48000 kHz
 
-	std::vector<utils::image_sink::encoder_frame> video_ringbuffer;
+	std::vector<utils::video_sink::encoder_frame> video_ringbuffer;
 	std::vector<u8> audio_ringbuffer;
 	usz video_ring_pos = 0;
 	usz video_ring_frame_count = 0;
@@ -209,9 +201,9 @@ struct rec_info
 		return pos;
 	}
 
-	std::shared_ptr<rec_image_sink> image_sink;
+	std::shared_ptr<rec_video_sink> video_sink;
 	std::shared_ptr<utils::video_encoder> encoder;
-	std::unique_ptr<named_thread<std::function<void()>>> image_provider_thread;
+	std::unique_ptr<named_thread<std::function<void()>>> video_provider_thread;
 	atomic_t<bool> paused = false;
 	s64 last_pts = -1;
 
@@ -240,9 +232,9 @@ struct rec_info
 	void set_video_params(s32 video_format);
 	void set_audio_params(s32 audio_format);
 
-	void start_image_provider();
-	void pause_image_provider();
-	void stop_image_provider(bool flush);
+	void start_video_provider();
+	void pause_video_provider();
+	void stop_video_provider(bool flush);
 };
 
 void rec_info::set_video_params(s32 video_format)
@@ -507,29 +499,29 @@ void rec_info::set_audio_params(s32 audio_format)
 	cellRec.notice("set_audio_params: audio_format=0x%x, audio_codec_id=%d, sample_rate=%d, audio_bps=%d", audio_format, audio_codec_id, sample_rate, audio_bps);
 }
 
-void rec_info::start_image_provider()
+void rec_info::start_video_provider()
 {
 	const bool was_paused = paused.exchange(false);
 	utils::video_provider& video_provider = g_fxo->get<utils::video_provider>();
 
-	if (image_provider_thread && was_paused)
+	if (video_provider_thread && was_paused)
 	{
 		// Resume
 		const u64 pause_time_end = get_system_time();
 		ensure(pause_time_end > pause_time_start);
 		pause_time_total += (pause_time_end - pause_time_start);
 		video_provider.set_pause_time(pause_time_total / 1000);
-		cellRec.notice("Resuming image provider.");
+		cellRec.notice("Resuming video provider.");
 		return;
 	}
 
-	cellRec.notice("Starting image provider.");
+	cellRec.notice("Starting video provider.");
 
 	recording_time_start = get_system_time();
 	pause_time_total = 0;
 	video_provider.set_pause_time(0);
 
-	image_provider_thread = std::make_unique<named_thread<std::function<void()>>>("cellRec Image Provider", [this]()
+	video_provider_thread = std::make_unique<named_thread<std::function<void()>>>("cellRec video provider", [this]()
 	{
 		const bool use_internal_audio = param.audio_input == CELL_REC_PARAM_AUDIO_INPUT_DISABLE || param.audio_input_mix_vol < 100;
 		const bool use_external_audio = param.audio_input != CELL_REC_PARAM_AUDIO_INPUT_DISABLE && param.audio_input_mix_vol > 0;
@@ -537,7 +529,7 @@ void rec_info::start_image_provider()
 		const bool use_ring_buffer = param.ring_sec > 0;
 		const usz frame_size = input_format.pitch * input_format.height;
 
-		cellRec.notice("image_provider_thread: use_ring_buffer=%d, video_ringbuffer_size=%d, audio_ringbuffer_size=%d, ring_sec=%d, frame_size=%d, use_external_video=%d, use_external_audio=%d, use_internal_audio=%d", use_ring_buffer, video_ringbuffer.size(), audio_ringbuffer.size(), param.ring_sec, frame_size, use_external_video, use_external_audio, use_internal_audio);
+		cellRec.notice("video_provider_thread: use_ring_buffer=%d, video_ringbuffer_size=%d, audio_ringbuffer_size=%d, ring_sec=%d, frame_size=%d, use_external_video=%d, use_external_audio=%d, use_internal_audio=%d", use_ring_buffer, video_ringbuffer.size(), audio_ringbuffer.size(), param.ring_sec, frame_size, use_external_video, use_external_audio, use_internal_audio);
 
 		while (thread_ctrl::state() != thread_state::aborting && encoder)
 		{
@@ -575,7 +567,7 @@ void rec_info::start_image_provider()
 					{
 						if (use_ring_buffer)
 						{
-							utils::image_sink::encoder_frame& frame_data = video_ringbuffer[next_video_ring_pos()];
+							utils::video_sink::encoder_frame& frame_data = video_ringbuffer[next_video_ring_pos()];
 							frame_data.pts = pts;
 							frame_data.width = input_format.width;
 							frame_data.height = input_format.height;
@@ -595,14 +587,14 @@ void rec_info::start_image_provider()
 					last_pts = pts;
 				}
 			}
-			else if (use_ring_buffer && image_sink)
+			else if (use_ring_buffer && video_sink)
 			{
-				utils::image_sink::encoder_frame frame = image_sink->get_frame();
+				utils::video_sink::encoder_frame frame = video_sink->get_frame();
 
 				if (const s64 pts = encoder->get_pts(frame.timestamp_ms); pts > last_pts && frame.data.size() > 0)
 				{
 					ensure(frame.data.size() == frame_size);
-					utils::image_sink::encoder_frame& frame_data = video_ringbuffer[next_video_ring_pos()];
+					utils::video_sink::encoder_frame& frame_data = video_ringbuffer[next_video_ring_pos()];
 					frame_data = std::move(frame);
 					frame_data.pts = pts;
 					last_pts = pts;
@@ -635,34 +627,34 @@ void rec_info::start_image_provider()
 			}
 
 			// Update recording time
-			recording_time_total = encoder->get_timestamp_ms(encoder->last_pts());
+			recording_time_total = encoder->get_timestamp_ms(encoder->last_video_pts());
 
 			thread_ctrl::wait_for(100);
 		}
 	});
 }
 
-void rec_info::pause_image_provider()
+void rec_info::pause_video_provider()
 {
 	cellRec.notice("Pausing image provider.");
 
-	if (image_provider_thread)
+	if (video_provider_thread)
 	{
 		paused = true;
 		pause_time_start = get_system_time();
 	}
 }
 
-void rec_info::stop_image_provider(bool flush)
+void rec_info::stop_video_provider(bool flush)
 {
-	cellRec.notice("Stopping image provider.");
+	cellRec.notice("Stopping video provider.");
 
-	if (image_provider_thread)
+	if (video_provider_thread)
 	{
-		auto& thread = *image_provider_thread;
+		auto& thread = *video_provider_thread;
 		thread = thread_state::aborting;
 		thread();
-		image_provider_thread.reset();
+		video_provider_thread.reset();
 	}
 
 	if (flush && param.ring_sec > 0 && !video_ringbuffer.empty())
@@ -680,7 +672,7 @@ void rec_info::stop_image_provider(bool flush)
 		for (usz i = 0; i < frame_count; i++)
 		{
 			const usz pos = (start_offset + i) % video_ringbuffer.size();
-			utils::image_sink::encoder_frame& frame_data = video_ringbuffer[pos];
+			utils::video_sink::encoder_frame& frame_data = video_ringbuffer[pos];
 			encoder->add_frame(frame_data.data, frame_data.pitch, frame_data.width, frame_data.height, frame_data.av_pixel_format, encoder->get_timestamp_ms(frame_data.pts - start_pts));
 
 			// TODO: add audio data to encoder
@@ -1073,7 +1065,7 @@ error_code cellRecOpen(vm::cptr<char> pDirName, vm::cptr<char> pFileName, vm::cp
 		rec.audio_ringbuffer.resize(audio_ring_buffer_size);
 		rec.audio_ring_step = audio_size_per_sample;
 		rec.video_ringbuffer.resize(video_ring_buffer_size, {});
-		rec.image_sink = std::make_shared<rec_image_sink>();
+		rec.video_sink = std::make_shared<rec_video_sink>();
 	}
 
 	rec.encoder = std::make_shared<utils::video_encoder>();
@@ -1082,6 +1074,7 @@ error_code cellRecOpen(vm::cptr<char> pDirName, vm::cptr<char> pFileName, vm::cp
 	rec.encoder->set_video_bitrate(rec.video_bps);
 	rec.encoder->set_video_codec(rec.video_codec_id);
 	rec.encoder->set_sample_rate(rec.sample_rate);
+	rec.encoder->set_audio_channels(rec.channels);
 	rec.encoder->set_audio_bitrate(rec.audio_bps);
 	rec.encoder->set_audio_codec(rec.audio_codec_id);
 	rec.encoder->set_output_format(rec.output_format);
@@ -1114,12 +1107,12 @@ error_code cellRecClose(s32 isDiscard)
 		if (isDiscard)
 		{
 			// No need to flush
-			rec.stop_image_provider(false);
+			rec.stop_video_provider(false);
 			rec.encoder->stop(false);
 
-			if (rec.image_sink)
+			if (rec.video_sink)
 			{
-				rec.image_sink->stop(false);
+				rec.video_sink->stop(false);
 			}
 
 			if (fs::is_file(rec.param.filename))
@@ -1135,18 +1128,18 @@ error_code cellRecClose(s32 isDiscard)
 		else
 		{
 			// Flush to make sure we encode all remaining frames
-			rec.stop_image_provider(true);
+			rec.stop_video_provider(true);
 			rec.encoder->stop(true);
-			rec.recording_time_total = rec.encoder->get_timestamp_ms(rec.encoder->last_pts());
+			rec.recording_time_total = rec.encoder->get_timestamp_ms(rec.encoder->last_video_pts());
 
-			if (rec.image_sink)
+			if (rec.video_sink)
 			{
-				rec.image_sink->stop(true);
+				rec.video_sink->stop(true);
 			}
 
 			const s64 start_pts = rec.encoder->get_pts(rec.param.scene_metadata.start_time);
 			const s64 end_pts = rec.encoder->get_pts(rec.param.scene_metadata.end_time);
-			const s64 last_pts = rec.encoder->last_pts();
+			const s64 last_pts = rec.encoder->last_video_pts();
 
 			is_valid_range = start_pts >= 0 && end_pts <= last_pts;
 		}
@@ -1157,7 +1150,7 @@ error_code cellRecClose(s32 isDiscard)
 		g_fxo->need<utils::video_provider>();
 		utils::video_provider& video_provider = g_fxo->get<utils::video_provider>();
 
-		// Release the image sink if it was used
+		// Release the video sink if it was used
 		if (rec.param.video_input == CELL_REC_PARAM_VIDEO_INPUT_DISABLE)
 		{
 			const recording_mode old_mode = g_recording_mode.exchange(recording_mode::stopped);
@@ -1167,15 +1160,15 @@ error_code cellRecClose(s32 isDiscard)
 				cellRec.error("cellRecClose: Unexpected recording mode %s found while stopping video capture.", old_mode);
 			}
 
-			if (!video_provider.set_image_sink(nullptr, recording_mode::cell))
+			if (!video_provider.set_video_sink(nullptr, recording_mode::cell))
 			{
-				cellRec.error("cellRecClose failed to release image sink");
+				cellRec.error("cellRecClose failed to release video sink");
 			}
 		}
 
 		rec.param = {};
 		rec.encoder.reset();
-		rec.image_sink.reset();
+		rec.video_sink.reset();
 		rec.audio_ringbuffer.clear();
 		rec.video_ringbuffer.clear();
 		rec.state = rec_state::closed;
@@ -1207,7 +1200,7 @@ error_code cellRecStop()
 
 	sysutil_register_cb([&rec](ppu_thread& ppu) -> s32
 	{
-		// Disable image sink if it was used
+		// Disable video sink if it was used
 		if (rec.param.video_input == CELL_REC_PARAM_VIDEO_INPUT_DISABLE)
 		{
 			const recording_mode old_mode = g_recording_mode.exchange(recording_mode::stopped);
@@ -1219,12 +1212,12 @@ error_code cellRecStop()
 		}
 
 		// cellRecStop actually just pauses the recording
-		rec.pause_image_provider();
+		rec.pause_video_provider();
 
 		ensure(!!rec.encoder);
 		rec.encoder->pause(true);
 
-		rec.recording_time_total = rec.encoder->get_timestamp_ms(rec.encoder->last_pts());
+		rec.recording_time_total = rec.encoder->get_timestamp_ms(rec.encoder->last_video_pts());
 		rec.state = rec_state::stopped;
 
 		rec.cb(ppu, CELL_REC_STATUS_STOP, CELL_OK, rec.cbUserData);
@@ -1254,15 +1247,15 @@ error_code cellRecStart()
 		g_fxo->need<utils::video_provider>();
 		utils::video_provider& video_provider = g_fxo->get<utils::video_provider>();
 
-		// Setup an image sink if it is needed
+		// Setup an video sink if it is needed
 		if (rec.param.video_input == CELL_REC_PARAM_VIDEO_INPUT_DISABLE)
 		{
 			if (rec.param.ring_sec <= 0)
 			{
 				// Regular recording
-				if (!video_provider.set_image_sink(rec.encoder, recording_mode::cell))
+				if (!video_provider.set_video_sink(rec.encoder, recording_mode::cell))
 				{
-					cellRec.error("Failed to set image sink");
+					cellRec.error("Failed to set video sink");
 					rec.cb(ppu, CELL_REC_STATUS_ERR, CELL_REC_ERROR_FATAL, rec.cbUserData);
 					return CELL_OK;
 				}
@@ -1270,9 +1263,9 @@ error_code cellRecStart()
 			else
 			{
 				// Ringbuffer recording
-				if (!video_provider.set_image_sink(rec.image_sink, recording_mode::cell))
+				if (!video_provider.set_video_sink(rec.video_sink, recording_mode::cell))
 				{
-					cellRec.error("Failed to set image sink");
+					cellRec.error("Failed to set video sink");
 					rec.cb(ppu, CELL_REC_STATUS_ERR, CELL_REC_ERROR_FATAL, rec.cbUserData);
 					return CELL_OK;
 				}
@@ -1287,7 +1280,7 @@ error_code cellRecStart()
 			g_recording_mode = recording_mode::stopped;
 		}
 
-		rec.start_image_provider();
+		rec.start_video_provider();
 
 		if (rec.encoder->has_error)
 		{
diff --git a/rpcs3/Emu/Io/recording_config.h b/rpcs3/Emu/Io/recording_config.h
index f1e2e58242..e08e73acfe 100644
--- a/rpcs3/Emu/Io/recording_config.h
+++ b/rpcs3/Emu/Io/recording_config.h
@@ -8,14 +8,31 @@ struct cfg_recording final : cfg::node
 	bool load();
 	void save() const;
 
-	cfg::uint<0, 60> framerate{this, "Framerate", 30};
-	cfg::uint<0, 7680> width{this, "Width", 1280};
-	cfg::uint<0, 4320> height{this, "Height", 720};
-	cfg::uint<0, 192> pixel_format{this, "AVPixelFormat", 0}; // AVPixelFormat::AV_PIX_FMT_YUV420P
-	cfg::uint<0, 32813> video_codec{this, "AVCodecID", 12}; // AVCodecID::AV_CODEC_ID_MPEG4
-	cfg::uint<0, 25000000> video_bps{this, "Video Bitrate", 4000000};
-	cfg::uint<0, 5> max_b_frames{this, "Max B-Frames", 2};
-	cfg::uint<0, 20> gop_size{this, "Group of Pictures Size", 12};
+	struct node_video : cfg::node
+	{
+		node_video(cfg::node* _this) : cfg::node(_this, "Video") {}
+
+		cfg::uint<0, 60> framerate{this, "Framerate", 30};
+		cfg::uint<0, 7680> width{this, "Width", 1280};
+		cfg::uint<0, 4320> height{this, "Height", 720};
+		cfg::uint<0, 192> pixel_format{this, "AVPixelFormat", 0}; // AVPixelFormat::AV_PIX_FMT_YUV420P
+		cfg::uint<0, 0xFFFF> video_codec{this, "AVCodecID", 12}; // AVCodecID::AV_CODEC_ID_MPEG4
+		cfg::uint<0, 25000000> video_bps{this, "Video Bitrate", 4000000};
+		cfg::uint<0, 5> max_b_frames{this, "Max B-Frames", 2};
+		cfg::uint<0, 20> gop_size{this, "Group of Pictures Size", 12};
+
+	} video{ this };
+
+	struct node_audio : cfg::node
+	{
+		node_audio(cfg::node* _this) : cfg::node(_this, "Audio") {}
+		
+		cfg::uint<0x10000, 0x17000> audio_codec{this, "AVCodecID", 86019}; // AVCodecID::AV_CODEC_ID_AC3
+		cfg::uint<0, 8> channels{this, "Channels", 2};
+		cfg::uint<0, 25000000> audio_bps{this, "Audio Bitrate", 320000};
+		cfg::uint<0, 25000000> sample_rate{this, "Sample Rate", 48000};
+
+	} audio{ this };
 
 	const std::string path;
 };
diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj
index cc7c60f51d..0d7a9a74ea 100644
--- a/rpcs3/emucore.vcxproj
+++ b/rpcs3/emucore.vcxproj
@@ -618,7 +618,7 @@
     <ClInclude Include="Loader\mself.hpp" />
     <ClInclude Include="util\atomic.hpp" />
     <ClInclude Include="util\bless.hpp" />
-    <ClInclude Include="util\image_sink.h" />
+    <ClInclude Include="util\video_sink.h" />
     <ClInclude Include="util\video_provider.h" />
     <ClInclude Include="util\media_utils.h" />
     <ClInclude Include="util\serialization.hpp" />
diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters
index 5a11ef535f..447430681e 100644
--- a/rpcs3/emucore.vcxproj.filters
+++ b/rpcs3/emucore.vcxproj.filters
@@ -2275,7 +2275,7 @@
     <ClInclude Include="util\video_provider.h">
       <Filter>Utilities</Filter>
     </ClInclude>
-    <ClInclude Include="util\image_sink.h">
+    <ClInclude Include="util\video_sink.h">
       <Filter>Utilities</Filter>
     </ClInclude>
     <ClInclude Include="Emu\Io\recording_config.h">
diff --git a/rpcs3/rpcs3qt/gs_frame.cpp b/rpcs3/rpcs3qt/gs_frame.cpp
index f14213cb52..31b81518e5 100644
--- a/rpcs3/rpcs3qt/gs_frame.cpp
+++ b/rpcs3/rpcs3qt/gs_frame.cpp
@@ -12,6 +12,7 @@
 #include "Emu/IdManager.h"
 #include "Emu/Cell/Modules/cellScreenshot.h"
 #include "Emu/Cell/Modules/cellVideoOut.h"
+#include "Emu/Cell/Modules/cellAudio.h"
 #include "Emu/RSX/rsx_utils.h"
 #include "Emu/RSX/Overlays/overlay_message.h"
 #include "Emu/Io/recording_config.h"
@@ -445,9 +446,9 @@ void gs_frame::toggle_recording()
 	{
 		m_video_encoder->stop();
 
-		if (!video_provider.set_image_sink(nullptr, recording_mode::rpcs3))
+		if (!video_provider.set_video_sink(nullptr, recording_mode::rpcs3))
 		{
-			gui_log.warning("The video provider could not release the image sink. A sink with higher priority must have been set.");
+			gui_log.warning("The video provider could not release the video sink. A sink with higher priority must have been set.");
 		}
 
 		// Play a sound
@@ -489,21 +490,23 @@ void gs_frame::toggle_recording()
 		video_path += "recording_" + date_time::current_time_narrow<'_'>() + ".mp4";
 
 		utils::video_encoder::frame_format output_format{};
-		output_format.av_pixel_format = static_cast<AVPixelFormat>(g_cfg_recording.pixel_format.get());
-		output_format.width = g_cfg_recording.width;
-		output_format.height = g_cfg_recording.height;
-		output_format.pitch = g_cfg_recording.width * 4;
+		output_format.av_pixel_format = static_cast<AVPixelFormat>(g_cfg_recording.video.pixel_format.get());
+		output_format.width = g_cfg_recording.video.width;
+		output_format.height = g_cfg_recording.video.height;
+		output_format.pitch = g_cfg_recording.video.width * 4;
 
 		m_video_encoder->set_path(video_path);
-		m_video_encoder->set_framerate(g_cfg_recording.framerate);
-		m_video_encoder->set_video_bitrate(g_cfg_recording.video_bps);
-		m_video_encoder->set_video_codec(g_cfg_recording.video_codec);
-		m_video_encoder->set_max_b_frames(g_cfg_recording.max_b_frames);
-		m_video_encoder->set_gop_size(g_cfg_recording.gop_size);
+		m_video_encoder->set_framerate(g_cfg_recording.video.framerate);
+		m_video_encoder->set_video_bitrate(g_cfg_recording.video.video_bps);
+		m_video_encoder->set_video_codec(g_cfg_recording.video.video_codec);
+		m_video_encoder->set_max_b_frames(g_cfg_recording.video.max_b_frames);
+		m_video_encoder->set_gop_size(g_cfg_recording.video.gop_size);
 		m_video_encoder->set_output_format(output_format);
-		m_video_encoder->set_sample_rate(0);   // TODO
-		m_video_encoder->set_audio_bitrate(0); // TODO
-		m_video_encoder->set_audio_codec(0);   // TODO
+		m_video_encoder->set_sample_rate(g_cfg_recording.audio.sample_rate);
+		//m_video_encoder->set_audio_channels(static_cast<u32>(g_fxo->get<cell_audio>().cfg.backend_ch_cnt));
+		m_video_encoder->set_audio_channels(static_cast<u32>(g_fxo->get<cell_audio>().cfg.audio_channels));
+		m_video_encoder->set_audio_bitrate(g_cfg_recording.audio.audio_bps);
+		m_video_encoder->set_audio_codec(g_cfg_recording.audio.audio_codec);
 		m_video_encoder->encode();
 
 		if (m_video_encoder->has_error)
@@ -513,9 +516,9 @@ void gs_frame::toggle_recording()
 			return;
 		}
 
-		if (!video_provider.set_image_sink(m_video_encoder, recording_mode::rpcs3))
+		if (!video_provider.set_video_sink(m_video_encoder, recording_mode::rpcs3))
 		{
-			gui_log.warning("The video provider could not set the image sink. A sink with higher priority must have been set.");
+			gui_log.warning("The video provider could not set the video sink. A sink with higher priority must have been set.");
 			rsx::overlays::queue_message(tr("Recording not possible").toStdString());
 			m_video_encoder->stop();
 			return;
diff --git a/rpcs3/util/image_sink.h b/rpcs3/util/image_sink.h
deleted file mode 100644
index 3c23eca514..0000000000
--- a/rpcs3/util/image_sink.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#pragma once
-
-#include "util/types.hpp"
-#include "util/atomic.hpp"
-#include "Utilities/mutex.h"
-
-#include <deque>
-#include <cmath>
-
-namespace utils
-{
-	class image_sink
-	{
-	public:
-		image_sink() = default;
-
-		virtual void stop(bool flush = true) = 0;
-		virtual void add_frame(std::vector<u8>& frame, u32 pitch, u32 width, u32 height, s32 pixel_format, usz timestamp_ms) = 0;
-
-		s64 get_pts(usz timestamp_ms) const
-		{
-			return static_cast<s64>(std::round((timestamp_ms * m_framerate) / 1000.f));
-		}
-
-		usz get_timestamp_ms(s64 pts) const
-		{
-			return static_cast<usz>(std::round((pts * 1000) / static_cast<float>(m_framerate)));
-		}
-
-		atomic_t<bool> has_error{false};
-
-		struct encoder_frame
-		{
-			encoder_frame() = default;
-			encoder_frame(usz timestamp_ms, u32 pitch, u32 width, u32 height, s32 av_pixel_format, std::vector<u8>&& data)
-				: timestamp_ms(timestamp_ms), pitch(pitch), width(width), height(height), av_pixel_format(av_pixel_format), data(std::move(data))
-			{}
-
-			s64 pts = -1; // Optional
-			usz timestamp_ms = 0;
-			u32 pitch = 0;
-			u32 width = 0;
-			u32 height = 0;
-			s32 av_pixel_format = 0; // NOTE: Make sure this is a valid AVPixelFormat
-			std::vector<u8> data;
-		};
-
-	protected:
-		shared_mutex m_mtx;
-		std::deque<encoder_frame> m_frames_to_encode;
-		atomic_t<bool> m_flush = false;
-		u32 m_framerate = 0;
-	};
-}
diff --git a/rpcs3/util/media_utils.cpp b/rpcs3/util/media_utils.cpp
index 3dddd5b0f8..cc6207d369 100644
--- a/rpcs3/util/media_utils.cpp
+++ b/rpcs3/util/media_utils.cpp
@@ -32,6 +32,28 @@ LOG_CHANNEL(media_log, "Media");
 
 namespace utils
 {
+	template <typename T>
+	static inline void write_byteswapped(const u8* src, u8* dst)
+	{
+		*reinterpret_cast<T*>(dst) = *reinterpret_cast<const be_t<T>*>(src);
+	}
+
+	template <typename T>
+	static inline void copy_samples(const u8* src, u8* dst, usz sample_count, bool swap_endianness)
+	{
+		if (swap_endianness)
+		{
+			for (usz i = 0; i < sample_count; i++)
+			{
+				write_byteswapped<T>(src + i * sizeof(T), dst + i * sizeof(T));
+			}
+		}
+		else
+		{
+			std::memcpy(dst, src, sample_count * sizeof(T));
+		}
+	}
+
 	template <>
 	std::string media_info::get_metadata(const std::string& key, const std::string& def) const
 	{
@@ -204,11 +226,19 @@ namespace utils
 
 	struct scoped_av
 	{
-		AVFormatContext* format = nullptr;
-		const AVCodec* codec = nullptr;
-		AVCodecContext* context = nullptr;
-		AVFrame* frame = nullptr;
-		AVStream* stream = nullptr;
+		struct ctx
+		{
+			const AVCodec* codec = nullptr;
+			AVCodecContext* context = nullptr;
+			AVStream* stream = nullptr;
+			AVPacket* packet = nullptr;
+			AVFrame* frame = nullptr;
+		};
+
+		ctx audio{};
+		ctx video{};
+
+		AVFormatContext* format_context = nullptr;
 		SwrContext* swr = nullptr;
 		SwsContext* sws = nullptr;
 		std::function<void()> kill_callback = nullptr;
@@ -216,21 +246,38 @@ namespace utils
 		~scoped_av()
 		{
 			// Clean up
-			if (frame)
+			if (audio.frame)
 			{
-				av_frame_unref(frame);
-				av_frame_free(&frame);
+				av_frame_unref(audio.frame);
+				av_frame_free(&audio.frame);
+			}
+			if (video.frame)
+			{
+				av_frame_unref(video.frame);
+				av_frame_free(&video.frame);
+			}
+			if (audio.packet)
+			{
+				av_packet_unref(audio.packet);
+				av_packet_free(&audio.packet);
+			}
+			if (video.packet)
+			{
+				av_packet_unref(video.packet);
+				av_packet_free(&video.packet);
 			}
 			if (swr)
 				swr_free(&swr);
 			if (sws)
 				sws_freeContext(sws);
-			if (context)
-				avcodec_close(context);
+			if (audio.context)
+				avcodec_close(audio.context);
+			if (video.context)
+				avcodec_close(video.context);
 			// AVCodec is managed by libavformat, no need to free it
 			// see: https://stackoverflow.com/a/18047320
-			if (format)
-				avformat_free_context(format);
+			if (format_context)
+				avformat_free_context(format_context);
 			//if (stream)
 			//	av_free(stream);
 			if (kill_callback)
@@ -238,6 +285,53 @@ namespace utils
 		}
 	};
 
+	// check that a given sample format is supported by the encoder
+	static bool check_sample_fmt(const AVCodec* codec, enum AVSampleFormat sample_fmt)
+	{
+		for (const AVSampleFormat* p = codec->sample_fmts; p && *p != AV_SAMPLE_FMT_NONE; p++)
+		{
+			if (*p == sample_fmt)
+			{
+				return true;
+			}
+		}
+		return false;
+	}
+
+	// just pick the highest supported samplerate
+	static int select_sample_rate(const AVCodec* codec)
+	{
+		if (!codec->supported_samplerates)
+			return 44100;
+
+		int best_samplerate = 0;
+		for (const int* samplerate = codec->supported_samplerates; samplerate && *samplerate != 0; samplerate++)
+		{
+			if (!best_samplerate || abs(44100 - *samplerate) < abs(44100 - best_samplerate))
+			{
+				best_samplerate = *samplerate;
+			}
+		}
+		return best_samplerate;
+	}
+
+	// select layout with the highest channel count
+	static const AVChannelLayout* select_channel_layout(const AVCodec* codec, int channels)
+	{
+		constexpr AVChannelLayout empty_ch_layout = {};
+
+		for (const AVChannelLayout* ch_layout = codec->ch_layouts;
+			 ch_layout && memcmp(ch_layout, &empty_ch_layout, sizeof(AVChannelLayout)) != 0;
+			 ch_layout++)
+		{
+			if (ch_layout->nb_channels == channels)
+			{
+				return ch_layout;
+			}
+		}
+		return nullptr;
+	}
+
 	audio_decoder::audio_decoder()
 	{
 	}
@@ -295,14 +389,14 @@ namespace utils
 			scoped_av av;
 
 			// Get format from audio file
-			av.format = avformat_alloc_context();
-			if (int err = avformat_open_input(&av.format, path.c_str(), nullptr, nullptr); err < 0)
+			av.format_context = avformat_alloc_context();
+			if (int err = avformat_open_input(&av.format_context, path.c_str(), nullptr, nullptr); err < 0)
 			{
 				media_log.error("audio_decoder: Could not open file '%s'. Error: %d='%s'", path, err, av_error_to_string(err));
 				has_error = true;
 				return;
 			}
-			if (int err = avformat_find_stream_info(av.format, nullptr); err < 0)
+			if (int err = avformat_find_stream_info(av.format_context, nullptr); err < 0)
 			{
 				media_log.error("audio_decoder: Could not retrieve stream info from file '%s'. Error: %d='%s'", path, err, av_error_to_string(err));
 				has_error = true;
@@ -312,11 +406,11 @@ namespace utils
 			// Find the first audio stream
 			AVStream* stream = nullptr;
 			unsigned int stream_index;
-			for (stream_index = 0; stream_index < av.format->nb_streams; stream_index++)
+			for (stream_index = 0; stream_index < av.format_context->nb_streams; stream_index++)
 			{
-				if (av.format->streams[stream_index]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
+				if (av.format_context->streams[stream_index]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
 				{
-					stream = av.format->streams[stream_index];
+					stream = av.format_context->streams[stream_index];
 					break;
 				}
 			}
@@ -328,8 +422,8 @@ namespace utils
 			}
 
 			// Find decoder
-			av.codec = avcodec_find_decoder(stream->codecpar->codec_id);
-			if (!av.codec)
+			av.audio.codec = avcodec_find_decoder(stream->codecpar->codec_id);
+			if (!av.audio.codec)
 			{
 				media_log.error("audio_decoder: Failed to find decoder for stream #%u in file '%s'", stream_index, path);
 				has_error = true;
@@ -337,8 +431,8 @@ namespace utils
 			}
 
 			// Allocate context
-			av.context = avcodec_alloc_context3(av.codec);
-			if (!av.context)
+			av.audio.context = avcodec_alloc_context3(av.audio.codec);
+			if (!av.audio.context)
 			{
 				media_log.error("audio_decoder: Failed to allocate context for stream #%u in file '%s'", stream_index, path);
 				has_error = true;
@@ -346,7 +440,7 @@ namespace utils
 			}
 
 			// Open decoder
-			if (int err = avcodec_open2(av.context, av.codec, nullptr); err < 0)
+			if (int err = avcodec_open2(av.audio.context, av.audio.codec, nullptr); err < 0)
 			{
 				media_log.error("audio_decoder: Failed to open decoder for stream #%u in file '%s'. Error: %d='%s'", stream_index, path, err, av_error_to_string(err));
 				has_error = true;
@@ -389,8 +483,8 @@ namespace utils
 			}
 
 			// Prepare to read data
-			av.frame = av_frame_alloc();
-			if (!av.frame)
+			av.audio.frame = av_frame_alloc();
+			if (!av.audio.frame)
 			{
 				media_log.error("audio_decoder: Error allocating the frame");
 				has_error = true;
@@ -403,9 +497,9 @@ namespace utils
 			std::unique_ptr<AVPacket, decltype([](AVPacket* p){av_packet_unref(p);})> packet_(packet);
 
 			// Iterate through frames
-			while (thread_ctrl::state() != thread_state::aborting && av_read_frame(av.format, packet) >= 0)
+			while (thread_ctrl::state() != thread_state::aborting && av_read_frame(av.format_context, packet) >= 0)
 			{
-				if (int err = avcodec_send_packet(av.context, packet); err < 0)
+				if (int err = avcodec_send_packet(av.audio.context, packet); err < 0)
 				{
 					media_log.error("audio_decoder: Queuing error: %d='%s'", err, av_error_to_string(err));
 					has_error = true;
@@ -414,7 +508,7 @@ namespace utils
 
 				while (thread_ctrl::state() != thread_state::aborting)
 				{
-					if (int err = avcodec_receive_frame(av.context, av.frame); err < 0)
+					if (int err = avcodec_receive_frame(av.audio.context, av.audio.frame); err < 0)
 					{
 						if (err == AVERROR(EAGAIN) || err == averror_eof)
 							break;
@@ -427,7 +521,7 @@ namespace utils
 					// Resample frames
 					u8* buffer;
 					const int align = 1;
-					const int buffer_size = av_samples_alloc(&buffer, nullptr, dst_channels, av.frame->nb_samples, dst_format, align);
+					const int buffer_size = av_samples_alloc(&buffer, nullptr, dst_channels, av.audio.frame->nb_samples, dst_format, align);
 					if (buffer_size < 0)
 					{
 						media_log.error("audio_decoder: Error allocating buffer: %d='%s'", buffer_size, av_error_to_string(buffer_size));
@@ -435,7 +529,7 @@ namespace utils
 						return;
 					}
 
-					const int frame_count = swr_convert(av.swr, &buffer, av.frame->nb_samples, const_cast<const uint8_t**>(av.frame->data), av.frame->nb_samples);
+					const int frame_count = swr_convert(av.swr, &buffer, av.audio.frame->nb_samples, const_cast<const uint8_t**>(av.audio.frame->data), av.audio.frame->nb_samples);
 					if (frame_count < 0)
 					{
 						media_log.error("audio_decoder: Error converting frame: %d='%s'", frame_count, av_error_to_string(frame_count));
@@ -450,25 +544,10 @@ namespace utils
 						std::scoped_lock lock(m_mtx);
 						data.resize(m_size + buffer_size);
 
-						if (m_swap_endianness)
-						{
-							// The format is float 32bit per channel.
-							const auto write_byteswapped = [](const void* src, void* dst) -> void
-							{
-								*static_cast<f32*>(dst) = *static_cast<const be_t<f32>*>(src);
-							};
+						// The format is float 32bit per channel.
+						copy_samples<f32>(buffer, &data[m_size], buffer_size / sizeof(f32), m_swap_endianness);
 
-							for (size_t i = 0; i < (buffer_size - sizeof(f32)); i += sizeof(f32))
-							{
-								write_byteswapped(buffer + i, data.data() + m_size + i);
-							}
-						}
-						else
-						{
-							memcpy(&data[m_size], buffer, buffer_size);
-						}
-
-						const s64 timestamp_ms = stream->time_base.den ? (1000 * av.frame->best_effort_timestamp * stream->time_base.num) / stream->time_base.den : 0;
+						const s64 timestamp_ms = stream->time_base.den ? (1000 * av.audio.frame->best_effort_timestamp * stream->time_base.num) / stream->time_base.den : 0;
 						timestamps_ms.push_back({m_size, timestamp_ms});
 						m_size += buffer_size;
 					}
@@ -476,7 +555,7 @@ namespace utils
 					if (buffer)
 						av_free(buffer);
 
-					media_log.notice("audio_decoder: decoded frame_count=%d buffer_size=%d timestamp_us=%d", frame_count, buffer_size, av.frame->best_effort_timestamp);
+					media_log.notice("audio_decoder: decoded frame_count=%d buffer_size=%d timestamp_us=%d", frame_count, buffer_size, av.audio.frame->best_effort_timestamp);
 				}
 			}
 		};
@@ -535,7 +614,7 @@ namespace utils
 	}
 
 	video_encoder::video_encoder()
-		: utils::image_sink()
+		: utils::video_sink()
 	{
 	}
 
@@ -549,9 +628,9 @@ namespace utils
 		return m_path;
 	}
 
-	s64 video_encoder::last_pts() const
+	s64 video_encoder::last_video_pts() const
 	{
-		return m_last_pts;
+		return m_last_video_pts;
 	}
 
 	void video_encoder::set_path(const std::string& path)
@@ -594,6 +673,11 @@ namespace utils
 		m_sample_rate = sample_rate;
 	}
 
+	void video_encoder::set_audio_channels(u32 channels)
+	{
+		m_channels = channels;
+	}
+
 	void video_encoder::set_audio_bitrate(u32 bitrate)
 	{
 		m_audio_bitrate_bps = bitrate;
@@ -604,16 +688,6 @@ namespace utils
 		m_audio_codec_id = codec_id;
 	}
 
-	void video_encoder::add_frame(std::vector<u8>& frame, u32 pitch, u32 width, u32 height, s32 pixel_format, usz timestamp_ms)
-	{
-		// Do not allow new frames while flushing
-		if (m_flush)
-			return;
-
-		std::lock_guard lock(m_mtx);
-		m_frames_to_encode.emplace_back(timestamp_ms, pitch, width, height, pixel_format, std::move(frame));
-	}
-
 	void video_encoder::pause(bool flush)
 	{
 		if (m_thread)
@@ -658,6 +732,7 @@ namespace utils
 
 		std::lock_guard lock(m_mtx);
 		m_frames_to_encode.clear();
+		m_samples_to_encode.clear();
 		has_error = false;
 		m_flush = false;
 		m_paused = false;
@@ -675,7 +750,8 @@ namespace utils
 			return;
 		}
 
-		m_last_pts = 0;
+		m_last_audio_pts = 0;
+		m_last_video_pts = 0;
 
 		stop();
 
@@ -692,7 +768,21 @@ namespace utils
 		{
 			m_running = true;
 
-			// TODO: audio encoding
+			av_log_set_callback([](void* avcl, int level, const char* fmt, va_list vl) -> void
+			{
+				constexpr int line_size = 1024;
+				char line[line_size]{};
+				int print_prefix = 1;
+
+				if (int err = av_log_format_line2(avcl, level, fmt, vl, line, line_size, &print_prefix); err < 0)
+				{
+					media_log.error("av_log: av_log_format_line2 failed. Error: %d='%s'", err, av_error_to_string(err));
+					return;
+				}
+
+				media_log.error("av_log: %s", line);
+			});
+			av_log_set_level(AV_LOG_TRACE);
 
 			// Reset variables at all costs
 			scoped_av av;
@@ -702,38 +792,38 @@ namespace utils
 				m_running = false;
 			};
 
-			const AVPixelFormat out_format = static_cast<AVPixelFormat>(m_out_format.av_pixel_format);
-			const char* av_output_format = nullptr;
-
-			const auto find_format = [&](const AVCodec* codec) -> const char*
+			// Let's list the encoders first
+			std::vector<const AVCodec*> audio_codecs;
+			std::vector<const AVCodec*> video_codecs;
+			void* opaque = nullptr;
+			while (const AVCodec* codec = av_codec_iterate(&opaque))
 			{
-				if (!codec)
-					return nullptr;
+				if (codec->type == AVMediaType::AVMEDIA_TYPE_AUDIO)
+				{
+					media_log.notice("video_encoder: Found audio codec %d = %s", static_cast<int>(codec->id), codec->name);
+					audio_codecs.push_back(codec);
+				}
+				else if (codec->type == AVMediaType::AVMEDIA_TYPE_VIDEO)
+				{
+					media_log.notice("video_encoder: Found video codec %d = %s", static_cast<int>(codec->id), codec->name);
+					video_codecs.push_back(codec);
+				}
+			}
 
+			const AVPixelFormat out_pix_format = static_cast<AVPixelFormat>(m_out_format.av_pixel_format);
+
+			const auto find_format = [&](AVCodecID video_codec, AVCodecID audio_codec) -> const AVOutputFormat*
+			{
 				// Try to find a preferable output format
 				std::vector<const AVOutputFormat*> oformats;
 
 				void* opaque = nullptr;
 				for (const AVOutputFormat* oformat = av_muxer_iterate(&opaque); !!oformat; oformat = av_muxer_iterate(&opaque))
 				{
-					if (avformat_query_codec(oformat, codec->id, FF_COMPLIANCE_STRICT) == 1)
+					if (avformat_query_codec(oformat, video_codec, FF_COMPLIANCE_STRICT) == 1 &&
+						avformat_query_codec(oformat, audio_codec, FF_COMPLIANCE_STRICT) == 1)
 					{
-						media_log.notice("video_encoder: Found output format '%s'", oformat->name);
-
-						switch (codec->id)
-						{
-						case AV_CODEC_ID_MPEG4:
-							if (strcmp(oformat->name, "avi") == 0)
-								return oformat->name;
-							break;
-						case AV_CODEC_ID_H264:
-						case AV_CODEC_ID_MJPEG:
-							// TODO
-							break;
-						default:
-							break;
-						}
-
+						media_log.notice("video_encoder: Found output format '%s' (video_codec=%d, audio_codec=%d)", oformat->name, static_cast<int>(video_codec), static_cast<int>(audio_codec));
 						oformats.push_back(oformat);
 					}
 				}
@@ -742,168 +832,294 @@ namespace utils
 				if (!oformats.empty() && oformats.front())
 				{
 					const AVOutputFormat* oformat = oformats.front();
-					media_log.notice("video_encoder: Falling back to output format '%s'", oformat->name);
-					return oformat->name;
+					media_log.notice("video_encoder: Falling back to output format '%s' (video_codec=%d, audio_codec=%d)", oformat->name, static_cast<int>(video_codec), static_cast<int>(audio_codec));
+					return oformat;
 				}
 
 				return nullptr;
 			};
 
-			AVCodecID used_codec = static_cast<AVCodecID>(m_video_codec_id);
+			const AVOutputFormat* out_format = find_format(static_cast<AVCodecID>(m_video_codec_id), static_cast<AVCodecID>(m_audio_codec_id));
 
-			// Find specified codec first
-			if (const AVCodec* encoder = avcodec_find_encoder(used_codec); !!encoder)
+			if (out_format)
 			{
-				media_log.success("video_encoder: Found requested video_codec %d = %s", static_cast<int>(used_codec), encoder->name);
-				av_output_format = find_format(encoder);
-
-				if (av_output_format)
-				{
-					media_log.success("video_encoder: Found requested output format '%s'", av_output_format);
-				}
-				else
-				{
-					media_log.error("video_encoder: Could not find a format for the requested video_codec %d = %s", static_cast<int>(used_codec), encoder->name);
-				}
+				media_log.success("video_encoder: Found requested output format '%s'", out_format->name);
 			}
 			else
 			{
-				media_log.error("video_encoder: Could not find requested video_codec %d", static_cast<int>(used_codec));
-			}
+				media_log.error("video_encoder: Could not find a format for the requested video_codec %d and audio_codec %d", m_video_codec_id, m_audio_codec_id);
 
-			// Fallback to some other codec
-			if (!av_output_format)
-			{
-				void* opaque = nullptr;
-				for (const AVCodec* codec = av_codec_iterate(&opaque); !!codec; codec = av_codec_iterate(&opaque))
+				// Fallback to some other codec
+				for (const AVCodec* video_codec : video_codecs)
 				{
-					if (av_codec_is_encoder(codec))
+					for (const AVCodec* audio_codec : audio_codecs)
 					{
-						media_log.notice("video_encoder: Found video_codec %d = %s", static_cast<int>(codec->id), codec->name);
-						av_output_format = find_format(codec);
+						out_format = find_format(video_codec->id, audio_codec->id);
 
-						if (av_output_format)
+						if (out_format)
 						{
-							media_log.success("video_encoder: Found fallback output format '%s'", av_output_format);
+							media_log.success("video_encoder: Found fallback output format '%s'", out_format->name);
 							break;
 						}
 					}
+
+					if (out_format)
+					{
+						break;
+					}
 				}
 			}
 
-			if (!av_output_format)
+			if (!out_format)
 			{
 				media_log.error("video_encoder: Could not find any output format");
 				has_error = true;
 				return;
 			}
 
-			if (int err = avformat_alloc_output_context2(&av.format, nullptr, av_output_format, path.c_str()); err < 0)
+			if (int err = avformat_alloc_output_context2(&av.format_context, out_format, nullptr, nullptr); err < 0)
 			{
-				media_log.error("video_encoder: avformat_alloc_output_context2 failed. Error: %d='%s'", err, av_error_to_string(err));
+				media_log.error("video_encoder: avformat_alloc_output_context2 for '%s' failed. Error: %d='%s'", out_format->name, err, av_error_to_string(err));
 				has_error = true;
 				return;
 			}
 
-			if (!av.format)
+			if (!av.format_context)
 			{
 				media_log.error("video_encoder: avformat_alloc_output_context2 failed");
 				has_error = true;
 				return;
 			}
 
-			if (!(av.codec = avcodec_find_encoder(av.format->oformat->video_codec)))
+			const auto create_context = [this, &av](AVCodecID codec_id, bool is_video) -> bool
+			{
+				const std::string type = is_video ? "video" : "audio";
+				scoped_av::ctx& ctx = is_video ? av.video : av.audio;
+
+				if (is_video)
+				{
+					if (!(ctx.codec = avcodec_find_encoder(av.format_context->oformat->video_codec)))
+					{
+						media_log.error("video_encoder: avcodec_find_encoder for video failed. video_codev=%d", static_cast<int>(av.format_context->oformat->video_codec));
+						return false;
+					}
+				}
+				else
+				{
+					if (!(ctx.codec = avcodec_find_encoder(av.format_context->oformat->audio_codec)))
+					{
+						media_log.error("video_encoder: avcodec_find_encoder for audio failed. audio_codec=%d", static_cast<int>(av.format_context->oformat->audio_codec));
+						return false;
+					}
+				}
+
+				if (!(ctx.stream = avformat_new_stream(av.format_context, nullptr)))
+				{
+					media_log.error("video_encoder: avformat_new_stream for %s failed", type);
+					return false;
+				}
+
+				ctx.stream->id = is_video ? 0 : 1;
+
+				if (!(ctx.context = avcodec_alloc_context3(ctx.codec)))
+				{
+					media_log.error("video_encoder: avcodec_alloc_context3 for %s failed", type);
+					return false;
+				}
+
+				if (av.format_context->oformat->flags & AVFMT_GLOBALHEADER)
+				{
+					ctx.context->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
+				}
+
+				return true;
+			};
+
+			if (!create_context(static_cast<AVCodecID>(m_video_codec_id), true))
 			{
-				media_log.error("video_encoder: avcodec_find_encoder failed");
 				has_error = true;
 				return;
 			}
 
-			if (!(av.stream = avformat_new_stream(av.format, nullptr)))
+			if (!create_context(static_cast<AVCodecID>(m_audio_codec_id), false))
 			{
-				media_log.error("video_encoder: avformat_new_stream failed");
 				has_error = true;
 				return;
 			}
 
-			av.stream->id = static_cast<int>(av.format->nb_streams - 1);
+			media_log.error("video_encoder: using audio_codec = %d", static_cast<int>(av.format_context->oformat->audio_codec));
+			media_log.error("video_encoder: using sample_rate = %d", m_sample_rate);
+			media_log.error("video_encoder: using audio_bitrate = %d", m_audio_bitrate_bps);
+			media_log.error("video_encoder: using audio channels = %d", m_channels);
+			media_log.error("video_encoder: using video_codec = %d", static_cast<int>(av.format_context->oformat->video_codec));
+			media_log.error("video_encoder: using video_bitrate = %d", m_video_bitrate_bps);
+			media_log.error("video_encoder: using out width = %d", m_out_format.width);
+			media_log.error("video_encoder: using out height = %d", m_out_format.height);
+			media_log.error("video_encoder: using framerate = %d", m_framerate);
+			media_log.error("video_encoder: using gop_size = %d", m_gop_size);
+			media_log.error("video_encoder: using max_b_frames = %d", m_max_b_frames);
 
-			if (!(av.context = avcodec_alloc_context3(av.codec)))
+			// select audio parameters supported by the encoder
+			if (av.audio.context)
 			{
-				media_log.error("video_encoder: avcodec_alloc_context3 failed");
-				has_error = true;
-				return;
+				if (const AVChannelLayout* ch_layout = select_channel_layout(av.audio.codec, m_channels))
+				{
+					if (int err = av_channel_layout_copy(&av.audio.context->ch_layout, ch_layout); err != 0)
+					{
+						media_log.error("video_encoder: av_channel_layout_copy failed. Error: %d='%s'", err, av_error_to_string(err));
+						has_error = true;
+						return;
+					}
+				}
+				else
+				{
+					media_log.error("video_encoder: select_channel_layout returned nullptr");
+					has_error = true;
+					return;
+				}
+
+				m_sample_rate = select_sample_rate(av.audio.codec);
+
+				av.audio.context->codec_id = av.format_context->oformat->audio_codec;
+				av.audio.context->codec_type = AVMEDIA_TYPE_AUDIO;
+				av.audio.context->bit_rate = m_audio_bitrate_bps;
+				av.audio.context->sample_rate = m_sample_rate;
+				av.audio.context->time_base = {.num = 1, .den = av.audio.context->sample_rate};
+				av.audio.context->sample_fmt = AV_SAMPLE_FMT_FLTP; // AV_SAMPLE_FMT_FLT is not supported in regular AC3
+				av.audio.stream->time_base = av.audio.context->time_base;
+
+				// check that the encoder supports the format
+				if (!check_sample_fmt(av.audio.codec, av.audio.context->sample_fmt))
+				{
+					media_log.error("video_encoder: Audio encoder does not support sample format %s", av_get_sample_fmt_name(av.audio.context->sample_fmt));
+					has_error = true;
+					return;
+				}
+
+				if (int err = avcodec_open2(av.audio.context, av.audio.codec, nullptr); err != 0)
+				{
+					media_log.error("video_encoder: avcodec_open2 for audio failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
+
+				if (!(av.audio.packet = av_packet_alloc()))
+				{
+					media_log.error("video_encoder: av_packet_alloc for audio packet failed");
+					has_error = true;
+					return;
+				}
+
+				if (!(av.audio.frame = av_frame_alloc()))
+				{
+					media_log.error("video_encoder: av_frame_alloc for audio frame failed");
+					has_error = true;
+					return;
+				}
+
+				av.audio.frame->format = AV_SAMPLE_FMT_FLTP;
+				av.audio.frame->nb_samples = av.audio.context->frame_size;
+
+				if (int err = av_channel_layout_copy(&av.audio.frame->ch_layout, &av.audio.context->ch_layout); err < 0)
+				{
+					media_log.error("video_encoder: av_channel_layout_copy for audio frame failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
+
+				if (int err = av_frame_get_buffer(av.audio.frame, 0); err < 0)
+				{
+					media_log.error("video_encoder: av_frame_get_buffer for audio frame failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
+
+				if (int err = avcodec_parameters_from_context(av.audio.stream->codecpar, av.audio.context); err < 0)
+				{
+					media_log.error("video_encoder: avcodec_parameters_from_context for audio failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
 			}
 
-			media_log.notice("video_encoder: using video_codec = %d", static_cast<int>(av.format->oformat->video_codec));
-			media_log.notice("video_encoder: using video_bitrate = %d", m_video_bitrate_bps);
-			media_log.notice("video_encoder: using out width = %d", m_out_format.width);
-			media_log.notice("video_encoder: using out height = %d", m_out_format.height);
-			media_log.notice("video_encoder: using framerate = %d", m_framerate);
-			media_log.notice("video_encoder: using gop_size = %d", m_gop_size);
-			media_log.notice("video_encoder: using max_b_frames = %d", m_max_b_frames);
-
-			av.context->codec_id = av.format->oformat->video_codec;
-			av.context->bit_rate = m_video_bitrate_bps;
-			av.context->width = static_cast<int>(m_out_format.width);
-			av.context->height = static_cast<int>(m_out_format.height);
-			av.context->time_base = {.num = 1, .den = static_cast<int>(m_framerate)};
-			av.context->framerate = {.num = static_cast<int>(m_framerate), .den = 1};
-			av.context->pix_fmt = out_format;
-			av.context->gop_size = m_gop_size;
-			av.context->max_b_frames = m_max_b_frames;
-
-			if (av.format->oformat->flags & AVFMT_GLOBALHEADER)
+			// select video parameters supported by the encoder
+			if (av.video.context)
 			{
-				av.context->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
+				av.video.context->codec_id = av.format_context->oformat->video_codec;
+				av.video.context->codec_type = AVMEDIA_TYPE_VIDEO;
+				av.video.context->frame_number = 0;
+				av.video.context->bit_rate = m_video_bitrate_bps;
+				av.video.context->width = static_cast<int>(m_out_format.width);
+				av.video.context->height = static_cast<int>(m_out_format.height);
+				av.video.context->time_base = {.num = 1, .den = static_cast<int>(m_framerate)};
+				av.video.context->framerate = {.num = static_cast<int>(m_framerate), .den = 1};
+				av.video.context->pix_fmt = out_pix_format;
+				av.video.context->gop_size = m_gop_size;
+				av.video.context->max_b_frames = m_max_b_frames;
+				av.video.stream->time_base = av.video.context->time_base;
+
+				if (int err = avcodec_open2(av.video.context, av.video.codec, nullptr); err != 0)
+				{
+					media_log.error("video_encoder: avcodec_open2 for video failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
+
+				if (!(av.video.packet = av_packet_alloc()))
+				{
+					media_log.error("video_encoder: av_packet_alloc for video packet failed");
+					has_error = true;
+					return;
+				}
+
+				if (!(av.video.frame = av_frame_alloc()))
+				{
+					media_log.error("video_encoder: av_frame_alloc for video frame failed");
+					has_error = true;
+					return;
+				}
+
+				av.video.frame->format = av.video.context->pix_fmt;
+				av.video.frame->width = av.video.context->width;
+				av.video.frame->height = av.video.context->height;
+
+				if (int err = av_frame_get_buffer(av.video.frame, 0); err < 0)
+				{
+					media_log.error("video_encoder: av_frame_get_buffer for video frame failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
+
+				if (int err = avcodec_parameters_from_context(av.video.stream->codecpar, av.video.context); err < 0)
+				{
+					media_log.error("video_encoder: avcodec_parameters_from_context for video failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
 			}
 
-			if (int err = avcodec_open2(av.context, av.codec, nullptr); err != 0)
+			media_log.error("video_encoder: av_dump_format");
+			for (u32 i = 0; i < av.format_context->nb_streams; i++)
 			{
-				media_log.error("video_encoder: avcodec_open2 failed. Error: %d='%s'", err, av_error_to_string(err));
-				has_error = true;
-				return;
+				av_dump_format(av.format_context, i, path.c_str(), 1);
 			}
 
-			if (!(av.frame = av_frame_alloc()))
+			// open the output file, if needed
+			if (!(av.format_context->flags & AVFMT_NOFILE))
 			{
-				media_log.error("video_encoder: av_frame_alloc failed");
-				has_error = true;
-				return;
+				if (int err = avio_open(&av.format_context->pb, path.c_str(), AVIO_FLAG_WRITE); err != 0)
+				{
+					media_log.error("video_encoder: avio_open failed. Error: %d='%s'", err, av_error_to_string(err));
+					has_error = true;
+					return;
+				}
 			}
 
-			av.frame->format = av.context->pix_fmt;
-			av.frame->width = av.context->width;
-			av.frame->height = av.context->height;
-
-			if (int err = av_frame_get_buffer(av.frame, 32); err < 0)
-			{
-				media_log.error("video_encoder: av_frame_get_buffer failed. Error: %d='%s'", err, av_error_to_string(err));
-				has_error = true;
-				return;
-			}
-
-			if (int err = avcodec_parameters_from_context(av.stream->codecpar, av.context); err < 0)
-			{
-				media_log.error("video_encoder: avcodec_parameters_from_context failed. Error: %d='%s'", err, av_error_to_string(err));
-				has_error = true;
-				return;
-			}
-
-			av_dump_format(av.format, 0, path.c_str(), 1);
-
-			if (int err = avio_open(&av.format->pb, path.c_str(), AVIO_FLAG_WRITE); err != 0)
-			{
-				media_log.error("video_encoder: avio_open failed. Error: %d='%s'", err, av_error_to_string(err));
-				has_error = true;
-				return;
-			}
-
-			if (int err = avformat_write_header(av.format, nullptr); err < 0)
+			if (int err = avformat_write_header(av.format_context, nullptr); err < 0)
 			{
 				media_log.error("video_encoder: avformat_write_header failed. Error: %d='%s'", err, av_error_to_string(err));
 
-				if (int err = avio_close(av.format->pb); err != 0)
+				if (int err = avio_close(av.format_context->pb); err != 0)
 				{
 					media_log.error("video_encoder: avio_close failed. Error: %d='%s'", err, av_error_to_string(err));
 				}
@@ -912,21 +1128,11 @@ namespace utils
 				return;
 			}
 
-			const auto flush = [&]()
+			const auto flush = [&](scoped_av::ctx& ctx)
 			{
-				while ((thread_ctrl::state() != thread_state::aborting || m_flush) && !has_error)
+				while ((thread_ctrl::state() != thread_state::aborting || m_flush) && !has_error && ctx.context)
 				{
-					AVPacket* packet = av_packet_alloc();
-					std::unique_ptr<AVPacket, decltype([](AVPacket* p){ if (p) av_packet_unref(p); })> packet_(packet);
-
-					if (!packet)
-					{
-						media_log.error("video_encoder: av_packet_alloc failed");
-						has_error = true;
-						return;
-					}
-
-					if (int err = avcodec_receive_packet(av.context, packet); err < 0)
+					if (int err = avcodec_receive_packet(ctx.context, ctx.packet); err < 0)
 					{
 						if (err == AVERROR(EAGAIN) || err == averror_eof)
 							break;
@@ -936,133 +1142,361 @@ namespace utils
 						return;
 					}
 
-					av_packet_rescale_ts(packet, av.context->time_base, av.stream->time_base);
-					packet->stream_index = av.stream->index;
+					av_packet_rescale_ts(ctx.packet, ctx.context->time_base, ctx.stream->time_base);
+					ctx.packet->stream_index = ctx.stream->index;
 
-					if (int err = av_interleaved_write_frame(av.format, packet); err < 0)
+					if (int err = av_write_frame(av.format_context, ctx.packet); err < 0)
 					{
-						media_log.error("video_encoder: av_interleaved_write_frame failed. Error: %d='%s'", err, av_error_to_string(err));
+						media_log.error("video_encoder: av_write_frame failed. Error: %d='%s'", err, av_error_to_string(err));
 						has_error = true;
 						return;
 					}
 				}
 			};
 
-			s64 last_pts = -1;
+			u32 audio_sample_remainder = 0;
+			s64 last_audio_pts = -1;
+			s64 last_audio_frame_pts = 0;
+			s64 last_video_pts = -1;
+
+			// Allocate audio buffer for our audio frame
+			std::vector<u8> audio_samples;
+			u32 audio_samples_sample_count = 0;
+			const bool sample_fmt_is_planar = av.audio.context && av_sample_fmt_is_planar(av.audio.context->sample_fmt) != 0;
+			const int sample_fmt_bytes = av.audio.context ? av_get_bytes_per_sample(av.audio.context->sample_fmt) : 0;
+			ensure(sample_fmt_bytes == sizeof(f32)); // We only support FLT or FLTP for now
+
+			if (av.audio.frame)
+			{
+				audio_samples.resize(av.audio.frame->nb_samples * av.audio.frame->ch_layout.nb_channels * sizeof(f32));
+				last_audio_frame_pts -= av.audio.frame->nb_samples;
+			}
+
+			encoder_sample last_samples;
+			u32 leftover_sample_count = 0;
 
 			while ((thread_ctrl::state() != thread_state::aborting || m_flush) && !has_error)
 			{
+				// Fetch video frame
 				encoder_frame frame_data;
+				bool got_frame = false;
 				{
 					m_mtx.lock();
 
 					if (m_frames_to_encode.empty())
 					{
 						m_mtx.unlock();
+					}
+					else
+					{
+						frame_data = std::move(m_frames_to_encode.front());
+						m_frames_to_encode.pop_front();
+						m_mtx.unlock();
 
-						if (m_flush)
+						// Calculate presentation timestamp.
+						const s64 pts = get_pts(frame_data.timestamp_ms);
+
+						// We need to skip this frame if it has the same timestamp.
+						if (pts <= last_video_pts)
 						{
-							m_flush = false;
+							media_log.trace("video_encoder: skipping frame. last_pts=%d, pts=%d", last_video_pts, pts);
+						}
+						else if (av.video.context)
+						{
+							media_log.trace("video_encoder: adding new frame. timestamp=%d", frame_data.timestamp_ms);
 
-							if (!m_paused)
+							got_frame = true;
+
+							if (int err = av_frame_make_writable(av.video.frame); err < 0)
 							{
-								// We only stop the thread after a flush if we are not paused
+								media_log.error("video_encoder: av_frame_make_writable failed. Error: %d='%s'", err, av_error_to_string(err));
+								has_error = true;
 								break;
 							}
+
+							u8* in_data[4]{};
+							int in_line[4]{};
+
+							const AVPixelFormat in_format = static_cast<AVPixelFormat>(frame_data.av_pixel_format);
+
+							if (int ret = av_image_fill_linesizes(in_line, in_format, frame_data.width); ret < 0)
+							{
+								fmt::throw_exception("video_encoder: av_image_fill_linesizes failed (ret=0x%x): %s", ret, utils::av_error_to_string(ret));
+							}
+
+							if (int ret = av_image_fill_pointers(in_data, in_format, frame_data.height, frame_data.data.data(), in_line); ret < 0)
+							{
+								fmt::throw_exception("video_encoder: av_image_fill_pointers failed (ret=0x%x): %s", ret, utils::av_error_to_string(ret));
+							}
+
+							// Update the context in case the frame format has changed
+							av.sws = sws_getCachedContext(av.sws, frame_data.width, frame_data.height, in_format,
+							                              av.video.context->width, av.video.context->height, out_pix_format, SWS_BICUBIC, nullptr, nullptr, nullptr);
+							if (!av.sws)
+							{
+								media_log.error("video_encoder: sws_getCachedContext failed");
+								has_error = true;
+								break;
+							}
+
+							if (int err = sws_scale(av.sws, in_data, in_line, 0, frame_data.height, av.video.frame->data, av.video.frame->linesize); err < 0)
+							{
+								media_log.error("video_encoder: sws_scale failed. Error: %d='%s'", err, av_error_to_string(err));
+								has_error = true;
+								break;
+							}
+
+							av.video.frame->pts = pts;
+
+							if (int err = avcodec_send_frame(av.video.context, av.video.frame); err < 0)
+							{
+								media_log.error("video_encoder: avcodec_send_frame for video failed. Error: %d='%s'", err, av_error_to_string(err));
+								has_error = true;
+								break;
+							}
+
+							flush(av.video);
+
+							last_video_pts = av.video.frame->pts;
+							m_last_video_pts = last_video_pts;
+						}
+					}
+				}
+
+				// Fetch audio sample
+				encoder_sample sample_data;
+				bool got_sample = false;
+				{
+					m_audio_mtx.lock();
+
+					if (m_samples_to_encode.empty())
+					{
+						m_audio_mtx.unlock();
+					}
+					else
+					{
+						sample_data = std::move(m_samples_to_encode.front());
+						m_samples_to_encode.pop_front();
+						m_audio_mtx.unlock();
+
+						if (sample_data.channels != av.audio.frame->ch_layout.nb_channels)
+						{
+							fmt::throw_exception("video_encoder: Audio sample channel count %d does not match frame channel count %d", sample_data.channels, av.audio.frame->ch_layout.nb_channels);
 						}
 
-						// We only actually pause after we process all frames
-						const u64 sleeptime = m_paused ? 10000 : 1;
-						thread_ctrl::wait_for(sleeptime);
-						continue;
+						// Calculate presentation timestamp.
+						const s64 pts = get_audio_pts(sample_data.timestamp_us);
+
+						// We need to skip this frame if it has the same timestamp.
+						if (pts <= last_audio_pts)
+						{
+							media_log.error("video_encoder: skipping sample. last_pts=%d, pts=%d", last_audio_pts, pts);
+						}
+						else if (av.audio.context)
+						{
+							media_log.trace("video_encoder: adding new sample. timestamp_us=%d", sample_data.timestamp_us);
+
+							static constexpr bool swap_endianness = false;
+
+							const auto send_frame = [&]()
+							{
+								if (audio_samples_sample_count < static_cast<u32>(av.audio.frame->nb_samples))
+								{
+									return;
+								}
+
+								audio_samples_sample_count = 0;
+								got_sample = true;
+
+								if (int err = av_frame_make_writable(av.audio.frame); err < 0)
+								{
+									media_log.error("video_encoder: av_frame_make_writable failed. Error: %d='%s'", err, av_error_to_string(err));
+									has_error = true;
+									return;
+								}
+
+								if (sample_fmt_is_planar)
+								{
+									const int channels = av.audio.frame->ch_layout.nb_channels;
+									const int samples = av.audio.frame->nb_samples;
+
+									for (int ch = 0; ch < channels; ch++)
+									{
+										f32* dst = reinterpret_cast<f32*>(av.audio.frame->data[ch]);
+
+										for (int sample = 0; sample < samples; sample++)
+										{
+											dst[sample] = *reinterpret_cast<f32*>(&audio_samples[(sample * channels + ch) * sizeof(f32)]);
+										}
+									}
+								}
+								else
+								{
+									std::memcpy(av.audio.frame->data[0], audio_samples.data(), audio_samples.size());
+								}
+
+								av.audio.frame->pts = last_audio_frame_pts + av.audio.frame->nb_samples;
+
+								if (int err = avcodec_send_frame(av.audio.context, av.audio.frame); err < 0)
+								{
+									media_log.error("video_encoder: avcodec_send_frame failed: %d='%s'", err, av_error_to_string(err));
+									has_error = true;
+									return;
+								}
+
+								flush(av.audio);
+
+								last_audio_frame_pts = av.audio.frame->pts;
+							};
+
+							const auto add_encoder_sample = [&](bool add_new_sample, u32 silence_to_add = 0)
+							{
+								const auto update_last_pts = [&](u32 samples_to_add)
+								{
+									const u32 sample_count = audio_sample_remainder + samples_to_add;
+									const u32 pts_to_add = sample_count / m_samples_per_block;
+									audio_sample_remainder = sample_count % m_samples_per_block;
+									last_audio_pts += pts_to_add;
+								};
+
+								// Copy as many old samples to our audio frame as possible
+								if (leftover_sample_count > 0)
+								{
+									const u32 samples_to_add = std::min(leftover_sample_count, av.audio.frame->nb_samples - audio_samples_sample_count);
+
+									if (samples_to_add > 0)
+									{
+										const u8* src = &last_samples.data[(last_samples.sample_count - leftover_sample_count) * last_samples.channels * sizeof(f32)];
+										u8* dst = &audio_samples[audio_samples_sample_count * last_samples.channels * sizeof(f32)];
+										copy_samples<f32>(src, dst, samples_to_add * last_samples.channels, swap_endianness);
+										audio_samples_sample_count += samples_to_add;
+										leftover_sample_count -= samples_to_add;
+										update_last_pts(samples_to_add);
+									}
+
+									if (samples_to_add < leftover_sample_count)
+									{
+										media_log.error("video_encoder: audio frame buffer is already filled entirely by last sample package...");
+									}
+								}
+								else if (silence_to_add > 0)
+								{
+									const u32 samples_to_add = std::min<s32>(silence_to_add, av.audio.frame->nb_samples - audio_samples_sample_count);
+
+									if (samples_to_add > 0)
+									{
+										u8* dst = &audio_samples[audio_samples_sample_count * av.audio.frame->ch_layout.nb_channels * sizeof(f32)];
+										std::memset(dst, 0, samples_to_add * sample_data.channels * sizeof(f32));
+										audio_samples_sample_count += samples_to_add;
+										update_last_pts(samples_to_add);
+									}
+								}
+								else if (add_new_sample)
+								{
+									// Copy as many new samples to our audio frame as possible
+									const u32 samples_to_add = std::min<s32>(sample_data.sample_count, av.audio.frame->nb_samples - audio_samples_sample_count);
+
+									if (samples_to_add > 0)
+									{
+										const u8* src = sample_data.data.data();
+										u8* dst = &audio_samples[audio_samples_sample_count * sample_data.channels * sizeof(f32)];
+										copy_samples<f32>(src, dst, samples_to_add * sample_data.channels, swap_endianness);
+										audio_samples_sample_count += samples_to_add;
+										update_last_pts(samples_to_add);
+									}
+
+									if (samples_to_add < sample_data.sample_count)
+									{
+										// Save this sample package for the next loop if it wasn't fully used.
+										leftover_sample_count = sample_data.sample_count - samples_to_add;
+									}
+									else
+									{
+										// Mark this sample package as fully used.
+										leftover_sample_count = 0;
+									}
+
+									last_samples = std::move(sample_data);
+								}
+
+								send_frame();
+							};
+
+							for (u32 sample = 0; !has_error;)
+							{
+								if (leftover_sample_count > 0)
+								{
+									// Add leftover samples
+									add_encoder_sample(false);
+								}
+								else if (pts > (last_audio_pts + 1))
+								{
+									// Add silence to fill the gap
+									const u32 silence_to_add = pts - (last_audio_pts + 1);
+									add_encoder_sample(false, silence_to_add);
+								}
+								else if (sample == 0)
+								{
+									// Add new samples
+									add_encoder_sample(true);
+									sample++;
+								}
+								else
+								{
+									break;
+								}
+							}
+
+							m_last_audio_pts = last_audio_pts;
+						}
+					}
+				}
+
+				if (!got_frame && !got_sample)
+				{
+					if (m_flush)
+					{
+						m_flush = false;
+
+						if (!m_paused)
+						{
+							// We only stop the thread after a flush if we are not paused
+							break;
+						}
 					}
 
-					frame_data = std::move(m_frames_to_encode.front());
-					m_frames_to_encode.pop_front();
-
-					m_mtx.unlock();
-
-					media_log.trace("video_encoder: adding new frame. timestamp=%d", frame_data.timestamp_ms);
-				}
-
-				// Calculate presentation timestamp.
-				const s64 pts = get_pts(frame_data.timestamp_ms);
-
-				// We need to skip this frame if it has the same timestamp.
-				if (pts <= last_pts)
-				{
-					media_log.notice("video_encoder: skipping frame. last_pts=%d, pts=%d", last_pts, pts);
+					// We only actually pause after we process all frames
+					const u64 sleeptime_us = m_paused ? 10000 : 1;
+					thread_ctrl::wait_for(sleeptime_us);
 					continue;
 				}
-
-				if (int err = av_frame_make_writable(av.frame); err < 0)
-				{
-					media_log.error("video_encoder: av_frame_make_writable failed. Error: %d='%s'", err, av_error_to_string(err));
-					has_error = true;
-					break;
-				}
-
-				u8* in_data[4]{};
-				int in_line[4]{};
-
-				const AVPixelFormat in_format = static_cast<AVPixelFormat>(frame_data.av_pixel_format);
-
-				if (int ret = av_image_fill_linesizes(in_line, in_format, frame_data.width); ret < 0)
-				{
-					fmt::throw_exception("video_encoder: av_image_fill_linesizes failed (ret=0x%x): %s", ret, utils::av_error_to_string(ret));
-				}
-
-				if (int ret = av_image_fill_pointers(in_data, in_format, frame_data.height, frame_data.data.data(), in_line); ret < 0)
-				{
-					fmt::throw_exception("video_encoder: av_image_fill_pointers failed (ret=0x%x): %s", ret, utils::av_error_to_string(ret));
-				}
-
-				// Update the context in case the frame format has changed
-				av.sws = sws_getCachedContext(av.sws, frame_data.width, frame_data.height, in_format,
-				                              av.context->width, av.context->height, out_format, SWS_BICUBIC, nullptr, nullptr, nullptr);
-				if (!av.sws)
-				{
-					media_log.error("video_encoder: sws_getCachedContext failed");
-					has_error = true;
-					break;
-				}
-
-				if (int err = sws_scale(av.sws, in_data, in_line, 0, frame_data.height, av.frame->data, av.frame->linesize); err < 0)
-				{
-					media_log.error("video_encoder: sws_scale failed. Error: %d='%s'", err, av_error_to_string(err));
-					has_error = true;
-					break;
-				}
-
-				av.frame->pts = pts;
-
-				if (int err = avcodec_send_frame(av.context, av.frame); err < 0)
-				{
-					media_log.error("video_encoder: avcodec_send_frame failed. Error: %d='%s'", err, av_error_to_string(err));
-					has_error = true;
-					break;
-				}
-
-				flush();
-
-				last_pts = av.frame->pts;
-
-				m_last_pts = last_pts;
 			}
 
-			if (int err = avcodec_send_frame(av.context, nullptr); err != 0)
+			if (av.video.context)
 			{
-				media_log.error("video_encoder: final avcodec_send_frame failed. Error: %d='%s'", err, av_error_to_string(err));
+				if (int err = avcodec_send_frame(av.video.context, nullptr); err != 0)
+				{
+					media_log.error("video_encoder: final avcodec_send_frame failed. Error: %d='%s'", err, av_error_to_string(err));
+				}
 			}
 
-			flush();
+			if (av.audio.context)
+			{
+				if (int err = avcodec_send_frame(av.audio.context, nullptr); err != 0)
+				{
+					media_log.error("video_encoder: final avcodec_send_frame failed. Error: %d='%s'", err, av_error_to_string(err));
+				}
+			}
 
-			if (int err = av_write_trailer(av.format); err != 0)
+			flush(av.video);
+			flush(av.audio);
+
+			if (int err = av_write_trailer(av.format_context); err != 0)
 			{
 				media_log.error("video_encoder: av_write_trailer failed. Error: %d='%s'", err, av_error_to_string(err));
 			}
 
-			if (int err = avio_close(av.format->pb); err != 0)
+			if (int err = avio_close(av.format_context->pb); err != 0)
 			{
 				media_log.error("video_encoder: avio_close failed. Error: %d='%s'", err, av_error_to_string(err));
 			}
diff --git a/rpcs3/util/media_utils.h b/rpcs3/util/media_utils.h
index 2718a80617..8bbea8c7ee 100644
--- a/rpcs3/util/media_utils.h
+++ b/rpcs3/util/media_utils.h
@@ -88,7 +88,7 @@ namespace utils
 		std::unique_ptr<named_thread<std::function<void()>>> m_thread;
 	};
 
-	class video_encoder : public utils::image_sink
+	class video_encoder : public utils::video_sink
 	{
 	public:
 		video_encoder();
@@ -108,7 +108,7 @@ namespace utils
 		};
 
 		std::string path() const;
-		s64 last_pts() const;
+		s64 last_video_pts() const;
 
 		void set_path(const std::string& path);
 		void set_framerate(u32 framerate);
@@ -118,16 +118,17 @@ namespace utils
 		void set_max_b_frames(s32 max_b_frames);
 		void set_gop_size(s32 gop_size);
 		void set_sample_rate(u32 sample_rate);
+		void set_audio_channels(u32 channels);
 		void set_audio_bitrate(u32 bitrate);
 		void set_audio_codec(s32 codec_id);
-		void add_frame(std::vector<u8>& frame, u32 pitch, u32 width, u32 height, s32 pixel_format, usz timestamp_ms) override;
 		void pause(bool flush = true);
 		void stop(bool flush = true) override;
 		void encode();
 
 	private:
 		std::string m_path;
-		s64 m_last_pts = 0;
+		s64 m_last_audio_pts = 0;
+		s64 m_last_video_pts = 0;
 
 		// Thread control
 		std::unique_ptr<named_thread<std::function<void()>>> m_thread;
@@ -136,14 +137,14 @@ namespace utils
 
 		// Video parameters
 		u32 m_video_bitrate_bps = 0;
-		s32 m_video_codec_id = 12; // AV_CODEC_ID_MPEG4;
+		s32 m_video_codec_id = 12; // AV_CODEC_ID_MPEG4
 		s32 m_max_b_frames = 2;
 		s32 m_gop_size = 12;
 		frame_format m_out_format{};
 
 		// Audio parameters
-		u32 m_sample_rate = 48000;
-		u32 m_audio_bitrate_bps = 96000;
-		s32 m_audio_codec_id = 86018; // AV_CODEC_ID_AAC
+		u32 m_channels = 2;
+		u32 m_audio_bitrate_bps = 320000;
+		s32 m_audio_codec_id = 86019; // AV_CODEC_ID_AC3
 	};
 }
diff --git a/rpcs3/util/video_provider.cpp b/rpcs3/util/video_provider.cpp
index d919137733..d86da5ecf1 100644
--- a/rpcs3/util/video_provider.cpp
+++ b/rpcs3/util/video_provider.cpp
@@ -34,37 +34,37 @@ namespace utils
 		g_recording_mode = recording_mode::stopped;
 	}
 
-	bool video_provider::set_image_sink(std::shared_ptr<image_sink> sink, recording_mode type)
+	bool video_provider::set_video_sink(std::shared_ptr<video_sink> sink, recording_mode type)
 	{
-		media_log.notice("video_provider: setting new image sink. sink=%d, type=%s", !!sink, type);
+		media_log.notice("video_provider: setting new video sink. sink=%d, type=%s", !!sink, type);
 
 		if (type == recording_mode::stopped)
 		{
 			// Prevent misuse. type is supposed to be a valid state.
-			media_log.error("video_provider: cannot set image sink with type %s", type);
+			media_log.error("video_provider: cannot set video sink with type %s", type);
 			return false;
 		}
 
 		std::lock_guard lock(m_mutex);
 
-		if (m_image_sink)
+		if (m_video_sink)
 		{
 			// cell has preference
 			if (m_type == recording_mode::cell && m_type != type)
 			{
-				media_log.warning("video_provider: cannot set image sink with type %s if type %s is active", type, m_type);
+				media_log.warning("video_provider: cannot set video sink with type %s if type %s is active", type, m_type);
 				return false;
 			}
 
-			if (m_type != type || m_image_sink != sink)
+			if (m_type != type || m_video_sink != sink)
 			{
-				media_log.warning("video_provider: stopping current image sink of type %s", m_type);
-				m_image_sink->stop();
+				media_log.warning("video_provider: stopping current video sink of type %s", m_type);
+				m_video_sink->stop();
 			}
 		}
 
 		m_type = sink ? type : recording_mode::stopped;
-		m_image_sink = sink;
+		m_video_sink = sink;
 
 		if (m_type == recording_mode::stopped)
 		{
@@ -84,19 +84,17 @@ namespace utils
 	{
 		std::lock_guard lock(m_mutex);
 
-		if (!m_image_sink)
+		if (!m_video_sink)
 			return false;
 
 		const usz timestamp_ms = std::chrono::duration_cast<std::chrono::milliseconds>(steady_clock::now() - m_encoder_start).count() - m_pause_time_ms;
-		const s64 pts = m_image_sink->get_pts(timestamp_ms);
-		return pts > m_last_pts_incoming;
+		const s64 pts = m_video_sink->get_pts(timestamp_ms);
+		return pts > m_last_video_pts_incoming;
 	}
 
-	void video_provider::present_frame(std::vector<u8>& data, u32 pitch, u32 width, u32 height, bool is_bgra)
+	recording_mode video_provider::check_state()
 	{
-		std::lock_guard lock(m_mutex);
-
-		if (!m_image_sink || m_image_sink->has_error)
+		if (!m_video_sink || m_video_sink->has_error)
 		{
 			g_recording_mode = recording_mode::stopped;
 			rsx::overlays::queue_message(localized_string_id::RECORDING_ABORTED);
@@ -105,33 +103,86 @@ namespace utils
 		if (g_recording_mode == recording_mode::stopped)
 		{
 			m_active = false;
-			return;
+			return g_recording_mode;
 		}
 
 		if (!m_active.exchange(true))
 		{
 			m_current_encoder_frame = 0;
-			m_last_pts_incoming = -1;
+			m_last_video_pts_incoming = -1;
+			m_last_audio_pts_incoming = -1;
 		}
 
-		if (m_current_encoder_frame == 0)
+		if (m_current_encoder_frame == 0 && m_current_encoder_sample == 0)
 		{
 			m_encoder_start = steady_clock::now();
 		}
 
-		// Calculate presentation timestamp.
-		const usz timestamp_ms = std::chrono::duration_cast<std::chrono::milliseconds>(steady_clock::now() - m_encoder_start).count() - m_pause_time_ms;
-		const s64 pts = m_image_sink->get_pts(timestamp_ms);
+		return g_recording_mode;
+	}
 
-		// We can just skip this frame if it has the same timestamp.
-		if (pts <= m_last_pts_incoming)
+	void video_provider::present_frame(std::vector<u8>& data, u32 pitch, u32 width, u32 height, bool is_bgra)
+	{
+		std::lock_guard lock(m_mutex);
+
+		if (check_state() == recording_mode::stopped)
 		{
 			return;
 		}
 
-		m_last_pts_incoming = pts;
+		// Calculate presentation timestamp.
+		const usz timestamp_ms = std::chrono::duration_cast<std::chrono::milliseconds>(steady_clock::now() - m_encoder_start).count() - m_pause_time_ms;
+		const s64 pts = m_video_sink->get_pts(timestamp_ms);
 
+		// We can just skip this frame if it has the same timestamp.
+		if (pts <= m_last_video_pts_incoming)
+		{
+			return;
+		}
+
+		m_last_video_pts_incoming = pts;
 		m_current_encoder_frame++;
-		m_image_sink->add_frame(data, pitch, width, height, is_bgra ? AVPixelFormat::AV_PIX_FMT_BGRA : AVPixelFormat::AV_PIX_FMT_RGBA, timestamp_ms);
+		m_video_sink->add_frame(data, pitch, width, height, is_bgra ? AVPixelFormat::AV_PIX_FMT_BGRA : AVPixelFormat::AV_PIX_FMT_RGBA, timestamp_ms);
+	}
+
+	bool video_provider::can_consume_sample()
+	{
+		std::lock_guard lock(m_mutex);
+
+		if (!m_video_sink)
+			return false;
+
+		const usz timestamp_us = std::chrono::duration_cast<std::chrono::microseconds>(steady_clock::now() - m_encoder_start).count() - (m_pause_time_ms * 1000ull);
+		const s64 pts = m_video_sink->get_audio_pts(timestamp_us);
+		return pts > m_last_audio_pts_incoming;
+	}
+
+	void video_provider::present_samples(u8* buf, u32 sample_count, u16 channels)
+	{
+		if (!buf || !sample_count || !channels)
+		{
+			return;
+		}
+
+		std::lock_guard lock(m_mutex);
+
+		if (check_state() == recording_mode::stopped)
+		{
+			return;
+		}
+
+		// Calculate presentation timestamp.
+		const usz timestamp_us = std::chrono::duration_cast<std::chrono::microseconds>(steady_clock::now() - m_encoder_start).count() - (m_pause_time_ms * 1000ull);
+		const s64 pts = m_video_sink->get_audio_pts(timestamp_us);
+
+		// We can just skip this sample if it has the same timestamp.
+		if (pts <= m_last_audio_pts_incoming)
+		{
+			return;
+		}
+
+		m_last_audio_pts_incoming = pts;
+		m_current_encoder_sample += sample_count;
+		m_video_sink->add_audio_samples(buf, sample_count, channels, timestamp_us);
 	}
 }
diff --git a/rpcs3/util/video_provider.h b/rpcs3/util/video_provider.h
index 31a051a112..93955ab571 100644
--- a/rpcs3/util/video_provider.h
+++ b/rpcs3/util/video_provider.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "image_sink.h"
+#include "video_sink.h"
 
 enum class recording_mode
 {
@@ -17,19 +17,27 @@ namespace utils
 		video_provider() = default;
 		~video_provider();
 
-		bool set_image_sink(std::shared_ptr<image_sink> sink, recording_mode type);
+		bool set_video_sink(std::shared_ptr<video_sink> sink, recording_mode type);
 		void set_pause_time(usz pause_time_ms);
+
 		bool can_consume_frame();
 		void present_frame(std::vector<u8>& data, u32 pitch, u32 width, u32 height, bool is_bgra);
 
+		bool can_consume_sample();
+		void present_samples(u8* buf, u32 sample_count, u16 channels);
+
 	private:
+		recording_mode check_state();
+
 		recording_mode m_type = recording_mode::stopped;
-		std::shared_ptr<image_sink> m_image_sink;
+		std::shared_ptr<video_sink> m_video_sink;
 		shared_mutex m_mutex{};
 		atomic_t<bool> m_active{false};
 		atomic_t<usz> m_current_encoder_frame{0};
+		atomic_t<usz> m_current_encoder_sample{0};
 		steady_clock::time_point m_encoder_start{};
-		s64 m_last_pts_incoming = -1;
+		s64 m_last_video_pts_incoming = -1;
+		s64 m_last_audio_pts_incoming = -1;
 		usz m_pause_time_ms = 0;
 	};
 
diff --git a/rpcs3/util/video_sink.h b/rpcs3/util/video_sink.h
new file mode 100644
index 0000000000..d3d69c82d1
--- /dev/null
+++ b/rpcs3/util/video_sink.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include "util/types.hpp"
+#include "util/atomic.hpp"
+#include "Utilities/mutex.h"
+
+#include <deque>
+#include <cmath>
+
+namespace utils
+{
+	class video_sink
+	{
+	public:
+		video_sink() = default;
+
+		virtual void stop(bool flush = true) = 0;
+
+		void add_frame(std::vector<u8>& frame, u32 pitch, u32 width, u32 height, s32 pixel_format, usz timestamp_ms)
+		{
+			// Do not allow new frames while flushing
+			if (m_flush)
+				return;
+
+			std::lock_guard lock(m_mtx);
+			m_frames_to_encode.emplace_back(timestamp_ms, pitch, width, height, pixel_format, std::move(frame));
+		}
+
+		void add_audio_samples(u8* buf, u32 sample_count, u16 channels, usz timestamp_us)
+		{
+			// Do not allow new samples while flushing
+			if (m_flush || !buf || !sample_count || !channels)
+				return;
+
+			std::vector<u8> sample(buf, buf + sample_count * channels * sizeof(f32));
+			std::lock_guard lock(m_audio_mtx);
+			m_samples_to_encode.emplace_back(timestamp_us, sample_count, channels, std::move(sample));
+		}
+
+		s64 get_pts(usz timestamp_ms) const
+		{
+			return static_cast<s64>(std::round((timestamp_ms * m_framerate) / 1000.f));
+		}
+
+		s64 get_audio_pts(usz timestamp_us) const
+		{
+			static constexpr f32 us_per_sec = 1000000.0f;
+			const f32 us_per_block = us_per_sec / (m_sample_rate / static_cast<f32>(m_samples_per_block));
+			return static_cast<s64>(std::ceil(timestamp_us / us_per_block));
+		}
+
+		usz get_timestamp_ms(s64 pts) const
+		{
+			return static_cast<usz>(std::round((pts * 1000) / static_cast<float>(m_framerate)));
+		}
+
+		usz get_audio_timestamp_us(s64 pts) const
+		{
+			return static_cast<usz>(std::round((pts * 1000) / static_cast<float>(m_sample_rate)));
+		}
+
+		atomic_t<bool> has_error{false};
+
+		struct encoder_frame
+		{
+			encoder_frame() = default;
+			encoder_frame(usz timestamp_ms, u32 pitch, u32 width, u32 height, s32 av_pixel_format, std::vector<u8>&& data)
+				: timestamp_ms(timestamp_ms), pitch(pitch), width(width), height(height), av_pixel_format(av_pixel_format), data(std::move(data))
+			{}
+
+			s64 pts = -1; // Optional
+			usz timestamp_ms = 0;
+			u32 pitch = 0;
+			u32 width = 0;
+			u32 height = 0;
+			s32 av_pixel_format = 0; // NOTE: Make sure this is a valid AVPixelFormat
+			std::vector<u8> data;
+		};
+
+		struct encoder_sample
+		{
+			encoder_sample() = default;
+			encoder_sample(usz timestamp_us, u32 sample_count, u16 channels, std::vector<u8>&& data)
+				: timestamp_us(timestamp_us), sample_count(sample_count), channels(channels), data(std::move(data))
+			{
+			}
+
+			usz timestamp_us = 0;
+			u32 sample_count = 0;
+			u16 channels = 0;
+			std::vector<u8> data;
+		};
+
+	protected:
+		shared_mutex m_mtx;
+		std::deque<encoder_frame> m_frames_to_encode;
+		shared_mutex m_audio_mtx;
+		std::deque<encoder_sample> m_samples_to_encode;
+		atomic_t<bool> m_flush = false;
+		u32 m_framerate = 30;
+		u32 m_sample_rate = 48000;
+		static constexpr u32 m_samples_per_block = 256;
+	};
+}