Add audio player

Play the decoded audio using SDL. The audio player frame sink receives the audio frames, resample them and write them to a byte buffer (introduced by this commit). On SDL audio callback (from an internal SDL thread), copy samples from this byte buffer to the SDL audio buffer. The byte buffer is protected by the SDL_AudioDeviceLock(), but it has been designed so that the producer and the consumer may write and read in parallel, provided that they don't access the same slices of the ring-buffer buffer. PR #3757 <https://github.com/Genymobile/scrcpy/pull/3757> Co-authored-by: Simon Chan <1330321+yume-chan@users.noreply.github.com>
2025-04-22 20:45:01 +00:00 · 2023-03-03 00:43:20 +01:00 · 2023-03-03 00:43:20 +01:00 · 5105824d59
commit 5105824d59
parent 1fc03c73b1
8 changed files with 519 additions and 5 deletions
--- a/BUILD.md
+++ b/BUILD.md
@ -15,7 +15,7 @@ First, you need to install the required packages:
 sudo apt install ffmpeg libsdl2-2.0-0 adb wget \
                 gcc git pkg-config meson ninja-build libsdl2-dev \
                 libavcodec-dev libavdevice-dev libavformat-dev libavutil-dev \
-                 libusb-1.0-0 libusb-1.0-0-dev
+                 libswresample-dev libusb-1.0-0 libusb-1.0-0-dev
 ```

 Then clone the repo and execute the installation script
@ -94,7 +94,7 @@ sudo apt install ffmpeg libsdl2-2.0-0 adb libusb-1.0-0
 # client build dependencies
 sudo apt install gcc git pkg-config meson ninja-build libsdl2-dev \
                 libavcodec-dev libavdevice-dev libavformat-dev libavutil-dev \
-                 libusb-1.0-0-dev
+                 libswresample-dev libusb-1.0-0-dev

 # server build dependencies
 sudo apt install openjdk-11-jdk
--- a/app/meson.build
+++ b/app/meson.build
@ -4,6 +4,7 @@ src = [
    'src/adb/adb_device.c',
    'src/adb/adb_parser.c',
    'src/adb/adb_tunnel.c',
+    'src/audio_player.c',
    'src/cli.c',
    'src/clock.c',
    'src/compat.c',
@ -30,6 +31,7 @@ src = [
    'src/version.c',
    'src/video_buffer.c',
    'src/util/acksync.c',
+    'src/util/average.c',
    'src/util/bytebuf.c',
    'src/util/file.c',
    'src/util/intmap.c',
@ -100,6 +102,7 @@ if not crossbuild_windows
        dependency('libavformat', version: '>= 57.33'),
        dependency('libavcodec', version: '>= 57.37'),
        dependency('libavutil'),
+        dependency('libswresample'),
        dependency('sdl2', version: '>= 2.0.5'),
    ]

@ -135,6 +138,7 @@ else
            cc.find_library('avcodec-60', dirs: ffmpeg_bin_dir),
            cc.find_library('avformat-60', dirs: ffmpeg_bin_dir),
            cc.find_library('avutil-58', dirs: ffmpeg_bin_dir),
+            cc.find_library('swresample-4', dirs: ffmpeg_bin_dir),
        ],
        include_directories: include_directories(ffmpeg_include_dir)
    )
--- a/app/src/audio_player.c
+++ b/app/src/audio_player.c
@ -0,0 +1,363 @@
+#include "audio_player.h"
+
+#include <libavutil/opt.h>
+
+#include "util/log.h"
+
+//#define SC_AUDIO_PLAYER_NDEBUG // comment to debug
+
+/** Downcast frame_sink to sc_audio_player */
+#define DOWNCAST(SINK) container_of(SINK, struct sc_audio_player, frame_sink)
+
+#define SC_AV_SAMPLE_FMT AV_SAMPLE_FMT_FLT
+#define SC_SDL_SAMPLE_FMT AUDIO_F32
+
+#define SC_AUDIO_OUTPUT_BUFFER_SAMPLES 480 // 10ms at 48000Hz
+
+// The target number of buffered samples between the producer and the consumer.
+// This value is directly use for compensation.
+#define SC_TARGET_BUFFERED_SAMPLES (3 * SC_AUDIO_OUTPUT_BUFFER_SAMPLES)
+
+// Use a ring-buffer of 1 second (at 48000Hz) between the producer and the
+// consumer. It too big, but it guarantees that the producer and the consumer
+// will be able to access it in parallel without locking.
+#define SC_BYTEBUF_SIZE_IN_SAMPLES 48000
+
+static inline size_t
+bytes_to_samples(struct sc_audio_player *ap, size_t bytes) {
+    assert(bytes % (ap->nb_channels * ap->out_bytes_per_sample) == 0);
+    return bytes / (ap->nb_channels * ap->out_bytes_per_sample);
+}
+
+static inline size_t
+samples_to_bytes(struct sc_audio_player *ap, size_t samples) {
+    return samples * ap->nb_channels * ap->out_bytes_per_sample;
+}
+
+void
+sc_audio_player_sdl_callback(void *userdata, uint8_t *stream, int len_int) {
+    struct sc_audio_player *ap = userdata;
+
+    // This callback is called with the lock used by SDL_AudioDeviceLock(), so
+    // the bytebuf is protected
+
+    assert(len_int > 0);
+    size_t len = len_int;
+
+#ifndef SC_AUDIO_PLAYER_NDEBUG
+    LOGD("[Audio] SDL callback requests %" SC_PRIsizet " samples",
+         bytes_to_samples(ap, len));
+#endif
+
+    size_t read_avail = sc_bytebuf_read_available(&ap->buf);
+    size_t read = MIN(read_avail, len);
+    if (read) {
+        sc_bytebuf_read(&ap->buf, stream, read);
+    }
+
+    if (read < len) {
+        // Insert silence
+#ifndef SC_AUDIO_PLAYER_NDEBUG
+        LOGD("[Audio] Buffer underflow, inserting silence: %" SC_PRIsizet
+             " samples", bytes_to_samples(ap, len - read));
+#endif
+        memset(stream + read, 0, len - read);
+        // If the first frame has not been received yet, it's not an underflow
+        if (ap->received) {
+            ap->underflow += bytes_to_samples(ap, len - read);
+        }
+    }
+
+    ap->last_consumed = sc_tick_now();
+}
+
+static uint8_t *
+sc_audio_player_get_swr_buf(struct sc_audio_player *ap, size_t min_samples) {
+    size_t min_buf_size = samples_to_bytes(ap, min_samples);
+    if (min_buf_size < ap->swr_buf_alloc_size) {
+        size_t new_size = min_buf_size + 4096;
+        uint8_t *buf = realloc(ap->swr_buf, new_size);
+        if (!buf) {
+            LOG_OOM();
+            // Could not realloc to the requested size
+            return NULL;
+        }
+        ap->swr_buf = buf;
+        ap->swr_buf_alloc_size = new_size;
+    }
+
+    return ap->swr_buf;
+}
+
+static bool
+sc_audio_player_frame_sink_open(struct sc_frame_sink *sink,
+                                const AVCodecContext *ctx) {
+    struct sc_audio_player *ap = DOWNCAST(sink);
+
+    SDL_AudioSpec desired = {
+        .freq = ctx->sample_rate,
+        .format = SC_SDL_SAMPLE_FMT,
+        .channels = ctx->ch_layout.nb_channels,
+        .samples = SC_AUDIO_OUTPUT_BUFFER_SAMPLES,
+        .callback = sc_audio_player_sdl_callback,
+        .userdata = ap,
+    };
+    SDL_AudioSpec obtained;
+
+    ap->device = SDL_OpenAudioDevice(NULL, 0, &desired, &obtained, 0);
+    if (!ap->device) {
+        LOGE("Could not open audio device: %s", SDL_GetError());
+        return false;
+    }
+
+    SwrContext *swr_ctx = swr_alloc();
+    if (!swr_ctx) {
+        LOG_OOM();
+        goto error_close_audio_device;
+    }
+    ap->swr_ctx = swr_ctx;
+
+    assert(ctx->sample_rate > 0);
+    assert(ctx->ch_layout.nb_channels > 0);
+    assert(!av_sample_fmt_is_planar(SC_AV_SAMPLE_FMT));
+    int out_bytes_per_sample = av_get_bytes_per_sample(SC_AV_SAMPLE_FMT);
+    assert(out_bytes_per_sample > 0);
+
+    av_opt_set_chlayout(swr_ctx, "in_chlayout", &ctx->ch_layout, 0);
+    av_opt_set_chlayout(swr_ctx, "out_chlayout", &ctx->ch_layout, 0);
+
+    av_opt_set_int(swr_ctx, "in_sample_rate", ctx->sample_rate, 0);
+    av_opt_set_int(swr_ctx, "out_sample_rate", ctx->sample_rate, 0);
+
+    av_opt_set_sample_fmt(swr_ctx, "in_sample_fmt", ctx->sample_fmt, 0);
+    av_opt_set_sample_fmt(swr_ctx, "out_sample_fmt", SC_AV_SAMPLE_FMT, 0);
+
+    int ret = swr_init(swr_ctx);
+    if (ret) {
+        LOGE("Failed to initialize the resampling context");
+        goto error_free_swr_ctx;
+    }
+
+    ap->sample_rate = ctx->sample_rate;
+    ap->nb_channels = ctx->ch_layout.nb_channels;
+    ap->out_bytes_per_sample = out_bytes_per_sample;
+
+    size_t bytebuf_size = samples_to_bytes(ap, SC_BYTEBUF_SIZE_IN_SAMPLES);
+
+    bool ok = sc_bytebuf_init(&ap->buf, bytebuf_size);
+    if (!ok) {
+        goto error_free_swr_ctx;
+    }
+
+    size_t initial_swr_buf_size = samples_to_bytes(ap, 4096);
+    ap->swr_buf = malloc(initial_swr_buf_size);
+    if (!ap->swr_buf) {
+        LOG_OOM();
+        goto error_destroy_bytebuf;
+    }
+    ap->swr_buf_alloc_size = initial_swr_buf_size;
+
+    ap->previous_write_avail = sc_bytebuf_write_available(&ap->buf);
+
+    sc_average_init(&ap->avg_buffering, 8);
+    ap->samples_since_resync = 0;
+
+    ap->last_consumed = 0;
+    ap->underflow = 0;
+    ap->received = 0;
+
+    SDL_PauseAudioDevice(ap->device, 0);
+
+    return true;
+
+error_destroy_bytebuf:
+    sc_bytebuf_destroy(&ap->buf);
+error_free_swr_ctx:
+    swr_free(&ap->swr_ctx);
+error_close_audio_device:
+    SDL_CloseAudioDevice(ap->device);
+
+    return false;
+}
+
+static void
+sc_audio_player_frame_sink_close(struct sc_frame_sink *sink) {
+    struct sc_audio_player *ap = DOWNCAST(sink);
+
+    assert(ap->device);
+    SDL_PauseAudioDevice(ap->device, 1);
+    SDL_CloseAudioDevice(ap->device);
+
+    free(ap->swr_buf);
+    sc_bytebuf_destroy(&ap->buf);
+    swr_free(&ap->swr_ctx);
+}
+
+static bool
+sc_audio_player_frame_sink_push(struct sc_frame_sink *sink,
+                                const AVFrame *frame) {
+    struct sc_audio_player *ap = DOWNCAST(sink);
+
+    SwrContext *swr_ctx = ap->swr_ctx;
+
+    int64_t delay = swr_get_delay(swr_ctx, ap->sample_rate);
+    // No need to av_rescale_rnd(), input and output sample rates are the same
+    // Add more space (256) for clock compensation
+    int dst_nb_samples = delay + frame->nb_samples + 256;
+
+    uint8_t *swr_buf = sc_audio_player_get_swr_buf(ap, dst_nb_samples);
+    if (!swr_buf) {
+        return false;
+    }
+
+    int ret = swr_convert(swr_ctx, &swr_buf, dst_nb_samples,
+                          (const uint8_t **) frame->data, frame->nb_samples);
+    if (ret < 0) {
+        LOGE("Resampling failed: %d", ret);
+        return false;
+    }
+
+    // swr_convert() returns the number of samples which would have been
+    // written if the buffer was big enough.
+    size_t samples_written = MIN(ret, dst_nb_samples);
+    size_t swr_buf_size = samples_to_bytes(ap, samples_written);
+#ifndef SC_AUDIO_PLAYER_NDEBUG
+    LOGD("[Audio] %" SC_PRIsizet " samples written to buffer", samples_written);
+#endif
+
+    // Since this function is the only writer, the current available space is
+    // at least the previous available space. In practice, it should almost
+    // always be possible to write without lock.
+    bool lockless_write = swr_buf_size <= ap->previous_write_avail;
+    if (lockless_write) {
+        sc_bytebuf_prepare_write(&ap->buf, swr_buf, swr_buf_size);
+    }
+
+    SDL_LockAudioDevice(ap->device);
+
+    // The consumer requests audio samples blocks (e.g. 480 samples).
+    // Convert the duration since the last consumption into samples.
+    size_t extrapolated = 0;
+    if (ap->last_consumed) {
+        sc_tick now = sc_tick_now();
+        assert(now >= ap->last_consumed);
+        extrapolated = (now - ap->last_consumed) * ap->sample_rate
+                                                 / SC_TICK_FREQ;
+    }
+
+    size_t read_avail = sc_bytebuf_read_available(&ap->buf);
+
+    // The consumer may not increase underflow value if there are still samples
+    // available
+    assert(read_avail == 0 || ap->underflow == 0);
+
+    size_t buffered_samples = bytes_to_samples(ap, read_avail);
+    // Underflow caused silence samples in excess (so it adds buffering).
+    // Extrapolated samples must be considered consumed for smoothing (so it
+    // removes buffering).
+    float buffering = (float) buffered_samples + ap->underflow - extrapolated;
+    sc_average_push(&ap->avg_buffering, buffering);
+
+#ifndef SC_AUDIO_PLAYER_NDEBUG
+    LOGD("[Audio] buffered_samples=%" SC_PRIsizet
+                " underflow=%" SC_PRIsizet
+                " extrapolated=%" SC_PRIsizet
+                " buffering=%f avg_buffering=%f",
+         buffered_samples, ap->underflow, extrapolated, buffering,
+         sc_average_get(&ap->avg_buffering));
+#endif
+
+    if (lockless_write) {
+        sc_bytebuf_commit_write(&ap->buf, swr_buf_size);
+    } else {
+        // Take care to keep full samples
+        size_t align = ap->nb_channels * ap->out_bytes_per_sample;
+        size_t write_avail =
+            sc_bytebuf_write_available(&ap->buf) / align * align;
+        if (swr_buf_size > write_avail) {
+            // Skip old samples
+            size_t cap = sc_bytebuf_capacity(&ap->buf) / align * align;
+            if (swr_buf_size > cap) {
+                // Ignore the first bytes in swr_buf
+                swr_buf += swr_buf_size - cap;
+                swr_buf_size = cap;
+            }
+            assert(swr_buf_size > write_avail);
+            if (swr_buf_size - write_avail > 0) {
+                sc_bytebuf_skip(&ap->buf, swr_buf_size - write_avail);
+            }
+        }
+        sc_bytebuf_write(&ap->buf, swr_buf, swr_buf_size);
+    }
+
+    // On buffer underflow, typically because a packet is late, silence is
+    // inserted. In that case, the late samples must be ignored when they
+    // arrive, otherwise they will delay playback.
+    //
+    // As an improvement, instead of naively skipping the silence duration, we
+    // can absorb it if it helps clock compensation.
+    if (ap->underflow) {
+        size_t avg = sc_average_get(&ap->avg_buffering);
+        if (avg > SC_TARGET_BUFFERED_SAMPLES) {
+            size_t diff = SC_TARGET_BUFFERED_SAMPLES - avg;
+            if (ap->underflow > diff) {
+                // Partially absorb underflow for clock compensation (only keep
+                // the diff with the target buffering level).
+                ap->underflow -= diff;
+            } else {
+                // Totally absorb underflow for clock compensation
+                ap->underflow = 0;
+            }
+
+            size_t skip_samples = MIN(ap->underflow, buffered_samples);
+            if (skip_samples) {
+                size_t skip_bytes = samples_to_bytes(ap, skip_samples);
+                sc_bytebuf_skip(&ap->buf, skip_bytes);
+                read_avail -= skip_bytes;
+#ifndef SC_AUDIO_PLAYER_NDEBUG
+                LOGD("[Audio] Skipping %" SC_PRIsizet " samples", skip_samples);
+#endif
+            }
+        } else {
+            // Totally absorb underflow for clock compensation
+            ap->underflow = 0;
+        }
+    }
+
+    ap->previous_write_avail = sc_bytebuf_write_available(&ap->buf);
+    ap->received = true;
+
+    SDL_UnlockAudioDevice(ap->device);
+
+    ap->samples_since_resync += samples_written;
+    if (ap->samples_since_resync >= ap->sample_rate) {
+        // Resync every second
+        ap->samples_since_resync = 0;
+
+        float avg = sc_average_get(&ap->avg_buffering);
+        int diff = SC_TARGET_BUFFERED_SAMPLES - avg;
+#ifndef SC_AUDIO_PLAYER_NDEBUG
+        LOGD("[Audio] Average buffering=%f, compensation %d", avg, diff);
+#endif
+        // Compensate the diff over 3 seconds (but will be recomputed after
+        // 1 second)
+        int ret = swr_set_compensation(swr_ctx, diff, 3 * ap->sample_rate);
+        if (ret < 0) {
+            LOGW("Resampling compensation failed: %d", ret);
+            // not fatal
+        }
+    }
+
+    return true;
+}
+
+void
+sc_audio_player_init(struct sc_audio_player *ap) {
+    static const struct sc_frame_sink_ops ops = {
+        .open = sc_audio_player_frame_sink_open,
+        .close = sc_audio_player_frame_sink_close,
+        .push = sc_audio_player_frame_sink_push,
+    };
+
+    ap->frame_sink.ops = &ops;
+}
--- a/app/src/audio_player.h
+++ b/app/src/audio_player.h
@ -0,0 +1,61 @@
+#ifndef SC_AUDIO_PLAYER_H
+#define SC_AUDIO_PLAYER_H
+
+#include "common.h"
+
+#include <stdbool.h>
+#include "trait/frame_sink.h"
+#include <util/average.h>
+#include <util/bytebuf.h>
+#include <util/thread.h>
+
+#include <libavformat/avformat.h>
+#include <libswresample/swresample.h>
+#include <SDL2/SDL.h>
+
+struct sc_audio_player {
+    struct sc_frame_sink frame_sink;
+
+    SDL_AudioDeviceID device;
+
+    // protected by SDL_AudioDeviceLock()
+    struct sc_bytebuf buf;
+    size_t previous_write_avail;
+
+    struct SwrContext *swr_ctx;
+
+    // The sample rate is the same for input and output
+    unsigned sample_rate;
+    // The number of channels is the same for input and output
+    unsigned nb_channels;
+    // The number of bytes per sample for a single channel
+    unsigned out_bytes_per_sample;
+
+    // Target buffer for resampling
+    uint8_t *swr_buf;
+    size_t swr_buf_alloc_size;
+
+    // Number of buffered samples (may be negative on underflow)
+    struct sc_average avg_buffering;
+    // Count the number of samples to trigger a compensation update regularly
+    size_t samples_since_resync;
+
+    // The last date a sample has been consumed by the audio output
+    sc_tick last_consumed;
+
+    // Number of silence samples inserted to be compensated
+    size_t underflow;
+    bool received;
+
+    const struct sc_audio_player_callbacks *cbs;
+    void *cbs_userdata;
+};
+
+struct sc_audio_player_callbacks {
+    void (*on_ended)(struct sc_audio_player *ap, bool success, void *userdata);
+};
+
+void
+sc_audio_player_init(struct sc_audio_player *ap);
+
+#endif
--- a/app/src/decoder.c
+++ b/app/src/decoder.c
@ -2,6 +2,7 @@

 #include <libavcodec/avcodec.h>
 #include <libavformat/avformat.h>
+#include <libavutil/channel_layout.h>

 #include "events.h"
 #include "video_buffer.h"
@ -50,6 +51,11 @@ sc_decoder_open(struct sc_decoder *decoder, const AVCodec *codec) {
    if (codec->type == AVMEDIA_TYPE_VIDEO) {
        // Hardcoded video properties
        decoder->codec_ctx->pix_fmt = AV_PIX_FMT_YUV420P;
+    } else {
+        // Hardcoded audio properties
+        decoder->codec_ctx->ch_layout =
+            (AVChannelLayout) AV_CHANNEL_LAYOUT_STEREO;
+        decoder->codec_ctx->sample_rate = 48000;
    }

    if (avcodec_open2(decoder->codec_ctx, codec, NULL) < 0) {
--- a/app/src/scrcpy.c
+++ b/app/src/scrcpy.c
@ -13,6 +13,7 @@
 # include <windows.h>
 #endif

+#include "audio_player.h"
 #include "controller.h"
 #include "decoder.h"
 #include "demuxer.h"
@ -40,6 +41,7 @@
 struct scrcpy {
    struct sc_server server;
    struct sc_screen screen;
+    struct sc_audio_player audio_player;
    struct sc_demuxer video_demuxer;
    struct sc_demuxer audio_demuxer;
    struct sc_decoder video_decoder;
@ -383,9 +385,16 @@ scrcpy(struct scrcpy_options *options) {
    }

    // Initialize SDL video in addition if display is enabled
-    if (options->display && SDL_Init(SDL_INIT_VIDEO)) {
-        LOGE("Could not initialize SDL: %s", SDL_GetError());
-        goto end;
+    if (options->display) {
+        if (SDL_Init(SDL_INIT_VIDEO)) {
+            LOGE("Could not initialize SDL video: %s", SDL_GetError());
+            goto end;
+        }
+
+        if (options->audio && SDL_Init(SDL_INIT_AUDIO)) {
+            LOGE("Could not initialize SDL audio: %s", SDL_GetError());
+            goto end;
+        }
    }

    sdl_configure(options->display, options->disable_screensaver);
@ -663,6 +672,11 @@ aoa_hid_end:
        screen_initialized = true;

        sc_decoder_add_sink(&s->video_decoder, &s->screen.frame_sink);
+
+        if (options->audio) {
+            sc_audio_player_init(&s->audio_player);
+            sc_decoder_add_sink(&s->audio_decoder, &s->audio_player.frame_sink);
+        }
    }

 #ifdef HAVE_V4L2
--- a/app/src/util/average.c
+++ b/app/src/util/average.c
@ -0,0 +1,26 @@
+#include "average.h"
+
+#include <assert.h>
+
+void
+sc_average_init(struct sc_average *avg, unsigned range) {
+    avg->range = range;
+    avg->avg = 0;
+    avg->count = 0;
+}
+
+void
+sc_average_push(struct sc_average *avg, float value) {
+    if (avg->count < avg->range) {
+        ++avg->count;
+    }
+
+    assert(avg->count);
+    avg->avg = ((avg->count - 1) * avg->avg + value) / avg->count;
+}
+
+float
+sc_average_get(struct sc_average *avg) {
+    assert(avg->count);
+    return avg->avg;
+}
--- a/app/src/util/average.h
+++ b/app/src/util/average.h
@ -0,0 +1,40 @@
+#ifndef SC_AVERAGE
+#define SC_AVERAGE
+
+#include "common.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+
+struct sc_average {
+    // Current average value
+    float avg;
+
+    // Target range, to update the average as follow:
+    //     avg = ((range - 1) * avg + new_value) / range
+    unsigned range;
+
+    // Number of values pushed when less than range (count <= range).
+    // The purpose is to handle the first (range - 1) values properly.
+    unsigned count;
+};
+
+void
+sc_average_init(struct sc_average *avg, unsigned range);
+
+/**
+ * Push a new value to update the "rolling" average
+ */
+void
+sc_average_push(struct sc_average *avg, float value);
+
+/**
+ * Get the current average value
+ *
+ * It is an error to call this function if sc_average_push() has not been
+ * called at least once.
+ */
+float
+sc_average_get(struct sc_average *avg);
+
+#endif