はじめに

AACデータを再生する方法を記載します。
OSはWindows10です。開発環境はVisual Studio2015を使いました。

データフローは次の通りです。
TS file -> demux -> (AAC) -> decode -> (fltp PCM) -> (s16 PCM) -> Speaker out

フローのStepを説明します。

FFmpegでTS fileからAACデータを取り出します(demux)。
FFmpegでAACデータをdecodeしてfltp PCMデータにします。
FFmpegでfltp PCMデータをs16 PCMデータに変換します。
Xaudio2でs16 PCMデータをSpeaker outします。

PCMフォーマット

PCMのフォーマットを説明します。

FFmpegでAACをデコードするとAV_SAMPLE_FMT_FLTP(fltp)になります。
1sampleが32bit-floatのplanar形式です。

LRが別々のエリア;L1,L2,...,Ln:R1,R2,...,Rn

Xaudio2ではAV_SAMPLE_FMT_S16(s16)にする必要があります。
1sample 16bit-integerのinterleave形式です。

LRが交互に現れる;L1,R1,L2,R2,...,Ln,Rn

fltpをs16に変換する

swresampleを使ってfltpをs16に変換します。

(snip)
if (swr == NULL) {
    swr = swr_alloc();
    if (swr == NULL) {
        printf("swr_alloc error.\n");
        break;
    }
    av_opt_set_int(swr, "in_channel_layout", frame->channel_layout, 0);
    av_opt_set_int(swr, "out_channel_layout", frame->channel_layout, 0);
    av_opt_set_int(swr, "in_sample_rate", frame->sample_rate, 0);
    av_opt_set_int(swr, "out_sample_rate", frame->sample_rate, 0);
    av_opt_set_sample_fmt(swr, "in_sample_fmt", (AVSampleFormat)frame->format, 0);
    av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
    ret = swr_init(swr);
    if (ret < 0) {
        printf("swr_init error ret=%08x.\n", AVERROR(ret));
        break;
    }
    int buf_size = frame->nb_samples*frame->channels * 2; /* the 2 means S16 */
    swr_buf = new BYTE[buf_size];
    swr_buf_len = buf_size;
}

ret = swr_convert(swr, &swr_buf, frame->nb_samples, (const uint8_t**)frame->extended_data, frame->nb_samples);
if (ret < 0) {
    printf("swr_convert error ret=%08x.\n", AVERROR(ret));
}
(snip)

sample code

Win32 Console アプリケーションを作成します。
FFmpegのAPIを使う～ビルド環境構築を参照してください。
[構成プロパティ]-[リンカー]-[入力]-[追加の依存ファイル]に次を追加します。

avcodec.lib
avdevice.lib
avfilter.lib
avformat.lib
avutil.lib
postproc.lib
swresample.lib
swscale.lib
xaudio2.lib

TS file "test.mpg"をFFmpegでdemuxしてAACデータを取り出します。
取り出したAACデータをデコードしてfltpにします。
fltpをs16に変換します。s16をXaudio2でSpeaker outします。

#include "stdafx.h"
#include <xaudio2.h>
extern "C" {
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavutil/opt.h>
#include <libswresample/swresample.h>
}

class VoiceCallback : public IXAudio2VoiceCallback
{
public:
    HANDLE event;
    VoiceCallback() : event(CreateEvent(NULL, FALSE, FALSE, NULL)) {}
    ~VoiceCallback() { CloseHandle(event); }
    void STDMETHODCALLTYPE OnStreamEnd() {}
    void STDMETHODCALLTYPE OnVoiceProcessingPassEnd() {}
    void STDMETHODCALLTYPE OnVoiceProcessingPassStart(UINT32 samples) {}
    void STDMETHODCALLTYPE OnBufferEnd(void * context) { SetEvent(event); }
    void STDMETHODCALLTYPE OnBufferStart(void * context) {}
    void STDMETHODCALLTYPE OnLoopEnd(void * context) {}
    void STDMETHODCALLTYPE OnVoiceError(void * context, HRESULT Error) {}
};

int main()
{
    int ret = -1;

    ret = CoInitializeEx(NULL, COINIT_MULTITHREADED);
    if (FAILED(ret)) {
        printf("error CoInitializeEx ret=%d\n", ret);
        return -1;
    }

    IXAudio2 *audio = NULL;
    ret = XAudio2Create(&audio);
    if (FAILED(ret)) {
        printf("error XAudio2Create ret=%d\n", ret);
        return -1;
    }

    IXAudio2MasteringVoice *master = NULL;
    ret = audio->CreateMasteringVoice(&master);
    if (FAILED(ret)) {
        printf("error CreateMasteringVoice ret=%d\n", ret);
        return -1;
    }

    av_register_all();

    const char *filename = "test.mpg";
    AVFormatContext *format_context = NULL;
    ret = avformat_open_input(&format_context, filename, NULL, NULL);
    if (ret < 0) {
        printf("cannot open file. filename=%s, ret=%08x\n", filename, AVERROR(ret));
        return -1;
    }

    ret = avformat_find_stream_info(format_context, NULL);
    if (ret < 0) {
        printf("avformat_find_stream_info error. ret=%08x\n", AVERROR(ret));
        return -1;
    }

    AVStream *audio_stream = NULL;
    for (unsigned int i = 0; i < format_context->nb_streams; i++) {
        if (format_context->streams[i]->codecpar->codec_type == AVMediaType::AVMEDIA_TYPE_AUDIO) {
            audio_stream = format_context->streams[i];
            break;
        }
    }
    if (audio_stream == NULL) {
        printf("stream not found\n");
        return -1;
    }

    AVCodec *codec = avcodec_find_decoder(audio_stream->codecpar->codec_id);
    if (codec == NULL) {
        printf("avcodec_find_decoder codec not found. codec_id=%d\n", audio_stream->codecpar->codec_id);
        return -1;
    }

    AVCodecContext *codec_context = avcodec_alloc_context3(codec);
    if (codec_context == NULL) {
        printf("avcodec_alloc_context3 error.\n");
        return -1;
    }

    ret = avcodec_open2(codec_context, codec, NULL);
    if (ret < 0) {
        printf("avcodec_open2 error. ret=%08x\n", AVERROR(ret));
        return -1;
    }

    AVFrame *frame = av_frame_alloc();
    AVPacket packet;
    int frame_number = 0;
    SwrContext *swr = NULL;
    BYTE* swr_buf = 0;
    int swr_buf_len = 0;
    IXAudio2SourceVoice *voice = NULL;
    bool first_submit = true;
    BYTE** buf = NULL;
    int buf_cnt = 0;
    VoiceCallback callback;
    WAVEFORMATEX format = { 0 };

    while (1) {
        // read ES
        if ((ret = av_read_frame(format_context, &packet)) < 0) {
            printf("av_read_frame eof or error. ret=%08x\n", AVERROR(ret));
            break; // eof or error
        }
        if (packet.stream_index == audio_stream->index) {
            // decode ES
            if ((ret = avcodec_send_packet(codec_context, &packet)) < 0) {
                printf("avcodec_send_packet error. ret=%08x\n", AVERROR(ret));
            }
            if ((ret = avcodec_receive_frame(codec_context, frame)) < 0) {
                if (ret != AVERROR(EAGAIN)) {
                    printf("avcodec_receive_frame error. ret=%08x\n", AVERROR(ret));
                    break;
                }
            }
            else {
                printf("frame_number=%d\n", ++frame_number);
                if (swr == NULL) {
                    swr = swr_alloc();
                    if (swr == NULL) {
                        printf("swr_alloc error.\n");
                        break;
                    }
                    av_opt_set_int(swr, "in_channel_layout", frame->channel_layout, 0);
                    av_opt_set_int(swr, "out_channel_layout", frame->channel_layout, 0);
                    av_opt_set_int(swr, "in_sample_rate", frame->sample_rate, 0);
                    av_opt_set_int(swr, "out_sample_rate", frame->sample_rate, 0);
                    av_opt_set_sample_fmt(swr, "in_sample_fmt", (AVSampleFormat)frame->format, 0);
                    av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
                    ret = swr_init(swr);
                    if (ret < 0) {
                        printf("swr_init error ret=%08x.\n", AVERROR(ret));
                        break;
                    }
                    int buf_size = frame->nb_samples*frame->channels * 2; /* the 2 means S16 */
                    swr_buf = new BYTE[buf_size];
                    swr_buf_len = buf_size;
                }

                ret = swr_convert(swr, &swr_buf, frame->nb_samples, (const uint8_t**)frame->extended_data, frame->nb_samples);
                if (ret < 0) {
                    printf("swr_convert error ret=%08x.\n", AVERROR(ret));
                }
                if (voice == NULL) {
                    format.wFormatTag = WAVE_FORMAT_PCM;
                    format.nChannels = frame->channels;
                    format.wBitsPerSample = 16;
                    format.nSamplesPerSec = frame->sample_rate;
                    format.nBlockAlign = format.wBitsPerSample / 8 * format.nChannels;
                    format.nAvgBytesPerSec = format.nSamplesPerSec * format.nBlockAlign;
                    ret = audio->CreateSourceVoice(
                        &voice,
                        &format,
                        0,                          // UINT32 Flags = 0,
                        XAUDIO2_DEFAULT_FREQ_RATIO, // float MaxFrequencyRatio = XAUDIO2_DEFAULT_FREQ_RATIO,
                        &callback                   // IXAudio2VoiceCallback *pCallback = NULL,
                    );
                    if (FAILED(ret)) {
                        printf("error CreateSourceVoice ret=%d\n", ret);
                    }
                    voice->Start();
                }
                if (buf == NULL) {
                    buf = new BYTE*[2];
                    buf[0] = new BYTE[swr_buf_len];
                    buf[1] = new BYTE[swr_buf_len];
                }
                memcpy(buf[buf_cnt], swr_buf, swr_buf_len);
                XAUDIO2_BUFFER buffer = { 0 };
                buffer.AudioBytes = swr_buf_len;
                buffer.pAudioData = buf[buf_cnt];
                ret = voice->SubmitSourceBuffer(&buffer);
                if (FAILED(ret)) {
                    printf("error SubmitSourceBuffer ret=%d\n", ret);
                }

                if (first_submit) {
                    first_submit = false;
                }
                else {
                    if (WaitForSingleObject(callback.event, INFINITE) != WAIT_OBJECT_0) {
                        printf("error WaitForSingleObject\n");
                    }
                }
                if (2 <= ++buf_cnt)
                    buf_cnt = 0;
            }
        }
        else {
            // does not audio ES.
        }
        av_packet_unref(&packet);
    }

    if (WaitForSingleObject(callback.event, INFINITE) != WAIT_OBJECT_0) {
        printf("error WaitForSingleObject\n");
    }

    if (voice)
        voice->Stop();

    getchar(); // wait user input

    // omit cleanup for simplify. (free/release etc...)

    return 0;
}

references

Xaudio2を使う
 FFmpegのAPIを使う～ビルド環境構築
 FFmpegのAPIを使う～AACデコーダ出力フォーマットfltpを調べてみる

FFmpegのAPIを使う～AACデータを再生する

はじめに

PCMフォーマット

fltpをs16に変換する

sample code

references