Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

How to use sonic to perform speed doubling on ffmpeg AVFrame type audio. #51

Open
ZzMLvzZ-792998470 opened this issue Jun 30, 2024 · 0 comments

Comments

@ZzMLvzZ-792998470
Copy link

Hello, I am trying to perform a speed doubling operation on the audio data decoded by ffmpeg. My initial idea was to use impulse sampling to achieve speed doubling, but changing the tone of the sound caused me a lot of trouble. So I chose to use snoic for audio data doubling processing, but it seems that the extraction and processing of audio data did not follow my expectations.
For example, for dual channel audio and FLTP audio formats, my approach is to write the data into sonic according to the number and order of channels, process it, and then retrieve it. But I'm not sure if I write all the data from both channels at once before reading, or if I read it every time I write to one channel. Thank you very much for your advice!

int Transcoder::add2TranscodeBuffer(AVFrame *inFrame, int sep, float speed) {
    int64_t inChannelLayout = inFrame->channel_layout;
    int inNbSamples = inFrame->nb_samples;
    int inSampleRate = inFrame->sample_rate;
    int inChannels = inFrame->channels;
    int inFormat = inFrame->format;

    int ret;
    if (!swrContext) {
        // 申请上下文空间
        swrContext = swr_alloc();
        // 设置上下文空间参数
        swr_alloc_set_opts(swrContext, outChannelLayout, (AVSampleFormat) outFormat, outSampleRate,
                           av_get_default_channel_layout(inChannels), (AVSampleFormat) inFormat, inSampleRate, 0, 0);
        // 初始化
        swr_init(swrContext);
        int fifoSize = swr_get_out_samples(swrContext, 0);
        av_log(nullptr, AV_LOG_INFO, "fifoSize:%i\n", fifoSize);

        //如果采样率相同,则以outNbSamples为输出大小
        if (inSampleRate == outSampleRate){
            audioOutSize = outNbSamples;
        }else {//48000->44100时,只有缓冲区有1115个采样时,才能得到1024个采样。
            audioOutSize = ceil(1.0 * inNbSamples * inSampleRate / outSampleRate);
            if (inSampleRate < outSampleRate) {
                audioOutSize += outNbSamples;
            }
        }
    }

    // 计算补帧个数 噪音帧和普通帧分开讨论
    int silenceNum = sep & 0b01111111;
    bool isComfortNoise = (sep & 0b10000000) == 0b10000000;
    if(isComfortNoise) silenceNum++;

    while (silenceNum > 0) {
        if(!frame) frame = av_frame_alloc();
        frame->sample_rate = inFrame->sample_rate;
        frame->channels = inFrame->channels;
        frame->format = inFrame->format;
        frame->nb_samples = inFrame->nb_samples;
        frame->channel_layout = inFrame->channel_layout;

        av_frame_get_buffer(frame, 0);

        //设置静音帧
        av_samples_set_silence(frame->data, 0, frame->nb_samples, frame->channels, (AVSampleFormat)frame->format);

        ret = swr_convert(swrContext, nullptr, 0, (const uint8_t **)frame->data, inNbSamples);
        if (ret < 0) {
            av_log(nullptr, AV_LOG_ERROR, "swr_convert failed:%i\n", ret);
            return ret;
        }
        silenceNum--;
    }


    //把帧加入缓存
    //将要重采样的帧放入缓存队列中
    if(speed != 1.0f) {
        if (_speed != speed) {
            _speed = speed;
            if(_sonic) {
                sonicDestroyStream(_sonic);
                _sonic = nullptr;
            }
        }

        if (!_sonic) {
            _sonic = sonicCreateStream(inSampleRate, inChannels);
        }

        sonicSetSpeed(_sonic, _speed);
        sonicSetPitch(_sonic, 1);

        int sonicBuffSize;
        int out_nb_samples = 0;
        uint8_t** outData = new uint8_t*[8];
        // Write audio data to Sonic
        int time_write = 0;
        for (int channel = 0; channel < inChannels; channel++) {
            time_write++;
            int numsProcess = sonicWriteShortToStream(_sonic, (short *) inFrame->data[channel], inNbSamples);
            av_log(nullptr, AV_LOG_INFO, "write_into_sonic, time_write:%i\n", time_write);
            sonicBuffSize = sonicSamplesAvailable(_sonic);
            out_nb_samples += sonicBuffSize;
            auto data = new short[sonicBuffSize];
            int readSamples = sonicReadShortFromStream(_sonic, data, sonicBuffSize);
            int dataSize = 0;
            if(readSamples > 0) {
                dataSize = av_samples_get_buffer_size(nullptr, inChannels, readSamples, (AVSampleFormat)inFrame->format, 0);
            }
            outData[channel] = new uint8_t[dataSize];
            memcpy(outData[channel], (uint8_t *)data, dataSize);
            delete[] data;
        }

        ret = swr_convert(swrContext, nullptr, 0, (const uint8_t **)outData, out_nb_samples);
        if (ret < 0) {
            av_log(nullptr, AV_LOG_ERROR, "swr_convert failed: %i\n", ret);
        }
        for (int channel = 0; channel < inChannels; channel++) {
            delete[] outData[channel];
        }
        delete[] outData;
        return ret;
    } else {
        if (!isComfortNoise || sep == 0) {
            ret = swr_convert(swrContext, nullptr, 0, (const uint8_t **) inFrame->data, inNbSamples);
            if (ret < 0) {
                av_log(nullptr, AV_LOG_ERROR, "swr_convert failed:%i\n", ret);
            }
        }
    }

    return ret;
    // 1. 读写分离,平均分2 x
//        int sonicBuffSize = 0;
//        // Write audio data to Sonic
//        int time_write = 0;
//        for (int channel = 0; channel < inChannels; channel++) {
//            time_write++;
//            int numsProcess = sonicWriteShortToStream(_sonic, (short *)inFrame->data[channel], inNbSamples);
//            av_log(nullptr, AV_LOG_INFO, "write_into_sonic, time_write:%i\n", time_write);
//            if (numsProcess > 0) {
//                sonicBuffSize = sonicSamplesAvailable(_sonic);
//                av_log(nullptr, AV_LOG_INFO, "write_into_sonic, sonicBufferSize:%i\n", sonicBuffSize);
//            }
//        }
//
//        if (sonicBuffSize > 0) {
//            int time_read = 0;
//            uint8_t **outSampleData = new uint8_t *[8];
//            int out_nb_samples = sonicBuffSize / inChannels;
//            for (int channel = 0; channel < inChannels; channel++) {
//                time_read++;
//                auto outData = new short[sonicBuffSize];
//                int readSamples = sonicReadShortFromStream(_sonic, outData, sonicBuffSize / inChannels);
//
//                av_log(nullptr, AV_LOG_INFO, "sonic::read, %i\n", time_read);
//                int dataSize = av_samples_get_buffer_size(nullptr, inChannels, readSamples, (AVSampleFormat)inFrame->format, 0);
//
//                outSampleData[channel] = new uint8_t[dataSize];
//                memcpy(outSampleData[channel], (uint8_t *)outData, dataSize);
//                delete[] outData;
//            }
//
//            auto data1 = outSampleData[0];
//            auto data2 = outSampleData[1];
//
//            av_log(nullptr, AV_LOG_INFO, "sonic::swr_convert, sample:%i\n", out_nb_samples);
//            ret = swr_convert(swrContext, nullptr, 0, (const uint8_t **)outSampleData, out_nb_samples);
//            if (ret < 0) {
//                av_log(nullptr, AV_LOG_ERROR, "swr_convert failed: %i\n", ret);
//            }
//            for (int channel = 0; channel < inChannels; channel++) {
//                delete[] outSampleData[channel];
//            }
//            delete[] outSampleData;
//        }
//        return ret;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant