I am working on capturing and streaming audio to an RTMP server in an instant. I work under MacOS (in Xcode), so I use the AVFoundation-framework to capture the sample audio buffer. But for coding and streaming, I need to use the ffmpeg-API and libfaac encoder. Therefore, the output format must be AAC (to support stream playback on iOS devices).
And I ran into this problem: the audio capture device (in my case Logitech Camera) gives me a sample buffer with 512 LPCM samples, and I can choose the input sampling frequency from 16000, 24000, 36000 or 48000 Hz. When I transmit these 512 samples to the AAC encoder (tuned to the appropriate sample rate), I hear a slow sound and a twitching sound (it seems as if a silence pause after each frame).
I realized (maybe I'm wrong) that the libfaac encoder only accepts sound frames with 1024 samples. When I set the input discrete selection to 24000 and overflow the input sample buffer to 48000 before encoding, I get 1024 re-samples. After encoding these 1024 sampels in AAC, I hear the correct sound output. But my webcam makes 512 samples in the buffer for any input sampling, when the output sampling frequency should be 48000 Hz. Therefore, I need to re-sample anyway, and I will not get exactly 1024 samples in the buffer after re-sampling.
Is there a way to solve this problem in the ffmpeg-API functionality ?
I would be grateful for any help.
PS: I assume that I can accumulate re-sampled buffers until the number of samples becomes 1024, and then encode it, but this is a stream, so there will be problems with the resulting timestamps and with other input devices, and this solution does not work.
The current problem has arisen from the problem described in [question]: How to populate AVFrame audio (ffmpeg) with data obtained from CMSampleBufferRef (AVFoundation)?
Here is the code with the audio codec configurations (there was also a video stream there, but the video works fine):
static AVFrame *aframe; static AVFrame *frame; AVOutputFormat *fmt; AVFormatContext *oc; AVStream *audio_st, *video_st; Init () { AVCodec *audio_codec, *video_codec; int ret; avcodec_register_all(); av_register_all(); avformat_network_init(); avformat_alloc_output_context2(&oc, NULL, "flv", filename); fmt = oc->oformat; oc->oformat->video_codec = AV_CODEC_ID_H264; oc->oformat->audio_codec = AV_CODEC_ID_AAC; video_st = NULL; audio_st = NULL; if (fmt->video_codec != AV_CODEC_ID_NONE) {
And oversampling and audio encoding:
if (mType == kCMMediaType_Audio) { CMSampleTimingInfo timing_info; CMSampleBufferGetSampleTimingInfo(sampleBuffer, 0, &timing_info); double pts=0; double dts=0; AVCodecContext *c; AVPacket pkt = { 0 }; // data and size must be 0; int got_packet, ret; av_init_packet(&pkt); c = audio_st->codec; CMItemCount numSamples = CMSampleBufferGetNumSamples(sampleBuffer); NSUInteger channelIndex = 0; CMBlockBufferRef audioBlockBuffer = CMSampleBufferGetDataBuffer(sampleBuffer); size_t audioBlockBufferOffset = (channelIndex * numSamples * sizeof(SInt16)); size_t lengthAtOffset = 0; size_t totalLength = 0; SInt16 *samples = NULL; CMBlockBufferGetDataPointer(audioBlockBuffer, audioBlockBufferOffset, &lengthAtOffset, &totalLength, (char **)(&samples)); const AudioStreamBasicDescription *audioDescription = CMAudioFormatDescriptionGetStreamBasicDescription(CMSampleBufferGetFormatDescription(sampleBuffer)); SwrContext *swr = swr_alloc(); int in_smprt = (int)audioDescription->mSampleRate; av_opt_set_int(swr, "in_channel_layout", AV_CH_LAYOUT_MONO, 0); av_opt_set_int(swr, "out_channel_layout", audio_st->codec->channel_layout, 0); av_opt_set_int(swr, "in_channel_count", audioDescription->mChannelsPerFrame, 0); av_opt_set_int(swr, "out_channel_count", audio_st->codec->channels, 0); av_opt_set_int(swr, "out_channel_layout", audio_st->codec->channel_layout, 0); av_opt_set_int(swr, "in_sample_rate", audioDescription->mSampleRate,0); av_opt_set_int(swr, "out_sample_rate", audio_st->codec->sample_rate,0); av_opt_set_sample_fmt(swr, "in_sample_fmt", AV_SAMPLE_FMT_S16, 0); av_opt_set_sample_fmt(swr, "out_sample_fmt", audio_st->codec->sample_fmt, 0); swr_init(swr); uint8_t **input = NULL; int src_linesize; int in_samples = (int)numSamples; ret = av_samples_alloc_array_and_samples(&input, &src_linesize, audioDescription->mChannelsPerFrame, in_samples, AV_SAMPLE_FMT_S16P, 0); *input=(uint8_t*)samples; uint8_t *output=NULL; int out_samples = av_rescale_rnd(swr_get_delay(swr, in_smprt) +in_samples, (int)audio_st->codec->sample_rate, in_smprt, AV_ROUND_UP); av_samples_alloc(&output, NULL, audio_st->codec->channels, out_samples, audio_st->codec->sample_fmt, 0); in_samples = (int)numSamples; out_samples = swr_convert(swr, &output, out_samples, (const uint8_t **)input, in_samples); aframe->nb_samples =(int) out_samples; ret = avcodec_fill_audio_frame(aframe, audio_st->codec->channels, audio_st->codec->sample_fmt, (uint8_t *)output, (int) out_samples * av_get_bytes_per_sample(audio_st->codec->sample_fmt) * audio_st->codec->channels, 1); aframe->channel_layout = audio_st->codec->channel_layout; aframe->channels=audio_st->codec->channels; aframe->sample_rate= audio_st->codec->sample_rate; if (timing_info.presentationTimeStamp.timescale!=0) pts=(double) timing_info.presentationTimeStamp.value/timing_info.presentationTimeStamp.timescale; aframe->pts=pts*audio_st->time_base.den; aframe->pts = av_rescale_q(aframe->pts, audio_st->time_base, audio_st->codec->time_base); ret = avcodec_encode_audio2(c, &pkt, aframe, &got_packet); if (ret < 0) { fprintf(stderr, "Error encoding audio frame: %s\n", av_err2str(ret)); exit(1); } swr_free(&swr); if (got_packet) { pkt.stream_index = audio_st->index; pkt.pts = av_rescale_q(pkt.pts, audio_st->codec->time_base, audio_st->time_base); pkt.dts = av_rescale_q(pkt.dts, audio_st->codec->time_base, audio_st->time_base); // Write the compressed frame to the media file. ret = av_interleaved_write_frame(oc, &pkt); if (ret != 0) { fprintf(stderr, "Error while writing audio frame: %s\n", av_err2str(ret)); exit(1); } }