I'm adding some new features to the audio player in my app KeyStage. One of the features that I wanna add is the transpose functionality. I found a real-time pitch shifter unit in AVAudioEngine, but it outputs the signals with approximately 0.1-second latency, which renders it useless.
I don't know anything about digital signal processing (I hope to learn it at some point when I have time). Still, after some web search, I discovered that all the real-time pitch-shifting algorithms produce a similar amount of latency, so real-time pitch-shifting is out of the table for me.
Therefore I need to change the pitch of the whole audio file or the buffer in non-real time in Swift or Objective-C. So I need a function that takes an AVAudioFile (or an AVAudioPCMBuffer) and a pitch shift amount in cents and outputs the pitch-shifted AVAudioFile (or AVAudioPCMBuffer), possibly through a completion handler or a message to delegate. I searched online for such a project or library but couldn't find any. I wish I could write such a function independently, but as I said, I don't know any DSP.
I'd appreciate it if anyone can help me with this. Can third-party libraries do this in objective c (or Swift)? Or, if this is a simple algorithm, can someone guide me on how I can achieve this? I even tried using chatGPT, but the codes it generates have tons of errors.
Here is what chatGPT provided me:
import Foundation
import Accelerate
func pitchShiftUsingFFT(audioFile: AVAudioFile, semitones: Float) -> AVAudioPCMBuffer? {
// Load input audio file
guard let inputFormat = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: audioFile.fileFormat.sampleRate, channels: 1, interleaved: false),
let inputFile = try? AVAudioFile(forReading: audioFile.url, commonFormat: .pcmFormatFloat32, interleaved: false) else {
print("Failed to load input audio file")
return nil
}
// Calculate pitch shift factor and ratio
let pitchShiftFactor = pow(2.0, semitones/12.0)
let pitchShiftRatio = Float(inputFormat.sampleRate) / (pitchShiftFactor * Float(inputFormat.sampleRate))
// Allocate memory for input and output audio buffers
let inputBufferSize = AVAudioFrameCount(inputFile.length)
let outputBufferSize = AVAudioFrameCount(Float(inputBufferSize) * pitchShiftFactor)
let inputBuffer = AVAudioPCMBuffer(pcmFormat: inputFormat, frameCapacity: inputBufferSize)
let outputBuffer = AVAudioPCMBuffer(pcmFormat: inputFormat, frameCapacity: outputBufferSize)
// Read input audio file into input buffer
guard let inputAudioBuffer = inputBuffer?.floatChannelData else {
print("Failed to allocate memory for input buffer")
return nil
}
do {
try inputFile.read(into: inputBuffer!)
} catch {
print("Failed to read input audio file")
return nil
}
// Allocate memory for FFT buffers
let fftSize = vDSP_Length(ceil(log2(Float(inputBufferSize))))
let fftSetup = vDSP_create_fftsetup(fftSize, Int32(kFFTRadix2))
let fftInputBuffer = UnsafeMutablePointer<Float>(calloc(inputBufferSize, MemoryLayout<Float>.size))
let fftOutputBuffer = UnsafeMutablePointer<DSPComplex>(calloc(inputBufferSize/2, MemoryLayout<DSPComplex>.size))
// Apply pitch shift using FFT
for channel in 0..<inputFormat.channelCount {
// Initialize variables
var phase = DSPFloat()
let phaseIncrement = 2.0 * Float.pi * pitchShiftRatio
var fftOffset = vDSP_Length(0)
// Copy input audio data into FFT input buffer
vDSP_vflt32(inputAudioBuffer[channel], 1, fftInputBuffer, 1, inputBufferSize)
// Perform FFT on input data
vDSP_ctoz(UnsafePointer<DSPFloat>(fftInputBuffer), 2, fftOutputBuffer, 1, inputBufferSize/2)
vDSP_fft_zrip(fftSetup, fftOutputBuffer, 1, fftSize, FFTDirection(FFT_FORWARD))
vDSP_ztoc(fftOutputBuffer, 1, UnsafeMutablePointer<DSPFloat>(fftInputBuffer), 2, inputBufferSize/2)
// Apply phase shift to FFT output
let phaseShift = DSPFloat(phaseIncrement * Float(fftOffset))
for i in 0..<inputBufferSize/2 {
let fftReal = fftInputBuffer[i*2]
let fftImag = fftInputBuffer[i*2+1]
let complex = DSPComplex(real: fftReal * cos(phase) - fftImag * sin(phase),
imaginary: fftReal * sin(phase) + fftImag * cos(phase))
fftOutputBuffer[i] = complex
phase += phaseShift
if phase > Float.pi {
phase -= 2.0 * Float.pi
}
}
// Perform inverse FFT on phase shifted output
vDSP_ctoz(fftOutputBuffer, 1, fftOutputBuffer, 1, inputBufferSize/2)
vDSP_fft_zrip(fftSetup, fftOutputBuffer, 1, fftSize, FFTDirection(FFT_INVERSE))
vDSP_ztoc(fftOutputBuffer, 1, UnsafeMutablePointer<DSPFloat>(fftInputBuffer), 2, inputBufferSize/2)
// Overlap-add FFT output to output audio buffer
var outputOffset = vDSP_Length(0)
let hopSize = vDSP_Length(Float(grainSize) * pitchShiftRatio)
let overlapSize = vDSP_Length(Float(overlap) * pitchShiftRatio)
while outputOffset < outputBufferSize {
let outputRemaining = outputBufferSize - outputOffset
let inputRemaining = inputBufferSize - fftOffset
let frameCount = min(inputRemaining, outputRemaining)
let overlapCount = min(frameCount, overlapSize)
// Add overlapped frames to output audio buffer
vDSP_vadd(inputAudioBuffer[channel]+fftOffset, 1,
outputBuffer!.floatChannelData[channel]+outputOffset, 1,
outputBuffer!.floatChannelData[channel]+outputOffset, 1,
overlapCount)
// Add remaining frames to output audio buffer
vDSP_vadd(fftInputBuffer+fftOffset, 1,
outputBuffer!.floatChannelData[channel]+outputOffset+overlapCount, 1,
outputBuffer!.floatChannelData[channel]+outputOffset+overlapCount, 1,
frameCount-overlapCount)
// Update buffer offsets
outputOffset += frameCount
fftOffset += hopSize
}
}
// Clean up FFT buffers and setup
vDSP_destroy_fftsetup(fftSetup)
free(fftInputBuffer)
free(fftOutputBuffer)
return outputBuffer
}