I'm getting chopping noises and incorrect results from my phase vocalizer in jupyter notebook

23 Views Asked by At

Im tryng to write a python class with methods that let me edit audio, the method in question is an attempt to pitch shift with a phase vocalizer, i know theres a librosa vocalizer and many others, but i would like to do it myself. the vocalizer works when there is zero pitch shit, but once i try to shift it it becomes choppy and weird. Im trying to use the method from Jentgent, seen here https://github.com/JentGent/pitch-shift/blob/main/audios.ipynb.

import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
from scipy.io.wavfile import write
import scipy.signal
import IPython
from IPython.display import Audio


class Audio_object:
    def __init__(self, file, mono=True):
        self.samplerate, self.data = wavfile.read(file)
        if mono and self.data.ndim > 1:
            self.data = np.mean(self.data, axis=1).astype(self.data.dtype)

        
    def writefile(self, name):
        filename = name + ".wav"
        write(filename, self.samplerate, self.data.astype(np.int16))

    def stft(self, window='hann', seg_ratio= 0.1, overlap=0.5):
        nperseg = self.samplerate*seg_ratio
        f, t, Zxx = scipy.signal.stft(self.data, fs=self.samplerate, window=window, nperseg=nperseg, noverlap=overlap)
        return f, t, Zxx

    def inverse_stft(self, Zxx, window='hann',seg_ratio= 0.1, overlap = 0.5):
        nperseg = self.samplerate*seg_ratio
        _, x_rec = scipy.signal.istft(Zxx, fs=self.samplerate, window=window, nperseg=nperseg, noverlap=overlap)
        return x_rec

    def circle(self, name):
        filename = name + ".wav"
        _, _, Zxx = self.stft()
        x_rec = self.inverse_stft(Zxx)
        write(filename, self.samplerate, x_rec.astype(np.int16))

    def plot(self, data,title):
        plt.plot(data)
        plt.xlabel("Sample Index")
        plt.ylabel("Amplitude")
        plt.title(title)

    
    def plot_fft(self):
        f, _, Zxx = self.stft()
        avg_spectrum = np.mean(np.abs(Zxx), axis=1) # taking the mean of the magnitude across the spectrum,
        plt.figure(figsize=(10, 5))                 # this is not done for further use, only for visual representation
        plt.plot(f, avg_spectrum)
        plt.title('Average FFT Magnitude Spectrum')
        plt.xlabel('Frequency [Hz]')
        plt.ylabel('Magnitude')
        plt.grid(True)
        plt.show()

    def plot_spectrogram(self, f, t, Zxx):
        plt.figure(figsize=(10, 5))
        plt.pcolormesh(t, f, np.abs(Zxx), shading='gouraud')
        plt.title('Spectrogram')
        plt.xlabel('Time [s]')
        plt.ylabel('Frequency [Hz]')
        plt.ylim(0,3500) 
        plt.show()


    @staticmethod
    def interpolate_time(idxs, arr):
        start = (idxs + 0.5).astype(int)
        frac = (idxs - start)[None, None, :]
        shifted_arr = np.concatenate((arr[:, 1:], np.zeros((arr.shape[0], arr.shape[1]))), axis=1)
        return arr[:, start] * (1 - frac) + shifted_arr[:, :, start] * frac
    
    @staticmethod
    def interpolate_time(idxs, arr):
        start = np.minimum((idxs + 0.5).astype(int), arr.shape[1]-1)
        frac = (idxs - start)[None, None, :]
        shifted_arr = np.concatenate((arr[:, 1:], np.zeros((arr.shape[0], arr.shape[1]))), axis=1)
        return arr[:, start] * (1 - frac) + shifted_arr[ :, start] * frac
    
    def pitch_shift(self, semitones, seg_ratio=0.1, overlap=0.5):         
        scaling = 2 ** (semitones / 12)
        f,t,Zxx = self.stft()
        

        anls_frames = np.arange(len(Zxx))
        n_synth_frames = np.floor(len(Zxx) * scaling).astype(int)
        synth_frames = np.arange(n_synth_frames)
        og_idxs = np.minimum(synth_frames / scaling,len(Zxx) - 1)

        mags = np.abs(Zxx)
        phases = np.angle(Zxx)
        #print((np.zeros((len(Zxx), 1)), phases[:, :-1]))
        phase_diffs = phases - np.concatenate((np.zeros(( len(Zxx), 1)), phases[:, :-1]), axis=1)
        phase_diffs = np.mod(phase_diffs, np.pi * 2)


        shifted_mags = self.interpolate_time(og_idxs, mags)
        shifted_phase_diffs = self.interpolate_time(og_idxs, phase_diffs)

        shifted_phases = np.cumsum(shifted_phase_diffs, axis=2)

        synth_stft = shifted_mags * np.exp(shifted_phases * 1j)


        new_waveform = self.inverse_stft(synth_stft.astype(np.complex64))
        return Audio(new_waveform.astype(np.int16), rate=int(self.samplerate*scaling))

gettysburg = Audio_object("gettysburg.wav", mono=True)

audio_widget = gettysburg.pitch_shift(12)
audio_widget

Ive been trying to mess around with variables, but the stft arrays are a little too complex for me

0

There are 0 best solutions below