What is wrong with this code performing unsupervised segmentation?

60 Views Asked by At

This is the block of code performing MFCC after taking an audio input file. After the mfcc, the boundary segments are found, segment duration duration for each segment is calculated using the function calculate_duration_features(), stored in the list duration_feats which is then concatenated with corresponding mfccs.

import librosa
import numpy as np

sr = 16000  # Sampling rate
n_mfcc = 20  # Number of MFCC coefficients
n_fft_values = [128, 256, 512]
hop_length_values = [32, 64, 128]
zcr_thresholds = [0.05, 0.08, 0.10, 0.12] 

class FeatureExtractorSegmenter:
    def __init__(self, sr, n_mfcc):
        self.sr = sr
        self.n_mfcc = n_mfcc

    def extract_features(self, audio_file):
        y, _ = librosa.load(audio_file, sr=self.sr)

        for n_fft in n_fft_values:
            for hop_length in hop_length_values:
                for zcr_threshold in zcr_thresholds:
                    # Coarse segmentation with ZCR
                    zcr = librosa.feature.zero_crossing_rate(y=y)
                    coarse_segments = np.where(zcr > zcr_threshold)[0]

                    # Refining boundaries with spectral change analysis (if coarse segments available)
                    refined_segments = []
                    if coarse_segments.any(): 
                        for start, end in zip(coarse_segments[:-1], coarse_segments[1:]):
                            segment = y[start:end]
                            onset_frames = librosa.onset.onset_detect(y=segment, sr=self.sr, units="frames")
                            onset_frames += start  # Adjusting onset positions to global indices
                            refined_segments.extend(segment[i:j] for i, j in zip(onset_frames[:-1], onset_frames[1:]))

                    # Extracingt MFCCs from refined segments (if available)
                    mfccs = []
                    duration_features = []
                    if refined_segments:
                        for segment in refined_segments:
                            segment_mfccs = librosa.feature.mfcc(y=segment, sr=self.sr, n_mfcc=self.n_mfcc,
                                                              n_fft=n_fft, hop_length=hop_length)
                            duration_feats = self.calculate_duration_features(segment)
                            mfccs.append(segment_mfccs)
                            duration_features.append(duration_feats)
                    print(f"MFCCs for n_fft={n_fft}, hop_length={hop_length}, zcr_threshold={zcr_threshold}:")
                    print(mfccs)
                    print(f"Duration features for n_fft={n_fft}, hop_length={hop_length}, zcr_threshold={zcr_threshold}:")
                    print(duration_features)


    def calculate_duration_features(self, segment):
        num_frames = len(segment)
        total_duration = num_frames * 0.025  # Assuming 25ms frame duration
        return [num_frames, total_duration]

audio_file = "/content/speech.wav"
extractor = FeatureExtractorSegmenter(sr, n_mfcc)
extractor.extract_features(audio_file)

Here is the link to the audio file I am using: http://www.fit.vutbr.cz/~motlicek/sympatex/f2bjrop1.0.wav

I am expecting a list containing the feature duration and MFCCs but every time i am running the code, it's giving empty mfcc and duration_feature. Can someone tell me what is wrong with the code?

0

There are 0 best solutions below