I am trying to separate voice from background noise in audio file using python and then extract mfcc features
but I get "librosa.util.exceptions.ParameterError: Invalid shape for monophonic audio: ndim=2, shape=(1025, 5341) " error
here's the code
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import scipy
from scipy.io.wavfile import write
import soundfile as sf
from sklearn.preprocessing import normalize
from scipy.io.wavfile import read, write
from scipy.fftpack import rfft, irfft
y, sr = librosa.load('/home/osboxes/Desktop/AccentReco1/audio-files/egyptiansong.mp3', duration=124)
y=rfft(y)
# And compute the spectrogram magnitude and phase
S_full, phase = librosa.magphase(librosa.stft(y))
# We'll compare frames using cosine similarity, and aggregate similar frames
# by taking their (per-frequency) median value.
#
# To avoid being biased by local continuity, we constrain similar frames to be
# separated by at least 2 seconds.
#
# This suppresses sparse/non-repetetitive deviations from the average spectrum,
# and works well to discard vocal elements.
S_filter = librosa.decompose.nn_filter(S_full,
aggregate=np.median,
metric='cosine',
width=int(librosa.time_to_frames(2, sr=sr)))
# The output of the filter shouldn't be greater than the input
# if we assume signals are additive. Taking the pointwise minimium
# with the input spectrum forces this.
S_filter = np.minimum(S_full, S_filter)
# We can also use a margin to reduce bleed between the vocals and instrumentation masks.
# Note: the margins need not be equal for foreground and background separation
margin_i, margin_v = 2, 10
power = 2
mask_i = librosa.util.softmask(S_filter,
margin_i * (S_full - S_filter),
power=power)
mask_v = librosa.util.softmask(S_full - S_filter,
margin_v * S_filter,
power=power)
# Once we have the masks, simply multiply them with the input spectrum
# to separate the components
S_foreground = mask_v * S_full
S_background = mask_i * S_full
# extract mfcc feature from data
mfccs = np.mean(librosa.feature.mfcc(y=S_foreground, sr=sr, n_mfcc=40).T,axis=0)
print(mfccs)
any idea?