22
loading...
This website collects cookies to deliver better user experience
Projects
, then on Create project
.Project board name
.Description
of the project and select a Project template
.Create project
.RandomOversample
method to create new features for the neutral class.def oversample(X, y):
X = joblib.load("speech_emotion_recognition/features/X.joblib")
y = joblib.load("speech_emotion_recognition/features/y.joblib")
print(Counter(y))
oversample = RandomOverSampler(sampling_strategy="minority")
X_over, y_over = oversample.fit_resample(X, y)
X_over_save, y_over_save = "X_over.joblib", "y_over.joblib"
joblib.dump(X_over, os.path.join("speech_emotion_recognition/features/", X_over_save))
joblib.dump(y_over, os.path.join("speech_emotion_recognition/features/", y_over_save))
Mel-frequency cepstrum (MFC) is a representation of the short-term power spectrum of a sound, based on a linear cosine transform of a log power spectrum on a nonlinear mel scale of frequency. Mel-frequency cepstral coefficients (MFCCs) are coefficients that collectively make up an MFC.
The difference between the cepstrum and the mel-frequency cepstrum is that in the MFC, the frequency bands are equally spaced on the mel scale, which approximates the human auditory system's response more closely than the linearly-spaced frequency bands used in the normal spectrum. This frequency warping can allow for better representation of sound, for example, in audio compression.
librosa
:def extract_features(path, save_dir):
feature_list = []
start_time = time.time()
for dir, _, files in os.walk(path):
for file in files:
y_lib, sample_rate = librosa.load(
os.path.join(dir, file), res_type="kaiser_fast"
)
mfccs = np.mean(
librosa.feature.mfcc(y=y_lib, sr=sample_rate, n_mfcc=40).T, axis=0
)
file = int(file[7:8]) - 1
arr = mfccs, file
feature_list.append(arr)
print("Data loaded in %s seconds." % (time.time() - start_time))
X, y = zip(*feature_list)
X, y = np.asarray(X), np.asarray(y)
print(X.shape, y.shape)
X_save, y_save = "X.joblib", "y.joblib"
joblib.dump(X, os.path.join(save_dir, X_save))
joblib.dump(y, os.path.join(save_dir, y_save))
return "Preprocessing completed."
Multi-Layer Perceptron (MLP)
def mlp_classifier(X, y):
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
mlp_model = MLPClassifier(
hidden_layer_sizes=(100,),
solver="adam",
alpha=0.001,
shuffle=True,
verbose=True,
momentum=0.8,
)
Convolutional Neural Network (CNN)
def cnn_model(X, y):
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
x_traincnn = np.expand_dims(X_train, axis=2)
x_testcnn = np.expand_dims(X_test, axis=2)
model = Sequential()
model.add(Conv1D(16, 5, padding="same", input_shape=(40, 1)))
model.add(Activation("relu"))
model.add(Conv1D(8, 5, padding="same"))
model.add(Activation("relu"))
model.add(
Conv1D(
8,
5,
padding="same",
)
)
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Flatten())
model.add(Dense(8))
model.add(Activation("softmax"))
model.compile(
loss="categorical_crossentropy",
optimizer="adam",
metrics=["accuracy"],
)
cnn_history = model.fit(
x_traincnn,
y_train,
batch_size=50,
epochs=100,
validation_data=(x_testcnn, y_test),
)
Long Short-Term Memory (LSTM)
def lstm_model(X, y):
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
X_train_lstm = np.expand_dims(X_train, axis=2)
X_test_lstm = np.expand_dims(X_test, axis=2)
lstm_model = Sequential()
lstm_model.add(LSTM(64, input_shape=(40, 1), return_sequences=True))
lstm_model.add(LSTM(32))
lstm_model.add(Dense(32, activation="relu"))
lstm_model.add(Dropout(0.1))
lstm_model.add(Dense(8, activation="softmax"))
lstm_model.compile(
optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
)
lstm_model.summary()
lstm_history = lstm_model.fit(X_train_lstm, y_train, batch_size=32, epochs=100)
adam
optimizer, and less layers. All models overfit (they couldn't generalize on unseen data), but this seems to be a common issue in neural networks and on audio data. sounddevice
:import soundfile as sf
import sounddevice as sd
from scipy.io.wavfile import write
def record_voice():
fs = 44100 # Sample rate
seconds = 3 # Duration of recording
# sd.default.device = "Built-in Audio" # Speakers full name here
print("Say something:")
myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=2)
sd.wait() # Wait until recording is finished
write("speech_emotion_recognition/recordings/myvoice.wav", fs, myrecording)
print("Voice recording saved.")
def make_predictions(file):
cnn_model = keras.models.load_model(
"speech_emotion_recognition/models/cnn_model.h5"
)
lstm_model = keras.models.load_model(
"speech_emotion_recognition/models/lstm_model.h5"
)
prediction_data, prediction_sr = librosa.load(
file,
res_type="kaiser_fast",
duration=3,
sr=22050,
offset=0.5,
)
mfccs = np.mean(
librosa.feature.mfcc(y=prediction_data, sr=prediction_sr, n_mfcc=40).T, axis=0
)
x = np.expand_dims(mfccs, axis=1)
x = np.expand_dims(x, axis=0)
predictions = lstm_model.predict_classes(x)
emotions_dict = {
"0": "neutral",
"1": "calm",
"2": "happy",
"3": "sad",
"4": "angry",
"5": "fearful",
"6": "disgusted",
"7": "surprised",
}
for key, value in emotions_dict.items():
if int(key) == predictions:
label = value
print("This voice sounds", predictions, label)