Match audio Signal(DTW)
preprocess.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 23 08:57:19 2018
@author: PeterTsai
"""
import os
import numpy as np
import glob
import librosa
import librosa.display
import re
from pydub import AudioSegment
import matplotlib.pyplot as plt
speechFileList = sorted(glob.glob('./choices/*.wav'))
# Load reference answer index audio (1,2,3,4)
sr = 16000
au1 = AudioSegment.from_wav('1.wav')
y1l = np.array(au1.get_array_of_samples(),dtype=np.float32)
print('duration {}s'.format(librosa.get_duration(y=y1l, sr=sr)))
y1, _ = librosa.load('1.wav',sr=sr)
#y1 = librosa.util.normalize(y1,norm=2)
y2, _ = librosa.load('2.wav',sr=sr)
#y2 = librosa.util.normalize(y2,norm=2)
y3, _ = librosa.load('3.wav',sr=sr)
#y3 = librosa.util.normalize(y3,norm=2)
y4, _ = librosa.load('4.wav',sr=sr)
#y4 = librosa.util.normalize(y4,norm=2)
X1 = librosa.feature.mfcc(y=y1 ,sr=sr)
X2 = librosa.feature.mfcc(y=y2 ,sr=sr)
X3 = librosa.feature.mfcc(y=y3 ,sr=sr)
X4 = librosa.feature.mfcc(y=y4 ,sr=sr)
refs = [X1,X2,X3,X4]
def sentenceSegmentation(speechFile, sr, ref_audios_mfcc):
hop_length = 512 # default mfcc window size
y, _ = librosa.load(speechFile,sr=sr)
#plt.figure(figsize=(10, 4))
#y = librosa.util.normalize(y)
print('{} duration: {}s'.format(speechFile, librosa.get_duration(y=y, sr=sr)))
X = librosa.feature.mfcc(y=y ,sr=sr)
#librosa.display.specshow(X, x_axis='time')
#plt.colorbar()
#plt.title('MFCC')
#plt.tight_layout()
choices_num = len(ref_audios_mfcc)
segment_time_point = np.zeros((choices_num+1,2),dtype=np.int64)
for t, ref in enumerate(ref_audios_mfcc):
D, wp = librosa.sequence.dtw(X, ref, subseq=True)
segment_time_point[t,:] = [wp[-1,0]* hop_length, wp[0,0]* hop_length]
segment_time_point[-1,:] = [len(y), len(y)]
#print(segment_time_point)
audio_segments = list()
if len(segment_time_point) == choices_num + 1:
for i in range(choices_num):
begin_i = segment_time_point[i,1]
end_i = segment_time_point[i+1,0]
segment = y[begin_i:end_i]
# Trim the beginning and ending silence
yt, _ = librosa.effects.trim(segment,top_db=25, ref=np.max)
print('choice {} begin at {}s, end at {}s (duration: {}s)'.format(i+1, format(begin_i/sr,'.3f'), format((begin_i+len(yt))/sr,'.3f'), librosa.get_duration(y=yt, sr=sr)))
audio_segments.append(yt)
else:
raise ValueError('segment_time_point size must be 5!')
return audio_segments, sr
for c, speechFile in enumerate(speechFileList):
print('choices: {}'.format(c+1))
#speechFile ='./choices/C0000026.wav'
m = re.match(r'\./choices/(.+)\.wav', speechFile)
filename = m.group(1)
path = 'segment/{}'.format(filename)
if not os.path.isdir(path):
os.makedirs(path)
chunks, sr = sentenceSegmentation(speechFile, sr, refs)
for i, chunk in enumerate(chunks):
librosa.output.write_wav('{}/choice{}.wav'.format(path,i+1),chunk,sr = sr)
match.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 11 08:08:59 2018
@author: PeterTsai
"""
import os
import numpy as np
import glob
import librosa
import librosa.display
import re
import matplotlib.pyplot as plt
import pandas as pd
sr = 16000
hop_length = 512
logfilename = 'log.txt'
out_path = 'match_segment'
if not os.path.isdir(out_path):
os.makedirs(out_path)
logfile = open('{}/{}'.format(out_path, logfilename),'w')
df = pd.read_csv('Kaggle.csv',index_col=False)
register_dataset = dict(zip(df.ID,df.Answer))
speechFileList = sorted(glob.glob('./choices/*.wav'))
def createMFCC(register_person, path='./audio_registeration',sr=16000):
y1, _ = librosa.load('{}/{}/1.wav'.format(path,register_person),sr=sr)
y2, _ = librosa.load('{}/{}/2.wav'.format(path,register_person),sr=sr)
y3, _ = librosa.load('{}/{}/3.wav'.format(path,register_person),sr=sr)
y4, _ = librosa.load('{}/{}/4.wav'.format(path,register_person),sr=sr)
X1 = librosa.feature.mfcc(y=y1 ,sr=sr)
X2 = librosa.feature.mfcc(y=y2 ,sr=sr)
X3 = librosa.feature.mfcc(y=y3 ,sr=sr)
X4 = librosa.feature.mfcc(y=y4 ,sr=sr)
ref_mfccs = [X1,X2,X3,X4]
return ref_mfccs
for speechFile in speechFileList:
#speechFile ='./choices/C0000026.wav'
y, _ = librosa.load(speechFile,sr=sr)
duration = librosa.get_duration(y=y,sr=sr)
time_ticks = np.arange(0,duration,step=0.5)
m = re.match(r'\./choices/(.+)\.wav', speechFile)
filename = m.group(1)
#print('choices ID: {}'.format(filename))
choiceID = int(filename[1:])
register_person = register_dataset[choiceID]
if register_person != 'maleB':
continue
ref_mfccs = createMFCC(register_person)
X = librosa.feature.mfcc(y=y ,sr=sr)
choices_num = len(ref_mfccs)
segment_time_point = np.zeros((choices_num+1,2),dtype=np.int64)
for t, ref in enumerate(ref_mfccs):
D, wp = librosa.sequence.dtw(X, ref, subseq=True)
segment_time_point[t,:] = [wp[-1,0]* hop_length, wp[0,0]* hop_length]
segment_time_point[-1,:] = [len(y), len(y)]
bound_times = segment_time_point /sr
orderCheck = np.zeros(choices_num)
for i in range(choices_num):
if bound_times[i+1,0] > bound_times[i,1]:
orderCheck[i] = 1
plt.figure(figsize=(20,5),dpi=200)
ax1 = plt.subplot(1, 1, 1)
ax1.tick_params(axis='both', which='major', labelsize=5)
ax1.tick_params(axis='both', which='minor', labelsize=4)
librosa.display.waveplot(y,sr=sr,x_axis='time')
ax1.set_xticks(time_ticks)
ymin, ymax = plt.ylim()
plt.vlines(bound_times[:-1,0], ymin, ymax, color='red', linestyle='-.', linewidth=2, alpha=0.9)
plt.vlines(bound_times[:-1,1], ymin, ymax, color='purple', linestyle='-.', linewidth=2, alpha=0.9)
#time_centers = np.mean(bound_times,axis=-1)
th = []
for tc in range(choices_num):
#plt.text(time_centers[i], 0.9*ymax, 'ref. {}'.format(tc+1), bbox=dict(facecolor='yellow', alpha=0.5))
th.append(plt.text(bound_times[tc,0], 0.9*ymax, 'ref. {}'.format(tc+1), bbox=dict(facecolor='yellow', alpha=0.5)))
if np.prod(orderCheck):
subpath = '{}/success/'.format(out_path)
if not os.path.isdir(subpath):
os.makedirs(subpath)
plt.savefig('{}/{}.png'.format(subpath,filename))
response = 'Success'
else:
subpath = '{}/fail/'.format(out_path)
if not os.path.isdir(subpath):
os.makedirs(subpath)
plt.savefig('{}/{}.png'.format(subpath,filename))
response = 'Fail'
plt.close()
print('choices ID: {} [{}]'.format(filename, response))
logfile.write('choices ID: {} [{}]\n'.format(filename, response))
logfile.close()