Match audio Signal(DTW)

preprocess.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 23 08:57:19 2018

@author: PeterTsai
"""

import os
import numpy as np
import glob
import librosa
import librosa.display
import re
from pydub import AudioSegment
import matplotlib.pyplot as plt

speechFileList = sorted(glob.glob('./choices/*.wav'))

# Load reference answer index audio (1,2,3,4)
sr = 16000
au1 = AudioSegment.from_wav('1.wav')
y1l = np.array(au1.get_array_of_samples(),dtype=np.float32)
print('duration {}s'.format(librosa.get_duration(y=y1l, sr=sr)))
y1, _ = librosa.load('1.wav',sr=sr)
#y1 = librosa.util.normalize(y1,norm=2)
y2, _ = librosa.load('2.wav',sr=sr)
#y2 = librosa.util.normalize(y2,norm=2)
y3, _ = librosa.load('3.wav',sr=sr)
#y3 = librosa.util.normalize(y3,norm=2)
y4, _ = librosa.load('4.wav',sr=sr)
#y4 = librosa.util.normalize(y4,norm=2)
X1 = librosa.feature.mfcc(y=y1 ,sr=sr)
X2 = librosa.feature.mfcc(y=y2 ,sr=sr)
X3 = librosa.feature.mfcc(y=y3 ,sr=sr)
X4 = librosa.feature.mfcc(y=y4 ,sr=sr)
refs = [X1,X2,X3,X4]



def sentenceSegmentation(speechFile, sr, ref_audios_mfcc):
    hop_length = 512 # default mfcc window size
    y, _ = librosa.load(speechFile,sr=sr)
    #plt.figure(figsize=(10, 4))
    #y = librosa.util.normalize(y)
    print('{} duration: {}s'.format(speechFile, librosa.get_duration(y=y, sr=sr)))
    X = librosa.feature.mfcc(y=y ,sr=sr)
    #librosa.display.specshow(X, x_axis='time')
    #plt.colorbar()
    #plt.title('MFCC')
    #plt.tight_layout()

    choices_num = len(ref_audios_mfcc)
    segment_time_point = np.zeros((choices_num+1,2),dtype=np.int64)
    for t, ref in enumerate(ref_audios_mfcc):
        D, wp = librosa.sequence.dtw(X, ref, subseq=True)
        segment_time_point[t,:] = [wp[-1,0]* hop_length, wp[0,0]* hop_length] 
    segment_time_point[-1,:] = [len(y), len(y)]
    #print(segment_time_point)
    audio_segments = list()
    if len(segment_time_point) == choices_num + 1:
        for i in range(choices_num):
            begin_i = segment_time_point[i,1]
            end_i   = segment_time_point[i+1,0]
            segment = y[begin_i:end_i]
            # Trim the beginning and ending silence
            yt, _ = librosa.effects.trim(segment,top_db=25,  ref=np.max)
            print('choice {} begin at {}s, end at {}s (duration: {}s)'.format(i+1, format(begin_i/sr,'.3f'), format((begin_i+len(yt))/sr,'.3f'), librosa.get_duration(y=yt, sr=sr)))
            audio_segments.append(yt)
    else:
        raise ValueError('segment_time_point size must be 5!')

    return audio_segments, sr


for c, speechFile in enumerate(speechFileList):
    print('choices: {}'.format(c+1))
    #speechFile ='./choices/C0000026.wav'
    m = re.match(r'\./choices/(.+)\.wav', speechFile)
    filename = m.group(1)
    path = 'segment/{}'.format(filename)
    if not os.path.isdir(path):
      os.makedirs(path)    
    chunks, sr = sentenceSegmentation(speechFile, sr, refs)

    for i, chunk in enumerate(chunks):
        librosa.output.write_wav('{}/choice{}.wav'.format(path,i+1),chunk,sr = sr)

match.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 11 08:08:59 2018

@author: PeterTsai
"""

import os
import numpy as np
import glob
import librosa
import librosa.display
import re
import matplotlib.pyplot as plt
import pandas as pd

sr = 16000
hop_length = 512
logfilename = 'log.txt'
out_path = 'match_segment'
if not os.path.isdir(out_path):
    os.makedirs(out_path)
logfile = open('{}/{}'.format(out_path, logfilename),'w')

df = pd.read_csv('Kaggle.csv',index_col=False)  
register_dataset = dict(zip(df.ID,df.Answer))

speechFileList = sorted(glob.glob('./choices/*.wav'))

def createMFCC(register_person, path='./audio_registeration',sr=16000):
    y1, _ = librosa.load('{}/{}/1.wav'.format(path,register_person),sr=sr)
    y2, _ = librosa.load('{}/{}/2.wav'.format(path,register_person),sr=sr)
    y3, _ = librosa.load('{}/{}/3.wav'.format(path,register_person),sr=sr)
    y4, _ = librosa.load('{}/{}/4.wav'.format(path,register_person),sr=sr)

    X1 = librosa.feature.mfcc(y=y1 ,sr=sr)
    X2 = librosa.feature.mfcc(y=y2 ,sr=sr)
    X3 = librosa.feature.mfcc(y=y3 ,sr=sr)
    X4 = librosa.feature.mfcc(y=y4 ,sr=sr)

    ref_mfccs = [X1,X2,X3,X4]
    return ref_mfccs

for speechFile in speechFileList:
    #speechFile ='./choices/C0000026.wav'

    y, _ = librosa.load(speechFile,sr=sr)
    duration = librosa.get_duration(y=y,sr=sr)
    time_ticks = np.arange(0,duration,step=0.5)

    m = re.match(r'\./choices/(.+)\.wav', speechFile)
    filename = m.group(1)
    #print('choices ID: {}'.format(filename))

    choiceID = int(filename[1:])
    register_person = register_dataset[choiceID]
    if register_person != 'maleB':
        continue
    ref_mfccs = createMFCC(register_person)

    X = librosa.feature.mfcc(y=y ,sr=sr)

    choices_num = len(ref_mfccs)
    segment_time_point = np.zeros((choices_num+1,2),dtype=np.int64)
    for t, ref in enumerate(ref_mfccs):
        D, wp = librosa.sequence.dtw(X, ref, subseq=True)
        segment_time_point[t,:] = [wp[-1,0]* hop_length, wp[0,0]* hop_length] 
    segment_time_point[-1,:] = [len(y), len(y)]    

    bound_times = segment_time_point /sr
    orderCheck = np.zeros(choices_num)
    for i in range(choices_num):
        if bound_times[i+1,0] > bound_times[i,1]:
            orderCheck[i] = 1 

    plt.figure(figsize=(20,5),dpi=200)
    ax1 = plt.subplot(1, 1, 1)
    ax1.tick_params(axis='both', which='major', labelsize=5)
    ax1.tick_params(axis='both', which='minor', labelsize=4)
    librosa.display.waveplot(y,sr=sr,x_axis='time')
    ax1.set_xticks(time_ticks)
    ymin, ymax = plt.ylim() 
    plt.vlines(bound_times[:-1,0], ymin, ymax, color='red', linestyle='-.', linewidth=2, alpha=0.9)
    plt.vlines(bound_times[:-1,1], ymin, ymax, color='purple', linestyle='-.', linewidth=2, alpha=0.9)

    #time_centers = np.mean(bound_times,axis=-1)
    th = []
    for tc in range(choices_num):
        #plt.text(time_centers[i], 0.9*ymax, 'ref. {}'.format(tc+1), bbox=dict(facecolor='yellow', alpha=0.5))
        th.append(plt.text(bound_times[tc,0], 0.9*ymax, 'ref. {}'.format(tc+1), bbox=dict(facecolor='yellow', alpha=0.5)))

    if np.prod(orderCheck):
        subpath = '{}/success/'.format(out_path)
        if not os.path.isdir(subpath):
            os.makedirs(subpath)
        plt.savefig('{}/{}.png'.format(subpath,filename))
        response = 'Success'
    else:
        subpath = '{}/fail/'.format(out_path)
        if not os.path.isdir(subpath):
            os.makedirs(subpath)
        plt.savefig('{}/{}.png'.format(subpath,filename))
        response = 'Fail'
    plt.close()
    print('choices ID: {} [{}]'.format(filename, response))
    logfile.write('choices ID: {} [{}]\n'.format(filename, response))

logfile.close()

Match audio Signal (DTW)

Match audio Signal(DTW)

results matching ""

No results matching ""