Main block of transcribe_file_with_word_offset.py
Fig.1 Flowchart of Main Block of transcribe_file_with_word_time_offsets.py
Fig.1 呈現Main block的流程圖。 行155-175是External setup using os.environ and argparse.ArgumentParser區塊。 行176-209是Get Processing File List區塊。 行212-236是Set output_csv_filename and choices_list區塊。 行238-268是Batch-wise Speech Transcription區塊。
153 if __name__ == '__main__':
154
155 os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="GC-AI-Challenge-4dc21e90cad0.json"
156 parser = argparse.ArgumentParser(
157 description=__doc__,
158 formatter_class=argparse.RawDescriptionHelpFormatter)
159
160 parser.add_argument('--beta', dest='beta', action='store_true')
161 parser.add_argument('--no-beta', dest='beta', action='store_false')
162 parser.add_argument('--nr', dest='nr', action='store_true')
163 parser.add_argument('--no-nr', dest='nr', action='store_false')
164 parser.add_argument('--reset', dest='reset', action='store_true')
165 parser.add_argument('--no-reset', dest='reset', action='store_false')
166
167 parser.add_argument('-v', dest='verbose', action='store_true', help='Verbose mode')
168
169 parser.add_argument('--batch_idx', dest='batch_idx', help='[1-60]')
170 parser.add_argument('--batch_size', dest='batch_idx', help='default 25')
171 parser.add_argument('--input_dir', dest='input_dir', help='input wav folder')
172 parser.set_defaults(beta=True, nr=True, reset=False, batch_idx='0', batch_size=25, input_dir='choices(entire)')
173
174
175 args = parser.parse_args()
176 batch_size = args.batch_size
177 string_batch_range = args.batch_idx
178 input_dir = args.input_dir
179
180 input_nr_dir = os.path.join(input_dir,'nr')
181 if not os.path.isdir(input_nr_dir):
182 os.makedirs(input_nr_dir)
183
184 output_dir= 'result_'+ input_dir
185 if not os.path.isdir(output_dir):
186 os.makedirs(output_dir)
187
188 entireSpeechFileList = sorted(glob.glob(os.path.join(input_dir,'*.wav')))
189 #print('entire speech files in folder {}'.format(input_dir))
190 #print(entireSpeechFileList)
191
192 if re.search('-', string_batch_range):
193 rr = re.match(r'([0-9]*)-([0-9]*)', string_batch_range)
194 if rr.group(1):
195 begin_idx = int(rr.group(1)) - 1
196 else:
197 begin_idx = 0
198 if rr.group(2):
199 end_idx = int(rr.group(2))
200 else:
201 end_idx = int(np.ceil(len(entireSpeechFileList)/batch_size))
202 else:
203 rr = re.match(r'([0-9]*)', string_batch_range)
204 if int(rr.group(1)) != 0:
205 begin_idx = int(rr.group(1))
206 end_idx = int(rr.group(1)) + 1
207 else:
208 begin_idx = 0
209 end_idx = int(np.ceil(len(entireSpeechFileList)/batch_size))
210
211
212 print('beta:', args.beta)
213 if args.beta:
214 suffix = 'beta'
215 else:
216 suffix = ''
217 print('suffix:', suffix)
218
219 print('nr:', args.nr)
220 if args.nr:
221 suffix1 = 'nr'
222 else:
223 suffix1 = ''
224 print('suffix1:', suffix1)
225
226
227 output_csv_filename = os.path.join(output_dir,'transcribe_output_{}_{}.csv'.format(suffix,suffix1))
228
229 '''check result csv exist or not'''
230 if os.path.isfile(output_csv_filename) and not(args.reset):
231 df = pd.read_csv(output_csv_filename,index_col=0)
232 existSpeechFileList = df.filename.tolist()
233 choices_list = df.sentence.tolist()
234 else:
235 choices_list = []
236 existSpeechFileList = []
237
238 for b in range(begin_idx,end_idx):
239
240 timestamp_xls_filename = os.path.join(output_dir,'transcribe_timestamp{}_{:02d}.xlsx'.format(suffix,b+1))
241 timestamp_writer = pd.ExcelWriter(timestamp_xls_filename)
242
243 ''' check batch xlsx status 1. complete task 2. proccessing yet finished 3. new '''
244 titles = []
245 if os.path.isfile(timestamp_xls_filename):
246 print('load {} ...'.format(timestamp_xls_filename))
247
248 book = load_workbook(timestamp_xls_filename)
249
250 for ws in book.worksheets:
251 mmm = re.match(r'(.+)\.wav_(.+)', ws.title)
252 titles.append(os.path.join(input_dir,mmm.group(1)+ '.wav'))
253 titles = sorted(list(set(titles)))
254 print('number of sheets: {}'.format(len(titles)))
255 if len(titles) == batch_size or b == end_idx-1:
256 continue
257 else:
258
259 book = load_workbook(timestamp_xls_filename)
260 timestamp_writer.book = book
261 timestamp_writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
262 speechFileQueue = sorted(list(set(entireSpeechFileList[b*batch_size:(b+1)*batch_size]) - set(titles)))
263
264 batch_transcribe_speechFile(speechFileQueue,existSpeechFileList,choices_list,timestamp_writer,args.verbose,args.nr,beta=args.beta)
265 else:
266 print('create new {} !!!'.format(timestamp_xls_filename))
267 speechFileQueue = entireSpeechFileList[b*batch_size:(b+1)*batch_size]
268
269 batch_transcribe_speechFile(speechFileQueue,existSpeechFileList,choices_list,timestamp_writer,args.verbose,args.nr,beta=args.beta)