from vosk import Model, KaldiRecognizer, SetLogLevel from tqdm.notebook import tqdm import wave import os import json def transcript_file(input_file, model_path): # Check if file exists if not os.path.isfile(input_file): raise FileNotFoundError(os.path.basename(input_file) + " not found") # Check if model path exists if not os.path.exists(model_path): raise FileNotFoundError(os.path.basename(model_path) + " not found") # open audio file wf = wave.open(input_file, "rb") # check if wave file has the right properties if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": raise TypeError("Audio file must be WAV format mono PCM.") # Initialize model model = Model(model_path) rec = KaldiRecognizer(model, wf.getframerate()) # Get file size (to calculate progress bar) file_size = os.path.getsize(input_file) # Run transcription pbar = tqdm(total=file_size) # To store our results transcription = [] while True: data = wf.readframes(4000) # use buffer of 4000 pbar.update(len(data)) if len(data) == 0: pbar.set_description("Transcription finished") break if rec.AcceptWaveform(data): # Convert json output to dict result_dict = json.loads(rec.Result()) # Extract text values and append them to transcription list transcription.append(result_dict.get("text", "")) # Get final bits of audio and flush the pipeline final_result = json.loads(rec.FinalResult()) transcription.append(final_result.get("text", "")) transcription_text = ' '.join(transcription) return transcription_text wave_file = '/input/already_converted/drive_thu.wav' transcription = transcript_file(wave_file, 'models/en')