transcription/transcribe_2.py

59 lines
1.8 KiB
Python
Raw Permalink Normal View History

2024-09-30 14:52:39 +02:00
from vosk import Model, KaldiRecognizer, SetLogLevel
from tqdm.notebook import tqdm
import wave
import os
import json
def transcript_file(input_file, model_path):
# Check if file exists
if not os.path.isfile(input_file):
raise FileNotFoundError(os.path.basename(input_file) + " not found")
# Check if model path exists
if not os.path.exists(model_path):
raise FileNotFoundError(os.path.basename(model_path) + " not found")
# open audio file
wf = wave.open(input_file, "rb")
# check if wave file has the right properties
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
raise TypeError("Audio file must be WAV format mono PCM.")
# Initialize model
model = Model(model_path)
rec = KaldiRecognizer(model, wf.getframerate())
# Get file size (to calculate progress bar)
file_size = os.path.getsize(input_file)
# Run transcription
pbar = tqdm(total=file_size)
# To store our results
transcription = []
while True:
data = wf.readframes(4000) # use buffer of 4000
pbar.update(len(data))
if len(data) == 0:
pbar.set_description("Transcription finished")
break
if rec.AcceptWaveform(data):
# Convert json output to dict
result_dict = json.loads(rec.Result())
# Extract text values and append them to transcription list
transcription.append(result_dict.get("text", ""))
# Get final bits of audio and flush the pipeline
final_result = json.loads(rec.FinalResult())
transcription.append(final_result.get("text", ""))
transcription_text = ' '.join(transcription)
return transcription_text
wave_file = '/input/already_converted/drive_thu.wav'
transcription = transcript_file(wave_file, 'models/en')