diff --git a/extract_srt_en.py b/extract_srt_en.py new file mode 100644 index 0000000..7736f14 --- /dev/null +++ b/extract_srt_en.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +# TODO tqdm +from vosk import Model, KaldiRecognizer, SetLogLevel +import sys +import os +import subprocess +import json +import argparse +from collections import namedtuple +from pprint import pprint +try: + from tqdm import tqdm + tqdm_installed = True +except: + tqdm_installed = False + +class SubPart: + + def __init__(self, start, end, text): + self.start = start + self.end = end + self.text = text + + @staticmethod + def ftot(f): + h = int(f//3600) + m = int(f//60 % 60) + s = int(f//1 % 60) + ms = int((1000 * f) % 1000) + s = f"{h:02}:{m:02}:{s:02},{ms:03}" + return s + + def __repr__(self): + return f""" +{self.ftot(self.start)} --> {self.ftot(self.end)} +{self.text} +"""[1:-1] + + +def gen_subparts(input_file, model_dir, verbose=False, partlen=4, progress=False): + SetLogLevel(0 if verbose else -1) + + model = Model(model_dir) + rec = KaldiRecognizer(model, 16000) + + process = subprocess.Popen(['ffmpeg', '-loglevel', 'quiet', '-i', + input_file, + '-ar', str(16000) , '-ac', '1', '-f', 's16le', '-'], + stdout=subprocess.PIPE) + + r = subprocess.run("ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1".split() + [input_file], stdout=subprocess.PIPE) + duration = float(r.stdout.decode('utf-8').strip()) + + if progress: + pbar = tqdm(total=duration, unit="s") + + prev_end = 0 + while True: + data = process.stdout.read(4000) + if len(data) == 0: + break + if rec.AcceptWaveform(data): + r = json.loads(rec.Result()) + if 'result' in r: + resultpart = [] # TODO: use this across AccesptForm + for result in r['result']: + if len(resultpart) > 0 and float(result['end']) - float(resultpart[0]['start']) >= partlen: + yield SubPart(start=resultpart[0]['start'], + end=float(resultpart[-1]['end']), + text=" ".join(r['word'] for r in resultpart)) + prev_end = float(resultpart[-1]['end']) + resultpart = [] + if float(result['end'] - result['start']) >= partlen: + yield SubPart(start=float(result['start']), + end=float(result['end']), + text=result['word']) + prev_end = float(result['end']) + resultpart = [] + else: + resultpart.append(result) + if progress: + pbar.update(float(result['end'] - pbar.n)) + + + if len(resultpart) > 0: + yield SubPart(start=float(resultpart[0]['start']), + end=float(resultpart[-1]['end']), + text=" ".join(r['word'] for r in resultpart)) + prev_end = float(resultpart[-1]['end']) + resultpart = [] + + else: + pass + #print(rec.PartialResult()) + #pprint(rec.PartialResult()) + if progress: + pbar.close() + r = json.loads(rec.PartialResult()) + text = r['partial'] + yield SubPart(start=prev_end, end=duration, text=text) + + +def create_parser(): + parser = argparse.ArgumentParser(prog="SRT file extractor using Speech-To-Text") + parser.add_argument("-v", "--verbose", action="store_true") + parser.add_argument("-o", "--output", type=argparse.FileType('w+'), default=sys.stdout) + parser.add_argument("-m", "--model", required=False) + parser.add_argument("-i", "--interval", default=4) + if tqdm_installed: + parser.add_argument("-p", "--progress", action="store_true") + parser.add_argument("input") + return parser + + +def main(): + args = create_parser().parse_args() + if tqdm_installed: + it = enumerate(gen_subparts(args.input, "models/en", args.verbose, args.interval, args.progress)) + else: + it = enumerate(gen_subparts(args.input, "models/en", args.verbose, args.interval, False)) + for i,subpart in it: + n = i+1 + args.output.write(f"""{n} +{subpart} + +""" +) + + + +if __name__ == "__main__": + main() diff --git a/website/download.php b/website/download.php index 56720ab..720ecad 100644 --- a/website/download.php +++ b/website/download.php @@ -34,12 +34,10 @@ include( '_head.php' ); $uniqid = time(); # exemple url https://peertube.cipherbliss.com/videos/watch/e6a37508-042e-4d83-8598-5d36b764bb3d - $old = getcwd(); - echo $old; - chdir( $old ); -// exec( $old . '/youtube-dl.sh ' . $uniqid . ' ' . $url, $output, $result ); - exec( $old . '/test.sh ' . $uniqid . ' ' . $url, $output, $result ); + +// exec( './youtube-dl.sh uniqueid_facho https://peertube.cipherbliss.com/videos/watch/b88a9568-517c-4a49-ab07-75c79323a825', $output, $result ); +// exec( '.'.$old . '/test.sh ' . $uniqid . ' ' . $url, $output, $result ); echo "
résultat du script.
"; var_dump( $result ); @@ -66,6 +64,8 @@ include( '_head.php' ); echo "
pas d'url envoyée. Vérifiez le formulaire. Retour"; } + exec( './youtube-dl.sh canadien https://www.youtube.com/watch?v=w97pAEr3svc', $output, $result ); + ?> diff --git a/website/run.php b/website/run.php index e69de29..9856ca6 100644 --- a/website/run.php +++ b/website/run.php @@ -0,0 +1,5 @@ +