youtube dl script ok to run directly, add en version for extract subtitles

This commit is contained in:
tykayn 2021-03-23 13:26:29 +01:00
parent ed4d03342d
commit c0f2f4d225
5 changed files with 145 additions and 5 deletions

132
extract_srt_en.py Normal file
View File

@ -0,0 +1,132 @@
#!/usr/bin/env python3
# TODO tqdm
from vosk import Model, KaldiRecognizer, SetLogLevel
import sys
import os
import subprocess
import json
import argparse
from collections import namedtuple
from pprint import pprint
try:
from tqdm import tqdm
tqdm_installed = True
except:
tqdm_installed = False
class SubPart:
def __init__(self, start, end, text):
self.start = start
self.end = end
self.text = text
@staticmethod
def ftot(f):
h = int(f//3600)
m = int(f//60 % 60)
s = int(f//1 % 60)
ms = int((1000 * f) % 1000)
s = f"{h:02}:{m:02}:{s:02},{ms:03}"
return s
def __repr__(self):
return f"""
{self.ftot(self.start)} --> {self.ftot(self.end)}
{self.text}
"""[1:-1]
def gen_subparts(input_file, model_dir, verbose=False, partlen=4, progress=False):
SetLogLevel(0 if verbose else -1)
model = Model(model_dir)
rec = KaldiRecognizer(model, 16000)
process = subprocess.Popen(['ffmpeg', '-loglevel', 'quiet', '-i',
input_file,
'-ar', str(16000) , '-ac', '1', '-f', 's16le', '-'],
stdout=subprocess.PIPE)
r = subprocess.run("ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1".split() + [input_file], stdout=subprocess.PIPE)
duration = float(r.stdout.decode('utf-8').strip())
if progress:
pbar = tqdm(total=duration, unit="s")
prev_end = 0
while True:
data = process.stdout.read(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
r = json.loads(rec.Result())
if 'result' in r:
resultpart = [] # TODO: use this across AccesptForm
for result in r['result']:
if len(resultpart) > 0 and float(result['end']) - float(resultpart[0]['start']) >= partlen:
yield SubPart(start=resultpart[0]['start'],
end=float(resultpart[-1]['end']),
text=" ".join(r['word'] for r in resultpart))
prev_end = float(resultpart[-1]['end'])
resultpart = []
if float(result['end'] - result['start']) >= partlen:
yield SubPart(start=float(result['start']),
end=float(result['end']),
text=result['word'])
prev_end = float(result['end'])
resultpart = []
else:
resultpart.append(result)
if progress:
pbar.update(float(result['end'] - pbar.n))
if len(resultpart) > 0:
yield SubPart(start=float(resultpart[0]['start']),
end=float(resultpart[-1]['end']),
text=" ".join(r['word'] for r in resultpart))
prev_end = float(resultpart[-1]['end'])
resultpart = []
else:
pass
#print(rec.PartialResult())
#pprint(rec.PartialResult())
if progress:
pbar.close()
r = json.loads(rec.PartialResult())
text = r['partial']
yield SubPart(start=prev_end, end=duration, text=text)
def create_parser():
parser = argparse.ArgumentParser(prog="SRT file extractor using Speech-To-Text")
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("-o", "--output", type=argparse.FileType('w+'), default=sys.stdout)
parser.add_argument("-m", "--model", required=False)
parser.add_argument("-i", "--interval", default=4)
if tqdm_installed:
parser.add_argument("-p", "--progress", action="store_true")
parser.add_argument("input")
return parser
def main():
args = create_parser().parse_args()
if tqdm_installed:
it = enumerate(gen_subparts(args.input, "models/en", args.verbose, args.interval, args.progress))
else:
it = enumerate(gen_subparts(args.input, "models/en", args.verbose, args.interval, False))
for i,subpart in it:
n = i+1
args.output.write(f"""{n}
{subpart}
"""
)
if __name__ == "__main__":
main()

View File

@ -34,12 +34,10 @@ include( '_head.php' );
$uniqid = time();
# exemple url https://peertube.cipherbliss.com/videos/watch/e6a37508-042e-4d83-8598-5d36b764bb3d
$old = getcwd();
echo $old;
chdir( $old );
// exec( $old . '/youtube-dl.sh ' . $uniqid . ' ' . $url, $output, $result );
exec( $old . '/test.sh ' . $uniqid . ' ' . $url, $output, $result );
// exec( './youtube-dl.sh uniqueid_facho https://peertube.cipherbliss.com/videos/watch/b88a9568-517c-4a49-ab07-75c79323a825', $output, $result );
// exec( '.'.$old . '/test.sh ' . $uniqid . ' ' . $url, $output, $result );
echo "<br/> résultat du script. <br>";
var_dump( $result );
@ -66,6 +64,8 @@ include( '_head.php' );
echo " <br> pas d'url envoyée. Vérifiez le formulaire. <a href='index.php'>Retour</a>";
}
exec( './youtube-dl.sh canadien https://www.youtube.com/watch?v=w97pAEr3svc', $output, $result );
?>
</div>

View File

@ -0,0 +1,5 @@
<?php
exec( './youtube-dl.sh uniqueid_facho https://peertube.cipherbliss.com/videos/watch/b88a9568-517c-4a49-ab07-75c79323a825', $output,$result);
var_dump($output);
var_dump($result);

1
website/test_works.txt Normal file
View File

@ -0,0 +1 @@
coucou

View File

@ -1,6 +1,8 @@
#!/bin/bash
mkdir -p ../input/ydl
rm -rf ../input/ydl/$UNIQID.mp3
rm -rf ../input/ydl/$UNIQID
UNIQID=$1
URL=$2