diff --git a/videocr/api.py b/videocr/api.py index a297f41..d7dc4a0 100644 --- a/videocr/api.py +++ b/videocr/api.py @@ -1,24 +1,11 @@ -from urllib.request import urlopen -import shutil - -from . import constants +from . import utils from .video import Video def get_subtitles( video_path: str, lang='eng', time_start='0:00', time_end='', conf_threshold=65, sim_threshold=90, use_fullframe=False) -> str: - # download tesseract data files to ~/tessdata if necessary - constants.TESSDATA_DIR.mkdir(parents=True, exist_ok=True) - for fname in lang.split('+'): - fpath = constants.TESSDATA_DIR / '{}.traineddata'.format(fname) - if not fpath.is_file(): - if fname[0].isupper(): - url = constants.TESSDATA_SCRIPT_URL.format(fname) - else: - url = constants.TESSDATA_URL.format(fname) - with urlopen(url) as res, open(fpath, 'w+b') as f: - shutil.copyfileobj(res, f) + utils.download_lang_data(lang) v = Video(video_path) v.run_ocr(lang, time_start, time_end, conf_threshold, use_fullframe) diff --git a/videocr/utils.py b/videocr/utils.py new file mode 100644 index 0000000..ee6161c --- /dev/null +++ b/videocr/utils.py @@ -0,0 +1,21 @@ +from urllib.request import urlopen +import shutil + +from . import constants + + +# download language data files to ~/tessdata if necessary +def download_lang_data(lang: str): + constants.TESSDATA_DIR.mkdir(parents=True, exist_ok=True) + + for lang_name in lang.split('+'): + filepath = constants.TESSDATA_DIR / '{}.traineddata'.format(lang_name) + if not filepath.is_file(): + # download needed file + if lang_name[0].isupper(): + url = constants.TESSDATA_SCRIPT_URL.format(lang_name) + else: + url = constants.TESSDATA_URL.format(lang_name) + + with urlopen(url) as res, open(filepath, 'w+b') as f: + shutil.copyfileobj(res, f)