From 9b37319961afe28f360b44cafde061cd9832177c Mon Sep 17 00:00:00 2001 From: Yun Date: Fri, 16 Jul 2021 16:58:44 +0200 Subject: [PATCH 1/2] Update model to use PaddleOCR results --- videocr/models.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/videocr/models.py b/videocr/models.py index 121cb0f..d0bb090 100644 --- a/videocr/models.py +++ b/videocr/models.py @@ -17,25 +17,15 @@ class PredictedFrame: confidence: int # total confidence of all words text: str - def __init__(self, index: int, pred_data: str, conf_threshold: int): + def __init__(self, index: int, pred_data: list[list], conf_threshold: int): self.index = index self.words = [] - block = 0 # keep track of line breaks - - for l in pred_data.splitlines()[1:]: - word_data = l.split() - if len(word_data) < 12: - # no word is predicted + for l in pred_data: + if len(l) < 2: continue - _, _, block_num, *_, conf, text = word_data - block_num, conf = int(block_num), int(conf) - - # handle line breaks - if block < block_num: - block = block_num - if self.words and self.words[-1].text != '\n': - self.words.append(PredictedWord(0, '\n')) + text = l[1][0] + conf = int(l[1][1] * 100) # word predictions with low confidence will be filtered out if conf >= conf_threshold: -- 2.39.5 From 37de9b3e5f1c1b5bb861fa3583ea1d808996b0ad Mon Sep 17 00:00:00 2001 From: Yun Date: Fri, 16 Jul 2021 17:01:18 +0200 Subject: [PATCH 2/2] Update image processing to use PaddleOCR instead of tesseract --- videocr/video.py | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/videocr/video.py b/videocr/video.py index dbf2356..7510e41 100644 --- a/videocr/video.py +++ b/videocr/video.py @@ -1,15 +1,12 @@ from __future__ import annotations from typing import List -import sys import multiprocessing -import pytesseract import cv2 -import numpy as np -from . import constants from . import utils from .models import PredictedFrame, PredictedSubtitle from .opencv_adapter import Capture +from paddleocr import PaddleOCR class Video: @@ -19,8 +16,6 @@ class Video: num_frames: int fps: float height: int - width: int - resize_dim: List[int] pred_frames: List[PredictedFrame] pred_subs: List[PredictedSubtitle] @@ -30,9 +25,6 @@ class Video: self.num_frames = int(v.get(cv2.CAP_PROP_FRAME_COUNT)) self.fps = v.get(cv2.CAP_PROP_FPS) self.height = int(v.get(cv2.CAP_PROP_FRAME_HEIGHT)) - self.width = int(v.get(cv2.CAP_PROP_FRAME_WIDTH)) - scale_percent = 47 # apparently 32 pixels is the optimal character height for tesseract. - self.resize_dim=(int(self.width * scale_percent/100), int(self.height * scale_percent/100)) def run_ocr(self, lang: str, time_start: str, time_end: str, conf_threshold: int, use_fullframe: bool) -> None: @@ -58,23 +50,15 @@ class Video: for i, data in enumerate(it_ocr) ] - def _image_to_data(self, img) -> str: + def _image_to_data(self, img) -> list[list]: + if img is None: + return '\n' if not self.use_fullframe: # only use bottom half of the frame by default img = img[self.height // 2:, :] - img = cv2.dilate(img, np.ones((2, 2), np.uint8)) _, img = cv2.threshold(img, 215, 255, cv2.THRESH_BINARY) - color_mask = cv2.inRange(img, (255, 255, 255), (255, 255, 255)) - img = cv2.bitwise_and(img, img, mask=color_mask) - img = cv2.erode(img, np.ones((2, 2), np.uint8)) - img = cv2.bitwise_not(img) - img = cv2.resize(img, self.resize_dim, interpolation=cv2.INTER_AREA) - img = cv2.copyMakeBorder(img, 20, 20, 0, 0, cv2.BORDER_CONSTANT, None, (255,255,255)) - config = '--tessdata-dir "{}" --psm 7 -c preserve_interword_spaces=1'.format(constants.TESSDATA_DIR) - try: - return pytesseract.image_to_data(img, lang=self.lang, config=config) - except Exception as e: - sys.exit('{}: {}'.format(e.__class__.__name__, e)) + return PaddleOCR(lang='ch').ocr(img) + def get_subtitles(self, sim_threshold: int) -> str: self._generate_subtitles(sim_threshold) -- 2.39.5