Replace tesseract with PaddleOCR #5

Closed
Yun wants to merge 2 commits from Yun/videocr:master into master
Showing only changes of commit 37de9b3e5f - Show all commits

View File

@ -1,15 +1,12 @@
from __future__ import annotations from __future__ import annotations
from typing import List from typing import List
import sys
import multiprocessing import multiprocessing
import pytesseract
import cv2 import cv2
import numpy as np
from . import constants
from . import utils from . import utils
from .models import PredictedFrame, PredictedSubtitle from .models import PredictedFrame, PredictedSubtitle
from .opencv_adapter import Capture from .opencv_adapter import Capture
from paddleocr import PaddleOCR
class Video: class Video:
@ -19,8 +16,6 @@ class Video:
num_frames: int num_frames: int
fps: float fps: float
height: int height: int
width: int
resize_dim: List[int]
pred_frames: List[PredictedFrame] pred_frames: List[PredictedFrame]
pred_subs: List[PredictedSubtitle] pred_subs: List[PredictedSubtitle]
@ -30,9 +25,6 @@ class Video:
self.num_frames = int(v.get(cv2.CAP_PROP_FRAME_COUNT)) self.num_frames = int(v.get(cv2.CAP_PROP_FRAME_COUNT))
self.fps = v.get(cv2.CAP_PROP_FPS) self.fps = v.get(cv2.CAP_PROP_FPS)
self.height = int(v.get(cv2.CAP_PROP_FRAME_HEIGHT)) self.height = int(v.get(cv2.CAP_PROP_FRAME_HEIGHT))
self.width = int(v.get(cv2.CAP_PROP_FRAME_WIDTH))
scale_percent = 47 # apparently 32 pixels is the optimal character height for tesseract.
self.resize_dim=(int(self.width * scale_percent/100), int(self.height * scale_percent/100))
def run_ocr(self, lang: str, time_start: str, time_end: str, def run_ocr(self, lang: str, time_start: str, time_end: str,
conf_threshold: int, use_fullframe: bool) -> None: conf_threshold: int, use_fullframe: bool) -> None:
@ -58,23 +50,15 @@ class Video:
for i, data in enumerate(it_ocr) for i, data in enumerate(it_ocr)
] ]
def _image_to_data(self, img) -> str: def _image_to_data(self, img) -> list[list]:
if img is None:
return '\n'
if not self.use_fullframe: if not self.use_fullframe:
# only use bottom half of the frame by default # only use bottom half of the frame by default
img = img[self.height // 2:, :] img = img[self.height // 2:, :]
img = cv2.dilate(img, np.ones((2, 2), np.uint8))
_, img = cv2.threshold(img, 215, 255, cv2.THRESH_BINARY) _, img = cv2.threshold(img, 215, 255, cv2.THRESH_BINARY)
color_mask = cv2.inRange(img, (255, 255, 255), (255, 255, 255)) return PaddleOCR(lang='ch').ocr(img)
img = cv2.bitwise_and(img, img, mask=color_mask)
img = cv2.erode(img, np.ones((2, 2), np.uint8))
img = cv2.bitwise_not(img)
img = cv2.resize(img, self.resize_dim, interpolation=cv2.INTER_AREA)
img = cv2.copyMakeBorder(img, 20, 20, 0, 0, cv2.BORDER_CONSTANT, None, (255,255,255))
config = '--tessdata-dir "{}" --psm 7 -c preserve_interword_spaces=1'.format(constants.TESSDATA_DIR)
try:
return pytesseract.image_to_data(img, lang=self.lang, config=config)
except Exception as e:
sys.exit('{}: {}'.format(e.__class__.__name__, e))
def get_subtitles(self, sim_threshold: int) -> str: def get_subtitles(self, sim_threshold: int) -> str:
self._generate_subtitles(sim_threshold) self._generate_subtitles(sim_threshold)