From 9b37319961afe28f360b44cafde061cd9832177c Mon Sep 17 00:00:00 2001
From: Yun <mrqianhuzi@gmail.com>
Date: Fri, 16 Jul 2021 16:58:44 +0200
Subject: [PATCH 1/2] Update model to use PaddleOCR results

---
 videocr/models.py | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/videocr/models.py b/videocr/models.py
index 121cb0f..d0bb090 100644
--- a/videocr/models.py
+++ b/videocr/models.py
@@ -17,25 +17,15 @@ class PredictedFrame:
     confidence: int  # total confidence of all words
     text: str
 
-    def __init__(self, index: int, pred_data: str, conf_threshold: int):
+    def __init__(self, index: int, pred_data: list[list], conf_threshold: int):
         self.index = index
         self.words = []
 
-        block = 0  # keep track of line breaks
-
-        for l in pred_data.splitlines()[1:]:
-            word_data = l.split()
-            if len(word_data) < 12:
-                # no word is predicted
+        for l in pred_data:
+            if len(l) < 2:
                 continue
-            _, _, block_num, *_, conf, text = word_data
-            block_num, conf = int(block_num), int(conf)
-
-            # handle line breaks
-            if block < block_num:
-                block = block_num
-                if self.words and self.words[-1].text != '\n':
-                    self.words.append(PredictedWord(0, '\n'))
+            text = l[1][0]
+            conf = int(l[1][1] * 100)
 
             # word predictions with low confidence will be filtered out
             if conf >= conf_threshold:
-- 
2.39.5


From 37de9b3e5f1c1b5bb861fa3583ea1d808996b0ad Mon Sep 17 00:00:00 2001
From: Yun <mrqianhuzi@gmail.com>
Date: Fri, 16 Jul 2021 17:01:18 +0200
Subject: [PATCH 2/2] Update image processing to use PaddleOCR instead of
 tesseract

---
 videocr/video.py | 28 ++++++----------------------
 1 file changed, 6 insertions(+), 22 deletions(-)

diff --git a/videocr/video.py b/videocr/video.py
index dbf2356..7510e41 100644
--- a/videocr/video.py
+++ b/videocr/video.py
@@ -1,15 +1,12 @@
 from __future__ import annotations
 from typing import List
-import sys
 import multiprocessing
-import pytesseract
 import cv2
-import numpy as np
 
-from . import constants
 from . import utils
 from .models import PredictedFrame, PredictedSubtitle
 from .opencv_adapter import Capture
+from paddleocr import PaddleOCR
 
 
 class Video:
@@ -19,8 +16,6 @@ class Video:
     num_frames: int
     fps: float
     height: int
-    width: int
-    resize_dim: List[int]
     pred_frames: List[PredictedFrame]
     pred_subs: List[PredictedSubtitle]
 
@@ -30,9 +25,6 @@ class Video:
             self.num_frames = int(v.get(cv2.CAP_PROP_FRAME_COUNT))
             self.fps = v.get(cv2.CAP_PROP_FPS)
             self.height = int(v.get(cv2.CAP_PROP_FRAME_HEIGHT))
-            self.width = int(v.get(cv2.CAP_PROP_FRAME_WIDTH))
-            scale_percent = 47 # apparently 32 pixels is the optimal character height for tesseract.
-            self.resize_dim=(int(self.width * scale_percent/100), int(self.height * scale_percent/100))
 
     def run_ocr(self, lang: str, time_start: str, time_end: str,
                 conf_threshold: int, use_fullframe: bool) -> None:
@@ -58,23 +50,15 @@ class Video:
                 for i, data in enumerate(it_ocr)
             ]
 
-    def _image_to_data(self, img) -> str:
+    def _image_to_data(self, img) -> list[list]:
+        if img is None:
+            return '\n'
         if not self.use_fullframe:
             # only use bottom half of the frame by default
             img = img[self.height // 2:, :]
-        img = cv2.dilate(img, np.ones((2, 2), np.uint8))
         _, img = cv2.threshold(img, 215, 255, cv2.THRESH_BINARY)
-        color_mask = cv2.inRange(img, (255, 255, 255), (255, 255, 255))
-        img = cv2.bitwise_and(img, img, mask=color_mask)
-        img = cv2.erode(img, np.ones((2, 2), np.uint8))
-        img = cv2.bitwise_not(img)
-        img = cv2.resize(img, self.resize_dim, interpolation=cv2.INTER_AREA)
-        img = cv2.copyMakeBorder(img, 20, 20, 0, 0, cv2.BORDER_CONSTANT, None, (255,255,255))
-        config = '--tessdata-dir "{}" --psm 7 -c preserve_interword_spaces=1'.format(constants.TESSDATA_DIR)
-        try:
-            return pytesseract.image_to_data(img, lang=self.lang, config=config)
-        except Exception as e:
-            sys.exit('{}: {}'.format(e.__class__.__name__, e))
+        return PaddleOCR(lang='ch').ocr(img)
+        
 
     def get_subtitles(self, sim_threshold: int) -> str:
         self._generate_subtitles(sim_threshold)
-- 
2.39.5