forked from pradana.aumars/videocr
Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
92d131ecd6 |
@ -17,15 +17,25 @@ class PredictedFrame:
|
|||||||
confidence: int # total confidence of all words
|
confidence: int # total confidence of all words
|
||||||
text: str
|
text: str
|
||||||
|
|
||||||
def __init__(self, index: int, pred_data: list[list], conf_threshold: int):
|
def __init__(self, index: int, pred_data: str, conf_threshold: int):
|
||||||
self.index = index
|
self.index = index
|
||||||
self.words = []
|
self.words = []
|
||||||
|
|
||||||
for l in pred_data:
|
block = 0 # keep track of line breaks
|
||||||
if len(l) < 2:
|
|
||||||
|
for l in pred_data.splitlines()[1:]:
|
||||||
|
word_data = l.split()
|
||||||
|
if len(word_data) < 12:
|
||||||
|
# no word is predicted
|
||||||
continue
|
continue
|
||||||
text = l[1][0]
|
_, _, block_num, *_, conf, text = word_data
|
||||||
conf = int(l[1][1] * 100)
|
block_num, conf = int(block_num), int(conf)
|
||||||
|
|
||||||
|
# handle line breaks
|
||||||
|
if block < block_num:
|
||||||
|
block = block_num
|
||||||
|
if self.words and self.words[-1].text != '\n':
|
||||||
|
self.words.append(PredictedWord(0, '\n'))
|
||||||
|
|
||||||
# word predictions with low confidence will be filtered out
|
# word predictions with low confidence will be filtered out
|
||||||
if conf >= conf_threshold:
|
if conf >= conf_threshold:
|
||||||
|
@ -1,12 +1,15 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from typing import List
|
from typing import List
|
||||||
|
import sys
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
import pytesseract
|
||||||
import cv2
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from . import constants
|
||||||
from . import utils
|
from . import utils
|
||||||
from .models import PredictedFrame, PredictedSubtitle
|
from .models import PredictedFrame, PredictedSubtitle
|
||||||
from .opencv_adapter import Capture
|
from .opencv_adapter import Capture
|
||||||
from paddleocr import PaddleOCR
|
|
||||||
|
|
||||||
|
|
||||||
class Video:
|
class Video:
|
||||||
@ -16,6 +19,8 @@ class Video:
|
|||||||
num_frames: int
|
num_frames: int
|
||||||
fps: float
|
fps: float
|
||||||
height: int
|
height: int
|
||||||
|
width: int
|
||||||
|
resize_dim: List[int]
|
||||||
pred_frames: List[PredictedFrame]
|
pred_frames: List[PredictedFrame]
|
||||||
pred_subs: List[PredictedSubtitle]
|
pred_subs: List[PredictedSubtitle]
|
||||||
|
|
||||||
@ -25,6 +30,9 @@ class Video:
|
|||||||
self.num_frames = int(v.get(cv2.CAP_PROP_FRAME_COUNT))
|
self.num_frames = int(v.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||||
self.fps = v.get(cv2.CAP_PROP_FPS)
|
self.fps = v.get(cv2.CAP_PROP_FPS)
|
||||||
self.height = int(v.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
self.height = int(v.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||||
|
self.width = int(v.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||||
|
scale_percent = 47 # apparently 32 pixels is the optimal character height for tesseract.
|
||||||
|
self.resize_dim=(int(self.width * scale_percent/100), int(self.height * scale_percent/100))
|
||||||
|
|
||||||
def run_ocr(self, lang: str, time_start: str, time_end: str,
|
def run_ocr(self, lang: str, time_start: str, time_end: str,
|
||||||
conf_threshold: int, use_fullframe: bool) -> None:
|
conf_threshold: int, use_fullframe: bool) -> None:
|
||||||
@ -50,15 +58,23 @@ class Video:
|
|||||||
for i, data in enumerate(it_ocr)
|
for i, data in enumerate(it_ocr)
|
||||||
]
|
]
|
||||||
|
|
||||||
def _image_to_data(self, img) -> list[list]:
|
def _image_to_data(self, img) -> str:
|
||||||
if img is None:
|
|
||||||
return '\n'
|
|
||||||
if not self.use_fullframe:
|
if not self.use_fullframe:
|
||||||
# only use bottom half of the frame by default
|
# only use bottom half of the frame by default
|
||||||
img = img[self.height // 2:, :]
|
img = img[self.height // 2:, :]
|
||||||
|
img = cv2.dilate(img, np.ones((2, 2), np.uint8))
|
||||||
_, img = cv2.threshold(img, 215, 255, cv2.THRESH_BINARY)
|
_, img = cv2.threshold(img, 215, 255, cv2.THRESH_BINARY)
|
||||||
return PaddleOCR(lang='ch').ocr(img)
|
color_mask = cv2.inRange(img, (255, 255, 255), (255, 255, 255))
|
||||||
|
img = cv2.bitwise_and(img, img, mask=color_mask)
|
||||||
|
img = cv2.erode(img, np.ones((2, 2), np.uint8))
|
||||||
|
img = cv2.bitwise_not(img)
|
||||||
|
img = cv2.resize(img, self.resize_dim, interpolation=cv2.INTER_AREA)
|
||||||
|
img = cv2.copyMakeBorder(img, 20, 20, 0, 0, cv2.BORDER_CONSTANT, None, (255,255,255))
|
||||||
|
config = '--tessdata-dir "{}" --psm 7 -c preserve_interword_spaces=1'.format(constants.TESSDATA_DIR)
|
||||||
|
try:
|
||||||
|
return pytesseract.image_to_data(img, lang=self.lang, config=config)
|
||||||
|
except Exception as e:
|
||||||
|
sys.exit('{}: {}'.format(e.__class__.__name__, e))
|
||||||
|
|
||||||
def get_subtitles(self, sim_threshold: int) -> str:
|
def get_subtitles(self, sim_threshold: int) -> str:
|
||||||
self._generate_subtitles(sim_threshold)
|
self._generate_subtitles(sim_threshold)
|
||||||
|
Loading…
Reference in New Issue
Block a user