mirror of
https://gitlab.os-k.eu/neox/CNIRevelator.git
synced 2023-08-25 14:03:10 +02:00
Delete pytesseract.py
This commit is contained in:
parent
99ce7d59f7
commit
ea40cf5938
326
pytesseract.py
326
pytesseract.py
@ -1,326 +0,0 @@
|
||||
"""
|
||||
********************************************************************************
|
||||
* CNIRevelator *
|
||||
* *
|
||||
* Desc: Pytesseract modification to comply with Pyinstaller *
|
||||
* *
|
||||
* Copyright © 2017-2018 Matthias A. Lee (madmaze) *
|
||||
* Copyright © 2018-2019 Adrien Bourmault (neox95) *
|
||||
* *
|
||||
* This file is part of CNIRevelator. *
|
||||
* *
|
||||
* CNIRevelator is free software: you can redistribute it and/or modify *
|
||||
* it under the terms of the GNU General Public License as published by *
|
||||
* the Free Software Foundation, either version 3 of the License, or *
|
||||
* any later version. *
|
||||
* *
|
||||
* CNIRevelator is distributed in the hope that it will be useful, *
|
||||
* but WITHOUT ANY WARRANTY*without even the implied warranty of *
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
||||
* GNU General Public License for more details. *
|
||||
* *
|
||||
* You should have received a copy of the GNU General Public License *
|
||||
* along with CNIRevelator. If not, see <https:*www.gnu.org/licenses/>. *
|
||||
********************************************************************************
|
||||
"""
|
||||
|
||||
|
||||
try:
|
||||
import Image
|
||||
except ImportError:
|
||||
from PIL import Image
|
||||
|
||||
import os, sys, subprocess, tempfile, shlex, string
|
||||
from glob import iglob
|
||||
from pkgutil import find_loader
|
||||
from distutils.version import LooseVersion
|
||||
from os.path import realpath, normpath, normcase
|
||||
numpy_installed = find_loader('numpy') is not None
|
||||
if numpy_installed:
|
||||
from numpy import ndarray
|
||||
tesseract_cmd = 'tesseract'
|
||||
RGB_MODE = 'RGB'
|
||||
OSD_KEYS = {'Page number':(
|
||||
'page_num', int),
|
||||
'Orientation in degrees':(
|
||||
'orientation', int),
|
||||
'Rotate':(
|
||||
'rotate', int),
|
||||
'Orientation confidence':(
|
||||
'orientation_conf', float),
|
||||
'Script':(
|
||||
'script', str),
|
||||
'Script confidence':(
|
||||
'script_conf', float)}
|
||||
|
||||
class Output:
|
||||
STRING = 'string'
|
||||
BYTES = 'bytes'
|
||||
DICT = 'dict'
|
||||
|
||||
|
||||
class TesseractError(RuntimeError):
|
||||
|
||||
def __init__(self, status, message):
|
||||
self.status = status
|
||||
self.message = message
|
||||
self.args = (status, message)
|
||||
|
||||
|
||||
class TesseractNotFoundError(EnvironmentError):
|
||||
|
||||
def __init__(self):
|
||||
super(TesseractNotFoundError, self).__init__(tesseract_cmd + " is not installed or it's not in your path")
|
||||
|
||||
|
||||
class TSVNotSupported(EnvironmentError):
|
||||
|
||||
def __init__(self):
|
||||
super(TSVNotSupported, self).__init__('TSV output not supported. Tesseract >= 3.05 required')
|
||||
|
||||
|
||||
def run_once(func):
|
||||
|
||||
def wrapper(*args, **kwargs):
|
||||
if wrapper._result is wrapper:
|
||||
wrapper._result = func(*args, **kwargs)
|
||||
return wrapper._result
|
||||
|
||||
wrapper._result = wrapper
|
||||
return wrapper
|
||||
|
||||
|
||||
def get_errors(error_string):
|
||||
return ' '.join(line for line in error_string.decode('utf-8').splitlines()).strip()
|
||||
|
||||
|
||||
def cleanup(temp_name):
|
||||
""" Tries to remove files by filename wildcard path. """
|
||||
for filename in iglob(temp_name + '*' if temp_name else temp_name):
|
||||
try:
|
||||
os.remove(filename)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def prepare(image):
|
||||
if isinstance(image, Image.Image):
|
||||
return image
|
||||
if numpy_installed:
|
||||
if isinstance(image, ndarray):
|
||||
pass
|
||||
return Image.fromarray(image)
|
||||
raise TypeError('Unsupported image object')
|
||||
|
||||
|
||||
def save_image(image):
|
||||
temp_name = tempfile.mktemp(prefix='tess_')
|
||||
if isinstance(image, str):
|
||||
return (temp_name, realpath(normpath(normcase(image))))
|
||||
else:
|
||||
image = prepare(image)
|
||||
img_extension = image.format
|
||||
if image.format not in frozenset({'BMP', 'JPEG', 'GIF', 'TIFF', 'PNG'}):
|
||||
img_extension = 'PNG'
|
||||
if not image.mode.startswith(RGB_MODE):
|
||||
image = image.convert(RGB_MODE)
|
||||
if 'A' in image.getbands():
|
||||
background = Image.new(RGB_MODE, image.size, (255, 255, 255))
|
||||
background.paste(image, (0, 0), image)
|
||||
image = background
|
||||
input_file_name = temp_name + os.extsep + img_extension
|
||||
(image.save)(input_file_name, format=img_extension, **image.info)
|
||||
return (
|
||||
temp_name, input_file_name)
|
||||
|
||||
|
||||
def subprocess_args(include_stdout=True):
|
||||
if hasattr(subprocess, 'STARTUPINFO'):
|
||||
si = subprocess.STARTUPINFO()
|
||||
si.dwFlags |= subprocess.STARTF_USESHOWWINDOW
|
||||
env = os.environ
|
||||
else:
|
||||
si = None
|
||||
env = None
|
||||
if include_stdout:
|
||||
ret = {'stdout': subprocess.PIPE}
|
||||
else:
|
||||
ret = {}
|
||||
ret.update({'stdin':subprocess.PIPE, 'stderr':subprocess.PIPE,
|
||||
'startupinfo':si,
|
||||
'env':env})
|
||||
return ret
|
||||
|
||||
|
||||
def run_tesseract(input_filename, output_filename_base, extension, lang, config='', nice=0):
|
||||
cmd_args = []
|
||||
if not sys.platform.startswith('win32'):
|
||||
if nice != 0:
|
||||
cmd_args += ('nice', '-n', str(nice))
|
||||
cmd_args += (tesseract_cmd, input_filename, output_filename_base)
|
||||
if lang is not None:
|
||||
cmd_args += ('-l', lang)
|
||||
cmd_args += shlex.split(config)
|
||||
if extension not in ('box', 'osd', 'tsv'):
|
||||
cmd_args.append(extension)
|
||||
try:
|
||||
proc = (subprocess.Popen)(cmd_args, **subprocess_args())
|
||||
except OSError:
|
||||
raise TesseractNotFoundError()
|
||||
|
||||
status_code, error_string = proc.wait(), proc.stderr.read()
|
||||
proc.stderr.close()
|
||||
if status_code:
|
||||
raise TesseractError(status_code, get_errors(error_string))
|
||||
return True
|
||||
|
||||
|
||||
def run_and_get_output(image, extension, lang=None, config='', nice=0, return_bytes=False):
|
||||
temp_name, input_filename = ('', '')
|
||||
try:
|
||||
temp_name, input_filename = save_image(image)
|
||||
kwargs = {'input_filename':input_filename,
|
||||
'output_filename_base':temp_name + '_out',
|
||||
'extension':extension,
|
||||
'lang':lang,
|
||||
'config':config,
|
||||
'nice':nice}
|
||||
run_tesseract(**kwargs)
|
||||
filename = kwargs['output_filename_base'] + os.extsep + extension
|
||||
with open(filename, 'rb') as (output_file):
|
||||
if return_bytes:
|
||||
return output_file.read()
|
||||
return output_file.read().decode('utf-8').strip()
|
||||
finally:
|
||||
cleanup(temp_name)
|
||||
|
||||
|
||||
def file_to_dict(tsv, cell_delimiter, str_col_idx):
|
||||
result = {}
|
||||
rows = [row.split(cell_delimiter) for row in tsv.split('\n')]
|
||||
if not rows:
|
||||
return result
|
||||
else:
|
||||
header = rows.pop(0)
|
||||
if len(rows[(-1)]) < len(header):
|
||||
rows[(-1)].append('')
|
||||
if str_col_idx < 0:
|
||||
str_col_idx += len(header)
|
||||
for i, head in enumerate(header):
|
||||
result[head] = [int(row[i]) if i != str_col_idx else row[i] for row in rows]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def is_valid(val, _type):
|
||||
if _type is int:
|
||||
return val.isdigit()
|
||||
else:
|
||||
if _type is float:
|
||||
pass
|
||||
try:
|
||||
float(val)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def osd_to_dict(osd):
|
||||
return {OSD_KEYS[kv[0]][0]:OSD_KEYS[kv[0]][1](kv[1]) for kv in (line.split(': ') for line in osd.split('\n')) if len(kv) == 2 if is_valid(kv[1], OSD_KEYS[kv[0]][1])}
|
||||
|
||||
|
||||
@run_once
|
||||
def get_tesseract_version():
|
||||
"""
|
||||
Returns LooseVersion object of the Tesseract version
|
||||
"""
|
||||
try:
|
||||
return LooseVersion((subprocess.check_output)([tesseract_cmd, '--version'], **subprocess_args(False)).decode('utf-8').split()[1].lstrip(string.printable[10:]))
|
||||
except OSError:
|
||||
raise TesseractNotFoundError()
|
||||
|
||||
|
||||
def image_to_string(image, lang=None, config='', nice=0, boxes=False, output_type=Output.STRING):
|
||||
"""
|
||||
Returns the result of a Tesseract OCR run on the provided image to string
|
||||
"""
|
||||
if boxes:
|
||||
print("\nWarning: Argument 'boxes' is deprecated and will be removed in future versions. Use function image_to_boxes instead.\n")
|
||||
return image_to_boxes(image, lang, config, nice, output_type)
|
||||
else:
|
||||
args = [
|
||||
image, 'txt', lang, config, nice]
|
||||
if output_type == Output.DICT:
|
||||
return {'text': run_and_get_output(*args)}
|
||||
if output_type == Output.BYTES:
|
||||
args.append(True)
|
||||
return run_and_get_output(*args)
|
||||
|
||||
|
||||
def image_to_boxes(image, lang=None, config='', nice=0, output_type=Output.STRING):
|
||||
"""
|
||||
Returns string containing recognized characters and their box boundaries
|
||||
"""
|
||||
config += ' batch.nochop makebox'
|
||||
args = [image, 'box', lang, config, nice]
|
||||
if output_type == Output.DICT:
|
||||
box_header = 'char left bottom right top page\n'
|
||||
return file_to_dict(box_header + run_and_get_output(*args), ' ', 0)
|
||||
else:
|
||||
if output_type == Output.BYTES:
|
||||
args.append(True)
|
||||
return run_and_get_output(*args)
|
||||
|
||||
|
||||
def image_to_data(image, lang=None, config='', nice=0, output_type=Output.STRING):
|
||||
"""
|
||||
Returns string containing box boundaries, confidences,
|
||||
and other information. Requires Tesseract 3.05+
|
||||
"""
|
||||
if get_tesseract_version() < '3.05':
|
||||
raise TSVNotSupported()
|
||||
config = '{} {}'.format('-c tessedit_create_tsv=1', config.strip()).strip()
|
||||
args = [image, 'tsv', lang, config, nice]
|
||||
if output_type == Output.DICT:
|
||||
return file_to_dict(run_and_get_output(*args), '\t', -1)
|
||||
else:
|
||||
if output_type == Output.BYTES:
|
||||
args.append(True)
|
||||
return run_and_get_output(*args)
|
||||
|
||||
|
||||
def image_to_osd(image, lang='osd', config='', nice=0, output_type=Output.STRING):
|
||||
"""
|
||||
Returns string containing the orientation and script detection (OSD)
|
||||
"""
|
||||
config = '{}-psm 0 {}'.format('' if get_tesseract_version() < '3.05' else '-', config.strip()).strip()
|
||||
args = [
|
||||
image, 'osd', lang, config, nice]
|
||||
if output_type == Output.DICT:
|
||||
return osd_to_dict(run_and_get_output(*args))
|
||||
else:
|
||||
if output_type == Output.BYTES:
|
||||
args.append(True)
|
||||
return run_and_get_output(*args)
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) == 2:
|
||||
filename, lang = sys.argv[1], None
|
||||
else:
|
||||
if len(sys.argv) == 4:
|
||||
if sys.argv[1] == '-l':
|
||||
filename, lang = sys.argv[3], sys.argv[2]
|
||||
sys.stderr.write('Usage: python pytesseract.py [-l lang] input_file\n')
|
||||
exit(2)
|
||||
try:
|
||||
print(image_to_string((Image.open(filename)), lang=lang))
|
||||
except IOError:
|
||||
sys.stderr.write('ERROR: Could not open file "%s"\n' % filename)
|
||||
exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user