From 8ea7686bdd6f591a4866597bd03f792a0fa5d286 Mon Sep 17 00:00:00 2001 From: Adrien Bourmault Date: Tue, 9 Jul 2019 16:10:30 +0200 Subject: [PATCH] Delete CNI_pytesseract.py --- src/analyzer/CNI_pytesseract.py | 326 -------------------------------- 1 file changed, 326 deletions(-) delete mode 100644 src/analyzer/CNI_pytesseract.py diff --git a/src/analyzer/CNI_pytesseract.py b/src/analyzer/CNI_pytesseract.py deleted file mode 100644 index ba023a5..0000000 --- a/src/analyzer/CNI_pytesseract.py +++ /dev/null @@ -1,326 +0,0 @@ -""" -******************************************************************************** -* CNIRevelator * -* * -* Desc: Pytesseract modification to comply with Pyinstaller * -* * -* Copyright © 2017-2018 Matthias A. Lee (madmaze) * -* Copyright © 2018-2019 Adrien Bourmault (neox95) * -* * -* This file is part of CNIRevelator. * -* * -* CNIRevelator is free software: you can redistribute it and/or modify * -* it under the terms of the GNU General Public License as published by * -* the Free Software Foundation, either version 3 of the License, or * -* any later version. * -* * -* CNIRevelator is distributed in the hope that it will be useful, * -* but WITHOUT ANY WARRANTY*without even the implied warranty of * -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * -* GNU General Public License for more details. * -* * -* You should have received a copy of the GNU General Public License * -* along with CNIRevelator. If not, see . * -******************************************************************************** -""" - - -try: - import Image -except ImportError: - from PIL import Image - -import os, sys, subprocess, tempfile, shlex, string -from glob import iglob -from pkgutil import find_loader -from distutils.version import LooseVersion -from os.path import realpath, normpath, normcase -numpy_installed = find_loader('numpy') is not None -if numpy_installed: - from numpy import ndarray -tesseract_cmd = 'tesseract' -RGB_MODE = 'RGB' -OSD_KEYS = {'Page number':( - 'page_num', int), - 'Orientation in degrees':( - 'orientation', int), - 'Rotate':( - 'rotate', int), - 'Orientation confidence':( - 'orientation_conf', float), - 'Script':( - 'script', str), - 'Script confidence':( - 'script_conf', float)} - -class Output: - STRING = 'string' - BYTES = 'bytes' - DICT = 'dict' - - -class TesseractError(RuntimeError): - - def __init__(self, status, message): - self.status = status - self.message = message - self.args = (status, message) - - -class TesseractNotFoundError(EnvironmentError): - - def __init__(self): - super(TesseractNotFoundError, self).__init__(tesseract_cmd + " is not installed or it's not in your path") - - -class TSVNotSupported(EnvironmentError): - - def __init__(self): - super(TSVNotSupported, self).__init__('TSV output not supported. Tesseract >= 3.05 required') - - -def run_once(func): - - def wrapper(*args, **kwargs): - if wrapper._result is wrapper: - wrapper._result = func(*args, **kwargs) - return wrapper._result - - wrapper._result = wrapper - return wrapper - - -def get_errors(error_string): - return ' '.join(line for line in error_string.decode('utf-8').splitlines()).strip() - - -def cleanup(temp_name): - """ Tries to remove files by filename wildcard path. """ - for filename in iglob(temp_name + '*' if temp_name else temp_name): - try: - os.remove(filename) - except OSError: - pass - - -def prepare(image): - if isinstance(image, Image.Image): - return image - if numpy_installed: - if isinstance(image, ndarray): - pass - return Image.fromarray(image) - raise TypeError('Unsupported image object') - - -def save_image(image): - temp_name = tempfile.mktemp(prefix='tess_') - if isinstance(image, str): - return (temp_name, realpath(normpath(normcase(image)))) - else: - image = prepare(image) - img_extension = image.format - if image.format not in frozenset({'BMP', 'JPEG', 'GIF', 'TIFF', 'PNG'}): - img_extension = 'PNG' - if not image.mode.startswith(RGB_MODE): - image = image.convert(RGB_MODE) - if 'A' in image.getbands(): - background = Image.new(RGB_MODE, image.size, (255, 255, 255)) - background.paste(image, (0, 0), image) - image = background - input_file_name = temp_name + os.extsep + img_extension - (image.save)(input_file_name, format=img_extension, **image.info) - return ( - temp_name, input_file_name) - - -def subprocess_args(include_stdout=True): - if hasattr(subprocess, 'STARTUPINFO'): - si = subprocess.STARTUPINFO() - si.dwFlags |= subprocess.STARTF_USESHOWWINDOW - env = os.environ - else: - si = None - env = None - if include_stdout: - ret = {'stdout': subprocess.PIPE} - else: - ret = {} - ret.update({'stdin':subprocess.PIPE, 'stderr':subprocess.PIPE, - 'startupinfo':si, - 'env':env}) - return ret - - -def run_tesseract(input_filename, output_filename_base, extension, lang, config='', nice=0): - cmd_args = [] - if not sys.platform.startswith('win32'): - if nice != 0: - cmd_args += ('nice', '-n', str(nice)) - cmd_args += (tesseract_cmd, input_filename, output_filename_base) - if lang is not None: - cmd_args += ('-l', lang) - cmd_args += shlex.split(config) - if extension not in ('box', 'osd', 'tsv'): - cmd_args.append(extension) - try: - proc = (subprocess.Popen)(cmd_args, **subprocess_args()) - except OSError: - raise TesseractNotFoundError() - - status_code, error_string = proc.wait(), proc.stderr.read() - proc.stderr.close() - if status_code: - raise TesseractError(status_code, get_errors(error_string)) - return True - - -def run_and_get_output(image, extension, lang=None, config='', nice=0, return_bytes=False): - temp_name, input_filename = ('', '') - try: - temp_name, input_filename = save_image(image) - kwargs = {'input_filename':input_filename, - 'output_filename_base':temp_name + '_out', - 'extension':extension, - 'lang':lang, - 'config':config, - 'nice':nice} - run_tesseract(**kwargs) - filename = kwargs['output_filename_base'] + os.extsep + extension - with open(filename, 'rb') as (output_file): - if return_bytes: - return output_file.read() - return output_file.read().decode('utf-8').strip() - finally: - cleanup(temp_name) - - -def file_to_dict(tsv, cell_delimiter, str_col_idx): - result = {} - rows = [row.split(cell_delimiter) for row in tsv.split('\n')] - if not rows: - return result - else: - header = rows.pop(0) - if len(rows[(-1)]) < len(header): - rows[(-1)].append('') - if str_col_idx < 0: - str_col_idx += len(header) - for i, head in enumerate(header): - result[head] = [int(row[i]) if i != str_col_idx else row[i] for row in rows] - - return result - - -def is_valid(val, _type): - if _type is int: - return val.isdigit() - else: - if _type is float: - pass - try: - float(val) - return True - except ValueError: - return False - - return True - - -def osd_to_dict(osd): - return {OSD_KEYS[kv[0]][0]:OSD_KEYS[kv[0]][1](kv[1]) for kv in (line.split(': ') for line in osd.split('\n')) if len(kv) == 2 if is_valid(kv[1], OSD_KEYS[kv[0]][1])} - - -@run_once -def get_tesseract_version(): - """ - Returns LooseVersion object of the Tesseract version - """ - try: - return LooseVersion((subprocess.check_output)([tesseract_cmd, '--version'], **subprocess_args(False)).decode('utf-8').split()[1].lstrip(string.printable[10:])) - except OSError: - raise TesseractNotFoundError() - - -def image_to_string(image, lang=None, config='', nice=0, boxes=False, output_type=Output.STRING): - """ - Returns the result of a Tesseract OCR run on the provided image to string - """ - if boxes: - print("\nWarning: Argument 'boxes' is deprecated and will be removed in future versions. Use function image_to_boxes instead.\n") - return image_to_boxes(image, lang, config, nice, output_type) - else: - args = [ - image, 'txt', lang, config, nice] - if output_type == Output.DICT: - return {'text': run_and_get_output(*args)} - if output_type == Output.BYTES: - args.append(True) - return run_and_get_output(*args) - - -def image_to_boxes(image, lang=None, config='', nice=0, output_type=Output.STRING): - """ - Returns string containing recognized characters and their box boundaries - """ - config += ' batch.nochop makebox' - args = [image, 'box', lang, config, nice] - if output_type == Output.DICT: - box_header = 'char left bottom right top page\n' - return file_to_dict(box_header + run_and_get_output(*args), ' ', 0) - else: - if output_type == Output.BYTES: - args.append(True) - return run_and_get_output(*args) - - -def image_to_data(image, lang=None, config='', nice=0, output_type=Output.STRING): - """ - Returns string containing box boundaries, confidences, - and other information. Requires Tesseract 3.05+ - """ - if get_tesseract_version() < '3.05': - raise TSVNotSupported() - config = '{} {}'.format('-c tessedit_create_tsv=1', config.strip()).strip() - args = [image, 'tsv', lang, config, nice] - if output_type == Output.DICT: - return file_to_dict(run_and_get_output(*args), '\t', -1) - else: - if output_type == Output.BYTES: - args.append(True) - return run_and_get_output(*args) - - -def image_to_osd(image, lang='osd', config='', nice=0, output_type=Output.STRING): - """ - Returns string containing the orientation and script detection (OSD) - """ - config = '{}-psm 0 {}'.format('' if get_tesseract_version() < '3.05' else '-', config.strip()).strip() - args = [ - image, 'osd', lang, config, nice] - if output_type == Output.DICT: - return osd_to_dict(run_and_get_output(*args)) - else: - if output_type == Output.BYTES: - args.append(True) - return run_and_get_output(*args) - - -def main(): - if len(sys.argv) == 2: - filename, lang = sys.argv[1], None - else: - if len(sys.argv) == 4: - if sys.argv[1] == '-l': - filename, lang = sys.argv[3], sys.argv[2] - sys.stderr.write('Usage: python pytesseract.py [-l lang] input_file\n') - exit(2) - try: - print(image_to_string((Image.open(filename)), lang=lang)) - except IOError: - sys.stderr.write('ERROR: Could not open file "%s"\n' % filename) - exit(1) - - -if __name__ == '__main__': - main() \ No newline at end of file