Little_CAT_Helpers/omegat_tools/extract_segments.py

203 lines
6.9 KiB
Python
Raw Normal View History

2022-08-15 07:46:35 +02:00
# -*- coding: utf-8 -*-
'''Create per translator TMX files from a base TMX file.
2022-08-15 07:59:41 +02:00
This script is used to parse TMX files from OmegaT projects involving more than
one translator, and creates individual TMX files containing the translations
made by each translator.
2022-08-15 07:46:35 +02:00
2022-08-15 07:59:41 +02:00
The user name of each translator and the two letter code to identify that
translator in the TMX file name must be entered in the "Translators" section
of the "omegat-tools.conf" file.
2022-08-15 07:46:35 +02:00
Requires:
- Python 3.6 or higher (for f-strings)
- lxml
'''
###########################################################################
#
# Segment Extractor
# ---------------------------
# Version 0.1, released August 15, 2022
# Author: Philippe Tourigny
# License: GPL3+
# https://www.gnu.org/licenses/gpl-3.0.en.html
#
# Rebuilt implementation script targeted at OmegaT team projects originally
# designed to splits a team project TMX file into separate files containing
# the segments translated by each individual translator. The individual
# files are named using the two-letter target language code of the project
# and a two-letter translator identifier specified in the configuration file.
#
# The files can then be placed in the "tmx2source" subfolder of an OmegaT
# project tm folder to show each translator's original translation immediately
# below the source text during revision, for example.
#
# The script can also recognize revisers if they are defined in the
# configuration file, and add a note to a segment to indicate that it has been
# revised. The OmegaT option to apply coloring to segments with notes can
# then be used to quickly identify which segments have been revised.
#
# TODO:
# - Automatically save the files to the "tmx2source" subfolder if a team
# project tmx file is selected.
# - Allow user selected individual TMX file names in addition to the
# "tmx2source" file name format.
# - Also accept a command line argument for the file to parse.
# - Improve the revision notes to mark the differences between the original
# and revised translations.
# - Allow batch processing of multiple files.
# - Enable the extraction of segments based on other criteria.
# - Optionally output to a form of two-column table for review
# outside a CAT tool.
###########################################################################
from lxml import etree
import common
from tmxhelpers import OmegaT_TMX
def set_tmxpath():
'''Establish a starting path for the TMX file selection dialog.'''
# Check whether user has defined a path for TMX files or projects
if common.config.has_option('Paths', 'tmxpath'):
configpath = common.config['Paths']['tmxpath']
else:
configpath = common.config['Paths']['projects']
tmxpath = common.set_basepath(configpath)
return configpath
def parse_tmx_tree(tmxfile=None):
'''Get the XML tree of the TMX file to parse'''
def get_tmx_file():
'''Get the TMX file to process.'''
tmxpath = set_tmxpath()
filetype=[('Translation memories', '*.tmx')]
asktmx = 'Select TMX file'
tmxfile = common.select_file(tmxpath, filetype, asktmx)
return tmxfile
# Ask the user to specify the file to parse if none was passed
if tmxfile is None:
tmxfile = get_tmx_file()
tmxparser = etree.XMLParser(remove_blank_text=True)
tmxtree = etree.parse(tmxfile, tmxparser)
return tmxtree
def get_translator_list():
'''Read list of translators and identifiers.
The parser for the config file returns a list of tuples,
which is converted to a dictionary. The list is then pruned
down to the translators involved in the project from which
the TMX file is taken'''
translator_list = dict(common.config.items('Translators'))
# Identify project translators whose work needs to be revised
# based on the premise that a revised translation will have
# a changeid that is not in the list of translators to revise.
project_translators = set(BODY.xpath('//tuv/@changeid'))
# Set up a dictionary containing the name and code of each
# translator in the configuration file involved in the project.
translators = {name:code for name, code in translator_list.items()
if name in project_translators}
return translators
def prepare_tmx_containers():
'''Set up a dictionary to hold each of the TMX files to revise.'''
# Get the first two characters of the target language from the
# translated tuv language attribute.
tgtlang = BODY.xpath('//tuv[2]/@*[local-name() = "lang"]')[0][:2]
sorted_tmxes = {}
for translator in TRANSLATORS.keys():
code = tgtlang + '-'+ TRANSLATORS[translator]
sorted_tmxes[code] = OmegaT_TMX(header=HEADER.attrib,
version=VERSION)
return sorted_tmxes
def sort_unrevised_tus():
'''Sort unrevised translations into separate lists for each translator'''
# Setup container for individual translator tmxes.
sorted_tmxes = prepare_tmx_containers()
# Retrieve all tuv elements containing a translation.
# The translation is always in the second tuv element.
translations = BODY.xpath('//tuv[2]')
# Sort unrevised tuvs by translator
for tuv in translations:
creationid = tuv.attrib.get('creationid')
changeid = tuv.attrib.get('changeid')
if creationid in TRANSLATORS.keys() and changeid == creationid:
translator = TRANSLATORS[changeid]
code = [tmxid for tmxid in sorted_tmxes.keys()
if translator in tmxid].pop()
tu = tuv.getparent()
sorted_tmxes[code].add_tu(tu)
return sorted_tmxes
def finalize_tmxdoc(tmxname, tmxcontent):
'''Define the tmx tree for output to a file.'''
# Set the full path and name for the TMX file.
tmxpath = common.Path(TMXTREE.docinfo.URL).parent
tmxfile = common.Path(tmxpath/tmxname).with_suffix('.tmx')
tmxcontent.insert_alt_comment()
tmxdoc = etree.ElementTree(tmxcontent.tmx)
return (tmxfile, tmxdoc)
def write_tmx(tmxfile, tmxdoc):
'''Output a TMX document to a file.'''
tmxdoc.write(tmxfile, encoding='utf-8', pretty_print=True,
xml_declaration=True, doctype=DOCTYPE)
if __name__ == '__main__':
# Parse the TMX file into an XML tree, and retrieve the main elements
# and information needed to create the individual files.
TMXTREE = parse_tmx_tree()
TMXROOT = TMXTREE.getroot()
HEADER, BODY = TMXROOT.getchildren()
DOCTYPE = TMXTREE.docinfo.doctype
VERSION = TMXROOT.attrib.get('version')
# Get the list of translators whose work will be revised
TRANSLATORS = get_translator_list()
unrevised_translations = sort_unrevised_tus()
for name, tmxcontent in unrevised_translations.items():
unrevised_file, unrevised_doc = finalize_tmxdoc(name, tmxcontent)
write_tmx(unrevised_file, unrevised_doc)