diff --git a/omegat_tools/extract_segments.py b/omegat_tools/extract_segments.py new file mode 100644 index 0000000..587739f --- /dev/null +++ b/omegat_tools/extract_segments.py @@ -0,0 +1,199 @@ +# -*- coding: utf-8 -*- + +'''Create per translator TMX files from a base TMX file. + +This script is used to parse TMX files from OmegaT projects involving more than one translator, and creates individual TMX files containing the translations made by each translator. + +The user name of each translator and the name of the corresponding TMX file must be entered in the "Translators" section of the "omegat-tools.conf" file. + +Requires: + - Python 3.6 or higher (for f-strings) + - lxml +''' + +########################################################################### +# +# Segment Extractor +# --------------------------- +# Version 0.1, released August 15, 2022 +# Author: Philippe Tourigny +# License: GPL3+ +# https://www.gnu.org/licenses/gpl-3.0.en.html +# +# Rebuilt implementation script targeted at OmegaT team projects originally +# designed to splits a team project TMX file into separate files containing +# the segments translated by each individual translator. The individual +# files are named using the two-letter target language code of the project +# and a two-letter translator identifier specified in the configuration file. +# +# The files can then be placed in the "tmx2source" subfolder of an OmegaT +# project tm folder to show each translator's original translation immediately +# below the source text during revision, for example. +# +# The script can also recognize revisers if they are defined in the +# configuration file, and add a note to a segment to indicate that it has been +# revised. The OmegaT option to apply coloring to segments with notes can +# then be used to quickly identify which segments have been revised. +# +# TODO: +# - Automatically save the files to the "tmx2source" subfolder if a team +# project tmx file is selected. +# - Allow user selected individual TMX file names in addition to the +# "tmx2source" file name format. +# - Also accept a command line argument for the file to parse. +# - Improve the revision notes to mark the differences between the original +# and revised translations. +# - Allow batch processing of multiple files. +# - Enable the extraction of segments based on other criteria. +# - Optionally output to a form of two-column table for review +# outside a CAT tool. +########################################################################### + +from lxml import etree + +import common +from tmxhelpers import OmegaT_TMX + + +def set_tmxpath(): + '''Establish a starting path for the TMX file selection dialog.''' + + # Check whether user has defined a path for TMX files or projects + if common.config.has_option('Paths', 'tmxpath'): + configpath = common.config['Paths']['tmxpath'] + else: + configpath = common.config['Paths']['projects'] + + tmxpath = common.set_basepath(configpath) + + return configpath + + +def parse_tmx_tree(tmxfile=None): + '''Get the XML tree of the TMX file to parse''' + + def get_tmx_file(): + '''Get the TMX file to process.''' + + tmxpath = set_tmxpath() + + filetype=[('Translation memories', '*.tmx')] + asktmx = 'Select TMX file' + tmxfile = common.select_file(tmxpath, filetype, asktmx) + + return tmxfile + + # Ask the user to specify the file to parse if none was passed + if tmxfile is None: + tmxfile = get_tmx_file() + + tmxparser = etree.XMLParser(remove_blank_text=True) + tmxtree = etree.parse(tmxfile, tmxparser) + + return tmxtree + + +def get_translator_list(): + '''Read list of translators and identifiers. + + The parser for the config file returns a list of tuples, + which is converted to a dictionary. The list is then pruned + down to the translators involved in the project from which + the TMX file is taken''' + + translator_list = dict(common.config.items('Translators')) + + # Identify project translators whose work needs to be revised + # based on the premise that a revised translation will have + # a changeid that is not in the list of translators to revise. + project_translators = set(BODY.xpath('//tuv/@changeid')) + + # Set up a dictionary containing the name and code of each + # translator in the configuration file involved in the project. + translators = {name:code for name, code in translator_list.items() + if name in project_translators} + + return translators + + +def prepare_tmx_containers(): + '''Set up a dictionary to hold each of the TMX files to revise.''' + + # Get the first two characters of the target language from the + # translated tuv language attribute. + tgtlang = BODY.xpath('//tuv[2]/@*[local-name() = "lang"]')[0][:2] + sorted_tmxes = {} + + for translator in TRANSLATORS.keys(): + code = tgtlang + '-'+ TRANSLATORS[translator] + sorted_tmxes[code] = OmegaT_TMX(header=HEADER.attrib, + version=VERSION) + + return sorted_tmxes + + +def sort_unrevised_tus(): + '''Sort unrevised translations into separate lists for each translator''' + + # Setup container for individual translator tmxes. + sorted_tmxes = prepare_tmx_containers() + + # Retrieve all tuv elements containing a translation. + # The translation is always in the second tuv element. + translations = BODY.xpath('//tuv[2]') + + # Sort unrevised tuvs by translator + for tuv in translations: + creationid = tuv.attrib.get('creationid') + changeid = tuv.attrib.get('changeid') + + if creationid in TRANSLATORS.keys() and changeid == creationid: + translator = TRANSLATORS[changeid] + code = [tmxid for tmxid in sorted_tmxes.keys() + if translator in tmxid].pop() + tu = tuv.getparent() + sorted_tmxes[code].add_tu(tu) + + return sorted_tmxes + + +def finalize_tmxdoc(tmxname, tmxcontent): + '''Define the tmx tree for output to a file.''' + + # Set the full path and name for the TMX file. + tmxpath = common.Path(TMXTREE.docinfo.URL).parent + tmxfile = common.Path(tmxpath/tmxname).with_suffix('.tmx') + + tmxcontent.insert_alt_comment() + tmxdoc = etree.ElementTree(tmxcontent.tmx) + + return (tmxfile, tmxdoc) + + +def write_tmx(tmxfile, tmxdoc): + '''Output a TMX document to a file.''' + + tmxdoc.write(tmxfile, encoding='utf-8', pretty_print=True, + xml_declaration=True, doctype=DOCTYPE) + + +if __name__ == '__main__': + + # Parse the TMX file into an XML tree, and retrieve the main elements + # and information needed to create the individual files. + TMXTREE = parse_tmx_tree() + TMXROOT = TMXTREE.getroot() + HEADER, BODY = TMXROOT.getchildren() + DOCTYPE = TMXTREE.docinfo.doctype + VERSION = TMXROOT.attrib.get('version') + + + # Get the list of translators whose work will be revised + TRANSLATORS = get_translator_list() + + unrevised_translations = sort_unrevised_tus() + + for name, tmxcontent in unrevised_translations.items(): + unrevised_file, unrevised_doc = finalize_tmxdoc(name, tmxcontent) + + write_tmx(unrevised_file, unrevised_doc) \ No newline at end of file