mirror of
https://codeberg.org/kazephil/Little_CAT_Helpers.git
synced 2023-08-25 14:05:45 +02:00
First working version.
This commit is contained in:
parent
1678049cf9
commit
b4c8716da2
199
omegat_tools/extract_segments.py
Normal file
199
omegat_tools/extract_segments.py
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
'''Create per translator TMX files from a base TMX file.
|
||||||
|
|
||||||
|
This script is used to parse TMX files from OmegaT projects involving more than one translator, and creates individual TMX files containing the translations made by each translator.
|
||||||
|
|
||||||
|
The user name of each translator and the name of the corresponding TMX file must be entered in the "Translators" section of the "omegat-tools.conf" file.
|
||||||
|
|
||||||
|
Requires:
|
||||||
|
- Python 3.6 or higher (for f-strings)
|
||||||
|
- lxml
|
||||||
|
'''
|
||||||
|
|
||||||
|
###########################################################################
|
||||||
|
#
|
||||||
|
# Segment Extractor
|
||||||
|
# ---------------------------
|
||||||
|
# Version 0.1, released August 15, 2022
|
||||||
|
# Author: Philippe Tourigny
|
||||||
|
# License: GPL3+
|
||||||
|
# https://www.gnu.org/licenses/gpl-3.0.en.html
|
||||||
|
#
|
||||||
|
# Rebuilt implementation script targeted at OmegaT team projects originally
|
||||||
|
# designed to splits a team project TMX file into separate files containing
|
||||||
|
# the segments translated by each individual translator. The individual
|
||||||
|
# files are named using the two-letter target language code of the project
|
||||||
|
# and a two-letter translator identifier specified in the configuration file.
|
||||||
|
#
|
||||||
|
# The files can then be placed in the "tmx2source" subfolder of an OmegaT
|
||||||
|
# project tm folder to show each translator's original translation immediately
|
||||||
|
# below the source text during revision, for example.
|
||||||
|
#
|
||||||
|
# The script can also recognize revisers if they are defined in the
|
||||||
|
# configuration file, and add a note to a segment to indicate that it has been
|
||||||
|
# revised. The OmegaT option to apply coloring to segments with notes can
|
||||||
|
# then be used to quickly identify which segments have been revised.
|
||||||
|
#
|
||||||
|
# TODO:
|
||||||
|
# - Automatically save the files to the "tmx2source" subfolder if a team
|
||||||
|
# project tmx file is selected.
|
||||||
|
# - Allow user selected individual TMX file names in addition to the
|
||||||
|
# "tmx2source" file name format.
|
||||||
|
# - Also accept a command line argument for the file to parse.
|
||||||
|
# - Improve the revision notes to mark the differences between the original
|
||||||
|
# and revised translations.
|
||||||
|
# - Allow batch processing of multiple files.
|
||||||
|
# - Enable the extraction of segments based on other criteria.
|
||||||
|
# - Optionally output to a form of two-column table for review
|
||||||
|
# outside a CAT tool.
|
||||||
|
###########################################################################
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
import common
|
||||||
|
from tmxhelpers import OmegaT_TMX
|
||||||
|
|
||||||
|
|
||||||
|
def set_tmxpath():
|
||||||
|
'''Establish a starting path for the TMX file selection dialog.'''
|
||||||
|
|
||||||
|
# Check whether user has defined a path for TMX files or projects
|
||||||
|
if common.config.has_option('Paths', 'tmxpath'):
|
||||||
|
configpath = common.config['Paths']['tmxpath']
|
||||||
|
else:
|
||||||
|
configpath = common.config['Paths']['projects']
|
||||||
|
|
||||||
|
tmxpath = common.set_basepath(configpath)
|
||||||
|
|
||||||
|
return configpath
|
||||||
|
|
||||||
|
|
||||||
|
def parse_tmx_tree(tmxfile=None):
|
||||||
|
'''Get the XML tree of the TMX file to parse'''
|
||||||
|
|
||||||
|
def get_tmx_file():
|
||||||
|
'''Get the TMX file to process.'''
|
||||||
|
|
||||||
|
tmxpath = set_tmxpath()
|
||||||
|
|
||||||
|
filetype=[('Translation memories', '*.tmx')]
|
||||||
|
asktmx = 'Select TMX file'
|
||||||
|
tmxfile = common.select_file(tmxpath, filetype, asktmx)
|
||||||
|
|
||||||
|
return tmxfile
|
||||||
|
|
||||||
|
# Ask the user to specify the file to parse if none was passed
|
||||||
|
if tmxfile is None:
|
||||||
|
tmxfile = get_tmx_file()
|
||||||
|
|
||||||
|
tmxparser = etree.XMLParser(remove_blank_text=True)
|
||||||
|
tmxtree = etree.parse(tmxfile, tmxparser)
|
||||||
|
|
||||||
|
return tmxtree
|
||||||
|
|
||||||
|
|
||||||
|
def get_translator_list():
|
||||||
|
'''Read list of translators and identifiers.
|
||||||
|
|
||||||
|
The parser for the config file returns a list of tuples,
|
||||||
|
which is converted to a dictionary. The list is then pruned
|
||||||
|
down to the translators involved in the project from which
|
||||||
|
the TMX file is taken'''
|
||||||
|
|
||||||
|
translator_list = dict(common.config.items('Translators'))
|
||||||
|
|
||||||
|
# Identify project translators whose work needs to be revised
|
||||||
|
# based on the premise that a revised translation will have
|
||||||
|
# a changeid that is not in the list of translators to revise.
|
||||||
|
project_translators = set(BODY.xpath('//tuv/@changeid'))
|
||||||
|
|
||||||
|
# Set up a dictionary containing the name and code of each
|
||||||
|
# translator in the configuration file involved in the project.
|
||||||
|
translators = {name:code for name, code in translator_list.items()
|
||||||
|
if name in project_translators}
|
||||||
|
|
||||||
|
return translators
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_tmx_containers():
|
||||||
|
'''Set up a dictionary to hold each of the TMX files to revise.'''
|
||||||
|
|
||||||
|
# Get the first two characters of the target language from the
|
||||||
|
# translated tuv language attribute.
|
||||||
|
tgtlang = BODY.xpath('//tuv[2]/@*[local-name() = "lang"]')[0][:2]
|
||||||
|
sorted_tmxes = {}
|
||||||
|
|
||||||
|
for translator in TRANSLATORS.keys():
|
||||||
|
code = tgtlang + '-'+ TRANSLATORS[translator]
|
||||||
|
sorted_tmxes[code] = OmegaT_TMX(header=HEADER.attrib,
|
||||||
|
version=VERSION)
|
||||||
|
|
||||||
|
return sorted_tmxes
|
||||||
|
|
||||||
|
|
||||||
|
def sort_unrevised_tus():
|
||||||
|
'''Sort unrevised translations into separate lists for each translator'''
|
||||||
|
|
||||||
|
# Setup container for individual translator tmxes.
|
||||||
|
sorted_tmxes = prepare_tmx_containers()
|
||||||
|
|
||||||
|
# Retrieve all tuv elements containing a translation.
|
||||||
|
# The translation is always in the second tuv element.
|
||||||
|
translations = BODY.xpath('//tuv[2]')
|
||||||
|
|
||||||
|
# Sort unrevised tuvs by translator
|
||||||
|
for tuv in translations:
|
||||||
|
creationid = tuv.attrib.get('creationid')
|
||||||
|
changeid = tuv.attrib.get('changeid')
|
||||||
|
|
||||||
|
if creationid in TRANSLATORS.keys() and changeid == creationid:
|
||||||
|
translator = TRANSLATORS[changeid]
|
||||||
|
code = [tmxid for tmxid in sorted_tmxes.keys()
|
||||||
|
if translator in tmxid].pop()
|
||||||
|
tu = tuv.getparent()
|
||||||
|
sorted_tmxes[code].add_tu(tu)
|
||||||
|
|
||||||
|
return sorted_tmxes
|
||||||
|
|
||||||
|
|
||||||
|
def finalize_tmxdoc(tmxname, tmxcontent):
|
||||||
|
'''Define the tmx tree for output to a file.'''
|
||||||
|
|
||||||
|
# Set the full path and name for the TMX file.
|
||||||
|
tmxpath = common.Path(TMXTREE.docinfo.URL).parent
|
||||||
|
tmxfile = common.Path(tmxpath/tmxname).with_suffix('.tmx')
|
||||||
|
|
||||||
|
tmxcontent.insert_alt_comment()
|
||||||
|
tmxdoc = etree.ElementTree(tmxcontent.tmx)
|
||||||
|
|
||||||
|
return (tmxfile, tmxdoc)
|
||||||
|
|
||||||
|
|
||||||
|
def write_tmx(tmxfile, tmxdoc):
|
||||||
|
'''Output a TMX document to a file.'''
|
||||||
|
|
||||||
|
tmxdoc.write(tmxfile, encoding='utf-8', pretty_print=True,
|
||||||
|
xml_declaration=True, doctype=DOCTYPE)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
# Parse the TMX file into an XML tree, and retrieve the main elements
|
||||||
|
# and information needed to create the individual files.
|
||||||
|
TMXTREE = parse_tmx_tree()
|
||||||
|
TMXROOT = TMXTREE.getroot()
|
||||||
|
HEADER, BODY = TMXROOT.getchildren()
|
||||||
|
DOCTYPE = TMXTREE.docinfo.doctype
|
||||||
|
VERSION = TMXROOT.attrib.get('version')
|
||||||
|
|
||||||
|
|
||||||
|
# Get the list of translators whose work will be revised
|
||||||
|
TRANSLATORS = get_translator_list()
|
||||||
|
|
||||||
|
unrevised_translations = sort_unrevised_tus()
|
||||||
|
|
||||||
|
for name, tmxcontent in unrevised_translations.items():
|
||||||
|
unrevised_file, unrevised_doc = finalize_tmxdoc(name, tmxcontent)
|
||||||
|
|
||||||
|
write_tmx(unrevised_file, unrevised_doc)
|
Loading…
Reference in New Issue
Block a user