Little_CAT_Helpers/omegat_tools/merge_omegat_glossaries.py

# -*- coding: utf-8 -*-

'''
Create a single OmegaT glossary from several glossaries.

This script collects all entries from every OmegaT text format glossary in the specified folder, removes duplicates, and writes them to a single file.

The OmegaT glossary is simply a text file containing up to three fields: source term, target term and, optionally, additional notes or comments.

Entries are considered duplicates if they have identical source and target terms (including capitalization, hyphenation, and spacing). Entries with only a source term are discarded. However, entries with a source term and a note are retained even if they have no target term. If two entries have identical source and target terms, but one also has a supplementary note, the latter is preferred.
'''

###########################################################################
#
# Merge OmegaT Glossaries
# ---------------------------
# Version 0.2, released July 5, 2022
# Author: Philippe Tourigny
# License: GPL3+
# https://www.gnu.org/licenses/gpl-3.0.en.html
#
# The purpose of this script is to consolidate glossaries that were
# originally created independently but have a significant amount of
# overlapping content.
#
# TODO:
#   - Allow path to be passed as a command line argument
#   - Check that the files are valid glossary files
#
###########################################################################

import csv

import common


def get_glossary_settings():
    '''Load the glossary-related settings from the configuration file.'''

    settings = {'configpath':common.config['Paths']['glossaries'],
                'main':common.config['Files']['main_glossary'],
                'extensions':common.config['Files']['glossary_files'].replace(',', '')
               }

    return settings


def set_base_glossary_path():
    '''Assign the default path for glossary files'''

    configpath = glossary_settings['configpath']
    base_glossary_path = common.set_basepath(configpath)

    return base_glossary_path


def get_glossary_list(glossary_path):
    '''Retrieve the list of glossary files to merge from the glossary path'''

    # Syntax to retrieve more than one extension inspired by
    # this Stack Overflow answer: https://stackoverflow.com/a/57893015/8123921

    extensions = glossary_settings['extensions']
    glossary_list = (g for g in glossary_path.rglob('*')
                     if g.suffix in extensions)

    return glossary_list


def get_glossary_entries(glossary_file):
    '''Build a list of the entries in a glossary file.'''

    fields =['source', 'target', 'notes']
    skip = ('', None)

    entries = []

    with open(glossary_file, 'r', encoding='utf-8', newline='') as gf:
        greader = csv.DictReader(gf, fieldnames=fields, delimiter='\t')

        for line in greader:
            if line['target'] in skip and line['notes'] in skip:
                continue
            elif line['notes'] is not None:
                entry = (line['source'], line['target'], line['notes'])
            else:
                entry = (line['source'],line['target'],'')

            entries.append(entry)

    return entries


def remove_redundant_pairs(entries):
    '''Retain term pairs with a note if the exact same pair
       exists both with and without a note'''

    # Build list of duplicates differentiated only by the presence of a note.
    def find_duplicates(entries):

        pairs = [(term[0], term[1]) for term in entries]
        duplicates = []
        found = set()

        for pair in pairs:
            if pair in found:
                duplicates.append(pair)
            else:
                found.add(pair)

        return duplicates


    # Identify duplicate pairs that have no notes and should be discarded.
    def identify_discards(duplicates):

        discard = []

        for entry in entries:
            pair = (entry[0], entry[1])

            if pair in duplicates:
                if entry[2] == '':
                    discard.append(entry)

        return discard

    # Main function code starts here
    discard = identify_discards(find_duplicates(entries))
    glossary = [entry for entry in entries if not entry in discard]

    return glossary


def write_glossary(glossary):
    '''Write the final merged glossary to a new file.'''

    title = 'Enter name of file to save'
    glossary_files = [('Glossary file', glossary_settings['extensions'])]
    merged_name = common.get_save_file_name(glossary_path,
                                            glossary_files,
                                            title)
    merged_file = common.Path(glossary_path/merged_name)

    glossary_header=['# Glossary in tab-separated format -*- coding: utf-8 -*-']

    with open(merged_file, 'w', encoding='utf-8', newline='') as mf:
        gwriter = csv.writer(mf, delimiter='\t')

        gwriter.writerow(glossary_header)
        gwriter.writerows(glossary)


if __name__ == '__main__':

    # Retrieve configuration information for glossaries
    glossary_settings = get_glossary_settings()

    # Retrieve list of glossaries to merge
    askfolder = 'Select folder with glossary files'
    glossary_path = common.select_folder(set_base_glossary_path(), askfolder)

    glossary_list = get_glossary_list(glossary_path)

    # Build list of all entries from each glossary
    all_entries = []

    for glossary_file in glossary_list:
        entries = get_glossary_entries(glossary_file)
        all_entries.extend(entries)

    # Remove exact duplicates and any remaining redundant pairs
    all_entries = list(set(all_entries))
    merged_glossary = remove_redundant_pairs(all_entries)

    # Write merged glossary to a file
    write_glossary(merged_glossary)