Add glossary merging tool.

2023-08-25 14:05:45 +02:00 · 2022-07-16 12:19:48 +09:00 · 2022-07-16 12:19:48 +09:00 · e42ea6db94
commit e42ea6db94
parent 3c88016e37
1 changed files with 179 additions and 0 deletions
--- a/omegat_tools/merge_omegat_glossaries.py
+++ b/omegat_tools/merge_omegat_glossaries.py
@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+
+'''
+Create a single OmegaT glossary from several glossaries.
+
+This script collects all entries from every OmegaT text format glossary in the specified folder, removes duplicates, and writes them to a single file.
+
+The OmegaT glossary is simply a text file containing up to three fields: source term, target term and, optionally, additional notes or comments.
+
+Entries are considered duplicates if they have identical source and target terms (including capitalization, hyphenation, and spacing). Entries with only a source term are discarded. However, entries with a source term and a note are retained even if they have no target term. If two entries have identical source and target terms, but one also has a supplementary note, the latter is preferred.
+'''
+
+###########################################################################
+#
+# Merge OmegaT Glossaries
+# ---------------------------
+# Version 0.2, released July 5, 2022
+# Author: Philippe Tourigny
+# License: GPL3+
+# https://www.gnu.org/licenses/gpl-3.0.en.html
+#
+# The purpose of this script is to consolidate glossaries that were
+# originally created independently but have a significant amount of
+# overlapping content.
+#
+# TODO:
+#   - Allow path to be passed as a command line argument
+#   - Check that the files are valid glossary files
+#
+###########################################################################
+
+import csv
+from pathlib import Path
+
+import common
+
+
+def get_glossary_settings():
+    '''Load the glossary-related settings from the configuration file.'''
+
+    settings = {'configpath':common.config['Paths']['glossaries'],
+                'main':common.config['Files']['main_glossary'],
+                'extensions':common.config['Files']['glossary_files'].replace(',', '')
+               }
+    
+    return settings
+
+
+def set_base_glossary_path():
+    '''Assign the default path for glossary files'''
+
+    config_path = Path(glossary_settings['configpath'])
+    candidates = [config_path, common.DEFAULT_DOCHOME, common.USER_HOME]
+    base_glossary_path = common.set_basepath(candidates)
+
+    return base_glossary_path
+
+
+def get_glossary_list(glossary_path):
+    '''Retrieve the list of glossary files to merge from the glossary path'''
+
+    # Syntax to retrieve more than one extension inspired by
+    # this Stack Overflow answer: https://stackoverflow.com/a/57893015/8123921
+    
+    extensions = glossary_settings['extensions']
+    glossary_list = (g for g in glossary_path.rglob('*')
+                     if g.suffix in extensions)
+
+    return glossary_list
+
+
+def get_glossary_entries(glossary_file):
+    '''Build a list of the entries in a glossary file.'''
+
+    fields =['source', 'target', 'notes']
+    skip = ('', None)
+
+    entries = []
+
+    with open(glossary_file, 'r', encoding='utf-8', newline='') as gf:
+        greader = csv.DictReader(gf, fieldnames=fields, delimiter='\t')
+
+        for line in greader:
+            if line['target'] in skip and line['notes'] in skip:
+                continue
+            elif line['notes'] is not None:
+                entry = (line['source'], line['target'], line['notes'])
+            else:
+                entry = (line['source'],line['target'],'')
+            
+            entries.append(entry)
+    
+    return entries
+
+
+def remove_redundant_pairs(entries):
+    '''Retain term pairs with a note if the exact same pair
+       exists both with and without a note'''
+
+    # Build list of duplicates differentiated only by the presence of a note.
+    def find_duplicates(entries):
+
+        pairs = [(term[0], term[1]) for term in entries]
+        duplicates = []
+        found = set()
+        
+        for pair in pairs:
+            if pair in found:
+                duplicates.append(pair)
+            else:
+                found.add(pair)
+        
+        return duplicates
+    
+    
+    # Identify duplicate pairs that have no notes and should be discarded.
+    def identify_discards(duplicates):
+
+        discard = []
+
+        for entry in entries:
+            pair = (entry[0], entry[1])
+
+            if pair in duplicates:
+                if entry[2] == '':
+                    discard.append(entry)
+        
+        return discard
+    
+    # Main function code starts here
+    discard = identify_discards(find_duplicates(entries))
+    glossary = [entry for entry in entries if not entry in discard]
+
+    return glossary
+
+
+def write_glossary(glossary):
+    '''Write the final merged glossary to a new file.'''
+
+    title = 'Enter name of file to save'
+    glossary_files = [('Glossary file', glossary_settings['extensions'])]
+    merged_name = common.get_save_file_name(glossary_path,
+                                            glossary_files,
+                                            title)
+    merged_file = Path(glossary_path/merged_name)
+
+    glossary_header=['# Glossary in tab-separated format -*- coding: utf-8 -*-']
+
+    with open(merged_file, 'w', encoding='utf-8', newline='') as mf:
+        gwriter = csv.writer(mf, delimiter='\t')
+
+        gwriter.writerow(glossary_header)
+        gwriter.writerows(glossary)
+
+
+if __name__ == '__main__':
+
+    # Retrieve configuration information for glossaries
+    glossary_settings = get_glossary_settings()
+
+    # Retrieve list of glossaries to merge
+    askfolder = 'Select folder with glossary files'
+    glossary_path = common.select_folder(set_base_glossary_path(), askfolder)
+
+    glossary_list = get_glossary_list(glossary_path)
+
+    # Build list of all entries from each glossary
+    all_entries = []
+
+    for glossary_file in glossary_list:
+        entries = get_glossary_entries(glossary_file)
+        all_entries.extend(entries)
+    
+    # Remove exact duplicates and any remaining redundant pairs
+    all_entries = list(set(all_entries))
+    merged_glossary = remove_redundant_pairs(all_entries)
+    
+    # Write merged glossary to a file
+    write_glossary(merged_glossary)