#!/usr/bin/env python3 """ Converts an UTF-8 text file to an ASCII file with hexadecimal Numeric Character References (like œ). """ import sys import re extension = re.compile("^(.*)\.([a-z0-9_-]+)$", re.IGNORECASE) def convert(thematch): codepoint = int(thematch.group(1), 16) return chr(codepoint) for ifilename in sys.argv[1:]: print("Converting %s..." % ifilename) match = extension.search (ifilename) if match: ext_ifile = match.group(2) ofilename = match.group(1) + "-ncr." + ext_ifile else: ofilename = ifilename + "-ncr" ifile = open(ifilename, "r") ofile = open(ofilename, "w") data = ifile.read() for ch in data: if ord(ch) > 127: ch = "&#x%x;" % ord(ch) ofile.write(ch) ifile.close() ofile.close()