Web-LangTag/registries/ltru2xml.awk

330 lines
13 KiB
Awk
Raw Normal View History

2023-06-09 10:02:30 +02:00
#! /usr/common/bin/gawk -f
#
# Usage: ltru2xml.awk registry > registry.xml
# Or : ltru2xml.awk /dev/null > registry.dtd
#
# The File-Date record is noted in a 'date' attribute of the root
# element <LanguageSubtagRegistry>.
#
# Other records are converted to elements <language>, <extlang>,
# <script>, <region>, <variant>, <grandfathered>, or <redundant>.
#
# The fields Added (required), Deprecated (optional), Description
# (one or more), and Comment (optional) are converted to elements
# <added>, <deprecated>, <description>, and <comment> in that order.
#
#### version 0.6 ##################################################
#
# Multiple descriptions and comments can be separated by an empty
# <alt /> element, squeezing them all into a single <description>
# or <comment> resp. The cleaner approach is to allow multiple
# <description> or <comment> elements. Modify the line MULT = 0
# below to MULT = 1 for this style.
#
# The tags zh-cmn, zh-hakka, and yi-latn are handled as special
# cases for RFC 4646 registries, for 4646bis cmn is an ordinary
# <extlang> subtag.
#
# Known issues: If the language subtag review list introduces a
# language subtag with 5-8 characters which clashes with a variant
# subtag these subtags would get the same xml:id resulting in an
# XML syntax error. In practice it's unlikely that there ever
# will be any IANA language subtag not derived from ISO 936, let
# alone using the same string also for a variant subtag.
#
# "XML Notepad 2007", an experimental Microsoft tool, does not yet
# support xml:id attributes without an explicit declaration of the
# xml namespace. The W3C validator accepts xml:id without explicit
# namespace declaration.
#
#### version 0.7 ##################################################
#
# The Internet drafts 4646bis-08 and 4645bis-02 introduced a new
# optional field "Macrolanguage" for language and extlang subtags.
#
# Frank Ellermann, 2007
#
#### remove leading and trailing spaces ###########################
function STRIP( STR )
{ sub( /^[\t ]+/, "", STR )
sub( /[\t ]+$/, "", STR )
return STR
}
#### add underscore to subtags starting with a digit ##############
function XMLID( STR )
{ sub( /^[0-9]/, "_&", STR )
return STR
}
#### convert tag to IDREFS (subtags) ##############################
function IDREF( STR )
{ N = split( STR, REF, "-" )
STR = ""
for ( I = 1; I <= N; ++I )
STR = STR " " XMLID( REF[ I ] )
return substr( STR, 2 )
}
#### escape less-than (and greater-than) characters ###############
function CANON( STR )
{ gsub( /</, "&lt;", STR )
gsub( />/, "&gt;", STR )
return STR
}
#### error ########################################################
function FATAL( STR )
{ print "error near line " NR ": " STR
OKAY = 0 ; return 1
}
#### save unfolded field body #####################################
function FIELD()
{ if ( NAME == "description" ) D[ ++DD ] = BODY
else if ( NAME == "comments" ) C[ ++CC ] = BODY
else if ( NAME == "prefix" ) P[ ++PP ] = BODY
else if ( F[ NAME ] == "" ) F[ NAME ] = BODY
else exit FATAL( NAME ": " BODY )
return
}
#### output record elements #######################################
function READY()
{ T = tolower( F[ "type" ] )
if ( T == "" ) exit FATAL( "missing type" )
L = "<" T
S = F[ "subtag" ]
if ( S == "" )
{ S = F[ "tag" ]
if ( S == "" ) exit FATAL( "missing tag" )
if ( T != "redundant" ) print L ">"
else if ( S == "yi-latn" )
print L " subtags='yi Latn'>"
else print L " subtags='" IDREF( S ) "'>"
B = "\t<tag> " S " </tag>"
HACK = HACK && ( S != "hak" )
}
else if ( F[ "tag" ] == "" )
{ print L " xml:id='" XMLID( S ) "'>"
B = "\t<subtag> " S " </subtag>"
}
else exit FATAL( "conflicting subtag " S )
S = F[ "suppress-script" ]
if ( S != "" && T == "language" )
{ L = "\t<suppress script='" S "'> "
print L S " </suppress>"
}
else if ( S != "" )
exit FATAL( "unexpected Suppress-Script" S )
if ( T == "extlang" && PP != 1 )
exit FATAL( "missing or extraneous prefix" )
while ( PP )
{ L = "\t<prefix subtags='" IDREF( P[ PP ] ) "'> "
print L P[ PP-- ] " </prefix>"
}
# modified:
S = F[ "macrolanguage" ]
if ( S != "" && ( T == "language" || T == "extlang" ))
{ L = "\t<macro language='" S "'> "
print L S " </macro>"
}
else if ( S != "" )
exit FATAL( "unexpected Macrolanguage" S )
A = F[ "added" ]
if ( A == "" ) exit FATAL( "missing date" )
print B "<added> " A " </added>"
S = F[ "preferred-value" ]
A = F[ "deprecated" ]
if ( A != "" )
{ L = "\t"
if ( S != "" )
{ L = L "<preferred"
H = S == "zh-cmn" || S == "zh-hakka"
if ( HACK && H )
L = L " subtags='zh'> "
else L = L " subtags='" IDREF( S ) "'> "
L = L S " </preferred>"
}
print L "<deprecated> " A " </deprecated>"
}
else if ( S != "" )
exit FATAL( "missing deprecated" )
if ( DD )
{ L = "\t<description> "
while ( DD )
{ L = L CANON( D[ DD-- ] )
if ( DD )
{ if ( MULT )
{ print L " </description> "
L = "\t<description>"
}
else
{ print L
L = "\t<alt /> "
} } }
print L " </description>"
}
else exit FATAL( "missing description" )
if ( CC )
{ L = "\t<comment> "
while ( CC )
{ L = L CANON( C[ CC-- ] )
if ( CC )
{ if ( MULT )
{ print L " </comment> "
L = "\t<comment>"
}
else
{ print L
L = "\t<alt /> "
} } }
print L " </comment>"
}
print "</" T ">"
return
}
#### output DOCTYPE ###############################################
BEGIN { ROOT = "LanguageSubtagRegistry"
HACK = 1
OKAY = 0
VERS = "ltru2xml/0.7"
MULT = 1
if ( ! MULT ) VERS = VERS "alt"
L = "<?xml version=\"1.0\" "
L = L "encoding=\"UTF-8\" "
print L "standalone=\"yes\" ?>"
print "<!DOCTYPE " ROOT " ["
A = "\tdate NMTOKEN #REQUIRED"
S = "grandfathered*, redundant*"
print "<!ELEMENT " ROOT " (language*, extlang*,"
print "\tscript*, region*, variant*, " S ")>"
print "<!ATTLIST " ROOT
print A ">"
A = "\txml:id ID #REQUIRED"
S = "added, (preferred?, deprecated)?,"
if ( MULT )
S = S " description+, comment*"
else S = S " description, comment?"
# modified:
L = "macro?, subtag,"
print "<!ELEMENT language (suppress?, " L
print "\t" S ")>"
print "<!ATTLIST language"
print A ">"
print "<!ELEMENT extlang (prefix, " L
print "\t" S ")>"
print "<!ATTLIST extlang"
print A ">"
print "<!ELEMENT script (subtag,"
print "\t" S ")>"
print "<!ATTLIST script"
print A ">"
print "<!ELEMENT region (subtag,"
print "\t" S ")>"
print "<!ATTLIST region"
print A ">"
print "<!ELEMENT variant (prefix*, subtag,"
print "\t" S ")>"
print "<!ATTLIST variant"
print A ">"
A = "\tsubtags IDREFS #REQUIRED"
print "<!ELEMENT grandfathered (tag,"
print "\t" S ")>"
print "<!ELEMENT redundant (tag,"
print "\t" S ")>"
print "<!ATTLIST redundant"
print A ">"
L = "<!-- a script subtag -->"
S = "\tscript IDREF #REQUIRED"
print "<!ELEMENT suppress (#PCDATA)>" L
print "<!ATTLIST suppress"
print S ">"
# modified:
L = "<!-- a macrolang subtag -->"
S = "\tlanguage IDREF #REQUIRED"
print "<!ELEMENT macro (#PCDATA)>" L
print "<!ATTLIST macro"
print S ">"
L = "<!-- a prefix tag -->"
print "<!ELEMENT prefix (#PCDATA)>" L
print "<!ATTLIST prefix"
print A ">"
L = "<!-- a date -->"
print "<!ELEMENT added (#PCDATA)>" L
print "<!ELEMENT deprecated (#PCDATA)>" L
L = "<!-- a (sub)tag -->"
print "<!ELEMENT preferred (#PCDATA)>" L
print "<!ATTLIST preferred"
print A ">"
print "<!ELEMENT subtag (#PCDATA)>" L
print "<!ELEMENT tag (#PCDATA)>" L
if ( MULT )
L = "(#PCDATA)><!-- text -->"
else L = "(#PCDATA | alt)*>"
print "<!ELEMENT description " L
print "<!ELEMENT comment " L
if ( ! MULT )
{ L = "<!-- separator -->"
print "<!ELEMENT alt EMPTY> " L
}
print "]>"
print
}
#### record separator #############################################
/^\%\%$/ { FIELD()
if ( NN++ ) READY()
else
{ A = F[ "file-date" ]
if ( A == "" )
exit FATAL( "missing File-Date" )
print "<" ROOT " date='" A "'>"
OKAY = 1
}
for ( NAME in F ) delete F[ NAME ]
NAME = "" ; CC = 0 ; PP = 0
BODY = "" ; DD = 0 ; next
}
#### start of new field ###########################################
/^[A-Za-z0-9]/ { FIELD()
if ( ! match( $0, ":" )) exit FATAL( $0 )
NAME = tolower( substr( $0, 1, RSTART - 1 ))
BODY = STRIP( substr( $0, RSTART + 1 ))
next
}
#### unfold field body ############################################
/^[\t ]/ { BODY = BODY " " STRIP( $0 )
next
}
#### garbage ######################################################
{ exit FATAL( $0 )
}
###################################################################
END { if ( NN++ )
{ FIELD() ; READY()
print "</" ROOT ">"
}
else OKAY = 0
print "<!-- " VERS " (" --NN " records) -->"
exit 1 - OKAY
}