forked from bortzmeyer/Web-LangTag
330 lines
13 KiB
Awk
330 lines
13 KiB
Awk
#! /usr/common/bin/gawk -f
|
|
#
|
|
# Usage: ltru2xml.awk registry > registry.xml
|
|
# Or : ltru2xml.awk /dev/null > registry.dtd
|
|
#
|
|
# The File-Date record is noted in a 'date' attribute of the root
|
|
# element <LanguageSubtagRegistry>.
|
|
#
|
|
# Other records are converted to elements <language>, <extlang>,
|
|
# <script>, <region>, <variant>, <grandfathered>, or <redundant>.
|
|
#
|
|
# The fields Added (required), Deprecated (optional), Description
|
|
# (one or more), and Comment (optional) are converted to elements
|
|
# <added>, <deprecated>, <description>, and <comment> in that order.
|
|
#
|
|
#### version 0.6 ##################################################
|
|
#
|
|
# Multiple descriptions and comments can be separated by an empty
|
|
# <alt /> element, squeezing them all into a single <description>
|
|
# or <comment> resp. The cleaner approach is to allow multiple
|
|
# <description> or <comment> elements. Modify the line MULT = 0
|
|
# below to MULT = 1 for this style.
|
|
#
|
|
# The tags zh-cmn, zh-hakka, and yi-latn are handled as special
|
|
# cases for RFC 4646 registries, for 4646bis cmn is an ordinary
|
|
# <extlang> subtag.
|
|
#
|
|
# Known issues: If the language subtag review list introduces a
|
|
# language subtag with 5-8 characters which clashes with a variant
|
|
# subtag these subtags would get the same xml:id resulting in an
|
|
# XML syntax error. In practice it's unlikely that there ever
|
|
# will be any IANA language subtag not derived from ISO 936, let
|
|
# alone using the same string also for a variant subtag.
|
|
#
|
|
# "XML Notepad 2007", an experimental Microsoft tool, does not yet
|
|
# support xml:id attributes without an explicit declaration of the
|
|
# xml namespace. The W3C validator accepts xml:id without explicit
|
|
# namespace declaration.
|
|
#
|
|
#### version 0.7 ##################################################
|
|
#
|
|
# The Internet drafts 4646bis-08 and 4645bis-02 introduced a new
|
|
# optional field "Macrolanguage" for language and extlang subtags.
|
|
#
|
|
# Frank Ellermann, 2007
|
|
#
|
|
#### remove leading and trailing spaces ###########################
|
|
function STRIP( STR )
|
|
{ sub( /^[\t ]+/, "", STR )
|
|
sub( /[\t ]+$/, "", STR )
|
|
return STR
|
|
}
|
|
#### add underscore to subtags starting with a digit ##############
|
|
function XMLID( STR )
|
|
{ sub( /^[0-9]/, "_&", STR )
|
|
return STR
|
|
}
|
|
#### convert tag to IDREFS (subtags) ##############################
|
|
function IDREF( STR )
|
|
{ N = split( STR, REF, "-" )
|
|
STR = ""
|
|
for ( I = 1; I <= N; ++I )
|
|
STR = STR " " XMLID( REF[ I ] )
|
|
return substr( STR, 2 )
|
|
}
|
|
#### escape less-than (and greater-than) characters ###############
|
|
function CANON( STR )
|
|
{ gsub( /</, "<", STR )
|
|
gsub( />/, ">", STR )
|
|
return STR
|
|
}
|
|
#### error ########################################################
|
|
function FATAL( STR )
|
|
{ print "error near line " NR ": " STR
|
|
OKAY = 0 ; return 1
|
|
}
|
|
#### save unfolded field body #####################################
|
|
function FIELD()
|
|
{ if ( NAME == "description" ) D[ ++DD ] = BODY
|
|
else if ( NAME == "comments" ) C[ ++CC ] = BODY
|
|
else if ( NAME == "prefix" ) P[ ++PP ] = BODY
|
|
else if ( F[ NAME ] == "" ) F[ NAME ] = BODY
|
|
else exit FATAL( NAME ": " BODY )
|
|
return
|
|
}
|
|
#### output record elements #######################################
|
|
function READY()
|
|
{ T = tolower( F[ "type" ] )
|
|
if ( T == "" ) exit FATAL( "missing type" )
|
|
L = "<" T
|
|
|
|
S = F[ "subtag" ]
|
|
if ( S == "" )
|
|
{ S = F[ "tag" ]
|
|
if ( S == "" ) exit FATAL( "missing tag" )
|
|
if ( T != "redundant" ) print L ">"
|
|
else if ( S == "yi-latn" )
|
|
print L " subtags='yi Latn'>"
|
|
else print L " subtags='" IDREF( S ) "'>"
|
|
B = "\t<tag> " S " </tag>"
|
|
HACK = HACK && ( S != "hak" )
|
|
}
|
|
else if ( F[ "tag" ] == "" )
|
|
{ print L " xml:id='" XMLID( S ) "'>"
|
|
B = "\t<subtag> " S " </subtag>"
|
|
}
|
|
else exit FATAL( "conflicting subtag " S )
|
|
|
|
S = F[ "suppress-script" ]
|
|
if ( S != "" && T == "language" )
|
|
{ L = "\t<suppress script='" S "'> "
|
|
print L S " </suppress>"
|
|
}
|
|
else if ( S != "" )
|
|
exit FATAL( "unexpected Suppress-Script" S )
|
|
|
|
if ( T == "extlang" && PP != 1 )
|
|
exit FATAL( "missing or extraneous prefix" )
|
|
while ( PP )
|
|
{ L = "\t<prefix subtags='" IDREF( P[ PP ] ) "'> "
|
|
print L P[ PP-- ] " </prefix>"
|
|
}
|
|
# modified:
|
|
S = F[ "macrolanguage" ]
|
|
if ( S != "" && ( T == "language" || T == "extlang" ))
|
|
{ L = "\t<macro language='" S "'> "
|
|
print L S " </macro>"
|
|
}
|
|
else if ( S != "" )
|
|
exit FATAL( "unexpected Macrolanguage" S )
|
|
|
|
A = F[ "added" ]
|
|
if ( A == "" ) exit FATAL( "missing date" )
|
|
print B "<added> " A " </added>"
|
|
|
|
S = F[ "preferred-value" ]
|
|
A = F[ "deprecated" ]
|
|
if ( A != "" )
|
|
{ L = "\t"
|
|
if ( S != "" )
|
|
{ L = L "<preferred"
|
|
H = S == "zh-cmn" || S == "zh-hakka"
|
|
if ( HACK && H )
|
|
L = L " subtags='zh'> "
|
|
else L = L " subtags='" IDREF( S ) "'> "
|
|
L = L S " </preferred>"
|
|
}
|
|
print L "<deprecated> " A " </deprecated>"
|
|
}
|
|
else if ( S != "" )
|
|
exit FATAL( "missing deprecated" )
|
|
|
|
if ( DD )
|
|
{ L = "\t<description> "
|
|
while ( DD )
|
|
{ L = L CANON( D[ DD-- ] )
|
|
if ( DD )
|
|
{ if ( MULT )
|
|
{ print L " </description> "
|
|
L = "\t<description>"
|
|
}
|
|
else
|
|
{ print L
|
|
L = "\t<alt /> "
|
|
} } }
|
|
print L " </description>"
|
|
}
|
|
else exit FATAL( "missing description" )
|
|
|
|
if ( CC )
|
|
{ L = "\t<comment> "
|
|
while ( CC )
|
|
{ L = L CANON( C[ CC-- ] )
|
|
if ( CC )
|
|
{ if ( MULT )
|
|
{ print L " </comment> "
|
|
L = "\t<comment>"
|
|
}
|
|
else
|
|
{ print L
|
|
L = "\t<alt /> "
|
|
} } }
|
|
print L " </comment>"
|
|
}
|
|
|
|
print "</" T ">"
|
|
return
|
|
}
|
|
#### output DOCTYPE ###############################################
|
|
BEGIN { ROOT = "LanguageSubtagRegistry"
|
|
HACK = 1
|
|
OKAY = 0
|
|
|
|
VERS = "ltru2xml/0.7"
|
|
MULT = 1
|
|
if ( ! MULT ) VERS = VERS "alt"
|
|
|
|
L = "<?xml version=\"1.0\" "
|
|
L = L "encoding=\"UTF-8\" "
|
|
print L "standalone=\"yes\" ?>"
|
|
print "<!DOCTYPE " ROOT " ["
|
|
|
|
A = "\tdate NMTOKEN #REQUIRED"
|
|
S = "grandfathered*, redundant*"
|
|
print "<!ELEMENT " ROOT " (language*, extlang*,"
|
|
print "\tscript*, region*, variant*, " S ")>"
|
|
print "<!ATTLIST " ROOT
|
|
print A ">"
|
|
|
|
A = "\txml:id ID #REQUIRED"
|
|
S = "added, (preferred?, deprecated)?,"
|
|
if ( MULT )
|
|
S = S " description+, comment*"
|
|
else S = S " description, comment?"
|
|
# modified:
|
|
L = "macro?, subtag,"
|
|
print "<!ELEMENT language (suppress?, " L
|
|
print "\t" S ")>"
|
|
print "<!ATTLIST language"
|
|
print A ">"
|
|
|
|
print "<!ELEMENT extlang (prefix, " L
|
|
print "\t" S ")>"
|
|
print "<!ATTLIST extlang"
|
|
print A ">"
|
|
|
|
print "<!ELEMENT script (subtag,"
|
|
print "\t" S ")>"
|
|
print "<!ATTLIST script"
|
|
print A ">"
|
|
|
|
print "<!ELEMENT region (subtag,"
|
|
print "\t" S ")>"
|
|
print "<!ATTLIST region"
|
|
print A ">"
|
|
|
|
print "<!ELEMENT variant (prefix*, subtag,"
|
|
print "\t" S ")>"
|
|
print "<!ATTLIST variant"
|
|
print A ">"
|
|
|
|
A = "\tsubtags IDREFS #REQUIRED"
|
|
print "<!ELEMENT grandfathered (tag,"
|
|
print "\t" S ")>"
|
|
print "<!ELEMENT redundant (tag,"
|
|
print "\t" S ")>"
|
|
print "<!ATTLIST redundant"
|
|
print A ">"
|
|
|
|
L = "<!-- a script subtag -->"
|
|
S = "\tscript IDREF #REQUIRED"
|
|
print "<!ELEMENT suppress (#PCDATA)>" L
|
|
print "<!ATTLIST suppress"
|
|
print S ">"
|
|
# modified:
|
|
L = "<!-- a macrolang subtag -->"
|
|
S = "\tlanguage IDREF #REQUIRED"
|
|
print "<!ELEMENT macro (#PCDATA)>" L
|
|
print "<!ATTLIST macro"
|
|
print S ">"
|
|
|
|
L = "<!-- a prefix tag -->"
|
|
print "<!ELEMENT prefix (#PCDATA)>" L
|
|
print "<!ATTLIST prefix"
|
|
print A ">"
|
|
|
|
L = "<!-- a date -->"
|
|
print "<!ELEMENT added (#PCDATA)>" L
|
|
print "<!ELEMENT deprecated (#PCDATA)>" L
|
|
|
|
L = "<!-- a (sub)tag -->"
|
|
print "<!ELEMENT preferred (#PCDATA)>" L
|
|
print "<!ATTLIST preferred"
|
|
print A ">"
|
|
|
|
print "<!ELEMENT subtag (#PCDATA)>" L
|
|
print "<!ELEMENT tag (#PCDATA)>" L
|
|
|
|
if ( MULT )
|
|
L = "(#PCDATA)><!-- text -->"
|
|
else L = "(#PCDATA | alt)*>"
|
|
print "<!ELEMENT description " L
|
|
print "<!ELEMENT comment " L
|
|
if ( ! MULT )
|
|
{ L = "<!-- separator -->"
|
|
print "<!ELEMENT alt EMPTY> " L
|
|
}
|
|
|
|
print "]>"
|
|
print
|
|
}
|
|
|
|
#### record separator #############################################
|
|
/^\%\%$/ { FIELD()
|
|
if ( NN++ ) READY()
|
|
else
|
|
{ A = F[ "file-date" ]
|
|
if ( A == "" )
|
|
exit FATAL( "missing File-Date" )
|
|
print "<" ROOT " date='" A "'>"
|
|
OKAY = 1
|
|
}
|
|
|
|
for ( NAME in F ) delete F[ NAME ]
|
|
NAME = "" ; CC = 0 ; PP = 0
|
|
BODY = "" ; DD = 0 ; next
|
|
}
|
|
#### start of new field ###########################################
|
|
/^[A-Za-z0-9]/ { FIELD()
|
|
if ( ! match( $0, ":" )) exit FATAL( $0 )
|
|
NAME = tolower( substr( $0, 1, RSTART - 1 ))
|
|
BODY = STRIP( substr( $0, RSTART + 1 ))
|
|
next
|
|
}
|
|
#### unfold field body ############################################
|
|
/^[\t ]/ { BODY = BODY " " STRIP( $0 )
|
|
next
|
|
}
|
|
#### garbage ######################################################
|
|
{ exit FATAL( $0 )
|
|
}
|
|
###################################################################
|
|
END { if ( NN++ )
|
|
{ FIELD() ; READY()
|
|
print "</" ROOT ">"
|
|
}
|
|
else OKAY = 0
|
|
print "<!-- " VERS " (" --NN " records) -->"
|
|
exit 1 - OKAY
|
|
} |