Web-LangTag/registries/ltru2xml.awk

#! /usr/common/bin/gawk -f
#
# Usage: ltru2xml.awk registry  > registry.xml
# Or   : ltru2xml.awk /dev/null > registry.dtd
#
# The File-Date record is noted in a 'date' attribute of the root
# element <LanguageSubtagRegistry>.
#
# Other records are converted to elements <language>, <extlang>,
# <script>, <region>, <variant>, <grandfathered>, or <redundant>.
#
# The fields Added (required), Deprecated (optional), Description
# (one or more), and Comment (optional) are converted to elements
# <added>, <deprecated>, <description>, and <comment> in that order.
#
#### version 0.6 ##################################################
#
# Multiple descriptions and comments can be separated by an empty
# <alt /> element, squeezing them all into a single <description>
# or <comment> resp.  The cleaner approach is to allow multiple
# <description> or <comment> elements.  Modify the line MULT = 0
# below to MULT = 1 for this style.
#
# The tags zh-cmn, zh-hakka, and yi-latn are handled as special
# cases for RFC 4646 registries, for 4646bis cmn is an ordinary
# <extlang> subtag.
#
# Known issues:  If the language subtag review list introduces a
# language subtag with 5-8 characters which clashes with a variant
# subtag these subtags would get the same xml:id resulting in an
# XML syntax error.  In practice it's unlikely that there ever
# will be any IANA language subtag not derived from ISO 936, let
# alone using the same string also for a variant subtag.
#
# "XML Notepad 2007", an experimental Microsoft tool, does not yet
# support xml:id attributes without an explicit declaration of the
# xml namespace.  The W3C validator accepts xml:id without explicit
# namespace declaration.
#
#### version 0.7 ##################################################
#
# The Internet drafts 4646bis-08 and 4645bis-02 introduced a new
# optional field "Macrolanguage" for language and extlang subtags.
#
#                                             Frank Ellermann, 2007
#
#### remove leading and trailing spaces ###########################
function STRIP( STR )
{              sub( /^[\t ]+/, "", STR )
               sub( /[\t ]+$/, "", STR )
               return STR
}
#### add underscore to subtags starting with a digit ##############
function XMLID( STR )
{              sub( /^[0-9]/, "_&", STR )
               return STR
}
#### convert tag to IDREFS (subtags) ##############################
function IDREF( STR )
{              N = split( STR, REF, "-" )
               STR = ""
               for ( I = 1; I <= N; ++I )
                    STR = STR " " XMLID( REF[ I ] )
               return substr( STR, 2 )
}
#### escape less-than (and greater-than) characters ###############
function CANON( STR )
{              gsub( /</, "&lt;", STR )
               gsub( />/, "&gt;", STR )
               return STR
}
#### error ########################################################
function FATAL( STR )
{              print "error near line " NR ": " STR
               OKAY = 0  ;    return 1
}
#### save unfolded field body #####################################
function FIELD()
{              if ( NAME == "description" )   D[ ++DD ] = BODY
               else if ( NAME == "comments" ) C[ ++CC ] = BODY
               else if ( NAME == "prefix" )   P[ ++PP ] = BODY
               else if ( F[ NAME ] == "" )    F[ NAME ] = BODY
               else exit FATAL( NAME ": " BODY )
               return
}
#### output record elements #######################################
function READY()
{              T = tolower( F[ "type" ] )
               if ( T == "" ) exit FATAL( "missing type" )
               L = "<" T

               S = F[ "subtag" ]
               if ( S == "" )
               {    S = F[ "tag" ]
                    if ( S == "" ) exit FATAL( "missing tag" )
                    if ( T != "redundant" )  print L ">"
                    else if ( S == "yi-latn" )
                         print L " subtags='yi Latn'>"
                    else print L " subtags='" IDREF( S ) "'>"
                    B = "\t<tag> " S " </tag>"
                    HACK = HACK && ( S != "hak" )
               }
               else if ( F[ "tag" ] == "" )
               {    print L " xml:id='" XMLID( S ) "'>"
                    B = "\t<subtag> " S " </subtag>"
               }
               else exit FATAL( "conflicting subtag " S )

               S = F[ "suppress-script" ]
               if ( S != "" && T == "language" )
               {    L = "\t<suppress script='" S "'> "
                    print L S " </suppress>"
               }
               else if ( S != "" )
                    exit FATAL( "unexpected Suppress-Script" S )

               if ( T == "extlang" && PP != 1 )
                    exit FATAL( "missing or extraneous prefix" )
               while ( PP )
               {    L = "\t<prefix subtags='" IDREF( P[ PP ] ) "'> "
                    print L P[ PP-- ] " </prefix>"
               }
# modified:
               S = F[ "macrolanguage" ]
               if ( S != "" && ( T == "language" || T == "extlang" ))
               {    L = "\t<macro language='" S "'> "
                    print L S " </macro>"
               }
               else if ( S != "" )
                    exit FATAL( "unexpected Macrolanguage" S )

               A = F[ "added" ]
               if ( A == "" ) exit FATAL( "missing date" )
               print B "<added> " A " </added>"

               S = F[ "preferred-value" ]
               A = F[ "deprecated" ]
               if ( A != "" )
               {    L = "\t"
                    if ( S != "" )
                    {    L = L "<preferred"
                         H = S == "zh-cmn" || S == "zh-hakka"
                         if ( HACK && H )
                              L = L " subtags='zh'> "
                         else L = L " subtags='" IDREF( S ) "'> "
                         L = L S " </preferred>"
                    }
                    print L "<deprecated> " A " </deprecated>"
               }
               else if ( S != "" )
                    exit FATAL( "missing deprecated" )

               if ( DD )
               {    L = "\t<description> "
                    while ( DD )
                    {    L = L CANON( D[ DD-- ] )
                         if ( DD )
                         {    if ( MULT )
                              {    print L " </description> "
                                   L = "\t<description>"
                              }
                              else
                              {    print L
                                   L = "\t<alt /> "
                    }    }    }
                    print L " </description>"
               }
               else exit FATAL( "missing description" )

               if ( CC )
               {    L = "\t<comment> "
                    while ( CC )
                    {    L = L CANON( C[ CC-- ] )
                         if ( CC )
                         {    if ( MULT )
                              {    print L " </comment> "
                                   L = "\t<comment>"
                              }
                              else
                              {    print L
                                   L = "\t<alt /> "
                    }    }    }
                    print L " </comment>"
               }

               print "</" T ">"
               return
}
#### output DOCTYPE ###############################################
BEGIN          {    ROOT = "LanguageSubtagRegistry"
                    HACK = 1
                    OKAY = 0

                    VERS = "ltru2xml/0.7"
                    MULT = 1
                    if ( ! MULT ) VERS = VERS "alt"

                    L = "<?xml version=\"1.0\" "
                    L = L "encoding=\"UTF-8\" "
                    print L "standalone=\"yes\" ?>"
                    print "<!DOCTYPE " ROOT " ["

                    A = "\tdate   NMTOKEN #REQUIRED"
                    S = "grandfathered*, redundant*"
                    print "<!ELEMENT " ROOT " (language*, extlang*,"
                    print "\tscript*, region*, variant*, " S ")>"
                    print "<!ATTLIST " ROOT
                    print A ">"

                    A = "\txml:id  ID     #REQUIRED"
                    S = "added, (preferred?, deprecated)?,"
                    if ( MULT )
                         S = S " description+, comment*"
                    else S = S " description, comment?"
# modified:
                    L = "macro?, subtag,"
                    print "<!ELEMENT language (suppress?, " L
                    print "\t" S ")>"
                    print "<!ATTLIST language"
                    print A ">"

                    print "<!ELEMENT extlang (prefix, " L
                    print "\t" S ")>"
                    print "<!ATTLIST extlang"
                    print A ">"

                    print "<!ELEMENT script (subtag,"
                    print "\t" S ")>"
                    print "<!ATTLIST script"
                    print A ">"

                    print "<!ELEMENT region (subtag,"
                    print "\t" S ")>"
                    print "<!ATTLIST region"
                    print A ">"

                    print "<!ELEMENT variant (prefix*, subtag,"
                    print "\t" S ")>"
                    print "<!ATTLIST variant"
                    print A ">"

                    A = "\tsubtags IDREFS #REQUIRED"
                    print "<!ELEMENT grandfathered (tag,"
                    print "\t" S ")>"
                    print "<!ELEMENT redundant (tag,"
                    print "\t" S ")>"
                    print "<!ATTLIST redundant"
                    print A ">"

                    L = "<!-- a script subtag -->"
                    S = "\tscript  IDREF  #REQUIRED"
                    print "<!ELEMENT suppress     (#PCDATA)>" L
                    print "<!ATTLIST suppress"
                    print S ">"
# modified:
                    L = "<!-- a macrolang subtag -->"
                    S = "\tlanguage IDREF #REQUIRED"
                    print "<!ELEMENT macro        (#PCDATA)>" L
                    print "<!ATTLIST macro"
                    print S ">"

                    L = "<!-- a prefix tag -->"
                    print "<!ELEMENT prefix       (#PCDATA)>" L
                    print "<!ATTLIST prefix"
                    print A ">"

                    L = "<!-- a date -->"
                    print "<!ELEMENT added        (#PCDATA)>" L
                    print "<!ELEMENT deprecated   (#PCDATA)>" L

                    L = "<!-- a (sub)tag -->"
                    print "<!ELEMENT preferred    (#PCDATA)>" L
                    print "<!ATTLIST preferred"
                    print A ">"

                    print "<!ELEMENT subtag       (#PCDATA)>" L
                    print "<!ELEMENT tag          (#PCDATA)>" L

                    if ( MULT )
                         L = "(#PCDATA)><!-- text -->"
                    else L = "(#PCDATA | alt)*>"
                    print "<!ELEMENT description  " L
                    print "<!ELEMENT comment      " L
                    if ( ! MULT )
                    {    L = "<!-- separator -->"
                         print "<!ELEMENT alt          EMPTY>    " L
                    }

                    print "]>"
                    print
               }

#### record separator #############################################
/^\%\%$/       {    FIELD()
                    if ( NN++ )    READY()
                    else
                    {    A = F[ "file-date" ]
                         if ( A == "" )
                              exit FATAL( "missing File-Date" )
                         print "<" ROOT " date='" A "'>"
                         OKAY = 1
                    }

                    for ( NAME in F ) delete F[ NAME ]
                    NAME = "" ;    CC = 0    ;    PP = 0
                    BODY = "" ;    DD = 0    ;    next
               }
#### start of new field ###########################################
/^[A-Za-z0-9]/ {    FIELD()
                    if ( ! match( $0, ":" )) exit FATAL( $0 )
                    NAME = tolower( substr( $0, 1, RSTART - 1 ))
                    BODY = STRIP( substr( $0, RSTART + 1 ))
                    next
               }
#### unfold field body ############################################
/^[\t ]/       {    BODY = BODY " " STRIP( $0 )
                    next
               }
#### garbage ######################################################
               {    exit FATAL( $0 )
               }
###################################################################
END            {    if ( NN++ )
                    {    FIELD()   ;   READY()
                         print "</" ROOT ">"
                    }
                    else OKAY = 0
                    print "<!-- " VERS " (" --NN " records) -->"
                    exit 1 - OKAY
               }