From 85f4c9e1e51f8d4aeaa0a58a54a5b1abcabe3494 Mon Sep 17 00:00:00 2001
From: Stephane Bortzmeyer <stephane+chapril@bortzmeyer.org>
Date: Fri, 9 Jun 2023 10:02:30 +0200
Subject: [PATCH] Initial import

---
 Makefile                       |  26 +++
 README.md                      |   7 +-
 TODO                           |   4 +
 favicon.ico                    | Bin 0 -> 766 bytes
 find-subtags.xml               |  68 +++++++
 index.xml                      |  47 +++++
 ltru.css                       |  64 +++++++
 page.xslt                      | 101 ++++++++++
 philips-regexp.xml             |  15 ++
 register-new-subtag.xml        | 196 ++++++++++++++++++++
 registries.xml                 |  35 ++++
 registries/Makefile            |   9 +
 registries/copy-and-convert.sh |  93 ++++++++++
 registries/fill-in-database.sh |   6 +
 registries/lsr2atom.py         | 103 ++++++++++
 registries/ltru.dtd            |  74 ++++++++
 registries/ltru.rnc            |  71 +++++++
 registries/ltru2xml.awk        | 330 +++++++++++++++++++++++++++++++++
 registries/utf82ncr.py         |  33 ++++
 tag-wisely.xml                 |  59 ++++++
 test-suites.xml                |  24 +++
 web-site.xml                   |  18 ++
 whatare.xml                    |  29 +++
 why-tagging.xml                |  67 +++++++
 24 files changed, 1478 insertions(+), 1 deletion(-)
 create mode 100644 Makefile
 create mode 100644 TODO
 create mode 100644 favicon.ico
 create mode 100644 find-subtags.xml
 create mode 100644 index.xml
 create mode 100644 ltru.css
 create mode 100644 page.xslt
 create mode 100644 philips-regexp.xml
 create mode 100644 register-new-subtag.xml
 create mode 100644 registries.xml
 create mode 100644 registries/Makefile
 create mode 100755 registries/copy-and-convert.sh
 create mode 100755 registries/fill-in-database.sh
 create mode 100755 registries/lsr2atom.py
 create mode 100644 registries/ltru.dtd
 create mode 100644 registries/ltru.rnc
 create mode 100644 registries/ltru2xml.awk
 create mode 100755 registries/utf82ncr.py
 create mode 100644 tag-wisely.xml
 create mode 100644 test-suites.xml
 create mode 100644 web-site.xml
 create mode 100644 whatare.xml
 create mode 100644 why-tagging.xml

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..dfb3e44
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,26 @@
+ALLHTML=$(shell ls *.xml 2> /dev/null | sed 's/.xml$$/.html/' )
+STYLESHEET=page.xslt
+IMAGES=ltag-icon-en.png favicon.ico
+ME=$(shell hostname)
+ifeq ("${ME}","lilith")
+WEBSERVER=/var/www/www.langtag.net
+else
+WEBSERVER=bortzmeyer@www.langtag.net:/var/www/www.langtag.net
+endif
+GOOGLEVERIF=google75f3cadf7e9fc996.html
+
+all: ${ALLHTML} 
+
+%.html: %.xml ${STYLESHEET} language-subtag-registry-version
+	xsltproc --stringparam lsr-version `cat language-subtag-registry-version` \
+		 --output $@ ${STYLESHEET} $< && xmllint --noout --valid $@
+
+install: all 
+	cp -a ../SQL/* .
+	touch ${GOOGLEVERIF}
+	rsync -q -a ${ALLHTML} ${GOOGLEVERIF} ${IMAGES} ltru.css registries test-suites PostgreSQL SQLite ${WEBSERVER}
+
+clean:
+	rm -f *.html
+
+
diff --git a/README.md b/README.md
index c80d55f..e21d0b8 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,8 @@
 # Web-LangTag
 
-Code for the www.langtag.net Web site (language tag registry)
\ No newline at end of file
+Code for the [www.langtag.net](https://www.langtag.net/) Web site
+(language tag registry).
+
+The programs depend on the [GaBuZoMeu
+tools](https://www.bortzmeyer.org/gabuzomeu-parsing-language-tags.html)
+and of some programs like xsltproc and trang.
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..8023dc9
--- /dev/null
+++ b/TODO
@@ -0,0 +1,4 @@
+* Check all the links
+* Export database schemas (link to GaBuZoMeu?)
+* Configure Web site
+* HTTPS
diff --git a/favicon.ico b/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..a4367be4daad7125fffdcb13ce080297d0f09ba8
GIT binary patch
literal 766
zcmcJNy$*sv5QJAD*n)+&#Kgps!~|O^p9BB4_NTa(=d$8)EPVr<f#W<HwQ=EK=f0VR
z1VlzS(lo|3l*d?PCL$}W6sy4E{^zPhstRW%bzO@zjohCNZq4TysByL4xR%XLf`QJo
z+;|(eNYyuFs=k5PH)MK{?b`sHZDHvrARUIeo4|Bb=&?5dYY}Jw17Cp{Rp!S|6QgJ#
zw^+k6*9r8w!<^8@dyQ>k$L)0J(_-0XS`586SL!xRzttV4o|9W_OthOaZ!rQpFU=<=
j)D9aHL%KAu!H0R{%bXckm&CPflA#>1AH>%m<j;l|a7EeM

literal 0
HcmV?d00001

diff --git a/find-subtags.xml b/find-subtags.xml
new file mode 100644
index 0000000..51bfd3f
--- /dev/null
+++ b/find-subtags.xml
@@ -0,0 +1,68 @@
+<page title="Find a subtag for my language / dialect / script">
+<p>Very often, people who are not used to <a href="whatare.html">language
+tagging</a> hesitate before choosing a tag for a given
+combination of language, region, script, etc. You can find the result
+of these hesitations on the Web, with people tagging, for instance,
+<wikipedia name="Japanese language">japanese</wikipedia> as <code>jp</code> (the proper
+subtag for the japanese <em>language</em> is <code>ja</code>,
+<code>jp</code> is for the country) or using subtags that are not
+registered 
+because they did not find the valid
+ones. The purpose of this small text is to explain how to find a
+subtag registered in the <em>Language Subtag Registry</em>. There are
+many ways to do so, of course, and you are free to report
+better ways.</p>
+<p>The first thing to try is probably to use <a
+href="http://people.w3.org/rishida/utils/subtags/">Richard Ishida's
+Language Subtag Registry Search</a>. You can enter text which appears
+in the Description or Comments field of the registry and the
+corresponding subtags will be displayed. For instance, for japanese,
+it will correctly report <code>ja</code> (and the script
+<code>Jpan</code>, which indicates the mix of
+<wikipedia name="Kanji">Han</wikipedia>, <wikipedia>Hiragana</wikipedia> and
+<wikipedia>Katakana</wikipedia>).</p>
+<p>A more powerful, but probably less user-friendly, method, is to use
+the registry directly. Since its canonical form is more adapted to
+computer programs than to humans (for instance,
+<wikipedia>Unicode</wikipedia> characters are reported as XML escapes,
+like &amp;#xE7;), it may be better to use of the <a
+href="registries.html/">many unofficial forms</a>, automatically
+computed from the official one, and available for
+various environments. For instance, you may load the <a
+href="registries/language-subtag-registry-utf8">text version</a> and use the Find
+function of your Web browser (Control-F in
+<wikipedia>Firefox</wikipedia>). Say that you are not sure of the
+proper subtag for the <wikipedia name="Canadian Aboriginal Syllabics">canadian
+aboriginal script</wikipedia>, searching "canadian" that way soon discovers
+the subtag <code>Cans</code>.</p>
+<p>Both Richard Ishida's Web service and the above method have a
+limit: they only use information that is in the registry. If the relationship between common
+names and the tags is not in the registry, you will not find it. For
+instance, if you want to identify "<wikipedia name="British English">British English</wikipedia>", you have to realize
+that that is done by constructing the tag <code>en-GB</code> (not <code>en-UK</code>) from
+subtags in the registry. Similarly, if you want to
+write texts in <wikipedia name="Alsatian language">Alsatian</wikipedia>, and search this word
+in the registry, you won't find anything.</p>
+<p>You have to use external tools, but please check their results
+against the registry, with the tools mentioned above. A good search
+tool is
+<wikipedia>Wikipedia</wikipedia>. The english-speaking Wikipedia
+displays the <wikipedia>ISO</wikipedia> code names for most of the
+languages it talks about. Since most subtags in the registry are based
+on ISO standards, this works most of the time. For instance, the article on Alsatian will
+show the language code <code>gsw</code>
+(<wikipedia name="Alemannic German">Alemannic</wikipedia>), which, even if it is broader than the Alsatian dialect, is a good start.</p>
+<p>For languages (but not scripts or dialects), another very useful
+source is <a href="http://www.ethnologue.com/">Ethnologue</a>, which is
+managed by the <wikipedia>ISO 639</wikipedia> registration agency. It
+has a <a href="http://www.ethnologue.com/site_search.asp">search function</a> that allows you to use words that are not in the
+formal standard. For example,
+searching for "Alsatian", you'll find
+<code><a href="http://www.ethnologue.com/show_language.asp?code=gsw">http://www.ethnologue.com/show_language.asp?code=gsw</a></code>, where there is the
+comment: "Called 'Schwyzerd&#xFC;tsch' in Switzerland, and 'Alsatian' in France". But be careful:
+Ethnologue displays only 3-letters code, while the registry 
+uses 2-letters code whenever they are available. For instance, <wikipedia name="French language">French</wikipedia> is
+<code>fr</code>, not <code>fra</code>.</p>
+<p>TODO: endonyms and exonyms</p>
+</page>
+
diff --git a/index.xml b/index.xml
new file mode 100644
index 0000000..ac8009f
--- /dev/null
+++ b/index.xml
@@ -0,0 +1,47 @@
+<page title="Home" pagetitle="Language Tags">
+<div class="menu3">
+<h2>For users</h2>
+<ul>
+<li><a href="whatare.html">What are language tags?</a></li>
+<li><a href="why-tagging.html">Why tagging</a>?</li>
+<li>Tag <a href="tag-wisely.html">wisely</a></li>
+<li>The <wikipedia name="IETF language code">Wikipedia article</wikipedia></li>
+<li>The <a
+href="http://www.w3.org/International/articles/language-tags/">excellent
+article from the W3C</a> about language tags</li>
+<li>Also from the <wikipedia>W3C</wikipedia>, a <a href="http://www.w3.org/International/tutorials/language-decl/">tutorial "Declaring Language in XHTML and HTML"</a></li>
+<li><a href="http://www.inter-locale.com">Other resources</a></li>
+<li><a href="http://people.w3.org/rishida/utils/subtags/">Search subtags online in the registry</a> (some explanations are provided, but this tool is more useful if you know the concepts behind the registry)</li>
+<li>Browse <a href="http://unicode.org/cldr/utility/languageid.jsp">language tags</a> and see their meaning</li>
+</ul>
+</div>
+<div class="menu3">
+<h2>For software developers</h2>
+<ul>
+<li>The subtag registry (version <em><lsr-version/></em>) in <a href="registries.html">various formats</a></li>
+<li><a href="http://unicode.org/cldr/data/tools/java/org/unicode/cldr/util/data/langtagRegex.txt">Mark Davis' parser (ICU)</a>, written as a <wikipedia name="Regular expression">regexp</wikipedia> suitable for <wikipedia>Perl</wikipedia> or <wikipedia>Java</wikipedia></li>
+<li><a href="http://www.dpawson.co.uk/java/rfc4646.html">Dave Pawson's implementation</a> in <wikipedia name="Java (programming language)">Java</wikipedia></li>
+<li>Martin D&#xFC;rst's implementation in <wikipedia name="Ruby (programming language)">Ruby</wikipedia> is available as a <wikipedia name="RubyGems">Gem</wikipedia> named "langtag" at <wikipedia>RubyForge</wikipedia></li>
+<li><a href="http://www.bortzmeyer.org/gabuzomeu-parsing-language-tags.html">St&#xE9;phane Bortzmeyer's implementation</a> in <wikipedia name="Haskell (programming language)">Haskell</wikipedia>.</li>
+<li><a href="philips-regexp.html">Addison Phillips' code</a>, as a <wikipedia name="Regular expression">regexp</wikipedia> for <wikipedia name="Java (programming language)">Java</wikipedia></li>
+<li>An <a
+href="http://www.sasakiatcf.com/felix/lta/05/lta.xsl">implementation</a>
+in <wikipedia>XSLT</wikipedia> (requires a XSLT 2.0 processor) and the
+<a href="http://www.sasakiatcf.com/felix/lta/">Web page it powers</a> (by Felix Sasaki).</li>
+<li><a href="test-suites.html">Test suites</a></li>
+</ul>
+</div>
+<div class="menu3">
+<h2>For standard authors</h2>
+<ul>
+<li>The current standard: <a href="http://www.rfc-editor.org/rfc/rfc5646.txt">RFC 5646</a> and <a href="http://www.rfc-editor.org/rfc/rfc4647.txt">RFC 4647</a></li>
+<li>How to <a href="register-new-subtag.html">register a new
+subtag</a> (for instance, for a variant of a language),</li>
+<li>A <a href="registries/lsr.atom">syndication feed</a> (<wikipedia name="Atom (standard)">Atom</wikipedia> format) of the changes in the registry</li>
+<li>The IETF <a href="http://www.ietf.org/html.charters/ltru-charter.html">LTRU Working Group</a></li>
+<li>Working on <a href="web-site.html">this Web site</a></li>
+</ul>
+</div>
+
+<p xml:lang="fr"><cite>Tout est al&#xE9;a, confusion et pr&#xE9;carit&#xE9;, sauf le Catalogue.</cite> (<wikipedia>Fred Vargas</wikipedia>)</p>
+</page>
diff --git a/ltru.css b/ltru.css
new file mode 100644
index 0000000..62d0dc8
--- /dev/null
+++ b/ltru.css
@@ -0,0 +1,64 @@
+.menu3 { 
+  float: left;
+  width: 30%;
+  padding: 1%;
+} 
+
+/* .menu3 ul { 
+  list-style-type: none; 
+} */
+
+.menu3 h2 { 
+ text-align: center; 
+}
+
+.back-to-normal { 
+  margin-top: 7%;
+  clear: both;
+  float: none;
+  padding: 1%;
+  width: 98%; 
+} 
+
+body { 
+  padding: 1%;
+  color: #000000;
+  background-color: #ffffff;
+  background-image: none;
+}
+
+.main-title {  
+  text-align: center;
+  float: none;
+  clear: both;
+}
+
+em { 
+  font-weight: bolder;
+ }
+
+a, p, li, h1, h2, h3, pre { 
+ }
+
+a:visited { 
+  /* text-decoration: line-through; /* Blog-like :-) */
+ }
+
+a:active { 
+  font-size: 125%;  
+ }
+
+code { 
+  font-size: 130%;
+ }
+
+#ltag-icon { 
+   float: left;
+}
+
+#headline { 
+   margin-left: 2%;
+   font-size: 140%;
+   font-weight: bolder;
+}
+
diff --git a/page.xslt b/page.xslt
new file mode 100644
index 0000000..0182c74
--- /dev/null
+++ b/page.xslt
@@ -0,0 +1,101 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+  xmlns:str="http://exslt.org/strings"
+  exclude-result-prefixes = "str"
+  version='1.0'>
+
+  <xsl:output method = "xml"
+    encoding = "UTF-8"
+    omit-xml-declaration = "no"
+    doctype-system = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
+    doctype-public = "-//W3C//DTD XHTML 1.0 Strict//EN"
+    indent = "yes"/>
+
+  <xsl:param name="lsr-version">VERSION UNDEFINED</xsl:param>
+    
+  <xsl:template match="page">
+    <xsl:variable name="title">
+      <xsl:value-of select="@title"/>
+    </xsl:variable>
+    <xsl:variable name="pagetitle">
+      <xsl:choose>
+        <xsl:when test="@pagetitle">
+          <xsl:value-of select="@pagetitle"/>
+        </xsl:when>
+        <xsl:otherwise>
+          <xsl:value-of select="@title"/>
+        </xsl:otherwise>
+      </xsl:choose>
+    </xsl:variable>
+    <html xml:lang="en">
+      <head>
+        <link rel="stylesheet" type="text/css" href="ltru.css" />
+        <title>Language Tags: <xsl:value-of select="$title"/></title>
+      </head>
+      <body>
+        <div><img id="ltag-icon" src="ltag-icon-en.png" alt=""/></div>
+        <h1 class="main-title"><xsl:value-of select="$pagetitle"/></h1>
+        <xsl:apply-templates select="*"/>
+        <hr class="before-footer"/>
+        <p class="footer"><a href="index.html">Home</a>. Web site maintained by 
+        <code><a href="mailto:webmaster@langtag.net">webmaster@langtag.net</a></code>. 
+        Hosted by <a href="http://www.afnic.fr/">AFNIC</a>.
+      </p>
+      </body>
+    </html>
+  </xsl:template>
+  
+  <xsl:template match="lsr-version">
+    <xsl:value-of select="$lsr-version"/>
+  </xsl:template>
+  
+  <xsl:template name="wikipedia">
+    <xsl:param name="link"/>
+    <xsl:param name="text"/>
+    <xsl:variable name="actuallink">
+      <xsl:choose>
+        <xsl:when test="$link = ''">
+          <xsl:value-of select="$text"/>
+        </xsl:when>
+        <xsl:otherwise>
+          <xsl:value-of select="$link"/>
+        </xsl:otherwise>
+      </xsl:choose>
+    </xsl:variable>
+    <a><xsl:attribute name="href">http://en.wikipedia.org/wiki/<xsl:value-of select="$actuallink"/></xsl:attribute><xsl:value-of select="$text"/></a>    
+  </xsl:template>
+  
+  <xsl:template match="wikipedia">
+    <xsl:variable name="word">
+      <xsl:choose>
+        <xsl:when test="@name">
+          <xsl:value-of select="@name"/>
+        </xsl:when>
+        <xsl:otherwise>
+          <xsl:value-of select="text()"/>
+        </xsl:otherwise>
+      </xsl:choose>
+    </xsl:variable>
+    <xsl:variable name="path">
+       <xsl:choose>
+          <xsl:when test="function-available('str:encode-uri')">      
+            <xsl:value-of select="str:encode-uri($word, true())"/>
+          </xsl:when>
+          <xsl:otherwise>
+            <xsl:value-of select="$word"/>
+          </xsl:otherwise>
+        </xsl:choose>
+      </xsl:variable>    
+    <xsl:call-template name="wikipedia">
+      <xsl:with-param name="link" select="$path"/>
+      <xsl:with-param name="text" select="text()"/>
+    </xsl:call-template>
+  </xsl:template>
+
+  <xsl:template match="@*|node()">
+    <xsl:copy>
+      <xsl:apply-templates select="@*|node()"/>
+    </xsl:copy>
+  </xsl:template>
+  
+</xsl:stylesheet>
diff --git a/philips-regexp.xml b/philips-regexp.xml
new file mode 100644
index 0000000..8843a43
--- /dev/null
+++ b/philips-regexp.xml
@@ -0,0 +1,15 @@
+<page title="Addison Phillips Java regexp for language tags">
+<p>Here is a <wikipedia>regular expression</wikipedia> to parse the <em>future</em> versions of
+<a href="index.html">language tags</a>. Suitable for the syntax of the RFC 5646. Written by Addison Phillips, <code>addison - at - amazon.com</code> for the <wikipedia name="Java (programming language)">Java programming language</wikipedia>.</p>
+<pre>
+     static final String langtag_ex =
+     "(\\A[xX]([\\x2d]\\p{Alnum}{1,8})*\\z)"
+       + "|(((\\A\\p{Alpha}{2,8}(?=\\x2d|\\z)){1}"
+       + "(([\\x2d]\\p{Alpha}{3})(?=\\x2d|\\z)){0,3}"
+       + "([\\x2d]\\p{Alpha}{4}(?=\\x2d|\\z))?"
+       + "([\\x2d](\\p{Alpha}{2}|\\d{3})(?=\\x2d|\\z))?"
+       + "([\\x2d](\\d\\p{Alnum}{3}|\\p{Alnum}{5,8})(?=\\x2d|\\z))*)"
+       + "(([\\x2d]([a-wyzA-WYZ](?=\\x2d))([\\x2d](\\p{Alnum}{2,8})+)*))*"
+       + "([\\x2d][xX]([\\x2d]\\p{Alnum}{1,8})*)?)\\z";
+</pre>
+</page>
\ No newline at end of file
diff --git a/register-new-subtag.xml b/register-new-subtag.xml
new file mode 100644
index 0000000..6b3b846
--- /dev/null
+++ b/register-new-subtag.xml
@@ -0,0 +1,196 @@
+<?xml version="1.0" encoding="us-ascii"?>
+<page title="How to register a new subtag">
+<p>The <em><a href="registries.html">language subtag registry</a></em>
+includes many <a href="whatare.html">subtags</a> identifying countries, languages or variants
+such as local dialects. Your favorite language and/or variant is
+probably already there. But, if it is not, you can ask for the
+registration of a new subtag. This text explains how, but the
+complete and authoritative explanation is in <a
+href="http://www.rfc-editor.org/rfc/rfc5646.txt">RFC 5646</a>,
+specially its section 3.5, <a href="http://tools.ietf.org/html/rfc5646#section-3.5">Registration Procedure for Subtags</a>". It is
+recommended to read at least this section.</p>
+<p>Before you start, a warning: the process takes time, documentation
+and the ability to defend your proposal and to back it with facts and
+references. Just sending an email saying "People in my hometown speaks
+<wikipedia name="Alsatian language">Alsatian</wikipedia>, I want 'als' to be registered as a language subtag" is not sufficient.</p>
+<p>You have different sorts of subtags and the rules are not the same
+for all:</p>
+<ul>
+<li>Subtags for types "countries" or "scripts" cannot be registered
+directly with the <wikipedia>IETF</wikipedia>. You have to go through
+the maintenance agencies of <wikipedia name="International Organization for Standardization">ISO</wikipedia>, the language
+subtag registry managed by IETF copies the ISO standards here.</li>
+<li>Only subtags of types "language" and "variant" are therefore
+considered here. In practice, chances that a "language" subtag
+registration succeeds seem limited (you will probably be redirected to
+the maintenance agencies of <wikipedia>ISO 639</wikipedia>; if you
+already ask them and were turned out, prepare a very good proposal if
+you want the IETF to make another choice). We then concentrate on
+"variant" subtags.</li>
+</ul>
+<p>The process is the following (it is a simplified version; did I
+tell you to read the full story in <a href="http://tools.ietf.org/html/rfc5646#section-3.5">section 3.5</a> of <a
+href="http://www.rfc-editor.org/rfc/rfc5646.txt">RFC 5646</a>?):</p>
+<ol>
+<li>Collect background information, typically references to published
+descriptions of the language or dialect. A Wikipedia page is possible but
+may be insufficient, specially since the page may change
+easily. Stable references are preferred.</li>
+<li>Choose a subtag which must conform to the syntax rules explained
+in RFC 5646 (<a href="http://tools.ietf.org/html/rfc5646#section-2.1">section 2.1</a>). A variant subtag must be either a string of
+five to eight alphanumeric characters, <em>or</em> a string of four
+alphanumeric characters, starting with a digit. So, <code><wikipedia
+name="Valencia (province)">valencian</wikipedia></code> is illegal
+(too long) while <code>valencia</code> is legal. <code><wikipedia
+name="German spelling reform of 1996">1996</wikipedia></code> is legal, too, but
+not <code>732</code>.</li>
+<li>Fill-in the registration form whose template is:
+<pre>
+   LANGUAGE SUBTAG REGISTRATION FORM
+   1. Name of requester:
+   2. E-mail address of requester:
+   3. Record Requested:
+
+      Type:
+      Subtag:
+      Description:
+      Prefix:
+      Preferred-Value:
+      Deprecated:
+      Suppress-Script:
+      Comments:
+
+   4. Intended meaning of the subtag:
+   5. Reference to published description
+      of the language (book or article):
+   6. Any other relevant information:
+</pre> 
+
+Pay special attention to Prefix (in practice, most variants have a
+Prefix, which is the main language of this variant, such as
+<code><wikipedia name="Catalan language">ca</wikipedia></code> for
+valencian).<br/> Think twice about Description (in general a short
+one-line sentence) and Comments (which may be longer), because the
+consistency of tagging among different taggers will heavily depend on
+the quality of these fields.<br/>Keep
+detailed scholar references for the Reference section of the request:
+the registry is not a library.<br/> Some
+fields are typically not used for a variant such as
+Suppress-Script.</li>
+<li>Send it to the mailing list <code><a
+href="mailto:ietf-languages@iana.org">ietf-languages@iana.org</a></code>
+(you may choose to <a href="http://www.alvestrand.no/mailman/listinfo/ietf-languages">subscribe to the mailing list</a> before, to get an
+idea of the people and discussions, and to be sure to have the
+complete thread).</li>
+<li>Reply to questions, address objections, be prepared to modify your
+registration form and keep cool.</li>
+</ol>
+<p>Let's see a complete example showing many issues (thanks to CE
+Whitehead for the nice example). The current registry has three
+entries for the <wikipedia>french language</wikipedia>,
+<code>fr</code> for today's French, <code>frm</code> for Middle French
+(the language spoken during the <wikipedia>Renaissance</wikipedia>)
+and <code>fro</code> for Old French (the language spoken during the
+<wikipedia name="Middle Ages">Middle Age</wikipedia>). This is not
+always sufficiently fine-grained to classify some old texts. So, here
+is a possible proposal to register a variant, <code>1606Nict</code>,
+for the late Middle French, as described in the famous <wikipedia
+name="Jean Nicot">Nicot</wikipedia>'s book:</p>
+<pre>
+LANGUAGE SUBTAG REGISTRATION FORM
+1. Name of requester:  C. E. Whitehead
+2. E-mail address of requester: cewcathar@hotmail.com
+3. Record Requested:
+Type: Variant
+Subtag:  1606Nict
+        (or alternately 16siecle)
+Description: Late Middle French
+Prefix: frm
+Preferred-Value:
+Deprecated:
+Suppress-Script:
+Comments: French as catalogued in Jean Nicot, "Thresor de la langue francoyse" 1606
+
+4. Intended meaning of the subtag:
+5. Reference to published description
+of the language (book or article):
+
+* Joachim du Bellay, La deffence et illustration de la langue francoyse,
+1549; ed critique by Henri Chamard, Geneve, Slatkine Rpt. 1969
+
+* Jean Nicot, "Thresor de la langue francoyse" 1606; ARTFL Project,
+University of Chicago:
+http://portail.atilf.fr/dictionnaires/TLF-NICOT/index.htm
+
+6. Any other relevant information:
+See second request below
+</pre>
+<p>Do note the detailed references and the use of Prefix to
+clearly state that it is a variant of Middle French.</p>
+<p>Let's see a second example from the same author, with the added
+difficulty that we use <wikipedia>XML</wikipedia>-like encoding for
+the composed characters (see section 3.1 of RFC 5646). This specificies the early modern French, as
+described by the <wikipedia name="Academie francaise">french academy</wikipedia>:</p>
+<pre>
+LANGUAGE SUBTAG REGISTRATION FORM
+1. Name of requester:  C. E. Whitehead
+2. E-mail address of requester: cewcathar@hotmail.com
+3. Record Requested:
+
+Type: Variant
+Subtag:  1694acad
+            (alternately 17siecle)
+Description: Early modern French
+Prefix: fr 
+Preferred-Value:
+Deprecated:
+Suppress-Script:
+Comments:  As catalogued in the "Dictionnaire de
+l'acad&amp;#xe9;me fran&amp;#xe7;oise", 4eme ed. 1694; includes
+elements of Middle French; also new terms from the Americas
+
+4. Intended meaning of the subtag:
+5. Reference to published description
+of the language (book or article):
+
+* Dictionnaire de l'acad&#xE9;mie fran&#xE7;oise, 4eme ed. 1694; RTFL Project,
+University of Chicago:
+http://portail.atilf.fr/dictionnaires/ACADEMIE/index.htm
+
+* F&#xE9;nelon, Fran&#xE7;ois de Salignac de La Mothe (1984), Fenelon's Letter to the
+French Academy : with an introduction and commentary.
+
+* Ayres-Bennett, Wendy (2004), Sociolinguistic variation in
+seventeenth-century France : methodology and case studies.
+
+also:
+* http://www.tsl.state.tx.us/treasures/giants/lasalle/lasalle-cover.html
+ http://teacherweb.com/FL/Cocoa/CEWhitehead/HTMLPage15.stm
+</pre>
+<p>It is probably useful to list some mistakes that people seem to
+make often. Keep in mind that:</p>
+<ul>
+<li>Language issues are always extremely passionate, both for
+psychological (people feel very strong about their language) and
+political reasons (wars have been fought about languages). Please, try
+to keep easy and do not forget that it is perfectly normal that an
+international audience does not know your language (or the language
+you champion) and does not see things they way you do.</li>
+<li>Pay attention to syntax issues (you may wish to ask a computer
+person, may be with the help of some of the software tools listed <a
+href="/">on the home page</a>) and also be sure to fill in the form
+properly - or do not be suprised if the first reactions are on the
+syntax, not on the proposal itself. If the IETF yells at your form
+errors, do not assume it is a refusal of your language: it is simply a
+desire to enforce the documented process.</li>
+<li>The IETF is not a general appeal mechanism for other standard
+bodies decisions. Other standards are imperfect, true, but so is IETF
+work, too. Please do not use the IETF language registration mechanism just
+because ISO turned you down. Variant registration is typically fine
+because no other standard body do it.</li>
+</ul>
+ <p>Thanks
+for reading and good luck for your future subtag
+registrations. Remember: it may seems difficult but it is worth
+it.</p>
+</page>
diff --git a/registries.xml b/registries.xml
new file mode 100644
index 0000000..60cffbc
--- /dev/null
+++ b/registries.xml
@@ -0,0 +1,35 @@
+<page title="The registry in various formats">
+
+<p>Files available here were automatically produced from the official
+registry maintained by <wikipedia>IANA</wikipedia>. The current version of
+the registry is <em><lsr-version/></em>.</p>
+
+<ul>
+<li><a href="https://www.iana.org/assignments/language-subtag-registry">Official format</a></li>
+<li>As text files (one line per record, fields separated by
+tabulations, the subtag is the first field, the addition date the
+second):
+<ul>
+<li><a href="registries/lsr-language.txt">List of languages</a></li>
+<li><a href="registries/lsr-script.txt">List of scripts</a></li>
+<li><a href="registries/lsr-region.txt">List of regions</a> (including countries)</li>
+<li><a href="registries/lsr-variant.txt">List of variants</a> (orthography, local dialects)</li>
+<li><a href="registries/lsr-grandfathered.txt">List of grand-fathered</a> (tags which would otherwise be illegal but are maintained for compatibility, because they were previously used)</li>
+<li><a href="registries/lsr-redundant.txt">List of redundants</a> (subtags redundant with other subtags)</li>
+</ul>
+</li>
+<li>As <wikipedia>XML</wikipedia> :
+<ul>
+<li>According to <a href="registries/ltru.rnc">this schema</a> (written in <wikipedia>RelaxNG</wikipedia>): <a href="registries/language-subtag-registry.xml">registry in XML</a>, St&#xE9;phane Bortzmeyer's version</li>
+</ul>
+</li>
+<li>As <wikipedia>SQL</wikipedia>. Since SQL is not really portable, there are
+several versions :
+<ul>
+<li>For <wikipedia>PostgreSQL</wikipedia> (create the database with <a href="PostgreSQL/create-db-subtag.sql">this schema</a>) : <a href="registries/lsr-postgres.sql">registry in SQL</a>,</li>
+<li>For <wikipedia>SQLite</wikipedia> (create the database with <a href="SQLite/create-db-subtag.sql">this schema</a>) : <a href="registries/lsr-sqlite.sql">registry in SQL</a>,</li>
+</ul>
+</li>
+</ul>
+
+</page>
diff --git a/registries/Makefile b/registries/Makefile
new file mode 100644
index 0000000..53214ff
--- /dev/null
+++ b/registries/Makefile
@@ -0,0 +1,9 @@
+all: 
+	./copy-and-convert.sh
+	cp ./language-subtag-registry-version ..
+
+ltru.rng: ltru.rnc
+	trang -Irnc -Orng ltru.rnc $@
+
+clean:
+	rm -f language-subtag-registry language-subtag-registry.xml language-subtag-registry2.xml lsr-*.txt
diff --git a/registries/copy-and-convert.sh b/registries/copy-and-convert.sh
new file mode 100755
index 0000000..30aed0a
--- /dev/null
+++ b/registries/copy-and-convert.sh
@@ -0,0 +1,93 @@
+#!/bin/sh
+
+MYURL=https://www.langtag.net/
+LTR_URL=https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
+LTR_LOCAL=language-subtag-registry
+PROGRAMS_DIR=../../GaBuZoMeu
+TEST_PROGRAM=${PROGRAMS_DIR}/check-registry
+OS="$(uname)"
+if [ "$OS" = "FreeBSD" ]; then 
+  # FreeBSD's mktemp is stupid enough to have *no*
+  # default template :-(
+  OUTPUT=`mktemp /tmp/$(basename $0).tmp.XXX)`
+  TMPDIFF=`mktemp /tmp/$(basename $0).tmp.XXX)`
+else
+  OUTPUT=`mktemp`
+  TMPDIFF=`mktemp`
+fi
+MAINTAINER=stephane+langtag@bortzmeyer.org
+
+# Conversions
+CONVERT_XML_BORTZMEYER=${PROGRAMS_DIR}/registry2xml
+CONVERT_XML_ELLERMANN="awk -f ltru2xml.awk "
+CONVERT_POSTGRESQL=${PROGRAMS_DIR}/registry2postgresql
+CONVERT_SQLITE=${PROGRAMS_DIR}/registry2sqlite
+CONVERT_TXT=${PROGRAMS_DIR}/registry2txt
+CONVERT_HTML=${PROGRAMS_DIR}/registry2mulhtml
+FILL_DATABASE=./fill-in-database.sh
+# --force is to avoid spurious warnings about "Ambiguous output"
+#CRLF_TO_LOCAL="recode --force /CR-LF..US-ASCII "
+
+trap  "rm -f $OUTPUT $TMPDIFF; exit 1" 1 2 3 15
+trap  "rm -f $OUTPUT $TMPDIFF" EXIT
+
+if [ -e ${LTR_LOCAL} ]; then
+  ltr_date=`head -n 1 ${LTR_LOCAL} | cut -d" " -f2`
+  # Allow time to elapse. The date of the file at IANA is often the day after
+  # the date written in the LSR. Heuristically, we add one day and a few hours.
+  current_date=`date +"%Y%m%d %H:%M:%S" --date="${ltr_date} +1 day +4 hour"`
+else
+  # Trick to force a downloading
+  current_date="19700101"
+  #current_date=`date --utc +"%Y%m%d"`
+fi
+curl --silent --output ${LTR_LOCAL}.TMP \
+    --compressed \
+    --referer ${MYURL} \
+    --proxy "" \
+    --time-cond "${current_date}" \
+    --header "From: ${MAINTAINER}" \
+  ${LTR_URL} 2>&1 > ${OUTPUT}
+if [ $? != 0 ]; then
+    cat ${OUTPUT} | mutt -s "Network error getting ${LTR_URL}" ${MAINTAINER}
+    exit 1
+fi
+if [ -e ${LTR_LOCAL}.TMP ]; then
+    #$CRLF_TO_LOCAL ${LTR_LOCAL}.TMP
+    ${TEST_PROGRAM} ${LTR_LOCAL}.TMP 2>&1 >> ${OUTPUT}
+    if [ $? = 0 ]; then
+	if [ -e ${LTR_LOCAL} ]; then
+	    diff -u ${LTR_LOCAL} ${LTR_LOCAL}.TMP > $TMPDIFF
+            if [ ! -z $TMPDIFF ]; then
+		mutt -s "New LTR registry at ${MYURL}" ${MAINTAINER} < $TMPDIFF
+            fi
+	fi
+	mv ${LTR_LOCAL}.TMP ${LTR_LOCAL}
+        # Now, the various conversions
+	${CONVERT_XML_BORTZMEYER}
+	# trang is in Java and therefore fails frequently
+	# trang -Irnc -Orng ltru.rnc ltru.rng
+	xmllint --noout --relaxng ltru.rng ${LTR_LOCAL}.xml
+        ${CONVERT_TXT}
+        #${CONVERT_XML_ELLERMANN} < ${LTR_LOCAL} > ${LTR_LOCAL}2.xml
+	#xmllint --noout --valid ${LTR_LOCAL}2.xml
+        ${CONVERT_POSTGRESQL} > lsr-postgres.sql
+        ${CONVERT_SQLITE} > lsr-sqlite.sql
+	# TODO: UTF-8 support on SQLite was never tested
+	./utf82ncr.py lsr-sqlite.sql
+	mv lsr-sqlite.sql lsr-sqlite-utf8.sql
+	mv lsr-sqlite-ncr.sql lsr-sqlite.sql
+        ${CONVERT_HTML}
+        ${FILL_DATABASE}
+	# Needs to be ported away from.DateTime
+	#./lsr2atom.py > lsr.atom
+        version=`head -n 1 ${LTR_LOCAL} | awk '{print $2}'`
+        echo $version > ${LTR_LOCAL}-version
+        exit 0
+    else
+	cat ${OUTPUT} | mutt -s "Invalid registry ${LTR_URL}" ${MAINTAINER}
+	exit 1
+    fi
+else # File not downloaded, probably because there was nothing new.
+    exit 0
+fi
diff --git a/registries/fill-in-database.sh b/registries/fill-in-database.sh
new file mode 100755
index 0000000..19ea814
--- /dev/null
+++ b/registries/fill-in-database.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+DATABASE=lsr
+
+psql -f clean-postgres.sql ${DATABASE}
+psql -f lsr-postgres.sql ${DATABASE}
diff --git a/registries/lsr2atom.py b/registries/lsr2atom.py
new file mode 100755
index 0000000..1e85365
--- /dev/null
+++ b/registries/lsr2atom.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+
+__version__ = "BETA"
+domain = "langtag.net"
+tag_prefix = "tag:%s,2007-05:LSR" % domain
+
+import sys
+import urllib.request, urllib.parse, urllib.error
+import psycopg2
+# ElementTree is painful, with all its renamings :-(
+try:
+    import cElementTree as ET
+except ImportError:
+    try:
+        import ElementTree as ET
+    except ImportError:
+        # Now a standard part of Python >= 2.5
+        import xml.etree.ElementTree as ET
+import mx.DateTime as DateTime # TODO move to another package
+
+max = 10
+
+db_module = psycopg2
+
+def process_type(tree, type="language"):
+    request = ("SELECT code,description, added FROM %ss_with_descr" % type) + \
+              " ORDER BY added DESC LIMIT %(max)s"
+    cursor.execute(
+        request,
+        {'max': max})
+    for tuplee in cursor.fetchall():
+        code = tuplee[0]
+        description = tuplee[1]
+        added = tuplee[2]
+        utype = type.capitalize()
+        entry = ET.SubElement(tree, "entry")
+        title = ET.SubElement(entry, "title")
+        title.text = "%s: %s" % (utype, description)
+        entry_id = ET.SubElement(entry, "id")
+        entry_id.text = tag_prefix + "/" + urllib.parse.quote_plus("%s %s" % (type, code))
+        published = ET.SubElement(entry, "published")
+        published.text = added.strftime("%Y-%m-%dT00:00:00Z")
+        # TODO: records in the LSR are sometimes updated but it is not obvious to see it,
+        # since there is only an "Added" field.
+        updated = ET.SubElement(entry, "updated")
+        updated.text = published.text
+        category = ET.SubElement(entry, "category")
+        category.attrib["scheme"] = tag_prefix
+        category.attrib["term"] = type
+        category.attrib["label"] = utype
+        link = ET.SubElement(entry, "link")
+        link.attrib["rel"] = "alternate"
+        link.attrib["href"] = "http://www.%s/registries/registry-html/%s/%s.html" % \
+                              (domain, type, code)
+        content = ET.SubElement(entry, "content")
+        content.attrib["type"] = "text"
+        content.text = """
+        %s
+
+        %s
+
+        %s
+
+        Added on %s
+        """ % (type, code, description, added.strftime("%Y-%m-%d"))
+        # TODO: an alternate Content in HTML?
+        
+connection = db_module.connect("dbname=lsr")
+cursor = connection.cursor()
+
+feed = ET.Element("feed")
+feed.attrib["xmlns"] = "http://www.w3.org/2005/Atom"
+title = ET.SubElement(feed, "title")
+title.text = "Language Tag Registry syndication feed"
+updated = ET.SubElement(feed, "updated")
+updated.text = DateTime.now().strftime("%Y-%m-%dT%H:%M:00Z")
+link_html = ET.SubElement(feed, "link")
+link_html.attrib["rel"] = "alternate"
+link_html.attrib["type"] = "text/html"
+link_html.attrib["href"] = "http://www.%s/" % domain
+link_self = ET.SubElement(feed, "link")
+link_self.attrib["rel"] = "self"
+link_self.attrib["type"] = "application/atom+xml"
+link_self.attrib["href"] = "http://www.%s/registries/lsr.atom" % domain
+author = ET.SubElement(feed, "author")
+name = ET.SubElement(author, "name")
+name.text = "Stephane Bortzmeyer"
+email = ET.SubElement(author, "email")
+email.text = "webmaster@langtag.net"
+feed_id = ET.SubElement(feed, "id")
+feed_id.text = tag_prefix
+generator = ET.SubElement(feed, "generator")
+generator.text = "%s %s running with Python %s" % \
+             ("lsr2atom", __version__, sys.version.split()[0])
+
+process_type(feed, "language")
+process_type(feed, "variant")
+process_type(feed, "script")
+process_type(feed, "region")
+process_type(feed, "extlang")
+cursor.close()
+connection.close()
+print(ET.tostring(feed, encoding="UTF-8"))
diff --git a/registries/ltru.dtd b/registries/ltru.dtd
new file mode 100644
index 0000000..6ce56cc
--- /dev/null
+++ b/registries/ltru.dtd
@@ -0,0 +1,74 @@
+<!-- 
+
+Written by Frank Ellermann 
+
+TODO: does not seem consistent with the awk script which produces the XML
+
+-->
+
+<!ELEMENT ltru (language*, extlang*, script*, region*, variant*,
+grandfathered*, redundant*)>
+<!ATTLIST ltru
+        date NMTOKEN #REQUIRED
+>
+
+<!ELEMENT language (suppress?, deprecated?, description+, comment*)>
+<!ATTLIST language
+        date NMTOKEN #REQUIRED
+        tag  NMTOKEN #REQUIRED
+>
+
+<!ELEMENT extlang (prefix, deprecated?, description+, comment*)>
+<!ATTLIST extlang
+        date NMTOKEN #REQUIRED
+        tag  NMTOKEN #REQUIRED
+>
+
+<!ELEMENT script (deprecated?, description+, comment*)>
+<!ATTLIST script
+        date NMTOKEN #REQUIRED
+        tag  NMTOKEN #REQUIRED
+>
+
+<!ELEMENT region (deprecated?, description+, comment*)>
+<!ATTLIST region
+        date NMTOKEN #REQUIRED
+        tag  NMTOKEN #REQUIRED
+>
+
+<!ELEMENT variant (prefix*, deprecated?, description+, comment*)>
+<!ATTLIST variant
+        date NMTOKEN #REQUIRED
+        tag  NMTOKEN #REQUIRED
+>
+
+<!ELEMENT grandfathered (deprecated?, description+, comment*)>
+<!ATTLIST grandfathered
+        date NMTOKEN #REQUIRED
+        tag  NMTOKEN #REQUIRED
+>
+
+<!ELEMENT redundant (deprecated?, description+, comment*)>
+<!ATTLIST redundant
+        date NMTOKEN #REQUIRED
+        tag  NMTOKEN #REQUIRED
+>
+
+<!ELEMENT suppress EMPTY>
+<!ATTLIST suppress
+        tag  NMTOKEN #REQUIRED
+>
+
+<!ELEMENT prefix EMPTY>
+<!ATTLIST prefix
+        tag  NMTOKEN #REQUIRED
+>
+
+<!ELEMENT deprecated EMPTY>
+<!ATTLIST deprecated
+        date NMTOKEN #REQUIRED
+        tag  NMTOKEN #IMPLIED
+>
+
+<!ELEMENT description (#PCDATA)>
+<!ELEMENT comment (#PCDATA)>
diff --git a/registries/ltru.rnc b/registries/ltru.rnc
new file mode 100644
index 0000000..2a19f22
--- /dev/null
+++ b/registries/ltru.rnc
@@ -0,0 +1,71 @@
+# RelaxNG schema for the "language tag" registry specified in RFC 4646
+# and available at http://www.iana.org/assignments/language-subtag-registry
+
+# Not standard in any way, just an individual proposal
+
+# Stephane Bortzmeyer <bortzmeyer@nic.fr>
+
+# TODO: add Schematron rules for constraints such as "Records that
+# contain a 'Preferred-Value' field MUST also have a 'Deprecated'
+# field. " This specific constraint does not really require
+# Schematron, but others may.
+
+start = registry
+
+registry = element registry {date & languages & extlangs & scripts & regions & variants & 
+   redundants & grandfathereds} # TODO: extensions
+
+date = element date {xsd:date}
+
+languages = language*
+
+language = element language {subtag & common & scope? & suppress-script? & macrolanguage?}
+
+extlangs = extlang*
+
+extlang = element extlang {subtag & common & scope? & macrolanguage?}
+
+scripts = script*
+
+script = element script {subtag & common}
+
+regions = region*
+
+region = element region {subtag & common}
+
+variants = variant*
+
+variant = element variant {subtag & common & prefix*} # "Records of type 'variant' 
+                    # MAY have more than one field of type" 'Prefix'. 
+
+grandfathereds = grandfathered*
+
+grandfathered = element grandfathered {tag & common}
+
+redundants = redundant*
+
+redundant = element redundant {tag & common}
+
+common = added & descriptions & deprecated? & preferred-value?
+
+added = element added {xsd:date}
+
+suppress-script = element suppress-script {text}
+
+descriptions = description+ # Each record MUST contain the following fields
+
+description = element description {text}
+
+subtag = element subtag {text}
+
+tag = element tag {text}
+
+prefix = element prefix {text}
+
+macrolanguage = element macrolanguage {text}
+
+deprecated = element deprecated {xsd:date}
+
+preferred-value = element preferred-value {text}
+
+scope = element scope {text}
\ No newline at end of file
diff --git a/registries/ltru2xml.awk b/registries/ltru2xml.awk
new file mode 100644
index 0000000..b02038f
--- /dev/null
+++ b/registries/ltru2xml.awk
@@ -0,0 +1,330 @@
+#! /usr/common/bin/gawk -f
+#
+# Usage: ltru2xml.awk registry  > registry.xml
+# Or   : ltru2xml.awk /dev/null > registry.dtd
+#
+# The File-Date record is noted in a 'date' attribute of the root
+# element <LanguageSubtagRegistry>.
+#
+# Other records are converted to elements <language>, <extlang>,
+# <script>, <region>, <variant>, <grandfathered>, or <redundant>.
+#
+# The fields Added (required), Deprecated (optional), Description
+# (one or more), and Comment (optional) are converted to elements
+# <added>, <deprecated>, <description>, and <comment> in that order.
+#
+#### version 0.6 ##################################################
+#
+# Multiple descriptions and comments can be separated by an empty
+# <alt /> element, squeezing them all into a single <description>
+# or <comment> resp.  The cleaner approach is to allow multiple
+# <description> or <comment> elements.  Modify the line MULT = 0
+# below to MULT = 1 for this style.
+#
+# The tags zh-cmn, zh-hakka, and yi-latn are handled as special
+# cases for RFC 4646 registries, for 4646bis cmn is an ordinary
+# <extlang> subtag.
+#
+# Known issues:  If the language subtag review list introduces a
+# language subtag with 5-8 characters which clashes with a variant
+# subtag these subtags would get the same xml:id resulting in an
+# XML syntax error.  In practice it's unlikely that there ever
+# will be any IANA language subtag not derived from ISO 936, let
+# alone using the same string also for a variant subtag.
+#
+# "XML Notepad 2007", an experimental Microsoft tool, does not yet
+# support xml:id attributes without an explicit declaration of the
+# xml namespace.  The W3C validator accepts xml:id without explicit
+# namespace declaration.
+#
+#### version 0.7 ##################################################
+#
+# The Internet drafts 4646bis-08 and 4645bis-02 introduced a new
+# optional field "Macrolanguage" for language and extlang subtags.
+# 
+#                                             Frank Ellermann, 2007
+#
+#### remove leading and trailing spaces ###########################
+function STRIP( STR )
+{              sub( /^[\t ]+/, "", STR )
+               sub( /[\t ]+$/, "", STR )
+               return STR
+}
+#### add underscore to subtags starting with a digit ##############
+function XMLID( STR )
+{              sub( /^[0-9]/, "_&", STR )
+               return STR
+}
+#### convert tag to IDREFS (subtags) ##############################
+function IDREF( STR )
+{              N = split( STR, REF, "-" )
+               STR = ""
+               for ( I = 1; I <= N; ++I )
+                    STR = STR " " XMLID( REF[ I ] )
+               return substr( STR, 2 )
+}
+#### escape less-than (and greater-than) characters ###############
+function CANON( STR )
+{              gsub( /</, "&lt;", STR )
+               gsub( />/, "&gt;", STR )
+               return STR
+}
+#### error ########################################################
+function FATAL( STR )
+{              print "error near line " NR ": " STR
+               OKAY = 0  ;    return 1
+}
+#### save unfolded field body #####################################
+function FIELD()
+{              if ( NAME == "description" )   D[ ++DD ] = BODY
+               else if ( NAME == "comments" ) C[ ++CC ] = BODY
+               else if ( NAME == "prefix" )   P[ ++PP ] = BODY
+               else if ( F[ NAME ] == "" )    F[ NAME ] = BODY
+               else exit FATAL( NAME ": " BODY )
+               return
+}
+#### output record elements #######################################
+function READY()
+{              T = tolower( F[ "type" ] )
+               if ( T == "" ) exit FATAL( "missing type" )
+               L = "<" T
+
+               S = F[ "subtag" ]
+               if ( S == "" )
+               {    S = F[ "tag" ]
+                    if ( S == "" ) exit FATAL( "missing tag" )
+                    if ( T != "redundant" )  print L ">"
+                    else if ( S == "yi-latn" )
+                         print L " subtags='yi Latn'>"
+                    else print L " subtags='" IDREF( S ) "'>"
+                    B = "\t<tag> " S " </tag>"
+                    HACK = HACK && ( S != "hak" )
+               }
+               else if ( F[ "tag" ] == "" )
+               {    print L " xml:id='" XMLID( S ) "'>"
+                    B = "\t<subtag> " S " </subtag>"
+               }
+               else exit FATAL( "conflicting subtag " S )
+
+               S = F[ "suppress-script" ]
+               if ( S != "" && T == "language" )
+               {    L = "\t<suppress script='" S "'> "
+                    print L S " </suppress>"
+               }
+               else if ( S != "" )
+                    exit FATAL( "unexpected Suppress-Script" S )
+
+               if ( T == "extlang" && PP != 1 )
+                    exit FATAL( "missing or extraneous prefix" )
+               while ( PP )
+               {    L = "\t<prefix subtags='" IDREF( P[ PP ] ) "'> "
+                    print L P[ PP-- ] " </prefix>"
+               }
+# modified:
+               S = F[ "macrolanguage" ]
+               if ( S != "" && ( T == "language" || T == "extlang" )) 
+               {    L = "\t<macro language='" S "'> "
+                    print L S " </macro>"
+               }
+               else if ( S != "" )
+                    exit FATAL( "unexpected Macrolanguage" S )
+
+               A = F[ "added" ]
+               if ( A == "" ) exit FATAL( "missing date" )
+               print B "<added> " A " </added>"
+
+               S = F[ "preferred-value" ]
+               A = F[ "deprecated" ]
+               if ( A != "" )
+               {    L = "\t"
+                    if ( S != "" )
+                    {    L = L "<preferred"
+                         H = S == "zh-cmn" || S == "zh-hakka"
+                         if ( HACK && H )
+                              L = L " subtags='zh'> "
+                         else L = L " subtags='" IDREF( S ) "'> "
+                         L = L S " </preferred>"
+                    }
+                    print L "<deprecated> " A " </deprecated>"
+               }
+               else if ( S != "" )
+                    exit FATAL( "missing deprecated" )
+
+               if ( DD )
+               {    L = "\t<description> "
+                    while ( DD )
+                    {    L = L CANON( D[ DD-- ] )
+                         if ( DD )
+                         {    if ( MULT )
+                              {    print L " </description> "
+                                   L = "\t<description>"
+                              }
+                              else
+                              {    print L
+                                   L = "\t<alt /> "
+                    }    }    }
+                    print L " </description>"
+               }
+               else exit FATAL( "missing description" )
+
+               if ( CC )
+               {    L = "\t<comment> "
+                    while ( CC )
+                    {    L = L CANON( C[ CC-- ] )
+                         if ( CC )
+                         {    if ( MULT )
+                              {    print L " </comment> "
+                                   L = "\t<comment>"
+                              }
+                              else
+                              {    print L
+                                   L = "\t<alt /> "
+                    }    }    }
+                    print L " </comment>"
+               }
+
+               print "</" T ">"
+               return
+}
+#### output DOCTYPE ###############################################
+BEGIN          {    ROOT = "LanguageSubtagRegistry"
+                    HACK = 1
+                    OKAY = 0
+
+                    VERS = "ltru2xml/0.7"
+                    MULT = 1
+                    if ( ! MULT ) VERS = VERS "alt"
+
+                    L = "<?xml version=\"1.0\" "
+                    L = L "encoding=\"UTF-8\" "
+                    print L "standalone=\"yes\" ?>"
+                    print "<!DOCTYPE " ROOT " ["
+
+                    A = "\tdate   NMTOKEN #REQUIRED"
+                    S = "grandfathered*, redundant*"
+                    print "<!ELEMENT " ROOT " (language*, extlang*,"
+                    print "\tscript*, region*, variant*, " S ")>"
+                    print "<!ATTLIST " ROOT
+                    print A ">"
+
+                    A = "\txml:id  ID     #REQUIRED"
+                    S = "added, (preferred?, deprecated)?,"
+                    if ( MULT )
+                         S = S " description+, comment*"
+                    else S = S " description, comment?"
+# modified:
+                    L = "macro?, subtag,"
+                    print "<!ELEMENT language (suppress?, " L
+                    print "\t" S ")>"
+                    print "<!ATTLIST language"
+                    print A ">"
+
+                    print "<!ELEMENT extlang (prefix, " L
+                    print "\t" S ")>"
+                    print "<!ATTLIST extlang"
+                    print A ">"
+
+                    print "<!ELEMENT script (subtag,"
+                    print "\t" S ")>"
+                    print "<!ATTLIST script"
+                    print A ">"
+
+                    print "<!ELEMENT region (subtag,"
+                    print "\t" S ")>"
+                    print "<!ATTLIST region"
+                    print A ">"
+
+                    print "<!ELEMENT variant (prefix*, subtag,"
+                    print "\t" S ")>"
+                    print "<!ATTLIST variant"
+                    print A ">"
+
+                    A = "\tsubtags IDREFS #REQUIRED"
+                    print "<!ELEMENT grandfathered (tag,"
+                    print "\t" S ")>"
+                    print "<!ELEMENT redundant (tag,"
+                    print "\t" S ")>"
+                    print "<!ATTLIST redundant"
+                    print A ">"
+
+                    L = "<!-- a script subtag -->"
+                    S = "\tscript  IDREF  #REQUIRED"
+                    print "<!ELEMENT suppress     (#PCDATA)>" L
+                    print "<!ATTLIST suppress"
+                    print S ">"
+# modified:
+                    L = "<!-- a macrolang subtag -->"
+                    S = "\tlanguage IDREF #REQUIRED"
+                    print "<!ELEMENT macro        (#PCDATA)>" L
+                    print "<!ATTLIST macro"
+                    print S ">"
+
+                    L = "<!-- a prefix tag -->"
+                    print "<!ELEMENT prefix       (#PCDATA)>" L
+                    print "<!ATTLIST prefix"
+                    print A ">"
+
+                    L = "<!-- a date -->"
+                    print "<!ELEMENT added        (#PCDATA)>" L
+                    print "<!ELEMENT deprecated   (#PCDATA)>" L
+
+                    L = "<!-- a (sub)tag -->"
+                    print "<!ELEMENT preferred    (#PCDATA)>" L
+                    print "<!ATTLIST preferred"
+                    print A ">"
+
+                    print "<!ELEMENT subtag       (#PCDATA)>" L
+                    print "<!ELEMENT tag          (#PCDATA)>" L
+
+                    if ( MULT )
+                         L = "(#PCDATA)><!-- text -->"
+                    else L = "(#PCDATA | alt)*>"
+                    print "<!ELEMENT description  " L
+                    print "<!ELEMENT comment      " L
+                    if ( ! MULT )
+                    {    L = "<!-- separator -->"
+                         print "<!ELEMENT alt          EMPTY>    " L
+                    }
+
+                    print "]>"
+                    print
+               }
+
+#### record separator #############################################
+/^\%\%$/       {    FIELD()
+                    if ( NN++ )    READY()
+                    else
+                    {    A = F[ "file-date" ]
+                         if ( A == "" )
+                              exit FATAL( "missing File-Date" )
+                         print "<" ROOT " date='" A "'>"
+                         OKAY = 1
+                    }
+
+                    for ( NAME in F ) delete F[ NAME ]
+                    NAME = "" ;    CC = 0    ;    PP = 0
+                    BODY = "" ;    DD = 0    ;    next
+               }
+#### start of new field ###########################################
+/^[A-Za-z0-9]/ {    FIELD()
+                    if ( ! match( $0, ":" )) exit FATAL( $0 )
+                    NAME = tolower( substr( $0, 1, RSTART - 1 ))
+                    BODY = STRIP( substr( $0, RSTART + 1 ))
+                    next
+               }
+#### unfold field body ############################################
+/^[\t ]/       {    BODY = BODY " " STRIP( $0 )
+                    next
+               }
+#### garbage ######################################################
+               {    exit FATAL( $0 )
+               }
+###################################################################
+END            {    if ( NN++ )
+                    {    FIELD()   ;   READY()
+                         print "</" ROOT ">"
+                    }
+                    else OKAY = 0
+                    print "<!-- " VERS " (" --NN " records) -->"
+                    exit 1 - OKAY
+               }
\ No newline at end of file
diff --git a/registries/utf82ncr.py b/registries/utf82ncr.py
new file mode 100755
index 0000000..e2e1b92
--- /dev/null
+++ b/registries/utf82ncr.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+
+""" Converts an UTF-8 text file to an ASCII file with hexadecimal
+Numeric Character References (like &#x153;). """
+
+import sys
+import re
+
+extension = re.compile("^(.*)\.([a-z0-9_-]+)$", re.IGNORECASE)
+
+def convert(thematch):
+    codepoint = int(thematch.group(1), 16)
+    return chr(codepoint)
+
+for ifilename in sys.argv[1:]:
+    print("Converting %s..." % ifilename)
+    match = extension.search (ifilename)
+    if match:
+        ext_ifile = match.group(2)
+        ofilename = match.group(1) + "-ncr." + ext_ifile
+    else:
+        ofilename = ifilename + "-ncr"
+    ifile = open(ifilename, "r")
+    ofile = open(ofilename, "w")
+    data = ifile.read()
+    for ch in data:
+        if ord(ch) > 127:
+            ch = "&#x%x;" % ord(ch)
+        ofile.write(ch)
+    ifile.close()
+    ofile.close()
+    
+    
diff --git a/tag-wisely.xml b/tag-wisely.xml
new file mode 100644
index 0000000..e72570d
--- /dev/null
+++ b/tag-wisely.xml
@@ -0,0 +1,59 @@
+<page title="Tag wisely">
+<p>(Most of the content on this page comes directly from
+<wikipedia name="Request for Comments">RFC</wikipedia> 5646.)</p>
+
+<p>For the same body of text, you may have several possible
+tags. Interoperability is best served when all users use the same
+language tag for the same language. The rules here are intended to
+help in that respect.</p>
+
+<p>Subtags should only be used where they add useful distinguishing
+information; extraneous subtags interfere with the meaning,
+understanding, and processing of language tags. In particular, fields
+<code>Suppress-Script</code> in the registry should be obeyed: for
+instance, <code>fr</code> (<wikipedia name="French language">French</wikipedia>) has a
+<code>Suppress-Script: Latn</code> because the overwhelming majority
+of French texts are in the <wikipedia>Latin script</wikipedia>. Therefore, tagging text in French as
+<code>fr-Latn</code> is useless and confusing. A simple
+<code>fr</code> is enough. In the unlikely case that you meet French
+texts in the <wikipedia>Arabic script</wikipedia>, then you can add a subtag for the script:
+<code>fr-Arab</code>. (This is specially important since the former
+standard, in RFC 3066, did not have subtags for scripts and therefore
+old applications will have problems to handle them.)</p>
+
+<p>Use as precise a tag as possible, but no more specific than is
+justified. Avoid using subtags that are not important for
+distinguishing content in an application. For example, <code>de</code>
+might suffice for tagging an email written in
+<wikipedia name="German language">German</wikipedia>, while <code>de-CH-1996</code>, while
+legal,is probably unnecessarily precise for such a task.</p>
+
+<p>But do not be too vague: the primary language subtag might not be
+sufficient to give all the information necessary to understand the
+text. For
+example, the tag <code>az</code> (for
+<wikipedia name="Azerbaijani language">Azerbaidjani</wikipedia>) is probably insufficient in the
+absence of context, because this language has no dominant script. A person fluent in
+one script might not be able to read the other, even though the text
+might be identical.  Content tagged as <code>az</code> most probably is written
+in just one script and thus might not be intelligible to a reader
+familiar with the other script. <code>az-Latn</code>,
+<code>az-Cyrl</code> or <code>az-Arab</code> are probably necessary.</p>
+
+<p>If a tag or subtag has a <code>Preferred-Value</code> field in its registry
+entry, then the value of that field should be used to form the
+language tag. For example, use <code>he</code> for <wikipedia
+name="Hebrew language">Hebrew</wikipedia> in preference to
+<code>iw</code>.</p>
+
+<p>Validity of a tag is not everything. A tag may be both valid and
+meaningless. This is unavoidable with a generative system like the
+language subtag mechanism. So, <code>ar-Cyrl-AQ</code>
+(<wikipedia>Arabic</wikipedia> written with the <wikipedia name="Cyrillic alphabet">cyrillic
+script</wikipedia>, as used in <wikipedia>Antarctica</wikipedia>) is
+perfectly valid but should nevertheless be avoided because it has no
+relationship with the reality (there is not a single document with
+these characteristics).</p>
+
+</page>
+
diff --git a/test-suites.xml b/test-suites.xml
new file mode 100644
index 0000000..5419b60
--- /dev/null
+++ b/test-suites.xml
@@ -0,0 +1,24 @@
+<page title="Test suites for language tag software">
+
+<ul>
+<li>St&#xE9;phane Bortzmeyer's test suites. The format is "tag
+whitespace optional-comment":
+<ul>
+<li><a href="test-suites/well-formed-tags.txt">Well-formed tags</a>
+(they may be invalid)</li>
+<li><a href="test-suites/broken-tags.txt">Not well-formed tags</a></li>
+<li><a href="test-suites/valid-tags.txt">Valid tags</a> (for the
+2006-11-15 registry)</li>
+<li><a href="test-suites/invalid-tags.txt">Invalid tags</a> (for the
+2006-11-15 registry, they may be valid now, the current registry is <lsr-version/>!)</li>
+</ul>
+</li>
+<li><a href="http://unicode.org/cldr/data/tools/java/org/unicode/cldr/util/data/langtagTest.txt">ICU test suite</a></li>
+<li>From the <wikipedia name="Formal grammar">grammar</wikipedia> in
+the RFC, you can use programs like <a href="http://www.quut.com/abnfgen/">abnfgen</a> or <a
+href="http://www.bortzmeyer.org/eustathius-test-grammars.html">eusthathius</a> to generate tags for testing a
+parser. <em>Warning</em>: these tags will follow the grammar but may
+not be well-formed, since some rules are not in the grammar (for
+instance the rule that no two singletons must be identical).</li>
+</ul>
+</page>
\ No newline at end of file
diff --git a/web-site.xml b/web-site.xml
new file mode 100644
index 0000000..d041e52
--- /dev/null
+++ b/web-site.xml
@@ -0,0 +1,18 @@
+<page title="Management of this Web site">
+<p>The manager, <code><a
+href="mailto:webmaster@langtag.net">webmaster@langtag.net</a></code>
+is always glad to receive bug reports, fixes and improvments.</p>
+<p>The ideal way to send <wikipedia name="Patch
+(computing)">patches</wikipedia> is to retrieve the source and work on
+them.</p>
+<p>You can retrieve the source of the pages for this Web site with
+<wikipedia name="Subversion (software)">Subversion</wikipedia> at URL
+<code>https://svn.langtag.net/langtag/</code>. These sources are in
+<wikipedia>XML</wikipedia>, using a small superset of
+<wikipedia>XHTML</wikipedia> in a <code>&lt;page&gt;</code> element.</p>
+<p>The recommended method
+to request a change to the Web site is to send the <wikipedia
+name="Patch (computing)">patches</wikipedia> in <code>diff -u</code>
+format. If you cannot, send the modified XML page, and please try to
+convince your XML editor to do as little reformatting as possible.</p>
+</page>
\ No newline at end of file
diff --git a/whatare.xml b/whatare.xml
new file mode 100644
index 0000000..4949bb1
--- /dev/null
+++ b/whatare.xml
@@ -0,0 +1,29 @@
+<page title="What are they?">
+<p><em>Language tags</em> are a way to <em>tag</em> digital resources
+to indicate in what <wikipedia name="language">human
+language</wikipedia> they are. They are also used by software to tell
+an user's preference about languages.</p>
+<p>They can express the language itself but also the writing system,
+the national variant and many other things.</p>
+<p>A few examples of language tags:</p>
+<ul>
+<li><code>fr</code>: <wikipedia name="French language">French</wikipedia> language,</li>
+<li><code>en-AU</code>: <wikipedia name="English language">English</wikipedia> language, as
+written and spoken in <wikipedia>Australia</wikipedia>,</li>
+<li><code>az-Latn-IR</code>, <wikipedia name="Azerbaijani language">Azeri</wikipedia> language,
+written in the <wikipedia name="Latin alphabet">Latin</wikipedia> script, as used in <wikipedia>Iran</wikipedia>.</li>
+</ul>
+<p>They are specified in <wikipedia>IETF</wikipedia>
+<wikipedia name="Request for Comments">RFC</wikipedia> <em>5646</em> (<a
+href="http://www.rfc-editor.org/rfc/rfc5646.txt">available
+online</a>).</p>
+<p>Language tags are made of <em>subtags</em> separated by
+hyphens. The list of possible subtags is mostly directly copied from
+various <wikipedia>ISO</wikipedia> standards such as <wikipedia name="ISO
+639">ISO 639</wikipedia>.</p>
+<p>They are used in many formats and protocols for instance in
+<wikipedia>XML</wikipedia> (through the <code>xml:lang</code>
+attribute) and in <wikipedia>HTTP</wikipedia> (the browser can
+indicate to the <wikipedia>Web</wikipedia> server what language the
+user prefers, should the Web server have several versions).</p> 
+</page>
\ No newline at end of file
diff --git a/why-tagging.xml b/why-tagging.xml
new file mode 100644
index 0000000..0cc68e8
--- /dev/null
+++ b/why-tagging.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="utf-8"?>
+<page title="Why tagging?">
+<p>Executive summary: tagging your digital resources to indicate in
+what <wikipedia>language</wikipedia> they are allow</p>
+<ol>
+<li>Proper rendition,</li>
+<li>Correct behaviour of some software,</li>
+<li>Choice of the right tools,</li>
+<li>Correct filtering.</li>
+</ol>
+<h2>What is tagging</h2>
+<p>Tagging is the process of giving <a href="whatare.html">language
+tags</a> to a digital resource. For instance, in legacy
+<wikipedia>HTML</wikipedia>, it is done with:</p>
+<pre>
+<![CDATA[
+<html lang="ar">
+<!-- Text in Arabic -->
+]]>
+</pre>
+<p>and in <wikipedia>XML</wikipedia> with the <code>xml:lang</code>
+special attribute:</p>
+<pre>
+<![CDATA[
+<book xml:lang="uk">
+<!-- Text in Ukrainian -->
+]]>
+</pre>
+<h2>What is tagging for?</h2>
+<p>The purpose of tagging is to give <em>unambiguous</em> information
+to the software processes that will handle the resource. For instance,
+properly rendering the content on the screen requires to know the language it is
+written in. Actual <wikipedia>typography</wikipedia> rules are different for each
+language, language-independant rendition can only be an
+approximation. In the same way, knowing the language used is
+necessary for <wikipedia>speech synthesis</wikipedia>.</p>
+<p>Some programs may need the language to know what to do with
+requests like <wikipedia>CSS</wikipedia>' "first-letter"
+pseudo-property. The first letter of <wikipedia>Llobregat</wikipedia>
+is 'l' in <wikipedia name="English language">English</wikipedia> but
+'ll' in <wikipedia name="Spanish language">Spanish</wikipedia>.</p>
+<p>Tools like <wikipedia name="Spell checker">spell checkers</wikipedia> or an online dictionary must also be
+choosen depending on the language used.</p>
+<p>Language tagging also allow filters to keep only some documents,
+those written in a language that the user understands. At the present
+time, most <wikipedia>search engines</wikipedia>, like
+<wikipedia>Google</wikipedia>, use <a
+href="http://www.macchiato.com/slides/unicode_at_google.ppt">heuristics</a>
+to find out the language of a Web page. While it works fine to tell
+apart <wikipedia name="German language">German</wikipedia> from
+<wikipedia name="Japanese language">Japanese</wikipedia>, it is much
+more difficult with close languages like <wikipedia name="Danish
+language">Danish</wikipedia> and <wikipedia name="Norwegian
+language">Norwegian</wikipedia>, specially if the text is short.</p>
+<h2>Current situation</h2>
+<p>At the present day, we are a bit stuck in a
+<wikipedia name="The chicken or the egg">chicken-and-egg</wikipedia> problem: many applications
+(like the search engines mentioned before) do not use the language
+information because it is not present or unreliable. Therefore,
+webmasters and other document maintainers are not eager of tagging
+because it brings no short-term benefits. Things are becoming better
+but certainly too slowly.</p>
+<h2>More readings</h2>
+<ul>
+<li><a href="http://www.w3.org/TR/i18n-html-tech-lang/#ri20050208.091505539">Why specify language?</a> by the W3C</li>
+</ul>
+</page>