From 85f4c9e1e51f8d4aeaa0a58a54a5b1abcabe3494 Mon Sep 17 00:00:00 2001 From: Stephane Bortzmeyer Date: Fri, 9 Jun 2023 10:02:30 +0200 Subject: [PATCH] Initial import --- Makefile | 26 +++ README.md | 7 +- TODO | 4 + favicon.ico | Bin 0 -> 766 bytes find-subtags.xml | 68 +++++++ index.xml | 47 +++++ ltru.css | 64 +++++++ page.xslt | 101 ++++++++++ philips-regexp.xml | 15 ++ register-new-subtag.xml | 196 ++++++++++++++++++++ registries.xml | 35 ++++ registries/Makefile | 9 + registries/copy-and-convert.sh | 93 ++++++++++ registries/fill-in-database.sh | 6 + registries/lsr2atom.py | 103 ++++++++++ registries/ltru.dtd | 74 ++++++++ registries/ltru.rnc | 71 +++++++ registries/ltru2xml.awk | 330 +++++++++++++++++++++++++++++++++ registries/utf82ncr.py | 33 ++++ tag-wisely.xml | 59 ++++++ test-suites.xml | 24 +++ web-site.xml | 18 ++ whatare.xml | 29 +++ why-tagging.xml | 67 +++++++ 24 files changed, 1478 insertions(+), 1 deletion(-) create mode 100644 Makefile create mode 100644 TODO create mode 100644 favicon.ico create mode 100644 find-subtags.xml create mode 100644 index.xml create mode 100644 ltru.css create mode 100644 page.xslt create mode 100644 philips-regexp.xml create mode 100644 register-new-subtag.xml create mode 100644 registries.xml create mode 100644 registries/Makefile create mode 100755 registries/copy-and-convert.sh create mode 100755 registries/fill-in-database.sh create mode 100755 registries/lsr2atom.py create mode 100644 registries/ltru.dtd create mode 100644 registries/ltru.rnc create mode 100644 registries/ltru2xml.awk create mode 100755 registries/utf82ncr.py create mode 100644 tag-wisely.xml create mode 100644 test-suites.xml create mode 100644 web-site.xml create mode 100644 whatare.xml create mode 100644 why-tagging.xml diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..dfb3e44 --- /dev/null +++ b/Makefile @@ -0,0 +1,26 @@ +ALLHTML=$(shell ls *.xml 2> /dev/null | sed 's/.xml$$/.html/' ) +STYLESHEET=page.xslt +IMAGES=ltag-icon-en.png favicon.ico +ME=$(shell hostname) +ifeq ("${ME}","lilith") +WEBSERVER=/var/www/www.langtag.net +else +WEBSERVER=bortzmeyer@www.langtag.net:/var/www/www.langtag.net +endif +GOOGLEVERIF=google75f3cadf7e9fc996.html + +all: ${ALLHTML} + +%.html: %.xml ${STYLESHEET} language-subtag-registry-version + xsltproc --stringparam lsr-version `cat language-subtag-registry-version` \ + --output $@ ${STYLESHEET} $< && xmllint --noout --valid $@ + +install: all + cp -a ../SQL/* . + touch ${GOOGLEVERIF} + rsync -q -a ${ALLHTML} ${GOOGLEVERIF} ${IMAGES} ltru.css registries test-suites PostgreSQL SQLite ${WEBSERVER} + +clean: + rm -f *.html + + diff --git a/README.md b/README.md index c80d55f..e21d0b8 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ # Web-LangTag -Code for the www.langtag.net Web site (language tag registry) \ No newline at end of file +Code for the [www.langtag.net](https://www.langtag.net/) Web site +(language tag registry). + +The programs depend on the [GaBuZoMeu +tools](https://www.bortzmeyer.org/gabuzomeu-parsing-language-tags.html) +and of some programs like xsltproc and trang. diff --git a/TODO b/TODO new file mode 100644 index 0000000..8023dc9 --- /dev/null +++ b/TODO @@ -0,0 +1,4 @@ +* Check all the links +* Export database schemas (link to GaBuZoMeu?) +* Configure Web site +* HTTPS diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..a4367be4daad7125fffdcb13ce080297d0f09ba8 GIT binary patch literal 766 zcmcJNy$*sv5QJAD*n)+&#Kgps!~|O^p9BB4_NTa(=d$8)EPVrk$L)0J(_-0XS`586SL!xRzttV4o|9W_OthOaZ!rQpFU=<= j)D9aHL%KAu!H0R{%bXckm&CPflA#>1AH>%m +

Very often, people who are not used to language +tagging hesitate before choosing a tag for a given +combination of language, region, script, etc. You can find the result +of these hesitations on the Web, with people tagging, for instance, +japanese as jp (the proper +subtag for the japanese language is ja, +jp is for the country) or using subtags that are not +registered +because they did not find the valid +ones. The purpose of this small text is to explain how to find a +subtag registered in the Language Subtag Registry. There are +many ways to do so, of course, and you are free to report +better ways.

+

The first thing to try is probably to use Richard Ishida's +Language Subtag Registry Search. You can enter text which appears +in the Description or Comments field of the registry and the +corresponding subtags will be displayed. For instance, for japanese, +it will correctly report ja (and the script +Jpan, which indicates the mix of +Han, Hiragana and +Katakana).

+

A more powerful, but probably less user-friendly, method, is to use +the registry directly. Since its canonical form is more adapted to +computer programs than to humans (for instance, +Unicode characters are reported as XML escapes, +like &#xE7;), it may be better to use of the many unofficial forms, automatically +computed from the official one, and available for +various environments. For instance, you may load the text version and use the Find +function of your Web browser (Control-F in +Firefox). Say that you are not sure of the +proper subtag for the canadian +aboriginal script, searching "canadian" that way soon discovers +the subtag Cans.

+

Both Richard Ishida's Web service and the above method have a +limit: they only use information that is in the registry. If the relationship between common +names and the tags is not in the registry, you will not find it. For +instance, if you want to identify "British English", you have to realize +that that is done by constructing the tag en-GB (not en-UK) from +subtags in the registry. Similarly, if you want to +write texts in Alsatian, and search this word +in the registry, you won't find anything.

+

You have to use external tools, but please check their results +against the registry, with the tools mentioned above. A good search +tool is +Wikipedia. The english-speaking Wikipedia +displays the ISO code names for most of the +languages it talks about. Since most subtags in the registry are based +on ISO standards, this works most of the time. For instance, the article on Alsatian will +show the language code gsw +(Alemannic), which, even if it is broader than the Alsatian dialect, is a good start.

+

For languages (but not scripts or dialects), another very useful +source is Ethnologue, which is +managed by the ISO 639 registration agency. It +has a search function that allows you to use words that are not in the +formal standard. For example, +searching for "Alsatian", you'll find +http://www.ethnologue.com/show_language.asp?code=gsw, where there is the +comment: "Called 'Schwyzerdütsch' in Switzerland, and 'Alsatian' in France". But be careful: +Ethnologue displays only 3-letters code, while the registry +uses 2-letters code whenever they are available. For instance, French is +fr, not fra.

+

TODO: endonyms and exonyms

+ + diff --git a/index.xml b/index.xml new file mode 100644 index 0000000..ac8009f --- /dev/null +++ b/index.xml @@ -0,0 +1,47 @@ + + + + + +

Tout est aléa, confusion et précarité, sauf le Catalogue. (Fred Vargas)

+
diff --git a/ltru.css b/ltru.css new file mode 100644 index 0000000..62d0dc8 --- /dev/null +++ b/ltru.css @@ -0,0 +1,64 @@ +.menu3 { + float: left; + width: 30%; + padding: 1%; +} + +/* .menu3 ul { + list-style-type: none; +} */ + +.menu3 h2 { + text-align: center; +} + +.back-to-normal { + margin-top: 7%; + clear: both; + float: none; + padding: 1%; + width: 98%; +} + +body { + padding: 1%; + color: #000000; + background-color: #ffffff; + background-image: none; +} + +.main-title { + text-align: center; + float: none; + clear: both; +} + +em { + font-weight: bolder; + } + +a, p, li, h1, h2, h3, pre { + } + +a:visited { + /* text-decoration: line-through; /* Blog-like :-) */ + } + +a:active { + font-size: 125%; + } + +code { + font-size: 130%; + } + +#ltag-icon { + float: left; +} + +#headline { + margin-left: 2%; + font-size: 140%; + font-weight: bolder; +} + diff --git a/page.xslt b/page.xslt new file mode 100644 index 0000000..0182c74 --- /dev/null +++ b/page.xslt @@ -0,0 +1,101 @@ + + + + + + VERSION UNDEFINED + + + + + + + + + + + + + + + + + + + Language Tags: <xsl:value-of select="$title"/> + + +
+

+ + + + + +
+ + + + + + + + + + + + + + + + + + + http://en.wikipedia.org/wiki/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
diff --git a/philips-regexp.xml b/philips-regexp.xml new file mode 100644 index 0000000..8843a43 --- /dev/null +++ b/philips-regexp.xml @@ -0,0 +1,15 @@ + +

Here is a regular expression to parse the future versions of +language tags. Suitable for the syntax of the RFC 5646. Written by Addison Phillips, addison - at - amazon.com for the Java programming language.

+
+     static final String langtag_ex =
+     "(\\A[xX]([\\x2d]\\p{Alnum}{1,8})*\\z)"
+       + "|(((\\A\\p{Alpha}{2,8}(?=\\x2d|\\z)){1}"
+       + "(([\\x2d]\\p{Alpha}{3})(?=\\x2d|\\z)){0,3}"
+       + "([\\x2d]\\p{Alpha}{4}(?=\\x2d|\\z))?"
+       + "([\\x2d](\\p{Alpha}{2}|\\d{3})(?=\\x2d|\\z))?"
+       + "([\\x2d](\\d\\p{Alnum}{3}|\\p{Alnum}{5,8})(?=\\x2d|\\z))*)"
+       + "(([\\x2d]([a-wyzA-WYZ](?=\\x2d))([\\x2d](\\p{Alnum}{2,8})+)*))*"
+       + "([\\x2d][xX]([\\x2d]\\p{Alnum}{1,8})*)?)\\z";
+
+
\ No newline at end of file diff --git a/register-new-subtag.xml b/register-new-subtag.xml new file mode 100644 index 0000000..6b3b846 --- /dev/null +++ b/register-new-subtag.xml @@ -0,0 +1,196 @@ + + +

The language subtag registry +includes many subtags identifying countries, languages or variants +such as local dialects. Your favorite language and/or variant is +probably already there. But, if it is not, you can ask for the +registration of a new subtag. This text explains how, but the +complete and authoritative explanation is in RFC 5646, +specially its section 3.5, Registration Procedure for Subtags". It is +recommended to read at least this section.

+

Before you start, a warning: the process takes time, documentation +and the ability to defend your proposal and to back it with facts and +references. Just sending an email saying "People in my hometown speaks +Alsatian, I want 'als' to be registered as a language subtag" is not sufficient.

+

You have different sorts of subtags and the rules are not the same +for all:

+
    +
  • Subtags for types "countries" or "scripts" cannot be registered +directly with the IETF. You have to go through +the maintenance agencies of ISO, the language +subtag registry managed by IETF copies the ISO standards here.
  • +
  • Only subtags of types "language" and "variant" are therefore +considered here. In practice, chances that a "language" subtag +registration succeeds seem limited (you will probably be redirected to +the maintenance agencies of ISO 639; if you +already ask them and were turned out, prepare a very good proposal if +you want the IETF to make another choice). We then concentrate on +"variant" subtags.
  • +
+

The process is the following (it is a simplified version; did I +tell you to read the full story in section 3.5 of RFC 5646?):

+
    +
  1. Collect background information, typically references to published +descriptions of the language or dialect. A Wikipedia page is possible but +may be insufficient, specially since the page may change +easily. Stable references are preferred.
  2. +
  3. Choose a subtag which must conform to the syntax rules explained +in RFC 5646 (section 2.1). A variant subtag must be either a string of +five to eight alphanumeric characters, or a string of four +alphanumeric characters, starting with a digit. So, valencian is illegal +(too long) while valencia is legal. 1996 is legal, too, but +not 732.
  4. +
  5. Fill-in the registration form whose template is: +
    +   LANGUAGE SUBTAG REGISTRATION FORM
    +   1. Name of requester:
    +   2. E-mail address of requester:
    +   3. Record Requested:
    +
    +      Type:
    +      Subtag:
    +      Description:
    +      Prefix:
    +      Preferred-Value:
    +      Deprecated:
    +      Suppress-Script:
    +      Comments:
    +
    +   4. Intended meaning of the subtag:
    +   5. Reference to published description
    +      of the language (book or article):
    +   6. Any other relevant information:
    +
    + +Pay special attention to Prefix (in practice, most variants have a +Prefix, which is the main language of this variant, such as +ca for +valencian).
    Think twice about Description (in general a short +one-line sentence) and Comments (which may be longer), because the +consistency of tagging among different taggers will heavily depend on +the quality of these fields.
    Keep +detailed scholar references for the Reference section of the request: +the registry is not a library.
    Some +fields are typically not used for a variant such as +Suppress-Script.
  6. +
  7. Send it to the mailing list ietf-languages@iana.org +(you may choose to subscribe to the mailing list before, to get an +idea of the people and discussions, and to be sure to have the +complete thread).
  8. +
  9. Reply to questions, address objections, be prepared to modify your +registration form and keep cool.
  10. +
+

Let's see a complete example showing many issues (thanks to CE +Whitehead for the nice example). The current registry has three +entries for the french language, +fr for today's French, frm for Middle French +(the language spoken during the Renaissance) +and fro for Old French (the language spoken during the +Middle Age). This is not +always sufficiently fine-grained to classify some old texts. So, here +is a possible proposal to register a variant, 1606Nict, +for the late Middle French, as described in the famous Nicot's book:

+
+LANGUAGE SUBTAG REGISTRATION FORM
+1. Name of requester:  C. E. Whitehead
+2. E-mail address of requester: cewcathar@hotmail.com
+3. Record Requested:
+Type: Variant
+Subtag:  1606Nict
+        (or alternately 16siecle)
+Description: Late Middle French
+Prefix: frm
+Preferred-Value:
+Deprecated:
+Suppress-Script:
+Comments: French as catalogued in Jean Nicot, "Thresor de la langue francoyse" 1606
+
+4. Intended meaning of the subtag:
+5. Reference to published description
+of the language (book or article):
+
+* Joachim du Bellay, La deffence et illustration de la langue francoyse,
+1549; ed critique by Henri Chamard, Geneve, Slatkine Rpt. 1969
+
+* Jean Nicot, "Thresor de la langue francoyse" 1606; ARTFL Project,
+University of Chicago:
+http://portail.atilf.fr/dictionnaires/TLF-NICOT/index.htm
+
+6. Any other relevant information:
+See second request below
+
+

Do note the detailed references and the use of Prefix to +clearly state that it is a variant of Middle French.

+

Let's see a second example from the same author, with the added +difficulty that we use XML-like encoding for +the composed characters (see section 3.1 of RFC 5646). This specificies the early modern French, as +described by the french academy:

+
+LANGUAGE SUBTAG REGISTRATION FORM
+1. Name of requester:  C. E. Whitehead
+2. E-mail address of requester: cewcathar@hotmail.com
+3. Record Requested:
+
+Type: Variant
+Subtag:  1694acad
+            (alternately 17siecle)
+Description: Early modern French
+Prefix: fr 
+Preferred-Value:
+Deprecated:
+Suppress-Script:
+Comments:  As catalogued in the "Dictionnaire de
+l'acad&#xe9;me fran&#xe7;oise", 4eme ed. 1694; includes
+elements of Middle French; also new terms from the Americas
+
+4. Intended meaning of the subtag:
+5. Reference to published description
+of the language (book or article):
+
+* Dictionnaire de l'académie françoise, 4eme ed. 1694; RTFL Project,
+University of Chicago:
+http://portail.atilf.fr/dictionnaires/ACADEMIE/index.htm
+
+* Fénelon, François de Salignac de La Mothe (1984), Fenelon's Letter to the
+French Academy : with an introduction and commentary.
+
+* Ayres-Bennett, Wendy (2004), Sociolinguistic variation in
+seventeenth-century France : methodology and case studies.
+
+also:
+* http://www.tsl.state.tx.us/treasures/giants/lasalle/lasalle-cover.html
+ http://teacherweb.com/FL/Cocoa/CEWhitehead/HTMLPage15.stm
+
+

It is probably useful to list some mistakes that people seem to +make often. Keep in mind that:

+
    +
  • Language issues are always extremely passionate, both for +psychological (people feel very strong about their language) and +political reasons (wars have been fought about languages). Please, try +to keep easy and do not forget that it is perfectly normal that an +international audience does not know your language (or the language +you champion) and does not see things they way you do.
  • +
  • Pay attention to syntax issues (you may wish to ask a computer +person, may be with the help of some of the software tools listed on the home page) and also be sure to fill in the form +properly - or do not be suprised if the first reactions are on the +syntax, not on the proposal itself. If the IETF yells at your form +errors, do not assume it is a refusal of your language: it is simply a +desire to enforce the documented process.
  • +
  • The IETF is not a general appeal mechanism for other standard +bodies decisions. Other standards are imperfect, true, but so is IETF +work, too. Please do not use the IETF language registration mechanism just +because ISO turned you down. Variant registration is typically fine +because no other standard body do it.
  • +
+

Thanks +for reading and good luck for your future subtag +registrations. Remember: it may seems difficult but it is worth +it.

+
diff --git a/registries.xml b/registries.xml new file mode 100644 index 0000000..60cffbc --- /dev/null +++ b/registries.xml @@ -0,0 +1,35 @@ + + +

Files available here were automatically produced from the official +registry maintained by IANA. The current version of +the registry is .

+ + + +
diff --git a/registries/Makefile b/registries/Makefile new file mode 100644 index 0000000..53214ff --- /dev/null +++ b/registries/Makefile @@ -0,0 +1,9 @@ +all: + ./copy-and-convert.sh + cp ./language-subtag-registry-version .. + +ltru.rng: ltru.rnc + trang -Irnc -Orng ltru.rnc $@ + +clean: + rm -f language-subtag-registry language-subtag-registry.xml language-subtag-registry2.xml lsr-*.txt diff --git a/registries/copy-and-convert.sh b/registries/copy-and-convert.sh new file mode 100755 index 0000000..30aed0a --- /dev/null +++ b/registries/copy-and-convert.sh @@ -0,0 +1,93 @@ +#!/bin/sh + +MYURL=https://www.langtag.net/ +LTR_URL=https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry +LTR_LOCAL=language-subtag-registry +PROGRAMS_DIR=../../GaBuZoMeu +TEST_PROGRAM=${PROGRAMS_DIR}/check-registry +OS="$(uname)" +if [ "$OS" = "FreeBSD" ]; then + # FreeBSD's mktemp is stupid enough to have *no* + # default template :-( + OUTPUT=`mktemp /tmp/$(basename $0).tmp.XXX)` + TMPDIFF=`mktemp /tmp/$(basename $0).tmp.XXX)` +else + OUTPUT=`mktemp` + TMPDIFF=`mktemp` +fi +MAINTAINER=stephane+langtag@bortzmeyer.org + +# Conversions +CONVERT_XML_BORTZMEYER=${PROGRAMS_DIR}/registry2xml +CONVERT_XML_ELLERMANN="awk -f ltru2xml.awk " +CONVERT_POSTGRESQL=${PROGRAMS_DIR}/registry2postgresql +CONVERT_SQLITE=${PROGRAMS_DIR}/registry2sqlite +CONVERT_TXT=${PROGRAMS_DIR}/registry2txt +CONVERT_HTML=${PROGRAMS_DIR}/registry2mulhtml +FILL_DATABASE=./fill-in-database.sh +# --force is to avoid spurious warnings about "Ambiguous output" +#CRLF_TO_LOCAL="recode --force /CR-LF..US-ASCII " + +trap "rm -f $OUTPUT $TMPDIFF; exit 1" 1 2 3 15 +trap "rm -f $OUTPUT $TMPDIFF" EXIT + +if [ -e ${LTR_LOCAL} ]; then + ltr_date=`head -n 1 ${LTR_LOCAL} | cut -d" " -f2` + # Allow time to elapse. The date of the file at IANA is often the day after + # the date written in the LSR. Heuristically, we add one day and a few hours. + current_date=`date +"%Y%m%d %H:%M:%S" --date="${ltr_date} +1 day +4 hour"` +else + # Trick to force a downloading + current_date="19700101" + #current_date=`date --utc +"%Y%m%d"` +fi +curl --silent --output ${LTR_LOCAL}.TMP \ + --compressed \ + --referer ${MYURL} \ + --proxy "" \ + --time-cond "${current_date}" \ + --header "From: ${MAINTAINER}" \ + ${LTR_URL} 2>&1 > ${OUTPUT} +if [ $? != 0 ]; then + cat ${OUTPUT} | mutt -s "Network error getting ${LTR_URL}" ${MAINTAINER} + exit 1 +fi +if [ -e ${LTR_LOCAL}.TMP ]; then + #$CRLF_TO_LOCAL ${LTR_LOCAL}.TMP + ${TEST_PROGRAM} ${LTR_LOCAL}.TMP 2>&1 >> ${OUTPUT} + if [ $? = 0 ]; then + if [ -e ${LTR_LOCAL} ]; then + diff -u ${LTR_LOCAL} ${LTR_LOCAL}.TMP > $TMPDIFF + if [ ! -z $TMPDIFF ]; then + mutt -s "New LTR registry at ${MYURL}" ${MAINTAINER} < $TMPDIFF + fi + fi + mv ${LTR_LOCAL}.TMP ${LTR_LOCAL} + # Now, the various conversions + ${CONVERT_XML_BORTZMEYER} + # trang is in Java and therefore fails frequently + # trang -Irnc -Orng ltru.rnc ltru.rng + xmllint --noout --relaxng ltru.rng ${LTR_LOCAL}.xml + ${CONVERT_TXT} + #${CONVERT_XML_ELLERMANN} < ${LTR_LOCAL} > ${LTR_LOCAL}2.xml + #xmllint --noout --valid ${LTR_LOCAL}2.xml + ${CONVERT_POSTGRESQL} > lsr-postgres.sql + ${CONVERT_SQLITE} > lsr-sqlite.sql + # TODO: UTF-8 support on SQLite was never tested + ./utf82ncr.py lsr-sqlite.sql + mv lsr-sqlite.sql lsr-sqlite-utf8.sql + mv lsr-sqlite-ncr.sql lsr-sqlite.sql + ${CONVERT_HTML} + ${FILL_DATABASE} + # Needs to be ported away from.DateTime + #./lsr2atom.py > lsr.atom + version=`head -n 1 ${LTR_LOCAL} | awk '{print $2}'` + echo $version > ${LTR_LOCAL}-version + exit 0 + else + cat ${OUTPUT} | mutt -s "Invalid registry ${LTR_URL}" ${MAINTAINER} + exit 1 + fi +else # File not downloaded, probably because there was nothing new. + exit 0 +fi diff --git a/registries/fill-in-database.sh b/registries/fill-in-database.sh new file mode 100755 index 0000000..19ea814 --- /dev/null +++ b/registries/fill-in-database.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +DATABASE=lsr + +psql -f clean-postgres.sql ${DATABASE} +psql -f lsr-postgres.sql ${DATABASE} diff --git a/registries/lsr2atom.py b/registries/lsr2atom.py new file mode 100755 index 0000000..1e85365 --- /dev/null +++ b/registries/lsr2atom.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +__version__ = "BETA" +domain = "langtag.net" +tag_prefix = "tag:%s,2007-05:LSR" % domain + +import sys +import urllib.request, urllib.parse, urllib.error +import psycopg2 +# ElementTree is painful, with all its renamings :-( +try: + import cElementTree as ET +except ImportError: + try: + import ElementTree as ET + except ImportError: + # Now a standard part of Python >= 2.5 + import xml.etree.ElementTree as ET +import mx.DateTime as DateTime # TODO move to another package + +max = 10 + +db_module = psycopg2 + +def process_type(tree, type="language"): + request = ("SELECT code,description, added FROM %ss_with_descr" % type) + \ + " ORDER BY added DESC LIMIT %(max)s" + cursor.execute( + request, + {'max': max}) + for tuplee in cursor.fetchall(): + code = tuplee[0] + description = tuplee[1] + added = tuplee[2] + utype = type.capitalize() + entry = ET.SubElement(tree, "entry") + title = ET.SubElement(entry, "title") + title.text = "%s: %s" % (utype, description) + entry_id = ET.SubElement(entry, "id") + entry_id.text = tag_prefix + "/" + urllib.parse.quote_plus("%s %s" % (type, code)) + published = ET.SubElement(entry, "published") + published.text = added.strftime("%Y-%m-%dT00:00:00Z") + # TODO: records in the LSR are sometimes updated but it is not obvious to see it, + # since there is only an "Added" field. + updated = ET.SubElement(entry, "updated") + updated.text = published.text + category = ET.SubElement(entry, "category") + category.attrib["scheme"] = tag_prefix + category.attrib["term"] = type + category.attrib["label"] = utype + link = ET.SubElement(entry, "link") + link.attrib["rel"] = "alternate" + link.attrib["href"] = "http://www.%s/registries/registry-html/%s/%s.html" % \ + (domain, type, code) + content = ET.SubElement(entry, "content") + content.attrib["type"] = "text" + content.text = """ + %s + + %s + + %s + + Added on %s + """ % (type, code, description, added.strftime("%Y-%m-%d")) + # TODO: an alternate Content in HTML? + +connection = db_module.connect("dbname=lsr") +cursor = connection.cursor() + +feed = ET.Element("feed") +feed.attrib["xmlns"] = "http://www.w3.org/2005/Atom" +title = ET.SubElement(feed, "title") +title.text = "Language Tag Registry syndication feed" +updated = ET.SubElement(feed, "updated") +updated.text = DateTime.now().strftime("%Y-%m-%dT%H:%M:00Z") +link_html = ET.SubElement(feed, "link") +link_html.attrib["rel"] = "alternate" +link_html.attrib["type"] = "text/html" +link_html.attrib["href"] = "http://www.%s/" % domain +link_self = ET.SubElement(feed, "link") +link_self.attrib["rel"] = "self" +link_self.attrib["type"] = "application/atom+xml" +link_self.attrib["href"] = "http://www.%s/registries/lsr.atom" % domain +author = ET.SubElement(feed, "author") +name = ET.SubElement(author, "name") +name.text = "Stephane Bortzmeyer" +email = ET.SubElement(author, "email") +email.text = "webmaster@langtag.net" +feed_id = ET.SubElement(feed, "id") +feed_id.text = tag_prefix +generator = ET.SubElement(feed, "generator") +generator.text = "%s %s running with Python %s" % \ + ("lsr2atom", __version__, sys.version.split()[0]) + +process_type(feed, "language") +process_type(feed, "variant") +process_type(feed, "script") +process_type(feed, "region") +process_type(feed, "extlang") +cursor.close() +connection.close() +print(ET.tostring(feed, encoding="UTF-8")) diff --git a/registries/ltru.dtd b/registries/ltru.dtd new file mode 100644 index 0000000..6ce56cc --- /dev/null +++ b/registries/ltru.dtd @@ -0,0 +1,74 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/registries/ltru.rnc b/registries/ltru.rnc new file mode 100644 index 0000000..2a19f22 --- /dev/null +++ b/registries/ltru.rnc @@ -0,0 +1,71 @@ +# RelaxNG schema for the "language tag" registry specified in RFC 4646 +# and available at http://www.iana.org/assignments/language-subtag-registry + +# Not standard in any way, just an individual proposal + +# Stephane Bortzmeyer + +# TODO: add Schematron rules for constraints such as "Records that +# contain a 'Preferred-Value' field MUST also have a 'Deprecated' +# field. " This specific constraint does not really require +# Schematron, but others may. + +start = registry + +registry = element registry {date & languages & extlangs & scripts & regions & variants & + redundants & grandfathereds} # TODO: extensions + +date = element date {xsd:date} + +languages = language* + +language = element language {subtag & common & scope? & suppress-script? & macrolanguage?} + +extlangs = extlang* + +extlang = element extlang {subtag & common & scope? & macrolanguage?} + +scripts = script* + +script = element script {subtag & common} + +regions = region* + +region = element region {subtag & common} + +variants = variant* + +variant = element variant {subtag & common & prefix*} # "Records of type 'variant' + # MAY have more than one field of type" 'Prefix'. + +grandfathereds = grandfathered* + +grandfathered = element grandfathered {tag & common} + +redundants = redundant* + +redundant = element redundant {tag & common} + +common = added & descriptions & deprecated? & preferred-value? + +added = element added {xsd:date} + +suppress-script = element suppress-script {text} + +descriptions = description+ # Each record MUST contain the following fields + +description = element description {text} + +subtag = element subtag {text} + +tag = element tag {text} + +prefix = element prefix {text} + +macrolanguage = element macrolanguage {text} + +deprecated = element deprecated {xsd:date} + +preferred-value = element preferred-value {text} + +scope = element scope {text} \ No newline at end of file diff --git a/registries/ltru2xml.awk b/registries/ltru2xml.awk new file mode 100644 index 0000000..b02038f --- /dev/null +++ b/registries/ltru2xml.awk @@ -0,0 +1,330 @@ +#! /usr/common/bin/gawk -f +# +# Usage: ltru2xml.awk registry > registry.xml +# Or : ltru2xml.awk /dev/null > registry.dtd +# +# The File-Date record is noted in a 'date' attribute of the root +# element . +# +# Other records are converted to elements , , +#