Initial import

This commit is contained in:
Stephane Bortzmeyer 2023-06-08 21:49:35 +02:00
parent c9247066e2
commit 2fc541c0a7
39 changed files with 46325 additions and 2 deletions

129
Grammar.hs Normal file
View File

@ -0,0 +1,129 @@
{-# LANGUAGE NoMonomorphismRestriction,FlexibleContexts #-}
module Grammar (module Text.ParserCombinators.Parsec, tag, testTag,
getTag) where
import Text.ParserCombinators.Parsec
import Types
import Parsers
-- "Tags for Identifying Languages" RFC 5646
-- We use the names of the productions of the grammar in the RFC
dash = char '-' <?> "dash"
charI = char 'i' <|> char 'I'
charX = char 'x' <|> char 'X'
alphaNumNotX c =
c >= 'A' && c <= 'W' ||
c >= 'Y' && c <= 'Z' ||
c >= 'a' && c <= 'w' ||
c >= 'y' && c <= 'z' ||
c >= '0' && c <= '9'
languageTag :: GenParser Char () Tag
languageTag =do
value <- try (langtag) <|> try (privateuse) <|> irregular
return value
langtag = do
theLanguage <- language
theExtlang <- facultative (try (do {dash ; value <- extlang ; notFollowedBy alphaNum;
return value}));
theScript <- facultative (try (do {dash ; value <- script ; notFollowedBy alphaNum;
return value}));
theRegion <- facultative (try (do {dash ; value <- region ; notFollowedBy alphaNum;
return value}));
theVariants <- many (try (do {dash ; value <- variant ; notFollowedBy alphaNum;
return value}));
theExtensions <- many (try (do {dash ; value <- extension; notFollowedBy alphaNum;
return value}))
-- TODO: use a context-sensitive grammar (see Parsec
-- documentation) to enforce "2.2.6. Extension Subtags" which
-- says that an extension cannot be repeated.
optional (do {dash; privateuse})
eof
return (Tag theLanguage theExtlang theScript theRegion theVariants theExtensions)
language = do
value <- do
try (do {lang <- countBetween 2 3 letter; notFollowedBy alphaNum; return lang})
-- Shortest ISO 639 code
-- TODO: returns the extended, too!
<|>
try (do {lang <- count 4 letter; notFollowedBy alphaNum; return lang}) -- reserved for future use.
<|> (countBetween 5 8 letter) -- registered language subtag. TODO: return the value!
return value
-- RFC, section 2.2.2: "Although the ABNF production 'extlang' permits
-- up to three extended language tags"
extlang = do
value <- count 3 letter
return value
script = do
value <- count 4 letter -- ISO 15924 code
return value
region = do
count 2 letter -- ISO 3166 code
<|> count 3 digit -- United Nations M.49 code
variant = do
do {first <- digit ; rest <- countBetween 3 7 alphaNum; return (first:rest)}
<|> do {first <- letter ; rest <- countBetween 4 7 alphaNum; return (first:rest)}
extension = do
theSingleton <- singleton
theValues <- many1 (try (do {dash ; extensionName}))
return (theSingleton, theValues)
extensionName = do
countBetween 2 8 alphaNum
singleton = satisfy (alphaNumNotX)
privateuse = do
charX
values <- many1 (do {dash; countBetween 1 8 alphaNum})
eof
return (Priv values)
-- Irregular tags only
registeredIrregular = try (istring "en-GB-oed") <|> try (istring "i-ami") <|> try (istring "i-bnn") <|> try (istring "i-default")
<|> try (istring "i-enochian") <|> try (istring "i-hak") <|> try (istring "i-klingon") <|> try (istring "i-lux")
<|> try (istring "i-mingo") <|> try (istring "i-navajo") <|> try (istring "i-pwn") <|> try (istring "i-tao")
<|> try (istring "i-tay") <|> try (istring "i-tsu") <|> try (istring "sgn-BE-fr") <|>try ( istring "sgn-BE-nl")
<|> try (istring "sgn-CH-de") <|> try (istring "no-bok")
<|> try (istring"no-nyn")
<|> try (istring"zh-cmn")
<|> try (istring"zh-cmn-Hans")
<|> try (istring"zh-cmn-Hant")
<|> try (istring"zh-gan")
<|> try (istring"zh-min")
<|> try (istring"zh-min-nan")
<|> try (istring"zh-wuu")
<|> try (istring"zh-yue")
irregular = do
value <- registeredIrregular
return (GF value)
tag = do
value <- languageTag
return value
-- Utilities
testTag :: String -> Bool
testTag input
= case (parse languageTag "" input) of
Left err -> False
Right x -> True
getTag :: String -> Either String Tag
getTag input =
case (parse tag "" input) of
Left err -> Left (input ++ " is NOT well-formed: " ++ (show err) ++
"\n")
Right theTag -> Right theTag

4
INSTALL Normal file
View File

@ -0,0 +1,4 @@
Pre-requisites on Debian:
apt install ghc libghc-hunit-dev libghc-regex-compat-tdfa-dev

View File

@ -1,4 +1,4 @@
Copyright (c) <year> <owner>.
Copyright (c) 2006. Stéphane Bortzmeyer
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

38
Makefile Normal file
View File

@ -0,0 +1,38 @@
GHC=ghc
MODULES=Grammar.hs Types.hs
REGISTRY_MODULES=Registry/Grammar.hs Registry/Types.hs Registry/Registry.hs Registry/Utils.hs
TESTS=broken-tags.txt well-formed-tags.txt
ALL_MODULES=${MODULES}
REGISTRY=http://www.iana.org/assignments/language-subtag-registry
REGISTRYFILE=language-subtag-registry
ALL_PROGRAMS=check-wf check-valid tests-from-files display-tag registry2xml registry2txt registry2postgresql check-registry
TARBALL=/tmp/gabuzomeu.tar
default: check-wf check-valid check-registry
all: ${ALL_PROGRAMS}
%: %.hs ${ALL_MODULES}
${GHC} -o $@ --make $<
%: Registry/%.hs ${MODULES}
${GHC} -o $@ --make $<
test: test-tags test-regs
test-tags: tests-from-files ${ALL_MODULES} ${TESTS}
./$<
test-regs: test-registries ${ALL_MODULES} ${REGISTRY_MODULES} ${TESTS}
./$<
${REGISTRYFILE}: check-registry
wget -O $@ ${REGISTRY}
./check-registry ${REGISTRYFILE}
touch ${REGISTRYFILE}
dist: clean
(cd ..; tar cvf ${TARBALL} --exclude hnop --exclude ltr*.txt GaBuZoMeu && gzip --best --force --verbose ${TARBALL})
clean:
rm -f list-grandfathered-tags ${REGISTRYFILE} ${ALL_PROGRAMS} \#*\# *~ *.o *.hi *~ Registry/*.o Registry/*.hi Registry/*~

32
Parsers.hs Normal file
View File

@ -0,0 +1,32 @@
module Parsers where
import Text.ParserCombinators.Parsec
import Data.Maybe (catMaybes)
import Data.Char (toUpper)
import Control.Monad (liftM)
-- Parsing functions not in Text.ParserCombinators.Parsec
-- There is also a good ine in hsemail, manyNtoM
countBetween m n p
| n < m = error "First bound must be lower or equal than second bound"
| otherwise = do
xs <- try (count m p)
ys <- try (count (n - m) ((option Nothing) (do
y <- p
return (Just y))))
return (xs ++ catMaybes ys)
facultative p = liftM Just p <|> return Nothing
-- Case-insensitive parsers stloen from hsemail
-- Case-insensitive variant of Parsec's 'char' function.
ichar :: Char -> CharParser st Char
ichar c = satisfy (\x -> toUpper x == toUpper c)
-- Case-insensitive variant of Parsec's 'string' function.
istring :: String -> CharParser st String
istring cs = mapM ichar cs <?> cs
--
-- End of parsing functions

View File

@ -1,3 +1,8 @@
# GaBuZoMeu
GaBuZoMeu is a set of programs to parse and check language tags and language tag registry (see RFC 5646).
GaBuZoMeu is a set of programs to parse and check language tags and
language tag registry (see RFC 5646).
## Installation
See INSTALL

164
Registry/Grammar.hs Normal file
View File

@ -0,0 +1,164 @@
module Registry.Grammar where
import Text.ParserCombinators.Parsec as Parsec hiding (space, spaces, newline)
import Parsers
import Registry.Types
import Data.Maybe
import Data.Char(ord, chr)
import Numeric
import Prelude hiding (lookup)
-- Too low-level to be in Registry.Types
data Field = Field {field'name::String, field'value::String} deriving Show
colon = char ':' <?> "colon"
dash = char '-' <?> "dash"
space = char ' ' <?> "space"
newline = many1 (try (string "\n\r") <|> string "\n" <|> string "\r") <?> "new line"
spaces = many space
doublepercent = do {string "%%"; newline} <?> "double percent line"
extract :: Field -> (String, String)
extract f = (field'name f, field'value f)
-- TODO: make it case-insensitive
member :: String -> [(String, String)] -> Bool
member key [] = False
member key dict = let head = dict !! 0 in
if key == fst head then
True
else
member key (tail dict)
-- TODO: make it case-insensitive
lookup :: String -> [(String, String)] -> Maybe String
lookup key [] = Nothing
lookup key dict = let head = dict !! 0 in
if key == fst head then
Just (snd head)
else
lookup key (tail dict)
lookups :: String -> [(String, String)] -> [String]
lookups key [] = []
lookups key dict = let head = dict !! 0 in
if key == fst head then
(snd head) : lookups key (tail dict)
else
lookups key (tail dict)
suppressScript r = lookup "Suppress-Script" r
recordFactory :: [(String, String)] -> Either String Record
recordFactory r =
if member "Type" r then
let thetype = fromJust (lookup "Type" r) in
-- TODO: test that the field exist. fromJust will raise an exception but we can
-- provide a better error message if we test
case thetype of
"language" -> Right (Lang (Language
(fromJust (lookup "Subtag" r))
(lookups "Description" r)
(fromJust (lookup "Added" r))
(suppressScript r)
(lookup "Preferred-Value" r)
(lookup "Macrolanguage" r)
(lookup "Deprecated" r)
(lookup "Scope" r)
))
"extlang" -> Right (Extl (Extlang
(fromJust (lookup "Subtag" r))
(lookups "Description" r)
(fromJust (lookup "Added" r))
(suppressScript r)
(fromJust (lookup "Prefix" r))
(lookup "Preferred-Value" r)
(lookup "Macrolanguage" r)
(lookup "Scope" r)
))
"script" -> Right (Scr (Script
(fromJust (lookup "Subtag" r))
(lookups "Description" r)
(fromJust (lookup "Added" r))))
"region" -> Right (Reg (Region
(fromJust (lookup "Subtag" r))
(lookups "Description" r)
(fromJust (lookup "Added" r))))
"variant" -> Right (Var (Variant
(fromJust (lookup "Subtag" r))
(lookups "Description" r)
(fromJust (lookup "Added" r))
(lookups "Prefix" r)))
"redundant" -> Right (Red (Redundant
(fromJust (lookup "Tag" r))
(lookups "Description" r)
(fromJust (lookup "Added" r))))
"grandfathered" -> Right (GF (Grandfathered
(fromJust (lookup "Tag" r))
(lookups "Description" r)
(fromJust (lookup "Added" r))))
_ -> Left ("Unknown type \"" ++ thetype ++ "\"")
else
if member "File-Date" r then
Right (Date (fromJust (lookup "File-Date" r)))
else
Left "Invalid record: no Type or File-Date"
record = do
fields <- many1 field <?> "record"
let therecord = map extract fields
return (recordFactory therecord)
fieldname = do
start <- alphaNum
rest <- many (alphaNum <|> dash)
return ([start] ++ rest)
fieldvalue = many (unicodechar <|> space <|> try (do {newline; space})) <?> "field value"
unicodechar = satisfy (\thechar ->
let c = (ord thechar) in
(c >= 0x21 && c <= 0x10ffff))
<?> "Character"
field = do
name <- fieldname
spaces
colon
spaces
value <- fieldvalue
newline
return (Field name value)
<?> "field"
registry = do
allrecords <- record `sepBy` doublepercent
eof
return allrecords
isRight (Right a) = True
isRight (Left a) = False
fromRight (Right a) = a
fromRight (Left a) = error "Not a right value"
isLeft (Left a) = True
isLeft (Right a) = False
fromLeft (Left a) = a
fromLeft (Right a) = error "Not a left value"
parse :: String -> CheckResult
parse input =
case (Parsec.parse Registry.Grammar.registry "" input) of
Left err -> SyntaxError ("Registry is not legal: " ++ (show err))
Right reg -> let ok = and (map isRight reg) in
if ok then
Success (map fromRight reg)
else
let firstErr = (filter isLeft reg) !! 0 in
SyntaxError ("Registry is not legal: " ++
fromLeft firstErr)
-- TODO: the line number

229
Registry/Registry.hs Normal file
View File

@ -0,0 +1,229 @@
module Registry.Registry where
import qualified Text.ParserCombinators.Parsec as Parsec
import qualified System.IO
import Data.Maybe
import Registry.Types
import Registry.Utils
import Registry.Grammar (registry, member, parse)
import Types
import Grammar
defaultInfile = "./language-subtag-registry"
type Result = (Bool, String)
-- Combines a former result with a new result, yielding a result.
combine :: Result -> Result -> Result
combine (True, _) (False, s) = (False, s)
combine (False, s1) (False, s2) = (False, s1 ++ "; " ++ s2)
combine (False, s) (True, _) = (False, s)
combine (True, _) (True, _) = (True, "")
readRegistry filename = do
input <- System.IO.readFile filename
return (getRegistry input)
getRegistry input =
let reg = Registry.Grammar.parse input in
if not (checkOK reg) then
error "Illegal registry"
else
registryOf reg
isValidLanguage l dict =
if length (filter (onlyThisValue "language" l) dict) == 1 then
(True, "")
else
(False, "Unknown language " ++ l)
isValidScript Nothing _ = (True, "")
isValidScript s dict =
if length (filter (onlyThisValue "script" (fromJust s)) dict) == 1 then
(True, "")
else
(False, "Unknown script " ++ (fromJust s))
isValidRegion Nothing _ = (True, "")
isValidRegion r dict =
if length (filter (onlyThisValue "region" (fromJust r)) dict) == 1 then
(True, "")
else
(False, "Unknown region " ++ (fromJust r))
isValidVariants [] _ = (True, "")
isValidVariants v dict = foldl combine (True, "") (map (isValidVariant dict) v)
isValidVariant dict v =
if length (filter (onlyThisValue "variant" v) dict) == 1 then
(True, "")
else
(False, "Unknown variant " ++ v)
isValidExtension _ _ = (True, "")
-- TODO: a better test
-- Semantic testing
-- Only one extension by the same singleton
correctNumber :: [Singleton] -> Singleton -> Bool
correctNumber l s = length (filter (\c -> (c == s)) l) == 1
getSingleton :: Types.Extension -> Singleton
getSingleton (singleton, names) = singleton
norepeat :: Extensions -> Bool
norepeat extensions =
let singletons = map getSingleton extensions in
and (map (correctNumber singletons) singletons)
semanticTests (Tag lang extlang script region variants extensions) =
if norepeat extensions then
(True, "")
else
(False, "Repeated extension singleton")
semanticTests _ = (True, "") -- No tests for privateuse and grandfathered
isValid :: Registry -> Tag -> Result
isValid reg (Tag l el s r v e) =
let languages = filter (onlyThisType "language") reg in
let scripts = filter (onlyThisType "script") reg in
let regions = filter (onlyThisType "region") reg in
let variants = filter (onlyThisType "variant") reg in
let extensions = filter (onlyThisType "extension") reg in
semanticTests (Tag l el s r v e) `combine`
(isValidLanguage l languages) `combine` (isValidScript s scripts) `combine`
(isValidRegion r regions) `combine` (isValidVariants v variants) `combine`
(isValidExtension e extensions)
isValid reg (Types.GF tag) =
(True, "")
-- It is passed well-formedness (syntactic) tests, it is good
isValid reg (Priv p) =
(True, "") -- By definition, private tags are always valid
testTag :: Registry -> String -> Bool
testTag reg tag =
case (Parsec.parse Grammar.tag "" tag) of
Left err -> False
Right t -> if fst (semanticTests t) then
fst (isValid reg t)
else
False
checkOK (Success p) = True
checkOK _ = False
messageOf (Success p) = error "Check was a success, no message available"
messageOf (SyntaxError s) = s
messageOf (SemanticError s) = s
registryOf (Success p) = p
registryOf _ = error "Check was a failure, no registry available"
semanticCheck :: Registry -> CheckResult
semanticCheck r =
-- TODO: test that the extlangs exist
oneDateTest r `combineC` prefixesExist r `combineC`
scriptsExist r `combineC` noDuplicates r `combineC` preferredValuesExist r
`combineC` macrolanguagesExist r
oneDateTest reg =
if (length (filter onlyDate reg) == 1) then
Success reg
else
SemanticError "One and only one Field-Date record authorized"
macrolanguageExist reg language =
let macrolang = lang'macroLanguage (languageFrom language) in
if isJust macrolang then
length (filter (onlyThisValue "language" (fromJust macrolang))
(filter onlyLang reg)) > 0
else
True
macrolanguagesExist reg =
let languages = filter onlyLang reg in
let missing = filter (not.macrolanguageExist reg) languages in
if length missing == 0 then
Success reg
else
SemanticError ("Some languages have a Macrolanguage which does not appear in the registry: " ++
(show missing))
-- Extracts the langage subtag, the first one
languageOf tag =
takeWhile (/= '-') tag
prefixExist reg variant =
-- TODO: We test only that the language part of the prefix is OK, we should test
-- the rest
and (map (\r -> length (filter (onlyThisValue "language" (languageOf r))
(filter onlyLang reg)) > 0)
(variant'prefix (variantFrom variant)))
prefixesExist reg =
let variants = filter onlyVariant reg in
let missing = filter (not.prefixExist reg) variants in
if length missing == 0 then
Success reg
else
SemanticError ("Some variants have a Prefix which does not appear in the registry: " ++
(show missing))
scriptExist reg language =
let script = lang'script (languageFrom language) in
if isJust script then
length (filter (onlyThisValue "script" (fromJust script))
(filter onlyScript reg)) > 0
else
True
scriptsExist reg =
let languages = filter onlyLang reg in
let missing = filter (not.scriptExist reg) languages in
if length missing == 0 then
Success reg
else
SemanticError ("Some languages have a Suppress-script which does not appear in the registry: " ++
(show missing))
preferredLangValueExist reg language =
let preferredValue = lang'preferredValue (languageFrom language) in
if isJust preferredValue then
length (filter (onlyThisValue "language" (fromJust preferredValue))
(filter onlyLang reg)) > 0
else
True
preferredValuesExist reg =
-- TODO: test also regions and scripts
let languages = filter onlyLang reg in
let missing = filter (not.preferredLangValueExist reg) languages in
if length missing == 0 then
Success reg
else
SemanticError ("Some records have a Preferred-value which does not appear in the registry: " ++
(show missing))
otherRecord reg category val =
tail (filter (onlyThisValue category val) (filter (onlyThisType category) reg))
duplicate reg category =
case category of
"language" ->
filter (\t -> length t > 0) (map (otherRecord reg category)
(map (lang'subtag.languageFrom)
(filter (onlyThisType category) reg)))
_ -> [] -- TODO: implements them, too
noDuplicates reg =
let duplicates = concat (map (duplicate reg)
["language", "script", "variant", "region", "grandfathered"]) in
if length duplicates == 0 then
Success reg
else
SemanticError ("Duplicate values: " ++ (show duplicates))
-- TODO: tests Preferred-value

59
Registry/Types.hs Normal file
View File

@ -0,0 +1,59 @@
module Registry.Types where
-- TODO: import Time and use CalendarTime
type DateTime = String
-- TODO for all: Comments
data Language = Language {lang'subtag::String, lang'descr::[String],
lang'added::DateTime, lang'script::Maybe String,
lang'preferredValue::Maybe String,
lang'macroLanguage::Maybe String,
lang'deprecated::Maybe DateTime,
lang'scope::Maybe String}
deriving Show
data Extlang = Extlang {extlang'subtag::String, extlang'descr::[String],
extlang'added::DateTime, extlang'script::Maybe String,
extlang'prefix::String, -- An extlang can have only one prefix
extlang'preferredValue::Maybe String,
extlang'macroLanguage::Maybe String,
extlang'scope::Maybe String}
deriving Show
data Script = Script {script'subtag::String, script'descr::[String],
script'added::DateTime} deriving Show
data Region = Region {region'subtag::String, region'descr::[String],
region'added::DateTime} deriving Show
-- TODO: Deprecated and preferredValue
data Variant = Variant {variant'subtag::String, variant'descr::[String],
variant'added::DateTime, variant'prefix::[String]} deriving Show
data Redundant = Redundant {redundant'tag::String, redundant'descr::[String],
redundant'added::DateTime} deriving Show
data Grandfathered = Grandfathered {gf'tag::String, gf'descr::[String],
gf'added::DateTime} deriving Show
-- TODO: implements them
data Extension = Extension {ext'added::DateTime} deriving Show
data Record = Reg Region | Lang Language | Extl Extlang | Scr Script | Var Variant |
Red Redundant | GF Grandfathered | Ext Extension | Date DateTime deriving Show
type Registry = [Record]
data CheckResult = Success Registry | SyntaxError String | SemanticError String
deriving Show
combineC (Success r1) (Success r2) = Success r1
combineC (Success r) (SyntaxError e) = SyntaxError e
combineC (SyntaxError e) (Success r) = SyntaxError e
combineC (Success r) (SemanticError e) = SemanticError e
combineC (SemanticError e) (Success r) = SemanticError e
combineC (SyntaxError e1) (SemanticError e2) = SyntaxError e1
combineC (SemanticError e2) (SyntaxError e1) = SyntaxError e1
combineC (SyntaxError e1) (SyntaxError e2) = SyntaxError (e1 ++ " ; " ++ e2)
combineC (SemanticError e1) (SemanticError e2) = SemanticError (e1 ++ " ; " ++ e2)

216
Registry/Utils.hs Normal file
View File

@ -0,0 +1,216 @@
module Registry.Utils where
import Registry.Grammar
import Registry.Types
import qualified Data.List as List
import Data.Char (toUpper)
toUpperS :: String -> String
toUpperS = map toUpper
onlyDate (Date d) = True
onlyDate _ = False
onlyRecord (Date d) = False
onlyRecord _ = True
onlyGF (GF r) = True
onlyGF _ = False
onlyLang (Lang r) = True
onlyLang _ = False
onlyExtlang (Extl r) = True
onlyExtlang _ = False
onlyScript (Scr r) = True
onlyScript _ = False
onlyRegion (Reg r) = True
onlyRegion _ = False
onlyVariant (Var r) = True
onlyVariant _ = False
onlyRedundant (Red r) = True
onlyRedundant _ = False
onlyExtension (Red r) = True
onlyExtension _ = False
languageFrom (Lang r) = r
languageFrom x = error "Not a language"
extlangFrom (Extl r) = r
extlangFrom x = error "Not an extlang (extended language)"
scriptFrom (Scr r) = r
scriptFrom _ = error "Not a script"
regionFrom (Reg r) = r
regionFrom _ = error "Not a region"
variantFrom (Var v) = v
variantFrom _ = error "Not a variant"
redundantFrom (Red r) = r
redundantFrom _ = error "Not a redundant"
gfFrom (GF r) = r
gfFrom _ = error "Not a grandfathered"
onlyThisType :: String -> Record -> Bool
onlyThisType "language" = onlyLang
onlyThisType "extlang" = onlyExtlang
onlyThisType "grandfathered" = onlyGF
onlyThisType "script" = onlyScript
onlyThisType "region" = onlyRegion
onlyThisType "variant" = onlyVariant
onlyThisType "redundant" = onlyRedundant
onlyThisType "extension" = onlyExtension
onlyThisType t = error ("Unknown type in the registry: " ++ t)
onlyThisValue :: String -> String -> Record -> Bool
-- Remember that language tags are case-insensitive
onlyThisValue "language" v = (\r -> toUpperS (lang'subtag r) == toUpperS (v)).languageFrom
onlyThisValue "extlang" v = (\r -> toUpperS (extlang'subtag r) == toUpperS (v)).extlangFrom
onlyThisValue "grandfathered" v = (\r -> toUpperS (gf'tag r) == toUpperS (v)).gfFrom
onlyThisValue "script" v = (\r -> toUpperS (script'subtag r) == toUpperS (v)).scriptFrom
onlyThisValue "region" v = (\r -> toUpperS (region'subtag r) == toUpperS (v)).regionFrom
onlyThisValue "variant" v = (\r -> toUpperS (variant'subtag r) == toUpperS (v)).variantFrom
onlyThisValue "redundant" v = (\r -> toUpperS (redundant'tag r) == toUpperS (v)).redundantFrom
-- TODO: extensions
onlyThisValue t _ = error ("Unknown type of this value in the registry: " ++ t)
toSubtag (Lang l) = (lang'subtag l)
toSubtag (Scr s) = (script'subtag s)
toSubtag (GF t) = (gf'tag t)
toSubtag (Reg r) = (region'subtag r)
toSubtag (Var v) = (variant'subtag v)
toSubtag (Red r) = (redundant'tag r)
maybeToString Nothing = ""
maybeToString (Just s) = s
dateToString d = d
descrToString a = concat (List.intersperse " / " a)
prefixesToString a = concat (List.intersperse " / " a)
toString (Lang l) = (lang'subtag l) ++ "\t" ++ dateToString (lang'added l) ++ "\t" ++
descrToString (lang'descr l) ++ "\t" ++
(maybeToString (lang'script l)) ++ "\n"
toString (GF t) = (gf'tag t) ++ "\n"
toString (Scr s) = (script'subtag s) ++ "\t" ++ dateToString (script'added s) ++ "\t" ++
descrToString (script'descr s) ++ "\n"
toString (Reg r) = (region'subtag r) ++ "\t" ++ dateToString (region'added r) ++ "\t" ++
descrToString (region'descr r) ++ "\n"
toString (Var v) = (variant'subtag v) ++ "\t" ++ dateToString (variant'added v) ++ "\t" ++
descrToString (variant'descr v) ++ "\t" ++
prefixesToString (variant'prefix v) ++ "\n"
toString (Red r) = (redundant'tag r) ++ "\t" ++ dateToString (redundant'added r) ++ "\t" ++
descrToString (redundant'descr r) ++ "\n"
-- TODO: escape non-XML chars
dateToXML d = "<date>" ++ d ++ "</date>"
addedToXML d = "<added>" ++ d ++ "</added>"
deprecatedToXML Nothing = ""
deprecatedToXML (Just d) = "<deprecated>" ++ d ++ "</deprecated>"
sscriptToXML Nothing = ""
sscriptToXML (Just s) = "<suppress-script>" ++ s ++ "</suppress-script>"
prefToXML Nothing = ""
prefToXML (Just s) = "<preferred-value>" ++ s ++ "</preferred-value>"
macrolToXML Nothing = ""
macrolToXML (Just l) = "<macrolanguage>" ++ l ++ "</macrolanguage>"
scopeToXML Nothing = ""
scopeToXML (Just s) = "<scope>" ++ s ++ "</scope>"
onedescrToXML s = "<description>" ++ s ++ "</description>"
descrToXML a = concat (map onedescrToXML a)
oneprefixToXML s = "<prefix>" ++ s ++ "</prefix>"
prefixesToXML a = concat (map oneprefixToXML a)
toXML (Date d) = dateToXML d
toXML (Lang l) = "<language><subtag>" ++ (lang'subtag l) ++ "</subtag>" ++
addedToXML (lang'added l) ++
descrToXML (lang'descr l) ++
(sscriptToXML (lang'script l)) ++
deprecatedToXML (lang'deprecated l) ++
prefToXML (lang'preferredValue l) ++
macrolToXML (lang'macroLanguage l) ++
scopeToXML (lang'scope l) ++
"</language>\n"
toXML (Extl l) = "<extlang><subtag>" ++ (extlang'subtag l) ++ "</subtag>" ++
addedToXML (extlang'added l) ++
descrToXML (extlang'descr l) ++
(sscriptToXML (extlang'script l)) ++
macrolToXML (extlang'macroLanguage l) ++
scopeToXML (extlang'scope l) ++
"</extlang>\n"
toXML (GF t) = "<grandfathered><tag>" ++ (gf'tag t) ++ "</tag>" ++
addedToXML (gf'added t) ++
descrToXML (gf'descr t) ++ "</grandfathered>\n"
toXML (Scr s) = "<script><subtag>" ++ (script'subtag s) ++ "</subtag>" ++
addedToXML (script'added s) ++
descrToXML (script'descr s) ++ "</script>\n"
toXML (Reg r) = "<region><subtag>" ++ (region'subtag r) ++ "</subtag>" ++
addedToXML (region'added r) ++
descrToXML (region'descr r) ++ "</region>\n"
toXML (Var v) = "<variant><subtag>" ++ (variant'subtag v) ++ "</subtag>" ++
addedToXML (variant'added v) ++
descrToXML (variant'descr v) ++
prefixesToXML (variant'prefix v) ++ "</variant>\n"
toXML (Red r) = "<redundant><tag>" ++ (redundant'tag r) ++ "</tag>" ++
addedToXML (redundant'added r) ++
descrToXML (redundant'descr r) ++ "</redundant>\n"
toXML (Ext e) = error "No support for extensions"
-- TODO: escape non-HTML chars
html_head s = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n<head><link rel=\"stylesheet\" type=\"text/css\" href=\"/ltru.css\" /><title>" ++ s ++ "</title></head><body><h1>" ++
s ++ "</h1>"
html_tail = "<hr/><p><a href=\"/\">langtag.net Home</a></p></body></html>"
linkLanguage langcode =
if (length langcode) == 3 then
"<a href=\"http://www.sil.org/iso639-3/documentation.asp?id=" ++
langcode ++ "\">See the SIL entry</a> (if present)."
else if (length langcode) == 2 then
"<a href=\"http://www.loc.gov/standards/iso639-2/php/langcodes_name.php?iso_639_1=" ++
langcode ++ "\">See the LOC entry</a> (if present)."
else
""
dateToHTML d = "Date: " ++ d
addedToHTML d = "Added on " ++ d
sscriptToHTML Nothing = ""
sscriptToHTML (Just s) = " Suppress the script: <code><a href=\"../script/" ++ s ++
".html\">" ++ s ++ "</a></code>"
onedescrToHTML s = s
descrToHTML a = concat (List.intersperse " / " (map onedescrToHTML a))
-- TODO: a prefix is not always a simple language, it can be composed of several subtags
oneprefixToHTML s = " Possible prefix: <code><a href=\"../language/" ++ s ++ ".html\">" ++ s ++ "</a></code>"
prefixesToHTML a = concat (map oneprefixToHTML a)
toHTML (Date d) = "<p>Date: " ++ dateToHTML d ++ "</p>"
toHTML (Lang l) = (html_head ((lang'descr l) !! 0)) ++ "<p><code>" ++
(lang'subtag l) ++ "</code>. " ++
addedToHTML (lang'added l) ++ ". Description: " ++
descrToHTML (lang'descr l) ++ ". " ++
linkLanguage (lang'subtag l) ++
(sscriptToHTML (lang'script l)) ++ "</p>" ++ html_tail ++ "\n"
toHTML (GF t) = "<p>" ++ (gf'tag t) ++
addedToHTML (gf'added t) ++
descrToHTML (gf'descr t) ++ "</p>\n"
toHTML (Scr s) = (html_head ((script'descr s) !! 0)) ++ "<p><code>" ++
(script'subtag s) ++ "</code>." ++
addedToHTML (script'added s) ++ ". Description: " ++
descrToHTML (script'descr s) ++ "</p>" ++ html_tail ++ "\n"
toHTML (Reg r) = (html_head ((region'descr r) !! 0)) ++ "<p>" ++
(region'subtag r) ++ " " ++
addedToHTML (region'added r) ++
". Description: " ++
descrToHTML (region'descr r) ++ "</p>" ++
html_tail ++ "\n"
toHTML (Var v) = (html_head ((variant'descr v) !! 0)) ++ "<p><code>" ++
(variant'subtag v) ++ "</code>. " ++
addedToHTML (variant'added v) ++ ". Description: " ++
descrToHTML (variant'descr v) ++ ". " ++
prefixesToHTML (variant'prefix v) ++ "</p>" ++ html_tail ++ "\n"
toHTML (Red r) = (html_head ((redundant'descr r) !! 0)) ++
"<p>" ++ (redundant'tag r) ++
addedToHTML (redundant'added r) ++
descrToHTML (redundant'descr r) ++ "</p>" ++
html_tail ++ "\n"

View File

@ -0,0 +1,88 @@
import qualified System.IO
import qualified System.Environment
import qualified Data.List as List
import qualified System.Exit as Exit
import qualified System.Console.GetOpt as GetOpt
import Registry.Grammar
import Registry.Types
import Registry.Registry
nothing = putStr ""
-- Options handling
data Flag
= Verbose | Help deriving Show
options :: [GetOpt.OptDescr Flag]
options =
[ GetOpt.Option ['v'] ["verbose"] (GetOpt.NoArg Verbose) "Detailed output",
GetOpt.Option ['h'] ["help"] (GetOpt.NoArg Help) "Help message"
]
data OptionStore = ProgramOptions {verbose::Bool, help::Bool}
header = do
myself <- System.Environment.getProgName
return ("Usage: " ++ myself ++ " [-v] registry-file")
fatal detailed msg = do
myheader <- header
if detailed then
System.IO.hPutStrLn System.IO.stderr (GetOpt.usageInfo myheader options)
else
System.IO.hPutStrLn System.IO.stderr (myheader)
if msg /= "" then
System.IO.hPutStrLn System.IO.stderr msg
else
nothing
Exit.exitWith (Exit.ExitFailure 1)
-- Defaults
emptyOptionStore :: OptionStore
emptyOptionStore =
ProgramOptions {verbose = False, help = False}
processFlags :: [Flag] -> OptionStore
processFlags [] = emptyOptionStore
processFlags (first : rest) =
case first of
Verbose -> restOpts { verbose = True }
Help -> restOpts { help = True }
where restOpts = processFlags rest
programOpts :: [String] -> IO ([Flag], [String])
programOpts argv =
case GetOpt.getOpt GetOpt.Permute options argv of
(o,n,[] ) -> return(o,n)
(_,_,errs) -> fatal True (concat errs)
getOptionStore :: ([Flag], [String]) -> OptionStore
getOptionStore (f, n) = processFlags f
--
main = do
myargs <- System.Environment.getArgs
results <- programOpts myargs
let opts = getOptionStore results
let args = snd results
if help opts then
fatal True ""
else
nothing
if length (args) /= 1 then
fatal False "One and only one argument accepted"
else
nothing
let registryfilename = args !! 0
input <- System.IO.readFile registryfilename
let syntaxTree = parse input
if (checkOK syntaxTree) then do
let result = semanticCheck (registryOf syntaxTree)
if (checkOK result) then do
if verbose opts then
putStrLn (registryfilename ++ " is OK") -- TODO: display the date
else
nothing
Exit.exitWith Exit.ExitSuccess
else do
putStrLn ("Semantic error in " ++ registryfilename ++ ": " ++
(messageOf result))
Exit.exitWith (Exit.ExitFailure 3)
else do
putStrLn ("Syntax error in " ++ registryfilename ++ ": " ++
(messageOf syntaxTree))
Exit.exitWith (Exit.ExitFailure 2)

View File

@ -0,0 +1,37 @@
import qualified IO
import qualified System
import Registry.Utils
import Registry.Grammar (registry)
import Registry.Registry
infile = "./language-subtag-registry"
outdir = "./registry-html"
display reg thistype thisvalue =
-- We should always retrieve only one value, so we can take the head
toHTML ((filter (onlyThisValue thistype thisvalue)
(filter (onlyThisType thistype) reg)) !! 0)
toHTMLFiles reg thetype =
mapM (toHTMLfile reg thetype) (filter (onlyThisType thetype) reg)
toHTMLfile reg thistype thisvalue = do
let outfile = outdir ++ "/" ++ thistype ++ "/" ++ (toSubtag thisvalue) ++ ".html"
f <- IO.openFile (outfile) IO.WriteMode
IO.hPutStr f (display reg thistype (toSubtag thisvalue))
IO.hClose f
foreach [] func = return ()
foreach a func = do
func (head a)
foreach (tail a) func
main = do
f <- IO.openFile (infile) IO.ReadMode
input <- IO.hGetContents f
let theregistry = getRegistry input
foreach ["language", "script", "region", "variant",
"redundant", "grandfathered"]
(toHTMLFiles theregistry)

View File

@ -0,0 +1,23 @@
import qualified System.IO
import Registry.Utils
import Registry.Utils.PostgreSQL
import Registry.Grammar (registry)
import Registry.Registry
infile = "./language-subtag-registry"
displayAll reg thistype =
(foldr (++) "" (map toPostgreSQL
(filter (onlyThisType thistype) reg)))
main = do
f <- System.IO.openFile (infile) System.IO.ReadMode
input <- System.IO.hGetContents f
let theregistry = getRegistry input
putStrLn "BEGIN;"
putStrLn (displayAll theregistry "script")
putStrLn (displayAll theregistry "region")
putStrLn (displayAll theregistry "language")
putStrLn (displayAll theregistry "variant")
putStrLn "COMMIT;"

View File

@ -0,0 +1,24 @@
import qualified IO
import qualified System
import Registry.Utils
import Registry.Utils.SQLite
import Registry.Grammar (registry)
import Registry.Registry
infile = "./language-subtag-registry"
displayAll reg thistype =
(foldr (++) "" (map toSQLite
(filter (onlyThisType thistype) reg)))
main = do
f <- IO.openFile (infile) IO.ReadMode
input <- IO.hGetContents f
let theregistry = getRegistry input
putStrLn "BEGIN;"
putStrLn (displayAll theregistry "script")
putStrLn (displayAll theregistry "region")
putStrLn (displayAll theregistry "language")
putStrLn (displayAll theregistry "variant")
putStrLn "COMMIT;"

35
Registry/registry2txt.hs Normal file
View File

@ -0,0 +1,35 @@
import qualified System.IO
import Registry.Utils
import Registry.Grammar (registry)
import Registry.Registry
infile = "./language-subtag-registry"
displayAll reg thistype =
(foldr (++) "" (map toString
(filter (onlyThisType thistype) reg)))
displayOne reg thistype thisvalue =
concat (map toString
(filter (onlyThisValue thistype thisvalue) reg))
toTextFile reg thetype = do
-- TODO: add the date in the filename
let outfile = "lsr-" ++ thetype ++ ".txt"
f <- System.IO.openFile (outfile) System.IO.WriteMode
System.IO.hPutStr f (displayAll reg thetype)
System.IO.hClose f
foreach [] func = return ()
foreach a func = do
func (head a)
foreach (tail a) func
main = do
f <- System.IO.openFile (infile) System.IO.ReadMode
input <- System.IO.hGetContents f
let theregistry = getRegistry input
foreach ["language", "script", "region", "variant", "redundant", "grandfathered"]
(toTextFile theregistry)

32
Registry/registry2xml.hs Normal file
View File

@ -0,0 +1,32 @@
import qualified System.IO
import qualified Text.ParserCombinators.Parsec as Parsec
import Registry.Utils
import Registry.Grammar (registry)
import Registry.Registry
infile = "./language-subtag-registry"
displayAll reg = "<registry>" ++
(foldr (++) "" (map toXML reg)) ++
"</registry>"
displayOne reg thistype thisvalue =
concat (map toXML
(filter (onlyThisValue thistype thisvalue) reg))
foreach [] func = return ()
foreach a func = do
func (head a)
foreach (tail a) func
main = do
f <- System.IO.openFile (infile) System.IO.ReadMode
input <- System.IO.hGetContents f
let theregistry = getRegistry input
let outfile = "language-subtag-registry.xml"
f <- System.IO.openFile (outfile) System.IO.WriteMode
System.IO.hPutStr f (displayAll theregistry)
System.IO.hClose f

21
Types.hs Normal file
View File

@ -0,0 +1,21 @@
-- Haskell types to support the language tags
module Types where
type Language = String
type Extlang = Maybe String -- Only one extlang authorized
type Script = Maybe String
type Region = Maybe String
type Singleton = Char
type Variants = [Variant]
type Variant = String
type Extension = (Singleton, [String])
type Extensions = [Extension]
type Grandfathered = String
type Privateuse = [String]
data Tag = Tag Language Extlang Script Region Variants Extensions | GF Grandfathered | Priv Privateuse deriving (Show, Eq)
-- TODO: warning, Eq must
-- be redefined because it needs to take into account case-insensitivity and
-- (maybe) canonicalization

View File

@ -0,0 +1,11 @@
File-Date: 2006-09-20
%%
Type: language
Subtag: aa
Description: Afar
Added: 2005-10-16
%%
Subtag: ab
Description: Abkhazian
Added: 2005-10-16
Suppress-Script: Cyrl

View File

@ -0,0 +1,12 @@
File-Date: 2006-09-20
%%
Type: language
Subtag: aa
Description: Afar
Added: 2005-10-16
%%
Type: language
Subtag ab
Description: Abkhazian
Added: 2005-10-16
Suppress-Script: Cyrl

View File

@ -0,0 +1,12 @@
File-Date: 2006-09-20
%%
Type: language
Subtag: aa
Description: Afar
Added: 2005-10-16
%%
Type: languge
Subtag: ab
Description: Abkhazian
Added: 2005-10-16
Suppress-Script: Cyrl

30
broken-tags.txt Normal file
View File

@ -0,0 +1,30 @@
f
f-Latn # Main tag too short
fr-fra # Extended tag are no longer well-formed
fr-Lat # Extended, obsolete
xr-lxs-qut # extlangS
xr-lqt-qu # extlang + region
fr-Latn-F
a-value
tlh-a-b-foo
i-notexist # grandfathered but not registered: invalid, even if we only test well-formedness
abcdefghi-012345678
ab-abc-abc-abc-abc
ab-abcd-abc
ab-ab-abc
ab-123-abc
a-Hant-ZH
a1-Hant-ZH
ab-abcde-abc
ab-1abc-abc
ab-ab-abcd
ab-123-abcd
ab-abcde-abcd
ab-1abc-abcd
ab-a-b
ab-a-x
ab--ab
ab-abc-
-ab-abc
abcd-efg
aabbccddE

21
check-valid.hs Normal file
View File

@ -0,0 +1,21 @@
import Grammar
import System.Environment (getArgs)
import Registry.Registry
infile = "./language-subtag-registry"
analyze reg input
= case (getTag input) of
Left err -> err
Right tag -> let result = isValid reg tag in
if fst result then
(input ++ " is valid\n") -- TODO: indicate the date of the registry
else
(input ++ " is NOT valid: " ++ snd result ++ "\n")
main = do
myargs <- getArgs
theregistry <- readRegistry infile
putStr (concat (map (analyze theregistry) myargs))

15
check-wf.hs Normal file
View File

@ -0,0 +1,15 @@
import Grammar
import System.Environment (getArgs)
analyze language input
= case (parse language "" input) of
Left err -> (input ++ " is NOT well-formed: " ++ (show err) ++
"\n")
Right x -> (input ++ " is well-formed\n")
main = do
myargs <- getArgs
putStr (concat (map (analyze tag) myargs))
-- TODO: set return code? What code to use if some are well-formed and some are not?
-- TODO: provide a version with -q and -v ?

View File

@ -0,0 +1,18 @@
File-Date: 2006-09-25
%%
Type: language
Subtag: aa
Description: Afar
Added: 2005-10-16
%%
Type: language
Subtag: ab
Description: Abkhazian
Added: 2005-10-16
Suppress-Script: Cyrl
%%
Type: variant
Subtag: 1901
Description: Traditional German orthography
Added: 2005-10-16
Prefix: fx

View File

@ -0,0 +1,12 @@
File-Date: 2006-09-25
%%
Type: language
Subtag: aa
Description: Afar
Added: 2005-10-16
%%
Type: language
Subtag: ab
Description: Abkhazian
Added: 2005-10-16
Suppress-Script: Cyrk

77
display-tag.hs Normal file
View File

@ -0,0 +1,77 @@
import Grammar
import Types
import Registry.Registry (readRegistry, isValid)
import Registry.Types
import Registry.Utils (onlyThisValue, onlyThisType, scriptFrom, languageFrom,
regionFrom, variantFrom)
import Data.List (intersperse)
import System.Environment (getArgs)
infile = "./language-subtag-registry"
langToString registry l =
"language \"" ++ concat (intersperse " / "
(lang'descr (languageFrom (filter (onlyThisValue
"language" l)
(filter (onlyThisType "language")
registry) !! 0)))) ++ "\""
scriptToString registry Nothing = ""
scriptToString registry (Just s) =
let theScript = (filter (onlyThisValue "script" s)
(filter (onlyThisType "script")
registry)) !! 0 in
", " ++ s ++ ": script " ++
" \"" ++ concat (intersperse " / " (script'descr
(scriptFrom theScript))) ++ "\""
regionToString registry Nothing = ""
regionToString registry (Just r) =
let theRegion = (filter (onlyThisValue "region" r)
(filter (onlyThisType "region")
registry)) !! 0 in
", " ++ r ++ ": region " ++
" \"" ++ concat (intersperse " / " (region'descr
(regionFrom theRegion))) ++ "\""
variantsToString registry [] = ""
variantsToString registry v = ", " ++ concat (intersperse ", "
(map (variantToString registry) v))
variantToString registry v =
let theVariant = (filter (onlyThisValue "variant" v)
(filter (onlyThisType "variant")
registry)) !! 0 in
v ++ ": variant " ++
" \"" ++ concat (intersperse " / " (variant'descr
(variantFrom theVariant))) ++ "\""
toString :: Registry -> Tag -> String
toString registry (Tag l el s r v e) =
-- TODO: display extlangs
l ++ ": " ++ langToString registry l ++
scriptToString registry s ++ regionToString registry r ++
variantsToString registry v
toString registry (Types.GF tag) = tag
toString registry (Priv p) =
(concat p) ++ " (private tag so no info available)"
analyze :: Registry -> String -> String
analyze reg input
= case (getTag input) of
Left err -> err
Right tag -> let result = isValid reg tag in
if fst result then
input ++ ": (" ++ (toString reg tag) ++ ")"
else
(input ++ " is NOT valid: " ++ snd result ++ "\n")
main = do
myargs <- getArgs
theregistry <- readRegistry infile
putStrLn (concat (intersperse "\n" (map (analyze theregistry) myargs)))

View File

@ -0,0 +1,33 @@
File-Date: 2006-09-25
%%
Type: language
Subtag: aa
Description: Afar
Added: 2005-10-16
%%
Type: language
Subtag: ab
Description: Abkhazian
Added: 2005-10-16
Suppress-Script: Cyrl
%%
Type: language
Subtag: ab
Description: Abkhazian again
Added: 2005-10-16
Suppress-Script: Cyrl
%%
Type: script
Subtag: Arab
Description: Arabic
Added: 2005-10-16
%%
Type: script
Subtag: Armn
Description: Armenian
Added: 2005-10-16
%%
Type: script
Subtag: Cyrl
Description: Cyrillic
Added: 2005-10-16

View File

@ -0,0 +1,18 @@
File-Date: 2006-09-25
%%
Type: language
Subtag: aa
Description: Afar
Added: 2005-10-16
%%
Type: language
Subtag: ab
Description: Abkhazian
Added: 2005-10-16
Suppress-Script: Cyrk
%%
Type: script
Subtag: Cyrl
Description: Cyrillic
Added: 2005-10-16

View File

@ -0,0 +1,12 @@
File-Date: 2006-09-25
%%
Type: language
Subtag: aa
Description: Afar
Added: 2005-10-16
%%
Type: language
Subtag: ab
Description: Abkhazian
Added: 2005-10-16
Suppress-Script: Cyrk

View File

@ -0,0 +1,13 @@
File-Date: 2008-02-20
%%
Type: language
Subtag: aa
Description: Afar
Added: 2005-10-16
%%
Type: language
Subtag: foo
Description: Test only
Macrolanguage: xx
Added: 2008-02-20

View File

@ -0,0 +1,28 @@
File-Date: 2006-09-25
%%
Type: language
Subtag: aa
Description: Afar
Added: 2005-10-16
Preferred-value: xx
%%
Type: language
Subtag: ab
Description: Abkhazian
Added: 2005-10-16
Suppress-Script: Cyrx
%%
Type: script
Subtag: Arab
Description: Arabic
Added: 2005-10-16
%%
Type: script
Subtag: Armn
Description: Armenian
Added: 2005-10-16
%%
Type: script
Subtag: Cyrl
Description: Cyrillic
Added: 2005-10-16

View File

@ -0,0 +1,10 @@
Type: language
Subtag: aa
Description: Afar
Added: 2005-10-16
%%
Type: language
Subtag: ab
Description: Abkhazian
Added: 2005-10-16

11
invalid-tags.txt Normal file
View File

@ -0,0 +1,11 @@
ax-TZ # Not in the registry, but well-formed
fra-Latn # ISO 639 can be 3-letters
fra
fra-FX
abcd-Latn # Language of 4 chars reserved for future use
AaBbCcDd-x-y-any-x # Language of 5-8 chars, registered
zh-Latm-CN # Typo
de-DE-1902 # Wrong variant
fr-shadok # Variant
ab-c-abc-r-toto-c-abc # 'c' appears twice
en-a-bbb-a-ccc # 'a' appears twice

26
irregular.hs Normal file
View File

@ -0,0 +1,26 @@
import qualified IO
import Text.ParserCombinators.Parsec as Parsec
import qualified Data.List as List
import Grammar
import Registry.Grammar
import Registry.Types
infile = "./language-subtag-registry"
onlyIrregularGF (GF r) = isLeft (Grammar.getTag (gf'tag r))
onlyIrregularGF _ = False
displayTag (GF r) = "\"" ++ (gf'tag r) ++ "\" / "
run parser input
= case (Parsec.parse parser "" input) of
Left err -> error ("Registry \"" ++ infile ++ "\" is not legal: " ++ (show err) ++ "\n")
Right x -> IO.putStr ("-- AUTOMATICALLY GENERATED! Do *not* edit!!! \n" ++
(foldr (++) "" (List.intersperse "\n" (map displayTag (filter onlyIrregularGF (map fromRight x))))) ++
"\n")
main = do
f <- IO.openFile (infile) IO.ReadMode
input <- IO.hGetContents f
run registry input

76
test-registries.hs Normal file
View File

@ -0,0 +1,76 @@
import qualified Test.HUnit as HUnit
import qualified System.IO as IO
import qualified Text.ParserCombinators.Parsec as Parsec
import qualified Data.List as List
import qualified Data.Maybe as Maybe
import qualified Text.Regex as RE
import qualified System.Directory as Dir
import Registry.Grammar
import Registry.Types
import Registry.Registry
brokenDir = "./broken-registries"
correctDir = "./correct-registries"
invalidDir = "./invalid-registries"
validDir = "./valid-registries"
data RegistryText = Registry {filename::FilePath, filecontents::String}
buildTestCaseSyn :: Bool -> RegistryText -> HUnit.Test
buildTestCaseSyn True reg =
HUnit.TestCase (HUnit.assertBool ("Should be syntactically OK (" ++ (filename reg) ++ ")")
(checkOK (parse (filecontents reg))))
buildTestCaseSyn False reg =
HUnit.TestCase (HUnit.assertBool ("Should be syntactically wrong (" ++ (filename reg) ++ ")")
(not (checkOK (parse (filecontents reg)))))
buildTestCaseSem :: Bool -> RegistryText -> HUnit.Test
buildTestCaseSem True reg =
let registry = parse (filecontents reg) in
HUnit.TestCase (HUnit.assertBool ("Should be valid (" ++ (filename reg) ++ ")")
(checkOK (registry) &&
checkOK (semanticCheck (registryOf registry))))
buildTestCaseSem False reg =
let registry = parse (filecontents reg) in
HUnit.TestCase (HUnit.assertBool ("Should be invalid (" ++ (filename reg) ++ ")")
(checkOK (registry) &&
not (checkOK (semanticCheck (registryOf registry)))))
endsWith :: String -> String -> Bool
endsWith s1 s2 = Maybe.isJust (RE.matchRegex (RE.mkRegex (s2 ++ "$")) s1)
-- TODO: use mapM instead
foreach :: (Monad m) => [a1] -> (a1 -> m a) -> m [a]
foreach [] func = do
return []
foreach x func = do
newHead <- func (head x)
newTail <- foreach (tail x) func
return (newHead : newTail)
contentOf :: FilePath -> IO RegistryText
contentOf f = do
content <- IO.readFile f
return (Registry f content)
addDir d f = d ++ "/" ++ f
filesOf :: FilePath -> IO [RegistryText]
filesOf d = do
files <- Dir.getDirectoryContents d
foreach (map (addDir d) (filter (\n -> n /= "." && n /= ".." && (not (endsWith n "~")))
files))
contentOf
main = do
brokenFiles <- filesOf brokenDir
let testsBroken = map (buildTestCaseSyn False) brokenFiles
invalidFiles <- filesOf invalidDir
let testsInvalid = map (buildTestCaseSem False) invalidFiles
correctFiles <- filesOf correctDir
let testsCorrect = map (buildTestCaseSyn True) correctFiles
validFiles <- filesOf validDir
let testsValid = map (buildTestCaseSem True) validFiles
HUnit.runTestTT (HUnit.TestList (testsBroken ++ testsInvalid ++
testsCorrect ++ testsValid))

54
tests-from-files.hs Normal file
View File

@ -0,0 +1,54 @@
import qualified Grammar
import qualified Registry.Registry as Registry
import qualified Test.HUnit as HUnit
import qualified System.IO
import qualified Text.Regex as Regex
wfTagsFile = "./well-formed-tags.txt"
brokenTagsFile = "./broken-tags.txt"
validTagsFile = "./valid-tags.txt"
invalidTagsFile = "./invalid-tags.txt"
registryfile = "./language-subtag-registry"
languageTag = Grammar.tag
getTag line =
let fields = Regex.splitRegex (Regex.mkRegex " +") line in
fields !! 0
shouldBeWellFormed tag =
HUnit.TestCase (HUnit.assertBool (tag ++ " should be well-formed")
(Grammar.testTag tag == True))
shouldBeBroken tag =
HUnit.TestCase (HUnit.assertBool (tag ++ " should *not* be well-formed")
(Grammar.testTag tag == False))
shouldBeValid reg tag =
HUnit.TestCase (HUnit.assertBool (tag ++ " should be valid")
(Registry.testTag reg tag == True))
shouldBeInvalid reg tag =
HUnit.TestCase (HUnit.assertBool (tag ++ " should *not* be valid")
(Registry.testTag reg tag == False))
tagsFromFile filename = do
myContents <- System.IO.readFile filename
let myLines = lines myContents
return (map getTag myLines)
main = do
brokenTags <- tagsFromFile brokenTagsFile
wfTags <- tagsFromFile wfTagsFile
theregistry <- Registry.readRegistry registryfile
invalidTags <- tagsFromFile invalidTagsFile
validTags <- tagsFromFile validTagsFile
let tests = HUnit.TestList (map shouldBeBroken (brokenTags) ++
map shouldBeWellFormed (wfTags) ++
map (shouldBeValid theregistry) (validTags) ++
map (shouldBeInvalid theregistry) (invalidTags))
HUnit.runTestTT tests

File diff suppressed because it is too large Load Diff

48
valid-tags.txt Normal file
View File

@ -0,0 +1,48 @@
fr
fr-Latn
fr-Latn-FR
fr-Latn-419
fr-FR
fr-y-myext-myext2
apa-Latn # ISO 639 can be 3-letters
apa
apa-CA
i-klingon # grandfathered with singleton
no-bok # grandfathered without singleton
mn-Cyrl-MN
mN-cYrL-Mn
fr-Latn-CA
en-US
fr-Latn-CA
i-enochian # Grand fathered
x-fr-CH
sr-Latn-CS
es-419
sl-nedis
de-CH-1996
de-Latg-1996
sl-IT-nedis
en-a-bbb-x-a-ccc
de-a-value
en-x-US
az-Arab-x-AZE-derbend
es-Latn-CO-x-private
ab-x-abc-x-abc # anything goes after x
ab-x-abc-a-a # ditto
i-default # grandfathered
i-klingon # grandfathered
en
de-AT
es-419
de-CH-1901
sr-Cyrl
sr-Cyrl-CS
sl-Latn-IT-rozaj
en-US-x-twain
zh-cmn
zh-cmn-Hant
zh-cmn-Hant-HK
zh-gan
zh-yue-Hant-HK
en-Latn-GB-boont-r-extended-sequence-x-private
en-US-boont

54
well-formed-tags.txt Normal file
View File

@ -0,0 +1,54 @@
fr
fr-Latn
fr-Latn-FR
fr-Latn-419
fr-FR
ax-TZ # Not in the registry, but well-formed
fr-shadok # Variant
fr-y-myext-myext2
fra-Latn # ISO 639 can be 3-letters
fra
fra-FX
i-klingon # grandfathered with singleton
I-kLINgon # tags are case-insensitive...
no-bok # grandfathered without singleton
mn-Cyrl-MN
mN-cYrL-Mn
fr-Latn-CA
en-US
fr-Latn-CA
i-enochian # Grand fathered
x-fr-CH
sr-Latn-CS
es-419
sl-nedis
de-CH-1996
de-Latg-1996
sl-IT-nedis
en-a-bbb-x-a-ccc
de-a-value
en-Latn-GB-boont-r-extended-sequence-x-private
en-x-US
az-Arab-x-AZE-derbend
es-Latn-CO-x-private
en-US-boont
ab-x-abc-x-abc # anything goes after x
ab-x-abc-a-a # ditto
i-default # grandfathered
i-klingon # grandfathered
abcd-Latn # Language of 4 chars reserved for future use
AaBbCcDd-x-y-any-x # Language of 5-8 chars, registered
en
de-AT
es-419
de-CH-1901
sr-Cyrl
sr-Cyrl-CS
sl-Latn-IT-rozaj
en-US-x-twain
zh-cmn
zh-cmn-Hant
zh-cmn-Hant-HK
zh-gan
zh-yue-Hant-HK
xr-p-lze # Extension