130 lines
4.9 KiB
Haskell
130 lines
4.9 KiB
Haskell
{-# LANGUAGE NoMonomorphismRestriction,FlexibleContexts #-}
|
|
|
|
module Grammar (module Text.ParserCombinators.Parsec, tag, testTag,
|
|
getTag) where
|
|
|
|
import Text.ParserCombinators.Parsec
|
|
import Types
|
|
import Parsers
|
|
|
|
-- "Tags for Identifying Languages" RFC 5646
|
|
-- We use the names of the productions of the grammar in the RFC
|
|
|
|
dash = char '-' <?> "dash"
|
|
|
|
charI = char 'i' <|> char 'I'
|
|
charX = char 'x' <|> char 'X'
|
|
|
|
alphaNumNotX c =
|
|
c >= 'A' && c <= 'W' ||
|
|
c >= 'Y' && c <= 'Z' ||
|
|
c >= 'a' && c <= 'w' ||
|
|
c >= 'y' && c <= 'z' ||
|
|
c >= '0' && c <= '9'
|
|
|
|
languageTag :: GenParser Char () Tag
|
|
languageTag =do
|
|
value <- try (langtag) <|> try (privateuse) <|> irregular
|
|
return value
|
|
|
|
langtag = do
|
|
theLanguage <- language
|
|
theExtlang <- facultative (try (do {dash ; value <- extlang ; notFollowedBy alphaNum;
|
|
return value}));
|
|
theScript <- facultative (try (do {dash ; value <- script ; notFollowedBy alphaNum;
|
|
return value}));
|
|
theRegion <- facultative (try (do {dash ; value <- region ; notFollowedBy alphaNum;
|
|
return value}));
|
|
theVariants <- many (try (do {dash ; value <- variant ; notFollowedBy alphaNum;
|
|
return value}));
|
|
theExtensions <- many (try (do {dash ; value <- extension; notFollowedBy alphaNum;
|
|
return value}))
|
|
-- TODO: use a context-sensitive grammar (see Parsec
|
|
-- documentation) to enforce "2.2.6. Extension Subtags" which
|
|
-- says that an extension cannot be repeated.
|
|
optional (do {dash; privateuse})
|
|
eof
|
|
return (Tag theLanguage theExtlang theScript theRegion theVariants theExtensions)
|
|
|
|
language = do
|
|
value <- do
|
|
try (do {lang <- countBetween 2 3 letter; notFollowedBy alphaNum; return lang})
|
|
-- Shortest ISO 639 code
|
|
-- TODO: returns the extended, too!
|
|
<|>
|
|
try (do {lang <- count 4 letter; notFollowedBy alphaNum; return lang}) -- reserved for future use.
|
|
<|> (countBetween 5 8 letter) -- registered language subtag. TODO: return the value!
|
|
return value
|
|
|
|
-- RFC, section 2.2.2: "Although the ABNF production 'extlang' permits
|
|
-- up to three extended language tags"
|
|
extlang = do
|
|
value <- count 3 letter
|
|
return value
|
|
|
|
script = do
|
|
value <- count 4 letter -- ISO 15924 code
|
|
return value
|
|
|
|
region = do
|
|
count 2 letter -- ISO 3166 code
|
|
<|> count 3 digit -- United Nations M.49 code
|
|
|
|
variant = do
|
|
do {first <- digit ; rest <- countBetween 3 7 alphaNum; return (first:rest)}
|
|
<|> do {first <- letter ; rest <- countBetween 4 7 alphaNum; return (first:rest)}
|
|
|
|
extension = do
|
|
theSingleton <- singleton
|
|
theValues <- many1 (try (do {dash ; extensionName}))
|
|
return (theSingleton, theValues)
|
|
|
|
extensionName = do
|
|
countBetween 2 8 alphaNum
|
|
|
|
singleton = satisfy (alphaNumNotX)
|
|
|
|
privateuse = do
|
|
charX
|
|
values <- many1 (do {dash; countBetween 1 8 alphaNum})
|
|
eof
|
|
return (Priv values)
|
|
|
|
-- Irregular tags only
|
|
registeredIrregular = try (istring "en-GB-oed") <|> try (istring "i-ami") <|> try (istring "i-bnn") <|> try (istring "i-default")
|
|
<|> try (istring "i-enochian") <|> try (istring "i-hak") <|> try (istring "i-klingon") <|> try (istring "i-lux")
|
|
<|> try (istring "i-mingo") <|> try (istring "i-navajo") <|> try (istring "i-pwn") <|> try (istring "i-tao")
|
|
<|> try (istring "i-tay") <|> try (istring "i-tsu") <|> try (istring "sgn-BE-fr") <|>try ( istring "sgn-BE-nl")
|
|
<|> try (istring "sgn-CH-de") <|> try (istring "no-bok")
|
|
<|> try (istring"no-nyn")
|
|
<|> try (istring"zh-cmn")
|
|
<|> try (istring"zh-cmn-Hans")
|
|
<|> try (istring"zh-cmn-Hant")
|
|
<|> try (istring"zh-gan")
|
|
<|> try (istring"zh-min")
|
|
<|> try (istring"zh-min-nan")
|
|
<|> try (istring"zh-wuu")
|
|
<|> try (istring"zh-yue")
|
|
|
|
irregular = do
|
|
value <- registeredIrregular
|
|
return (GF value)
|
|
|
|
tag = do
|
|
value <- languageTag
|
|
return value
|
|
|
|
-- Utilities
|
|
testTag :: String -> Bool
|
|
testTag input
|
|
= case (parse languageTag "" input) of
|
|
Left err -> False
|
|
Right x -> True
|
|
|
|
getTag :: String -> Either String Tag
|
|
getTag input =
|
|
case (parse tag "" input) of
|
|
Left err -> Left (input ++ " is NOT well-formed: " ++ (show err) ++
|
|
"\n")
|
|
Right theTag -> Right theTag
|