{-# LANGUAGE NoMonomorphismRestriction,FlexibleContexts #-} module Grammar (module Text.ParserCombinators.Parsec, tag, testTag, getTag) where import Text.ParserCombinators.Parsec import Types import Parsers -- "Tags for Identifying Languages" RFC 5646 -- We use the names of the productions of the grammar in the RFC dash = char '-' "dash" charI = char 'i' <|> char 'I' charX = char 'x' <|> char 'X' alphaNumNotX c = c >= 'A' && c <= 'W' || c >= 'Y' && c <= 'Z' || c >= 'a' && c <= 'w' || c >= 'y' && c <= 'z' || c >= '0' && c <= '9' languageTag :: GenParser Char () Tag languageTag =do value <- try (langtag) <|> try (privateuse) <|> irregular return value langtag = do theLanguage <- language theExtlang <- facultative (try (do {dash ; value <- extlang ; notFollowedBy alphaNum; return value})); theScript <- facultative (try (do {dash ; value <- script ; notFollowedBy alphaNum; return value})); theRegion <- facultative (try (do {dash ; value <- region ; notFollowedBy alphaNum; return value})); theVariants <- many (try (do {dash ; value <- variant ; notFollowedBy alphaNum; return value})); theExtensions <- many (try (do {dash ; value <- extension; notFollowedBy alphaNum; return value})) -- TODO: use a context-sensitive grammar (see Parsec -- documentation) to enforce "2.2.6. Extension Subtags" which -- says that an extension cannot be repeated. optional (do {dash; privateuse}) eof return (Tag theLanguage theExtlang theScript theRegion theVariants theExtensions) language = do value <- do try (do {lang <- countBetween 2 3 letter; notFollowedBy alphaNum; return lang}) -- Shortest ISO 639 code -- TODO: returns the extended, too! <|> try (do {lang <- count 4 letter; notFollowedBy alphaNum; return lang}) -- reserved for future use. <|> (countBetween 5 8 letter) -- registered language subtag. TODO: return the value! return value -- RFC, section 2.2.2: "Although the ABNF production 'extlang' permits -- up to three extended language tags" extlang = do value <- count 3 letter return value script = do value <- count 4 letter -- ISO 15924 code return value region = do count 2 letter -- ISO 3166 code <|> count 3 digit -- United Nations M.49 code variant = do do {first <- digit ; rest <- countBetween 3 7 alphaNum; return (first:rest)} <|> do {first <- letter ; rest <- countBetween 4 7 alphaNum; return (first:rest)} extension = do theSingleton <- singleton theValues <- many1 (try (do {dash ; extensionName})) return (theSingleton, theValues) extensionName = do countBetween 2 8 alphaNum singleton = satisfy (alphaNumNotX) privateuse = do charX values <- many1 (do {dash; countBetween 1 8 alphaNum}) eof return (Priv values) -- Irregular tags only registeredIrregular = try (istring "en-GB-oed") <|> try (istring "i-ami") <|> try (istring "i-bnn") <|> try (istring "i-default") <|> try (istring "i-enochian") <|> try (istring "i-hak") <|> try (istring "i-klingon") <|> try (istring "i-lux") <|> try (istring "i-mingo") <|> try (istring "i-navajo") <|> try (istring "i-pwn") <|> try (istring "i-tao") <|> try (istring "i-tay") <|> try (istring "i-tsu") <|> try (istring "sgn-BE-fr") <|>try ( istring "sgn-BE-nl") <|> try (istring "sgn-CH-de") <|> try (istring "no-bok") <|> try (istring"no-nyn") <|> try (istring"zh-cmn") <|> try (istring"zh-cmn-Hans") <|> try (istring"zh-cmn-Hant") <|> try (istring"zh-gan") <|> try (istring"zh-min") <|> try (istring"zh-min-nan") <|> try (istring"zh-wuu") <|> try (istring"zh-yue") irregular = do value <- registeredIrregular return (GF value) tag = do value <- languageTag return value -- Utilities testTag :: String -> Bool testTag input = case (parse languageTag "" input) of Left err -> False Right x -> True getTag :: String -> Either String Tag getTag input = case (parse tag "" input) of Left err -> Left (input ++ " is NOT well-formed: " ++ (show err) ++ "\n") Right theTag -> Right theTag