GaBuZoMeu/Grammar.hs

130 lines
4.9 KiB
Haskell
Raw Permalink Normal View History

2023-06-08 21:49:35 +02:00
{-# LANGUAGE NoMonomorphismRestriction,FlexibleContexts #-}
module Grammar (module Text.ParserCombinators.Parsec, tag, testTag,
getTag) where
import Text.ParserCombinators.Parsec
import Types
import Parsers
-- "Tags for Identifying Languages" RFC 5646
-- We use the names of the productions of the grammar in the RFC
dash = char '-' <?> "dash"
charI = char 'i' <|> char 'I'
charX = char 'x' <|> char 'X'
alphaNumNotX c =
c >= 'A' && c <= 'W' ||
c >= 'Y' && c <= 'Z' ||
c >= 'a' && c <= 'w' ||
c >= 'y' && c <= 'z' ||
c >= '0' && c <= '9'
languageTag :: GenParser Char () Tag
languageTag =do
value <- try (langtag) <|> try (privateuse) <|> irregular
return value
langtag = do
theLanguage <- language
theExtlang <- facultative (try (do {dash ; value <- extlang ; notFollowedBy alphaNum;
return value}));
theScript <- facultative (try (do {dash ; value <- script ; notFollowedBy alphaNum;
return value}));
theRegion <- facultative (try (do {dash ; value <- region ; notFollowedBy alphaNum;
return value}));
theVariants <- many (try (do {dash ; value <- variant ; notFollowedBy alphaNum;
return value}));
theExtensions <- many (try (do {dash ; value <- extension; notFollowedBy alphaNum;
return value}))
-- TODO: use a context-sensitive grammar (see Parsec
-- documentation) to enforce "2.2.6. Extension Subtags" which
-- says that an extension cannot be repeated.
optional (do {dash; privateuse})
eof
return (Tag theLanguage theExtlang theScript theRegion theVariants theExtensions)
language = do
value <- do
try (do {lang <- countBetween 2 3 letter; notFollowedBy alphaNum; return lang})
-- Shortest ISO 639 code
-- TODO: returns the extended, too!
-- <|>
-- try (do {lang <- count 4 letter; notFollowedBy alphaNum; return lang}) -- reserved for future use.
2023-06-08 21:49:35 +02:00
<|> (countBetween 5 8 letter) -- registered language subtag. TODO: return the value!
return value
-- RFC, section 2.2.2: "Although the ABNF production 'extlang' permits
-- up to three extended language tags"
extlang = do
value <- count 3 letter
return value
script = do
value <- count 4 letter -- ISO 15924 code
return value
region = do
count 2 letter -- ISO 3166 code
<|> count 3 digit -- United Nations M.49 code
variant = do
do {first <- digit ; rest <- countBetween 3 7 alphaNum; return (first:rest)}
<|> do {first <- letter ; rest <- countBetween 4 7 alphaNum; return (first:rest)}
extension = do
theSingleton <- singleton
theValues <- many1 (try (do {dash ; extensionName}))
return (theSingleton, theValues)
extensionName = do
countBetween 2 8 alphaNum
singleton = satisfy (alphaNumNotX)
privateuse = do
charX
values <- many1 (do {dash; countBetween 1 8 alphaNum})
eof
return (Priv values)
-- Irregular tags only
registeredIrregular = try (istring "en-GB-oed") <|> try (istring "i-ami") <|> try (istring "i-bnn") <|> try (istring "i-default")
<|> try (istring "i-enochian") <|> try (istring "i-hak") <|> try (istring "i-klingon") <|> try (istring "i-lux")
<|> try (istring "i-mingo") <|> try (istring "i-navajo") <|> try (istring "i-pwn") <|> try (istring "i-tao")
<|> try (istring "i-tay") <|> try (istring "i-tsu") <|> try (istring "sgn-BE-fr") <|>try ( istring "sgn-BE-nl")
<|> try (istring "sgn-CH-de") <|> try (istring "no-bok")
<|> try (istring"no-nyn")
<|> try (istring"zh-cmn")
<|> try (istring"zh-cmn-Hans")
<|> try (istring"zh-cmn-Hant")
<|> try (istring"zh-gan")
<|> try (istring"zh-min")
<|> try (istring"zh-min-nan")
<|> try (istring"zh-wuu")
<|> try (istring"zh-yue")
irregular = do
value <- registeredIrregular
return (GF value)
tag = do
value <- languageTag
return value
-- Utilities
testTag :: String -> Bool
testTag input
= case (parse languageTag "" input) of
Left err -> False
Right x -> True
getTag :: String -> Either String Tag
getTag input =
case (parse tag "" input) of
Left err -> Left (input ++ " is NOT well-formed: " ++ (show err) ++
"\n")
Right theTag -> Right theTag