-- | This module contains the token data type for the intermediate language.
module FreeC.Frontend.IR.Token
  ( -- * Tokens
    Token(..)
  , mkIdentToken
  , mkSymbolToken
    -- * Special Symbols
  , specialSymbols
    -- * Keywords
  , Keyword(..)
  , keywords
  ) where

import           Data.Char        ( isUpper )
import           Data.Tuple.Extra ( (&&&) )

import           FreeC.Pretty

-------------------------------------------------------------------------------
-- Tokens                                                                    --
-------------------------------------------------------------------------------
-- | The token classes of the intermediate language.
--
--   In the description of the grammar for the token classes below, we are
--   using the following character classes.
--
--   > <lower>   ::= "a" | … | "z" | <any lowercase Unicode letter>
--   > <upper>   ::= "A" | … | "Z" | <any upper- or titlecase Unicode letter>
--   >
--   > <numeric> ::= <digit> | <any Unicode numeric character>
--   > <digit>   ::= "0" | … | "9"
--   > <octit>   ::= "0" | … | "7"
--   > <hexit>   ::= "0" | … | "9" | "a" | … | "f" | "A" | … | "F"
--   >
--   > <symbol>  ::= <any Unicode symbol or punctuation>
--   > <graphic> ::= <lower> | <upper> | <symbol> | <numeric>
--   > <space>   ::= " "
--
--   == Identifiers
--
--   Identifiers can contain letters (both upper- and lowercase), digits,
--   underscores and apostrophes. However, identifiers must not start with an
--   digit or apostrophe. Identifiers starting with an uppercase letter
--   identify constructors.
--
--   > <identletter> ::= <lower> | <upper> | <numeric> | "_" | "'"
--   > <varid>  ::= (<lower> | "_") { <identletter> }
--   > <conid>  ::= <upper> { <identletter> }
--
--   == Symbolic names
--
--   Symbolic names are sequences of arbitrary Unicode symbol and punctuation
--   characters wrapped in parenthesis where no parenthesis are allowed in the
--   name itself.
--
--   Constructor symbols are either empty or start with one of @"["@, @":"@
--   or @","@. The @"["@ and @","@ start characters are needed in addition
--   to @":"@ (which is the start character of constructor symbols in Haskell)
--   such that the empty list constructor and tuple constructors are recognized
--   correctly.
--
--   > <namesymbol>  ::= <symbol> \ ( "(" | ")" )
--   > <consymstart> ::= "[" | ":" | ","
--   >
--   > <varsym> ::= "(" (<namesymbol> \ <consymstart>) { <namesymbol> } ")"
--   > <consym> ::= "(" [ <consymstart> { <namesymbol> } ] ")"
--
--   == Integers
--
--   Integer tokens have an optional sign and can be in decimal, octal or
--   hexadecimal notation. The prefixes of octal and hexadecimal integers
--   as well as the digits (or "hexits") of hexadecimal integers are case
--   insensitive.
--
--   > <decimal>     ::= <digit> { <digit> }
--   > <octal>       ::= <octit> { <octit> }
--   > <hexadecimal> ::= <hexit> { <hexit> }
--   >
--   > <integer>   ::= [ "+" | "-" ] <natural>
--   > <natural>   ::= <decimal>
--   >               | "0o" <octal>       | "0O" <octal>
--   >               | "0x" <hexadecimal> | "0X" <hexadecimal>
--
--   == Strings
--
--   Strings are wrapped in double quotes. They can contain arbitrary Unicode
--   letters, digits, symbols and punctuation characters as well as spaces
--   and escape sequences.
--
--   > <escape> ::= <any Haskell escape sequence>
--   > <gap>    ::= "\" { <arbitrary Unicode whitespace> } "\"
--   >
--   > <string> ::= '"' { <graphic> \ ( '"' | "\" )
--   >                  | <space>
--   >                  | <escape>
--   >                  | <gap>
--   >                  } '"'
data Token
  = ConIdent String  -- ^ @<conid>@ a constructor identifier.
  | ConSymbol String -- ^ @<consym>@ a constructor symbol.
  | VarIdent String  -- ^ @<varid>@ a variable identifier.
  | VarSymbol String -- ^ @<varsym>@ a variable symbol.
  | Keyword Keyword  -- ^ A t'Keyword'.
  | IntToken Integer -- ^ @<integer>@ an integer literal.
  | StrToken String  -- ^ @<string>@ a string literal.
  | At               -- ^ @"\@"@
  | Comma            -- ^ @","@
  | Dot              -- ^ @"."@
  | DoubleColon      -- ^ @"::"@
  | Equals           -- ^ @"="@
  | Lambda           -- ^ @"\\"@
  | LBrace           -- ^ @"{"@
  | LParen           -- ^ @"("@
  | Pipe             -- ^ @"|"@
  | RBrace           -- ^ @"}"@
  | RParen           -- ^ @")"@
  | RArrow           -- ^ @"->"@
  | Semi             -- ^ @";"@
  | Bang             -- ^ @"!"@
 deriving ( Eq, Show )

-- | Constructs a 'ConIdent', 'VarIdent', 'Keyword' for the given identifier
--   or keyword depending.
--
--   Constructors and variables can be told apart by the case of their first
--   character. Constructors start with upper case letters and variables with
--   lower case letters or an underscore.
--
--   If the given string occurs in 'keywords', the corresponding keyword
--   token is returned instead.
mkIdentToken :: String -> Token
mkIdentToken ident = case lookup ident keywords of
  Nothing      | isUpper (head ident) -> ConIdent ident
               | otherwise -> VarIdent ident
  Just keyword -> Keyword keyword

-- | Constructs a 'ConSymbol' or 'VarSymbol' for the given symbolic name.
--
--   The given string should not include the parenthesis around symbols.
--
--   Constructor symbols must be empty or start with one of @"["@, @":"@ or
--   @","@.
mkSymbolToken :: String -> Token
mkSymbolToken ""              = ConSymbol ""
mkSymbolToken sym@(first : _) | first `elem` ['[', ':', ','] = ConSymbol sym
mkSymbolToken sym             = VarSymbol sym

-- | Pretty prints a token.
instance Pretty Token where
  pretty (ConIdent ident)  = prettyString ident
  pretty (ConSymbol sym)
    = prettyString "(" <> prettyString sym <> prettyString ")"
  pretty (VarIdent ident)  = prettyString ident
  pretty (VarSymbol sym)
    = prettyString "(" <> prettyString sym <> prettyString ")"
  pretty (Keyword keyword) = pretty keyword
  pretty (IntToken value)  = pretty value
  pretty (StrToken value)  = prettyString (show value)
  pretty At                = prettyString "@"
  pretty Comma             = prettyString ","
  pretty Dot               = prettyString "."
  pretty DoubleColon       = prettyString "::"
  pretty Equals            = prettyString "="
  pretty Lambda            = prettyString "\\"
  pretty LBrace            = prettyString "{"
  pretty LParen            = prettyString "("
  pretty Pipe              = prettyString "|"
  pretty RBrace            = prettyString "}"
  pretty RParen            = prettyString ")"
  pretty RArrow            = prettyString "->"
  pretty Semi              = prettyString ";"
  pretty Bang              = prettyString "!"

-------------------------------------------------------------------------------
-- Special Symbols                                                           --
-------------------------------------------------------------------------------
-- | Symbols that cannot be used as symbolic names.
--
--   Since the intermediate language is only parsed in tests, this
--   constraints only the identifiers that can be used in tests.
--   If the IR AST is generated by the frontend, identifiers are
--   allowed to collide with keywords.
specialSymbols :: [(String, Token)]
specialSymbols = map (showPretty &&& id)
  [ At
  , Comma
  , Dot
  , DoubleColon
  , Equals
  , Lambda
  , LBrace
  , LParen
  , Pipe
  , RBrace
  , RParen
  , RArrow
  , Semi
  , Bang
  ]

-------------------------------------------------------------------------------
-- Keywords                                                                  --
-------------------------------------------------------------------------------
-- Data type for keyword 'Token's.
data Keyword
  = CASE      -- ^ @"case"@
  | DATA      -- ^ @"data"@
  | ELSE      -- ^ @"else"@
  | ERROR     -- ^ @"error"@
  | FORALL    -- ^ @"forall"@
  | IF        -- ^ @"if"@
  | IMPORT    -- ^ @"import"@
  | IN        -- ^ @"in"@
  | LET       -- ^ @"let"@
  | MODULE    -- ^ @"module"@
  | OF        -- ^ @"of"@
  | THEN      -- ^ @"then"@
  | TRACE     -- ^ @"trace"@
  | TYPE      -- ^ @"type"@
  | UNDEFINED -- ^ @"undefined"@
  | WHERE     -- ^ @"where"@
 deriving ( Eq, Show )

-- | Maps reserved words that cannot be used as identifiers to the
--   corresponding 'Keyword' tokens.
--
--   Since the intermediate language is only parsed in tests, this
--   constraints only the identifiers that can be used in tests.
--   If the IR AST is generated by the frontend, identifiers are
--   allowed to collide with keywords.
keywords :: [(String, Keyword)]
keywords = map (showPretty &&& id)
  [ CASE
  , DATA
  , ELSE
  , ERROR
  , FORALL
  , IF
  , IMPORT
  , IN
  , LET
  , MODULE
  , OF
  , THEN
  , TRACE
  , TYPE
  , UNDEFINED
  , WHERE
  ]

-- | Pretty prints a keyword.
instance Pretty Keyword where
  pretty CASE      = prettyString "case"
  pretty DATA      = prettyString "data"
  pretty ELSE      = prettyString "else"
  pretty ERROR     = prettyString "error"
  pretty FORALL    = prettyString "forall"
  pretty IF        = prettyString "if"
  pretty IMPORT    = prettyString "import"
  pretty IN        = prettyString "in"
  pretty LET       = prettyString "let"
  pretty MODULE    = prettyString "module"
  pretty OF        = prettyString "of"
  pretty THEN      = prettyString "then"
  pretty TRACE     = prettyString "trace"
  pretty TYPE      = prettyString "type"
  pretty UNDEFINED = prettyString "undefined"
  pretty WHERE     = prettyString "where"