-- | This module contains a scanner for the intermediate language that takes -- the source code and converts it into a token stream. -- -- We convert the source code to a token stream such that -- "FreeC.Frontend.IR.Parser" does not have to handle whitespace explicitly. module FreeC.Frontend.IR.Scanner ( TokenWithPos(..), scan ) where import Data.Char ( isNumber, isPunctuation, isSymbol ) import Text.Parsec ( (<|>), Parsec ) import qualified Text.Parsec as Parsec import qualified Text.Parsec.Token as Parsec import FreeC.Frontend.IR.Token import FreeC.IR.SrcSpan import FreeC.Monad.Reporter import FreeC.Pretty import FreeC.Util.Parsec import FreeC.Util.Predicate ( (.&&.), (.||.) ) ------------------------------------------------------------------------------- -- Type Synonyms -- ------------------------------------------------------------------------------- -- | Type of parsers for IR lexeme of type @a@. type Scanner a = Parsec String () a -- | A 'Token' and its position in the source code. data TokenWithPos = TokenWithPos { getTokenPos :: Parsec.SourcePos, getToken :: Token } -- | We need a show instance for tokens with positions such that the parser -- can print unexpected tokens. instance Show TokenWithPos where show = showPretty . getToken -- | Converts the given scanner for a token to a scanner for the same token -- that attaches source location information. tokenWithPos :: Scanner Token -> Scanner TokenWithPos tokenWithPos scanner = TokenWithPos <$> Parsec.getPosition <*> scanner ------------------------------------------------------------------------------- -- Character Classes -- ------------------------------------------------------------------------------- -- | Scanner for a lowercase character. -- -- > <lower> ::= "a" | … | "z" | <any lowercase Unicode letter> lowerScanner :: Scanner Char lowerScanner = Parsec.lower -- | Scanner for an uppercase character. -- -- > <upper> ::= "A" | … | "Z" | <any upper- or titlecase Unicode letter> upperScanner :: Scanner Char upperScanner = Parsec.upper -- | Scanner for an Unicode numeric character. -- -- > <numeric> ::= <digit> | <any Unicode numeric character> numericScanner :: Scanner Char numericScanner = Parsec.satisfy isNumber ------------------------------------------------------------------------------- -- Language Definition -- ------------------------------------------------------------------------------- -- | Block comments start with @"{- "@ and can be nested. -- -- Block comments start and end with a space such that we are still -- able to parser pragmas which start with @"{-#"@ and end with @"#-}"@ blockCommentStart :: String blockCommentStart = "{- " -- | Block comments end with @" -}"@ and can be nested. -- -- Block comments start and end with a space such that we are still -- able to parser pragmas which start with @"{-#"@ and end with @"#-}"@ blockCommentEnd :: String blockCommentEnd = " -}" -- | Line comments start with @"-- "@ and span the remaining line. lineCommentStart :: String lineCommentStart = "-- " -- | Valid start characters of variable identifiers -- (see 'VarIdent' for the definition of @<varid>@). -- -- It matches the start of the identifier only, i.e., @<lower> | "_"@. -- The remaining characters are scanned by 'identLetter'. varIdentStart :: Scanner Char varIdentStart = lowerScanner <|> Parsec.char '_' -- | Valid start characters of constructor identifiers -- (see 'ConIdent' for the definition of @<conid>@). -- -- It matches the start of the identifier only, i.e., @<upper>@. -- The remaining characters are scanned by 'identLetter'. conIdentStart :: Scanner Char conIdentStart = upperScanner -- | Valid non-start characters of identifiers. -- -- This scanner is used for both @<varid>@s and @<conid>@s -- (see 'VarIdent' and 'ConIdent' respectively). -- -- It matches only one character at a time and only the characters after -- the first letter. -- -- > <identletter> ::= <lower> | <upper> | <numeric> | "_" | "'" -- -- The start of identifiers is scanned by 'varIdentStart' and 'conIdentStart' -- respectively. identLetter :: Scanner Char identLetter = lowerScanner <|> upperScanner <|> numericScanner <|> Parsec.oneOf "_'" -- | Valid characters in symbolic names (i.e., in @<varsym>@ and @<consym>@, -- see also VarIdent and 'ConIdent'). -- -- All Unicode symbol and punctuation characters except for parenthesis -- are allowed in symbolic names. Parenthesis are not allowed since the -- symbolic names are wrapped in parenthesis themselves. -- -- > <symbol> ::= <any Unicode symbol or punctuation> -- > <namesymbol> ::= <symbol> \ ( "(" | ")" ) nameSymbolChar :: Scanner Char nameSymbolChar = Parsec.satisfy ((isSymbol .||. isPunctuation) .&&. (`notElem` ['(', ')'])) -- | Language definition for the intermediate language. -- -- Contains the parameters for the 'tokenParser' for the IR. languageDef :: Parsec.LanguageDef () languageDef = Parsec.LanguageDef { Parsec.commentStart = blockCommentStart , Parsec.commentEnd = blockCommentEnd , Parsec.commentLine = lineCommentStart , Parsec.nestedComments = True , Parsec.identStart = varIdentStart <|> conIdentStart , Parsec.identLetter = identLetter , Parsec.opStart = nameSymbolChar , Parsec.opLetter = nameSymbolChar , Parsec.reservedNames = [] -- Keywords are handled by 'identScanner'. , Parsec.reservedOpNames = [] -- Handled by order in 'tokenScanner'. , Parsec.caseSensitive = True } ------------------------------------------------------------------------------- -- Generated Lexical Parsers -- ------------------------------------------------------------------------------- -- | Contains lexical parsers for the intermediate language. tokenParser :: Parsec.TokenParser () tokenParser = Parsec.makeTokenParser languageDef -- | Scanner for zero or more whitespace characters or comments. whitespaceScanner :: Scanner () whitespaceScanner = Parsec.whiteSpace tokenParser -- | Scanner for 'ConIdent' and 'VarIdent' tokens. identScanner :: Scanner Token identScanner = mkIdentToken <$> Parsec.identifier tokenParser -- | Scanner for 'ConSymbol' and 'VarSymbol' tokens. symbolScanner :: Scanner Token symbolScanner = Parsec.between (Parsec.char '(') (Parsec.char ')') (mkSymbolToken <$> Parsec.option "" (Parsec.operator tokenParser)) -- | Scanner for 'IntToken's. integerScanner :: Scanner Token integerScanner = IntToken <$> Parsec.integer tokenParser -- | Scanner for 'StrToken's. stringScanner :: Scanner Token stringScanner = StrToken <$> Parsec.stringLiteral tokenParser -- | Scanners for tokens listed in 'specialSymbols'. specialSymbolScanners :: [Scanner Token] specialSymbolScanners = map (\(symbol, token) -> Parsec.symbol tokenParser symbol >> return token) specialSymbols -- | Scanner for a single 'Token'. tokenScanner :: Scanner TokenWithPos tokenScanner = tokenWithPos $ Parsec.choice $ map Parsec.try $ identScanner : symbolScanner : integerScanner : stringScanner : specialSymbolScanners -- | A scanner for zero or more 'Token's. -- -- Whitespaces and comments before and between tokens are ignored. tokenListScanner :: Scanner [TokenWithPos] tokenListScanner = whitespaceScanner *> Parsec.many (Parsec.lexeme tokenParser tokenScanner) <* Parsec.eof -- | Converts the given IR source code to a stream of IR tokens. -- -- Reports a fatal error if there are unknown tokens. scan :: MonadReporter r => SrcFile -> r [TokenWithPos] scan srcFile = runParsecOrFail srcFile (srcFileContents srcFile) tokenListScanner