lex.llm

;**********************************************************************
; lex.llm
;
; Lexer implementation.
;
; Copyright 2009-2010 by Dustin Laurence.  Distributed under the terms of
; the LGPLv3.
;
;**********************************************************************

#include "system.llh"
#include "lex.llh"

;**********************************************************************
; Private lexer declarations
;
;**********************************************************************


    declare NILCC void     @ClearLexemeBuffer()
    declare NILCC %c_char* @GetLexemePtr()
    declare NILCC void     @AppendLexemeChar(%c_int %char_int)
    declare NILCC %c_int   @GetValidChar()

; Should be large enough
    #define LEXEMEBUFFER_SIZE 1024
    @lexemeBuffer = internal global [LEXEMEBUFFER_SIZE x %c_char] zeroinitializer

    @pBufferIndex = internal global %c_size_t 0

    ; Maintained by hand for now :-(
    @exceptionCode = external global %Word

;**********************************************************************
; ReadLexeme (public)
;
; The back end of the nil tokenizer.  A very simple state machine
; implemented directly in code, which is an instructive example of a
; complex function written in IR.  As the code is already complex enough
; and verges on being "write only" code, ReadLexeme only identifies
; significant character sequences (lexemes) in the source, while the
; actual construction of the corresponding token required by the parser
; is broken out into NextToken().
;
; There is a great deal of redundancy in the code, but those are not to
; be simplified away until the lexer is thoroughly debugged.
;
; Yes, it's hard to read.  It's a state machine, what do you want?  It
; was hard to write, and why should you have it any easier? :-)
;
;**********************************************************************

define NILCC %Token
@ReadLexeme()
{
    ; Reminder: for optimization purposes all alloca calls should happen
    ; in the first block

    ; FIXME: The state variable should be a label address computed with
    ; blockaddress() when blockaddress() and indirectbr are supported
    ; (apparently in LLVM 2.7).
    %pState = alloca %Word
    store %Word 0, %Word* %pState

    ; Start at the beginning of the buffer
    call NILCC void @ClearLexemeBuffer()

    br label %Loop

Loop:
    ;Try
        %currentChar = invoke NILCC %c_int @GetValidChar()
            to     label %DoState
            unwind label %Catch

        Catch:
            %exceptionType = load %Word* @exceptionCode
            %exception = insertvalue %Token {%c_int NIL_EXCEPTION, %Word 0},
                                                   %Word %exceptionType, 1
            ret %Token %exception

    DoState:
            %state = load %Word* %pState
            switch %Word %state, label %Next_Error [

                %Word 0, label %StartState
                %Word 1, label %TokenState
            ]

        Next_Error:
            cant_happen()
            br label %Next_Error

    StartState:
            ; Throw away whitespace
            switch %c_int %currentChar, label %ToToken [

                %c_int NIL_EOF,        label %AcceptException
                %c_int NIL_IOERROR,    label %AcceptException
                %c_int NIL_BADCHAR,    label %AcceptException

                %c_int ASCII_TAB,      label %Loop
                %c_int ASCII_NEWLINE,  label %AcceptChar
                %c_int ASCII_VTAB,     label %Loop
                %c_int ASCII_FORMFEED, label %Loop
                %c_int ASCII_CR,       label %Loop
                %c_int ASCII_SPACE,    label %Loop
                %c_int ASCII_DEL,      label %Loop

                ; Characters that do magic in the parser
                %c_int ASCII_SQUOTE,   label %AcceptChar
                %c_int ASCII_LPAREN,   label %AcceptChar
                %c_int ASCII_RPAREN,   label %AcceptChar
            ]

        ToToken:
            call NILCC void @AppendLexemeChar(%c_int %currentChar)
            store %Word 1, %Word* %pState
            br label %Loop

    TokenState:

            switch %c_int %currentChar, label %AppendChar [

                %c_int NIL_EOF,        label %AcceptComposite
                %c_int NIL_IOERROR,    label %AcceptComposite
                %c_int NIL_BADCHAR,    label %AcceptComposite

                %c_int ASCII_TAB,      label %AcceptComposite
                %c_int ASCII_NEWLINE,  label %AcceptComposite
                %c_int ASCII_VTAB,     label %AcceptComposite
                %c_int ASCII_FORMFEED, label %AcceptComposite
                %c_int ASCII_CR,       label %AcceptComposite
                %c_int ASCII_SPACE,    label %AcceptComposite
                %c_int ASCII_DEL,      label %AcceptComposite

                %c_int ASCII_LPAREN,   label %AcceptComposite
                %c_int ASCII_RPAREN,   label %AcceptComposite
            ]

    AppendChar:
        call NILCC void @AppendLexemeChar(%c_int %currentChar)
        br label %Loop

AcceptException:
    ; Shouldn't be possible now
    cant_happen()
    br label %AcceptException

AcceptChar:
    %char = insertvalue %Token {%c_int 0, %Word 0},
                                               %c_int %currentChar, 0
    ret %Token %char

AcceptComposite:
    ; Push back the non-Symbol character we found for the next call
    call ccc void @ungetchar_asserted(%c_int %currentChar)

    ; Add a null-terminator for the convenience of NextToken().
    call NILCC void @AppendLexemeChar(%c_int 0)

    %lexemePtr = getelementptr [LEXEMEBUFFER_SIZE x %c_char]* @lexemeBuffer,
                                                            i64 0, i64 0
    %strWord = ptrtoint %c_char* %lexemePtr to %Word

    switch %Word %state, label %Accept_Error [

        %Word 1, label %AcceptSymbol
    ]

Accept_Error:
    cant_happen()
    br label %Next_Error

AcceptSymbol:
    %symbolStruct = insertvalue %Token {%c_int NIL_SYMBOL, %Word undef},
                                                       %Word %strWord, 1
    ret %Token %symbolStruct
}

;**********************************************************************
; LexemeBuffer
;
; A small module that makes it easy for the lexer to accumulate up a
; token one character at a time, but didn't seem worth breaking out
; into its own file.
;
; FIXME: With some macro hackery, we could use this to define a more
; general buffer "class" and re-use it to get rid of stdio and call the
; low-level I/O functions directly.
;
;**********************************************************************

;**********************************************************************
; ClearLexemeBuffer
;
;**********************************************************************

define NILCC void
@ClearLexemeBuffer()
{
    store %c_size_t 0, %c_size_t* @pBufferIndex

    ret void
}

;**********************************************************************
; AppendLexemeChar
;
;**********************************************************************

define NILCC void
@AppendLexemeChar(%c_int %char_int)
{
    #ifndef NDEBUG
    ; Sanity test input to ensure it is within 0-255
    %nonNegative = icmp sge %c_int %char_int, 0
    assert(%nonNegative)
    %eightBits = icmp slt %c_int %char_int, 256
    assert(%eightBits)
    #endif

    ; Obtain c_char value
    %char = trunc %c_int %char_int to %c_char
    ; Obtain pointer to the next position
    %bufferIndex = load %c_size_t* @pBufferIndex
    ; FIXME: would like this to use the "inbounds" keyword
    %bufferPos = getelementptr [LEXEMEBUFFER_SIZE x %c_char]* @lexemeBuffer,
                                        i64 0,
                                        %c_size_t %bufferIndex

    ; Write the char to the buffer
    store %c_char %char, %c_char* %bufferPos

    ; Increment the index
    ; To prevent buffer overflow (because security is so important in
    ; private experimental code ;-) we simply truncate Lexemes to the
    ; buffer size.  If you require Lexemes longer than 1023 characters,
    ; get help.
    %maxIndex = sub %c_size_t LEXEMEBUFFER_SIZE, 1 ; really a constant
    %roomLeft = icmp ult %c_size_t %bufferIndex, %maxIndex
    br i1 %roomLeft, label %Increment, label %Done

Increment:
    %nextIndex = add %c_size_t %bufferIndex, 1
    store %c_size_t %nextIndex, %c_size_t* @pBufferIndex
    br label %Done

Done:
    ret void
}

;**********************************************************************
; Lexer-specific I/O
;
;**********************************************************************

;**********************************************************************
; GetValidChar (private)
;
; Simplifies the lexer by letting us filter out illegal characters.
; Returns the ASCII value for legal character, NIL_EOF for end-of-file,
; NIL_IOERROR on a (surprise!) I/O error, or prints an error message
; and returns NIL_BADCHAR for bad characters.
;
;**********************************************************************

define NILCC %c_int
@GetValidChar()
{
    %nextChar = call ccc %c_int @getchar_checked()
    switch %c_int %nextChar, label %ValidIO [
        %c_int IO_EOF,      label %OnEOF
        %c_int IO_ERROR,    label %OnIOError

        %c_int ASCII_TAB,      label %AcceptNextChar
        %c_int ASCII_NEWLINE,  label %AcceptNextChar
        %c_int ASCII_VTAB,     label %AcceptNextChar
        %c_int ASCII_FORMFEED, label %AcceptNextChar
        %c_int ASCII_CR,       label %AcceptNextChar
    ]

ValidIO:
    %smallEnough = icmp slt %c_int %nextChar, ASCII_DEL
    br i1 %smallEnough, label %SmallEnough,
                        label %BadChar

SmallEnough:
    %printing = icmp sge %c_int %nextChar, ASCII_SPACE
    br i1 %printing, label %AcceptNextChar,
                     label %BadChar

AcceptNextChar:
    ret %c_int %nextChar

; Handle Exceptional Conditions--translate to Nil-wide codes.

OnEOF:
    store %Word NIL_EOF, %Word* @exceptionCode
    unwind
;    ret %c_int NIL_EOF

BadChar:
    store %Word NIL_BADCHAR, %Word* @exceptionCode
    unwind
;    ret %c_int NIL_BADCHAR

OnIOError:
    store %Word NIL_IOERROR, %Word* @exceptionCode
    unwind
;    ret %c_int NIL_IOERROR
}