# This Source Code Form is subject to the terms of the Mozilla Public License, # v. 2.0. If a copy of the MPL was not distributed with this file, You can # obtain one at https://mozilla.org/MPL/2.0/. .global lex_next, classification, transitions, keywords, byte_keywords .include "boot/definitions.inc" .section .rodata # # Classification table assigns each possible character to a group (class). All # characters of the same group a handled equivalently. # # Classification: # .equ CLASS_INVALID, 0x00 .equ CLASS_DIGIT, 0x01 .equ CLASS_CHARACTER, 0x02 .equ CLASS_SPACE, 0x03 .equ CLASS_COLON, 0x04 .equ CLASS_EQUALS, 0x05 .equ CLASS_LEFT_PAREN, 0x06 .equ CLASS_RIGHT_PAREN, 0x07 .equ CLASS_ASTERISK, 0x08 .equ CLASS_UNDERSCORE, 0x09 .equ CLASS_SINGLE, 0x0a .equ CLASS_HEX, 0x0b .equ CLASS_ZERO, 0x0c .equ CLASS_X, 0x0d .equ CLASS_EOF, 0x0e .equ CLASS_DOT, 0x0f .equ CLASS_MINUS, 0x10 .equ CLASS_QUOTE, 0x11 .equ CLASS_GREATER, 0x12 .equ CLASS_LESS, 0x13 .equ CLASS_COUNT, 20 .type classification, @object classification: .byte CLASS_EOF # 00 NUL .byte CLASS_INVALID # 01 SOH .byte CLASS_INVALID # 02 STX .byte CLASS_INVALID # 03 ETX .byte CLASS_INVALID # 04 EOT .byte CLASS_INVALID # 05 ENQ .byte CLASS_INVALID # 06 ACK .byte CLASS_INVALID # 07 BEL .byte CLASS_INVALID # 08 BS .byte CLASS_SPACE # 09 HT .byte CLASS_SPACE # 0A LF .byte CLASS_INVALID # 0B VT .byte CLASS_INVALID # 0C FF .byte CLASS_SPACE # 0D CR .byte CLASS_INVALID # 0E SO .byte CLASS_INVALID # 0F SI .byte CLASS_INVALID # 10 DLE .byte CLASS_INVALID # 11 DC1 .byte CLASS_INVALID # 12 DC2 .byte CLASS_INVALID # 13 DC3 .byte CLASS_INVALID # 14 DC4 .byte CLASS_INVALID # 15 NAK .byte CLASS_INVALID # 16 SYN .byte CLASS_INVALID # 17 ETB .byte CLASS_INVALID # 18 CAN .byte CLASS_INVALID # 19 EM .byte CLASS_INVALID # 1A SUB .byte CLASS_INVALID # 1B ESC .byte CLASS_INVALID # 1C FS .byte CLASS_INVALID # 1D GS .byte CLASS_INVALID # 1E RS .byte CLASS_INVALID # 1F US .byte CLASS_SPACE # 20 Space .byte CLASS_SINGLE # 21 ! .byte CLASS_QUOTE # 22 " .byte 0x00 # 23 # .byte 0x00 # 24 $ .byte CLASS_SINGLE # 25 % .byte CLASS_SINGLE # 26 & .byte CLASS_QUOTE # 27 ' .byte CLASS_LEFT_PAREN # 28 ( .byte CLASS_RIGHT_PAREN # 29 ) .byte CLASS_ASTERISK # 2A * .byte CLASS_SINGLE # 2B + .byte CLASS_SINGLE # 2C , .byte CLASS_MINUS # 2D - .byte CLASS_DOT # 2E . .byte CLASS_SINGLE # 2F / .byte CLASS_ZERO # 30 0 .byte CLASS_DIGIT # 31 1 .byte CLASS_DIGIT # 32 2 .byte CLASS_DIGIT # 33 3 .byte CLASS_DIGIT # 34 4 .byte CLASS_DIGIT # 35 5 .byte CLASS_DIGIT # 36 6 .byte CLASS_DIGIT # 37 7 .byte CLASS_DIGIT # 38 8 .byte CLASS_DIGIT # 39 9 .byte CLASS_COLON # 3A : .byte CLASS_SINGLE # 3B ; .byte CLASS_LESS # 3C < .byte CLASS_EQUALS # 3D = .byte CLASS_GREATER # 3E > .byte 0x00 # 3F ? .byte CLASS_SINGLE # 40 @ .byte CLASS_CHARACTER # 41 A .byte CLASS_CHARACTER # 42 B .byte CLASS_CHARACTER # 43 C .byte CLASS_CHARACTER # 44 D .byte CLASS_CHARACTER # 45 E .byte CLASS_CHARACTER # 46 F .byte CLASS_CHARACTER # 47 G .byte CLASS_CHARACTER # 48 H .byte CLASS_CHARACTER # 49 I .byte CLASS_CHARACTER # 4A J .byte CLASS_CHARACTER # 4B K .byte CLASS_CHARACTER # 4C L .byte CLASS_CHARACTER # 4D M .byte CLASS_CHARACTER # 4E N .byte CLASS_CHARACTER # 4F O .byte CLASS_CHARACTER # 50 P .byte CLASS_CHARACTER # 51 Q .byte CLASS_CHARACTER # 52 R .byte CLASS_CHARACTER # 53 S .byte CLASS_CHARACTER # 54 T .byte CLASS_CHARACTER # 55 U .byte CLASS_CHARACTER # 56 V .byte CLASS_CHARACTER # 57 W .byte CLASS_CHARACTER # 58 X .byte CLASS_CHARACTER # 59 Y .byte CLASS_CHARACTER # 5A Z .byte CLASS_SINGLE # 5B [ .byte 0x00 # 5C \ .byte CLASS_SINGLE # 5D ] .byte CLASS_SINGLE # 5E ^ .byte CLASS_UNDERSCORE # 5F _ .byte 0x00 # 60 ` .byte CLASS_HEX # 61 a .byte CLASS_HEX # 62 b .byte CLASS_HEX # 63 c .byte CLASS_HEX # 64 d .byte CLASS_HEX # 65 e .byte CLASS_HEX # 66 f .byte CLASS_CHARACTER # 67 g .byte CLASS_CHARACTER # 68 h .byte CLASS_CHARACTER # 69 i .byte CLASS_CHARACTER # 6A j .byte CLASS_CHARACTER # 6B k .byte CLASS_CHARACTER # 6C l .byte CLASS_CHARACTER # 6D m .byte CLASS_CHARACTER # 6E n .byte CLASS_CHARACTER # 6F o .byte CLASS_CHARACTER # 70 p .byte CLASS_CHARACTER # 71 q .byte CLASS_CHARACTER # 72 r .byte CLASS_CHARACTER # 73 s .byte CLASS_CHARACTER # 74 t .byte CLASS_CHARACTER # 75 u .byte CLASS_CHARACTER # 76 v .byte CLASS_CHARACTER # 77 w .byte CLASS_X # 78 x .byte CLASS_CHARACTER # 79 y .byte CLASS_CHARACTER # 7A z .byte 0x00 # 7B { .byte CLASS_SINGLE # 7C | .byte 0x00 # 7D } .byte CLASS_SINGLE # 7E ~ .byte CLASS_INVALID # 7F DEL # # Textual keywords in the language. # .equ KEYWORDS_COUNT, TOKEN_IDENTIFIER - 1 .type keywords, @object keywords: .word 7 .ascii "program" .word 6 .ascii "import" .word 5 .ascii "const" .word 3 .ascii "var" .word 2 .ascii "if" .word 4 .ascii "then" .word 5 .ascii "elsif" .word 4 .ascii "else" .word 5 .ascii "while" .word 2 .ascii "do" .word 4 .ascii "proc" .word 5 .ascii "begin" .word 3 .ascii "end" .word 4 .ascii "type" .word 6 .ascii "record" .word 5 .ascii "union" .word 4 .ascii "true" .word 5 .ascii "false" .word 3 .ascii "nil" .word 3 .ascii "xor" .word 2 .ascii "or" .word 6 .ascii "return" .word 4 .ascii "cast" .word 4 .ascii "goto" .word 4 .ascii "case" .word 2 .ascii "of" .type byte_keywords, @object byte_keywords: .ascii "&.,:;()[]^=+-*@" .equ BYTE_KEYWORDS_SIZE, . - byte_keywords .section .data # The transition table describes transitions from one state to another, given # a symbol (character class). # # The table has m rows and n columns, where m is the amount of states and n is # the amount of classes. So given the current state and a classified character # the table can be used to look up the next state. # # Each cell is a word long. # - The least significant byte of the word is a row number (beginning with 0). # It specifies the target state. "ff" means that this is an end state and no # transition is possible. # - The next byte is the action that should be performed when transitioning. # For the meaning of actions see labels in the lex_next function, which # handles each action. # .type transitions, @object transitions: # Invalid Digit Alpha Space : = ( ) # * _ Single Hex 0 x NUL . # - " or ' > < .word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x06ff, 0x0106, 0x06ff .word 0x06ff, 0x0102, 0x06ff, 0x0102, 0x010c, 0x0102, 0x00ff, 0x06ff .word 0x0105, 0x0110, 0x0104, 0x0107 # 0x00 Start .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x07ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x01 Colon .word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff .word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff .word 0x05ff, 0x05ff, 0x05ff, 0x05ff # 0x02 Identifier .word 0x08ff, 0x0103, 0x00ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff .word 0x08ff, 0x00ff, 0x08ff, 0x00ff, 0x0103, 0x00ff, 0x08ff, 0x08ff .word 0x08ff, 0x08ff, 0x08ff, 0x08ff # 0x03 Decimal .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x04ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x04ff, 0x02ff # 0x04 Greater .word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff .word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff .word 0x06ff, 0x06ff, 0x04ff, 0x06ff # 0x05 Minus .word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff .word 0x0109, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff .word 0x06ff, 0x06ff, 0x06ff, 0x06ff # 0x06 Left paren .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x04ff # 0x07 Less .word 0x08ff, 0x0108, 0x00ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff .word 0x08ff, 0x00ff, 0x08ff, 0x0108, 0x0108, 0x00ff, 0x08ff, 0x08ff .word 0x08ff, 0x08ff, 0x08ff, 0x08ff # 0x08 Hexadecimal after 0x. .word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109 .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 .word 0x0109, 0x0109, 0x0109, 0x0109 # 0x09 Comment .word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 .word 0x0109, 0x0109, 0x0109, 0x0109 # 0x0a Closing comment .word 0x00ff, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110 .word 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110 .word 0x010b, 0x04ff, 0x010b, 0x010b # 0x0b String .word 0x08ff, 0x00ff, 0x00ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff .word 0x08ff, 0x00ff, 0x08ff, 0x00ff, 0x00ff, 0x010d, 0x08ff, 0x08ff .word 0x08ff, 0x08ff, 0x08ff, 0x08ff # 0x0c Leading zero .word 0x00ff, 0x0108, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff .word 0x00ff, 0x00ff, 0x00ff, 0x0108, 0x0108, 0x00ff, 0x00ff, 0x00ff .word 0x00ff, 0x00ff, 0x00ff, 0x00ff # 0x0d Starting hexadecimal .section .text # Returns the class from the classification table for the given character. # # Parameters: # a0 - Character. # # Sets a0 to the class number. .type classify, @function classify: la t0, classification add t0, t0, a0 # Character class pointer. lbu a0, (t0) # Character class. ret # Given the current state and a character class, calculates the next state. # Parameters: # a0 - Current state. # a1 - Character class. # # Sets a0 to the next state. .type lookup_state, @function lookup_state: li t0, CLASS_COUNT mul a0, a0, t0 # Transition row. add a0, a0, a1 # Transition column. li t0, 4 mul a0, a0, t0 # Multiply by the word size. la t0, transitions add t0, t0, a0 lw a0, (t0) # Next state. ret # Chains classify and lookup_state. # # Parameters: # a0 - Current state. # a1 - Character. # # Sets a0 to the next state based on the given character. .type _next_state, @function _next_state: # Prologue. addi sp, sp, -16 sw ra, 12(sp) sw s0, 8(sp) addi s0, sp, 16 sw a0, 4(sp) mv a0, a1 call classify mv a1, a0 lw a0, 4(sp) call lookup_state # Epilogue. lw ra, 12(sp) lw s0, 8(sp) addi sp, sp, 16 ret # Takes an identifier and checks whether it's a keyword. # # Parameters: # a0 - Token length. # a1 - Token pointer. # # Sets a0 to the appropriate token type. .type classify_identifier, @function classify_identifier: # Prologue. addi sp, sp, -16 sw ra, 12(sp) sw s0, 8(sp) addi s0, sp, 16 mv a2, a0 mv a3, a1 li a0, KEYWORDS_COUNT la a1, keywords call _strings_index bnez a0, .Lclassify_identifier_end li a0, TOKEN_IDENTIFIER .Lclassify_identifier_end: # Epilogue. lw ra, 12(sp) lw s0, 8(sp) addi sp, sp, 16 ret # Takes a symbol and determines its type. # # Parameters: # a0 - Token character. # # Sets a0 to the appropriate token type. .type classify_single, @function classify_single: # Prologue. addi sp, sp, -16 sw ra, 12(sp) sw s0, 8(sp) addi s0, sp, 16 mv a1, a0 li a2, BYTE_KEYWORDS_SIZE la a0, byte_keywords call _memchr la a1, byte_keywords sub a0, a0, a1 addi a0, a0, TOKEN_IDENTIFIER + 1 # Epilogue. lw ra, 12(sp) lw s0, 8(sp) addi sp, sp, 16 ret # Classified a symbol containing multiple characters (probably 2). # # Parameters: # a0 - Token length. # a1 - Token pointer. # # Sets a0 to the appropriate token type. .type classify_composite, @function classify_composite: lbu t0, 0(a1) li t1, ':' beq t0, t1, .Lclassify_composite_assign j .Lclassify_composite_end .Lclassify_composite_assign: li a0, TOKEN_ASSIGN j .Lclassify_composite_end .Lclassify_composite_end: ret # Initializes the classification table. # # Paramaters: # a0 - Source text pointer. # a1 - A pointer for output value, the token kind. 4 Bytes. # # Sets a0 to the position of the next token. .type lex_next, @function lex_next: # Prologue. addi sp, sp, -32 sw ra, 28(sp) sw s0, 24(sp) addi s0, sp, 32 sw s1, 20(sp) # Preserve s1 used for current source text position. mv s1, a0 sw a0, 12(sp) # Keeps a pointer to the beginning of a token. # 4(sp) and 8(sp) are reserved for the kind and length of the token if needed. sw s2, 16(sp) # Preserve s2 containing the current state. li s2, 0x00 # Initial, start state. sw a1, 0(sp) sw zero, (a1) # Initialize. .Llex_next_loop: mv a0, s2 lbu a1, (s1) call _next_state li t0, 0xff and s2, a0, t0 # Next state. li t0, 0xff00 and t1, a0, t0 # Transition action. srli t1, t1, 8 # Perform the provided action. li t0, 0x01 # Accumulate action. beq t1, t0, .Llex_next_accumulate li t0, 0x02 # Print action. beq t1, t0, .Llex_next_print li t0, 0x03 # Skip action. beq t1, t0, .Llex_next_skip li t0, 0x04 # Delimited string action. beq t1, t0, .Llex_next_comment li t0, 0x05 # Finalize identifier. beq t1, t0, .Llex_next_identifier li t0, 0x06 # Single character symbol action. beq t1, t0, .Llex_next_single li t0, 0x07 # An action for symbols containing multiple characters. beq t1, t0, .Llex_next_composite li t0, 0x08 # Integer action. beq t1, t0, .Llex_next_integer j .Llex_next_reject .Llex_next_reject: addi s1, s1, 1 j .Llex_next_end .Llex_next_accumulate: addi s1, s1, 1 j .Llex_next_loop .Llex_next_skip: addi s1, s1, 1 lw t0, 12(sp) addi t0, t0, 1 sw t0, 12(sp) j .Llex_next_loop .Llex_next_print: /* DEBUG addi a0, a0, 21 sw a0, 0(sp) addi a0, sp, 0 li a1, 1 call _write_error */ j .Llex_next_end .Llex_next_comment: addi s1, s1, 1 j .Llex_next_end .Llex_next_identifier: # An identifier can be a textual keyword. # Check the kind of the token and write it into the output parameter. lw a1, 12(sp) sub a0, s1, a1 sw a0, 8(sp) call classify_identifier sw a0, 4(sp) lw a0, 0(sp) addi a1, sp, 4 li a2, 12 call _memcpy j .Llex_next_end .Llex_next_single: lw a0, 12(sp) addi s1, a0, 1 lbu a0, (a0) call classify_single lw a1, 0(sp) sw a0, (a1) j .Llex_next_end .Llex_next_composite: addi s1, s1, 1 lw a1, 12(sp) sub a0, s1, a1 call classify_composite lw a1, 0(sp) sw a0, (a1) j .Llex_next_end .Llex_next_integer: lw t0, 0(sp) li t1, TOKEN_INTEGER sw t1, 0(t0) lw t1, 12(sp) sw t1, 8(t0) sub t1, s1, t1 sw t1, 4(t0) j .Llex_next_end .Llex_next_end: mv a0, s1 # Return the advanced text pointer. # Restore saved registers. lw s1, 20(sp) lw s2, 16(sp) # Epilogue. lw ra, 28(sp) lw s0, 24(sp) addi sp, sp, 32 ret