.global _tokenize, classification, transitions .section .rodata # # Classification table assigns each possible character to a group (class). All # characters of the same group a handled equivalently. # # Classification: # .equ CLASS_INVALID, 0x0 .equ CLASS_DIGIT, 0x01 .equ CLASS_CHARACTER, 0x02 .equ CLASS_SPACE, 0x03 .equ CLASS_COLON, 0x04 .equ CLASS_EQUALS, 0x05 .equ CLASS_LEFT_PAREN, 0x06 .equ CLASS_RIGHT_PAREN, 0x07 .equ CLASS_ASTERISK, 0x08 .equ CLASS_UNDERSCORE, 0x09 .equ CLASS_SINGLE, 0x0a .equ CLASS_HEX, 0x0b .equ CLASS_ZERO, 0x0c .equ CLASS_X, 0x0d .equ CLASS_EOF, 0x0e .equ CLASS_DOT, 0x0f .equ CLASS_COUNT, 16 .type classification, @object .size classification, 128 classification: .byte CLASS_EOF # 00 NUL .byte CLASS_INVALID # 01 SOH .byte CLASS_INVALID # 02 STX .byte CLASS_INVALID # 03 ETX .byte CLASS_INVALID # 04 EOT .byte CLASS_INVALID # 05 ENQ .byte CLASS_INVALID # 06 ACK .byte CLASS_INVALID # 07 BEL .byte CLASS_INVALID # 08 BS .byte CLASS_SPACE # 09 HT .byte CLASS_SPACE # 0A LF .byte CLASS_INVALID # 0B VT .byte CLASS_INVALID # 0C FF .byte CLASS_SPACE # 0D CR .byte CLASS_INVALID # 0E SO .byte CLASS_INVALID # 0F SI .byte CLASS_INVALID # 10 DLE .byte CLASS_INVALID # 11 DC1 .byte CLASS_INVALID # 12 DC2 .byte CLASS_INVALID # 13 DC3 .byte CLASS_INVALID # 14 DC4 .byte CLASS_INVALID # 15 NAK .byte CLASS_INVALID # 16 SYN .byte CLASS_INVALID # 17 ETB .byte CLASS_INVALID # 18 CAN .byte CLASS_INVALID # 19 EM .byte CLASS_INVALID # 1A SUB .byte CLASS_INVALID # 1B ESC .byte CLASS_INVALID # 1C FS .byte CLASS_INVALID # 1D GS .byte CLASS_INVALID # 1E RS .byte CLASS_INVALID # 1F US .byte CLASS_SPACE # 20 Space .byte CLASS_SINGLE # 21 ! .byte 0x00 # 22 " .byte 0x00 # 23 # .byte 0x00 # 24 $ .byte CLASS_SINGLE # 25 % .byte CLASS_SINGLE # 26 & .byte 0x00 # 27 ' .byte CLASS_LEFT_PAREN # 28 ( .byte CLASS_RIGHT_PAREN # 29 ) .byte CLASS_ASTERISK # 2A * .byte CLASS_SINGLE # 2B + .byte CLASS_SINGLE # 2C , .byte 0x00 # 2D - .byte CLASS_DOT # 2E . .byte CLASS_SINGLE # 2F / .byte CLASS_ZERO # 30 0 .byte CLASS_DIGIT # 31 1 .byte CLASS_DIGIT # 32 2 .byte CLASS_DIGIT # 33 3 .byte CLASS_DIGIT # 34 4 .byte CLASS_DIGIT # 35 5 .byte CLASS_DIGIT # 36 6 .byte CLASS_DIGIT # 37 7 .byte CLASS_DIGIT # 38 8 .byte CLASS_DIGIT # 39 9 .byte CLASS_COLON # 3A : .byte CLASS_SINGLE # 3B ; .byte 0x00 # 3C < .byte CLASS_EQUALS # 3D = .byte 0x00 # 3E > .byte 0x00 # 3F ? .byte CLASS_SINGLE # 40 @ .byte CLASS_CHARACTER # 41 A .byte CLASS_CHARACTER # 42 B .byte CLASS_CHARACTER # 43 C .byte CLASS_CHARACTER # 44 D .byte CLASS_CHARACTER # 45 E .byte CLASS_CHARACTER # 46 F .byte CLASS_CHARACTER # 47 G .byte CLASS_CHARACTER # 48 H .byte CLASS_CHARACTER # 49 I .byte CLASS_CHARACTER # 4A J .byte CLASS_CHARACTER # 4B K .byte CLASS_CHARACTER # 4C L .byte CLASS_CHARACTER # 4D M .byte CLASS_CHARACTER # 4E N .byte CLASS_CHARACTER # 4F O .byte CLASS_CHARACTER # 50 P .byte CLASS_CHARACTER # 51 Q .byte CLASS_CHARACTER # 52 R .byte CLASS_CHARACTER # 53 S .byte CLASS_CHARACTER # 54 T .byte CLASS_CHARACTER # 55 U .byte CLASS_CHARACTER # 56 V .byte CLASS_CHARACTER # 57 W .byte CLASS_CHARACTER # 58 X .byte CLASS_CHARACTER # 59 Y .byte CLASS_CHARACTER # 5A Z .byte CLASS_SINGLE # 5B [ .byte 0x00 # 5C \ .byte CLASS_SINGLE # 5D ] .byte CLASS_SINGLE # 5E ^ .byte CLASS_UNDERSCORE # 5F _ .byte 0x00 # 60 ` .byte CLASS_HEX # 61 a .byte CLASS_HEX # 62 b .byte CLASS_HEX # 63 c .byte CLASS_HEX # 64 d .byte CLASS_HEX # 65 e .byte CLASS_HEX # 66 f .byte CLASS_CHARACTER # 67 g .byte CLASS_CHARACTER # 68 h .byte CLASS_CHARACTER # 69 i .byte CLASS_CHARACTER # 6A j .byte CLASS_CHARACTER # 6B k .byte CLASS_CHARACTER # 6C l .byte CLASS_CHARACTER # 6D m .byte CLASS_CHARACTER # 6E n .byte CLASS_CHARACTER # 6F o .byte CLASS_CHARACTER # 70 p .byte CLASS_CHARACTER # 71 q .byte CLASS_CHARACTER # 72 r .byte CLASS_CHARACTER # 73 s .byte CLASS_CHARACTER # 74 t .byte CLASS_CHARACTER # 75 u .byte CLASS_CHARACTER # 76 v .byte CLASS_CHARACTER # 77 w .byte CLASS_X # 78 x .byte CLASS_CHARACTER # 79 y .byte CLASS_CHARACTER # 7A z .byte 0x00 # 7B { .byte CLASS_SINGLE # 7C | .byte 0x00 # 7D } .byte CLASS_SINGLE # 7E ~ .byte CLASS_INVALID # 7F DEL .section .data # The transition table describes transitions from one state to another, given # a symbol (character class). # # The table has m rows and n columns, where m is the amount of states and n is # the amount of classes. So given the current state and a classified character # the table can be used to look up the next state. # # Each cell is a word long. # - The least significant byte of the word is a row number (beginning with 0). # It specifies the target state. "ff" means that this is an end state and no # transition is possible. # - The next byte is the action that should be performed when transitioning. # For the meaning of actions see labels in the _analyze_token function, which # handles each action. # .type transitions, @object .size transitions, 13 * CLASS_COUNT # state count * CLASS_COUNT transitions: # Invalid Digit Alpha Space : = ( ) # * _ Single Hex 0 x NUL . .word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107 .word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 02 Identifier .word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk .word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109 .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment .word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token .word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero .word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x2ff, 0x02ff # 0e Dot .section .text # Returns the class from the classification table for the given character. # # Parameters: # a0 - Character. # # Sets a0 to the class number. .type _classify, @function _classify: la t0, classification add t0, t0, a0 # Character class pointer. lbu a0, (t0) # Character class. ret # Given the current state and a character class, calculates the next state. # Parameters: # a0 - Current state. # a1 - Character class. # # Sets a0 to the next state. .type _lookup_state, @function _lookup_state: li t0, CLASS_COUNT mul a0, a0, t0 # Transition row. add a0, a0, a1 # Transition column. li t0, 4 mul a0, a0, t0 # Multiply by the word size. la t0, transitions add t0, t0, a0 lw a0, (t0) # Next state. ret # Chains _classify and _lookup_state. # # Parameters: # a0 - Current state. # a1 - Character. # # Sets a0 to the next state based on the given character. .type _next_state, @function _next_state: # Prologue. addi sp, sp, -16 sw ra, 12(sp) sw s0, 8(sp) addi s0, sp, 16 sw a0, 4(sp) mv a0, a1 call _classify mv a1, a0 lw a0, 4(sp) call _lookup_state # Epilogue. lw ra, 12(sp) lw s0, 8(sp) addi sp, sp, 16 ret # Initializes the classification table. # # Paramaters: # a0 - Source text pointer. .type _analyze_token, @function _analyze_token: # Prologue. addi sp, sp, -24 sw ra, 20(sp) sw s0, 16(sp) addi s0, sp, 24 sw s1, 12(sp) # Preserve s1 used for current source text position. mv s1, a0 sw a0, 4(sp) # Keeps a pointer to the beginning of a token. sw s2, 8(sp) # Preserve s2 containing the current state. li s2, 0x00 # Initial, start state. .Lanalyze_token_loop: mv a0, s2 lbu a1, (s1) call _next_state li t0, 0xff and s2, a0, t0 # Next state. li t0, 0xff00 and t1, a0, t0 # Transition action. srli t1, t1, 8 li t0, 0x01 # Accumulate action. beq t1, t0, .Lanalyze_token_accumulate li t0, 0x02 # Print action. beq t1, t0, .Lanalyze_token_print li t0, 0x03 # Skip action. beq t1, t0, .Lanalyze_token_skip li t0, 0x04 # Comment action. beq t1, t0, .Lanalyze_token_comment /* DEBUG mv s4, t1 addi t1, t1, '0' sb t1, 0(sp) li t1, ' ' sb t1, 1(sp) addi t1, s2, '0' sb t1, 2(sp) addi a0, sp, 0 */ sw s1, 0(sp) addi a0, s1, 0 li a1, 3 call _write_error /* mv t1, s4 DEBUG */ j .Lanalyze_token_reject .Lanalyze_token_reject: addi s1, s1, 1 j .Lanalyze_token_end .Lanalyze_token_accumulate: addi s1, s1, 1 j .Lanalyze_token_loop .Lanalyze_token_skip: addi s1, s1, 1 lw t0, 4(sp) addi t0, t0, 1 sw t0, 4(sp) j .Lanalyze_token_loop .Lanalyze_token_print: /* DEBUG lw a0, 4(sp) mv a1, s1 sub a1, a1, a0 call _write_error DEBUG */ j .Lanalyze_token_end .Lanalyze_token_comment: addi s1, s1, 1 /* DEBUG lw a0, 4(sp) mv a1, s1 sub a1, a1, a0 call _write_error DEBUG */ j .Lanalyze_token_end .Lanalyze_token_end: mv a0, s1 # Return the advanced text pointer. # Restore saved registers. lw s1, 12(sp) lw s2, 8(sp) # Epilogue. lw ra, 20(sp) lw s0, 16(sp) addi sp, sp, 24 ret # Initializes the lookup tables. # # Parameters: # a0 - Source text pointer. .type _tokenize, @function _tokenize: # Prologue. addi sp, sp, -8 sw ra, 4(sp) sw s0, 0(sp) addi s0, sp, 8 .Ltokenize_loop: call _analyze_token lw t0, (a0) bnez t0, .Ltokenize_loop # Epilogue. lw ra, 4(sp) lw s0, 0(sp) addi sp, sp, 8 ret