diff --git a/boot/stage1.s b/boot/stage1.s index e591e21..9b118d5 100644 --- a/boot/stage1.s +++ b/boot/stage1.s @@ -1896,11 +1896,6 @@ _main: sw s0, 0(sp) addi s0, sp, 8 - # Read the source from the standard input. - la a0, source_code - li a1, SOURCE_BUFFER_SIZE # Buffer size. - call _read_file - li s2, 1 # Epilogue. @@ -1912,7 +1907,13 @@ _main: # Entry point. .type _start, @function _start: - call _tokenizer_initialize + # Read the source from the standard input. + la a0, source_code + li a1, SOURCE_BUFFER_SIZE # Buffer size. + call _read_file + + mv a0, s1 + call _tokenize call _main call _compile diff --git a/boot/states.txt b/boot/states.txt deleted file mode 100644 index 20d5966..0000000 --- a/boot/states.txt +++ /dev/null @@ -1,20 +0,0 @@ -- start -digit: integer -upper: identifier -lower: identifier -space: start -invalid: error - -- identifier -digit: identifier -upper: identifier -lower: identifier -space: end -invalid: end - -- integer: -digit: integer -upper: end -lower: end -space: end -invalid: end diff --git a/boot/tokenizer.s b/boot/tokenizer.s index b1ac11f..5570031 100644 --- a/boot/tokenizer.s +++ b/boot/tokenizer.s @@ -1,190 +1,405 @@ -.global _tokenizer_initialize +.global _tokenize, classification, transitions .section .rodata + # -# Classes: +# Classification table assigns each possible character to a group (class). All +# characters of the same group a handled equivalently. # -# 0x00: Invalid -# 0x01: Digit -# 0x02: Character -# 0x03: Space -.type classes, @object -.size classes, 128 -classes: - .byte 0x00 # 00 NUL - .byte 0x00 # 01 SOH - .byte 0x00 # 02 STX - .byte 0x00 # 03 ETX - .byte 0x00 # 04 EOT - .byte 0x00 # 05 ENQ - .byte 0x00 # 06 ACK - .byte 0x00 # 07 BEL - .byte 0x00 # 08 BS - .byte 0x00 # 09 HT - .byte 0x00 # 0A LF - .byte 0x00 # 0B VT - .byte 0x00 # 0C FF - .byte 0x00 # 0D CR - .byte 0x00 # 0E SO - .byte 0x00 # 0F SI - .byte 0x00 # 10 DLE - .byte 0x00 # 11 DC1 - .byte 0x00 # 12 DC2 - .byte 0x00 # 13 DC3 - .byte 0x00 # 14 DC4 - .byte 0x00 # 15 NAK - .byte 0x00 # 16 SYN - .byte 0x00 # 17 ETB - .byte 0x00 # 18 CAN - .byte 0x00 # 19 EM - .byte 0x00 # 1A SUB - .byte 0x00 # 1B ESC - .byte 0x00 # 1C FS - .byte 0x00 # 1D GS - .byte 0x00 # 1E RS - .byte 0x00 # 1F US - .byte 0x03 # 20 Space - .byte 0x00 # 21 ! +# Classification: +# +.equ CLASS_INVALID, 0x0 +.equ CLASS_DIGIT, 0x01 +.equ CLASS_CHARACTER, 0x02 +.equ CLASS_SPACE, 0x03 +.equ CLASS_COLON, 0x04 +.equ CLASS_EQUALS, 0x05 +.equ CLASS_LEFT_PAREN, 0x06 +.equ CLASS_RIGHT_PAREN, 0x07 +.equ CLASS_ASTERISK, 0x08 +.equ CLASS_UNDERSCORE, 0x09 +.equ CLASS_SINGLE, 0x0a +.equ CLASS_HEX, 0x0b +.equ CLASS_ZERO, 0x0c +.equ CLASS_X, 0x0d +.equ CLASS_EOF, 0x0e +.equ CLASS_DOT, 0x0f + +.equ CLASS_COUNT, 16 + +.type classification, @object +.size classification, 128 +classification: + .byte CLASS_EOF # 00 NUL + .byte CLASS_INVALID # 01 SOH + .byte CLASS_INVALID # 02 STX + .byte CLASS_INVALID # 03 ETX + .byte CLASS_INVALID # 04 EOT + .byte CLASS_INVALID # 05 ENQ + .byte CLASS_INVALID # 06 ACK + .byte CLASS_INVALID # 07 BEL + .byte CLASS_INVALID # 08 BS + .byte CLASS_SPACE # 09 HT + .byte CLASS_SPACE # 0A LF + .byte CLASS_INVALID # 0B VT + .byte CLASS_INVALID # 0C FF + .byte CLASS_SPACE # 0D CR + .byte CLASS_INVALID # 0E SO + .byte CLASS_INVALID # 0F SI + .byte CLASS_INVALID # 10 DLE + .byte CLASS_INVALID # 11 DC1 + .byte CLASS_INVALID # 12 DC2 + .byte CLASS_INVALID # 13 DC3 + .byte CLASS_INVALID # 14 DC4 + .byte CLASS_INVALID # 15 NAK + .byte CLASS_INVALID # 16 SYN + .byte CLASS_INVALID # 17 ETB + .byte CLASS_INVALID # 18 CAN + .byte CLASS_INVALID # 19 EM + .byte CLASS_INVALID # 1A SUB + .byte CLASS_INVALID # 1B ESC + .byte CLASS_INVALID # 1C FS + .byte CLASS_INVALID # 1D GS + .byte CLASS_INVALID # 1E RS + .byte CLASS_INVALID # 1F US + .byte CLASS_SPACE # 20 Space + .byte CLASS_SINGLE # 21 ! .byte 0x00 # 22 " .byte 0x00 # 23 # .byte 0x00 # 24 $ - .byte 0x00 # 25 % - .byte 0x00 # 26 & + .byte CLASS_SINGLE # 25 % + .byte CLASS_SINGLE # 26 & .byte 0x00 # 27 ' - .byte 0x00 # 28 ( - .byte 0x00 # 29 ) - .byte 0x00 # 2A * - .byte 0x00 # 2B + - .byte 0x00 # 2C , + .byte CLASS_LEFT_PAREN # 28 ( + .byte CLASS_RIGHT_PAREN # 29 ) + .byte CLASS_ASTERISK # 2A * + .byte CLASS_SINGLE # 2B + + .byte CLASS_SINGLE # 2C , .byte 0x00 # 2D - - .byte 0x00 # 2E . - .byte 0x00 # 2F / - .byte 0x01 # 30 0 - .byte 0x01 # 31 1 - .byte 0x01 # 32 2 - .byte 0x01 # 33 3 - .byte 0x01 # 34 4 - .byte 0x01 # 35 5 - .byte 0x01 # 36 6 - .byte 0x01 # 37 7 - .byte 0x01 # 38 8 - .byte 0x01 # 39 9 - .byte 0x00 # 3A : - .byte 0x00 # 3B ; + .byte CLASS_DOT # 2E . + .byte CLASS_SINGLE # 2F / + .byte CLASS_ZERO # 30 0 + .byte CLASS_DIGIT # 31 1 + .byte CLASS_DIGIT # 32 2 + .byte CLASS_DIGIT # 33 3 + .byte CLASS_DIGIT # 34 4 + .byte CLASS_DIGIT # 35 5 + .byte CLASS_DIGIT # 36 6 + .byte CLASS_DIGIT # 37 7 + .byte CLASS_DIGIT # 38 8 + .byte CLASS_DIGIT # 39 9 + .byte CLASS_COLON # 3A : + .byte CLASS_SINGLE # 3B ; .byte 0x00 # 3C < - .byte 0x00 # 3D = + .byte CLASS_EQUALS # 3D = .byte 0x00 # 3E > .byte 0x00 # 3F ? - .byte 0x00 # 40 @ - .byte 0x02 # 41 A - .byte 0x02 # 42 B - .byte 0x02 # 43 C - .byte 0x02 # 44 D - .byte 0x02 # 45 E - .byte 0x02 # 46 F - .byte 0x02 # 47 G - .byte 0x02 # 48 H - .byte 0x02 # 49 I - .byte 0x02 # 4A J - .byte 0x02 # 4B K - .byte 0x02 # 4C L - .byte 0x02 # 4D M - .byte 0x02 # 4E N - .byte 0x02 # 4F O - .byte 0x02 # 50 P - .byte 0x02 # 51 Q - .byte 0x02 # 52 R - .byte 0x02 # 53 S - .byte 0x02 # 54 T - .byte 0x02 # 55 U - .byte 0x02 # 56 V - .byte 0x02 # 57 W - .byte 0x02 # 58 X - .byte 0x02 # 59 Y - .byte 0x02 # 5A Z - .byte 0x00 # 5B [ + .byte CLASS_SINGLE # 40 @ + .byte CLASS_CHARACTER # 41 A + .byte CLASS_CHARACTER # 42 B + .byte CLASS_CHARACTER # 43 C + .byte CLASS_CHARACTER # 44 D + .byte CLASS_CHARACTER # 45 E + .byte CLASS_CHARACTER # 46 F + .byte CLASS_CHARACTER # 47 G + .byte CLASS_CHARACTER # 48 H + .byte CLASS_CHARACTER # 49 I + .byte CLASS_CHARACTER # 4A J + .byte CLASS_CHARACTER # 4B K + .byte CLASS_CHARACTER # 4C L + .byte CLASS_CHARACTER # 4D M + .byte CLASS_CHARACTER # 4E N + .byte CLASS_CHARACTER # 4F O + .byte CLASS_CHARACTER # 50 P + .byte CLASS_CHARACTER # 51 Q + .byte CLASS_CHARACTER # 52 R + .byte CLASS_CHARACTER # 53 S + .byte CLASS_CHARACTER # 54 T + .byte CLASS_CHARACTER # 55 U + .byte CLASS_CHARACTER # 56 V + .byte CLASS_CHARACTER # 57 W + .byte CLASS_CHARACTER # 58 X + .byte CLASS_CHARACTER # 59 Y + .byte CLASS_CHARACTER # 5A Z + .byte CLASS_SINGLE # 5B [ .byte 0x00 # 5C \ - .byte 0x00 # 5D ] - .byte 0x00 # 5E ^ - .byte 0x00 # 5F _ + .byte CLASS_SINGLE # 5D ] + .byte CLASS_SINGLE # 5E ^ + .byte CLASS_UNDERSCORE # 5F _ .byte 0x00 # 60 ` - .byte 0x02 # 61 a - .byte 0x02 # 62 b - .byte 0x02 # 63 c - .byte 0x02 # 64 d - .byte 0x02 # 65 e - .byte 0x02 # 66 f - .byte 0x02 # 67 g - .byte 0x02 # 68 h - .byte 0x02 # 69 i - .byte 0x02 # 6A j - .byte 0x02 # 6B k - .byte 0x02 # 6C l - .byte 0x02 # 6D m - .byte 0x02 # 6E n - .byte 0x02 # 6F o - .byte 0x02 # 70 p - .byte 0x02 # 71 q - .byte 0x02 # 72 r - .byte 0x02 # 73 s - .byte 0x02 # 74 t - .byte 0x02 # 75 u - .byte 0x02 # 76 v - .byte 0x02 # 77 w - .byte 0x02 # 78 x - .byte 0x02 # 79 y - .byte 0x02 # 7A z + .byte CLASS_HEX # 61 a + .byte CLASS_HEX # 62 b + .byte CLASS_HEX # 63 c + .byte CLASS_HEX # 64 d + .byte CLASS_HEX # 65 e + .byte CLASS_HEX # 66 f + .byte CLASS_CHARACTER # 67 g + .byte CLASS_CHARACTER # 68 h + .byte CLASS_CHARACTER # 69 i + .byte CLASS_CHARACTER # 6A j + .byte CLASS_CHARACTER # 6B k + .byte CLASS_CHARACTER # 6C l + .byte CLASS_CHARACTER # 6D m + .byte CLASS_CHARACTER # 6E n + .byte CLASS_CHARACTER # 6F o + .byte CLASS_CHARACTER # 70 p + .byte CLASS_CHARACTER # 71 q + .byte CLASS_CHARACTER # 72 r + .byte CLASS_CHARACTER # 73 s + .byte CLASS_CHARACTER # 74 t + .byte CLASS_CHARACTER # 75 u + .byte CLASS_CHARACTER # 76 v + .byte CLASS_CHARACTER # 77 w + .byte CLASS_X # 78 x + .byte CLASS_CHARACTER # 79 y + .byte CLASS_CHARACTER # 7A z .byte 0x00 # 7B { - .byte 0x00 # 7C | + .byte CLASS_SINGLE # 7C | .byte 0x00 # 7D } - .byte 0x00 # 7E ~ - .byte 0x00 # 7F DEL + .byte CLASS_SINGLE # 7E ~ + .byte CLASS_INVALID # 7F DEL .section .data -.section .bss -.type class_names, @object -.size class_names, 1024 -class_names: .zero 1024 +# The transition table describes transitions from one state to another, given +# a symbol (character class). +# +# The table has m rows and n columns, where m is the amount of states and n is +# the amount of classes. So given the current state and a classified character +# the table can be used to look up the next state. +# +# Each cell is a word long. +# - The least significant byte of the word is a row number (beginning with 0). +# It specifies the target state. "ff" means that this is an end state and no +# transition is possible. +# - The next byte is the action that should be performed when transitioning. +# For the meaning of actions see labels in the _analyze_token function, which +# handles each action. +# +.type transitions, @object +.size transitions, 13 * CLASS_COUNT # state count * CLASS_COUNT +transitions: + # Invalid Digit Alpha Space : = ( ) + # * _ Single Hex 0 x NUL . + .word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107 + .word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start + + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon + + .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 02 Identifier + + .word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer + + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign + + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals + + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren + + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren + + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk + + .word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109 + .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment + + .word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff + .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment + + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token + + .word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero + + .word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal + + .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x2ff, 0x02ff # 0e Dot .section .text +# Returns the class from the classification table for the given character. +# +# Parameters: +# a0 - Character. +# +# Sets a0 to the class number. +.type _classify, @function +_classify: + la t0, classification + add t0, t0, a0 # Character class pointer. + lbu a0, (t0) # Character class. + ret + +# Given the current state and a character class, calculates the next state. + +# Parameters: +# a0 - Current state. +# a1 - Character class. +# +# Sets a0 to the next state. +.type _lookup_state, @function +_lookup_state: + li t0, CLASS_COUNT + mul a0, a0, t0 # Transition row. + add a0, a0, a1 # Transition column. + + li t0, 4 + mul a0, a0, t0 # Multiply by the word size. + + la t0, transitions + add t0, t0, a0 + lw a0, (t0) # Next state. + + ret + +# Chains _classify and _lookup_state. +# +# Parameters: +# a0 - Current state. +# a1 - Character. +# +# Sets a0 to the next state based on the given character. +.type _next_state, @function +_next_state: + # Prologue. + addi sp, sp, -16 + sw ra, 12(sp) + sw s0, 8(sp) + addi s0, sp, 16 + + sw a0, 4(sp) + mv a0, a1 + call _classify + + mv a1, a0 + lw a0, 4(sp) + call _lookup_state + + # Epilogue. + lw ra, 12(sp) + lw s0, 8(sp) + addi sp, sp, 16 + ret + # Initializes the classification table. # # Paramaters: -# a0 - Raw input for the classification table. -.type _initialize_classes, @function -_initialize_classes: +# a0 - Source text pointer. +.type _analyze_token, @function +_analyze_token: # Prologue. addi sp, sp, -24 sw ra, 20(sp) sw s0, 16(sp) addi s0, sp, 24 - sw s1, 12(sp) # Preserve the s1 register used for the character counter. - li s1, 128 # 128 ASCII characters. + sw s1, 12(sp) # Preserve s1 used for current source text position. + mv s1, a0 + sw a0, 4(sp) # Keeps a pointer to the beginning of a token. -.Linitialize_classes_loop: - addi s1, s1, -1 + sw s2, 8(sp) # Preserve s2 containing the current state. + li s2, 0x00 # Initial, start state. - la t0, classes - add t0, t0, s1 - lbu t0, (t0) - li t1, 0x01 +.Lanalyze_token_loop: + mv a0, s2 + lbu a1, (s1) + call _next_state - bne t0, t1, .Linitialize_classes_step + li t0, 0xff + and s2, a0, t0 # Next state. - /* DEBUG */ - li a0, 0x69676964 - sw a0, 8(sp) # Preserve the memory address. - addi a0, sp, 8 - li a1, 4 + li t0, 0xff00 + and t1, a0, t0 # Transition action. + srli t1, t1, 8 + + + li t0, 0x01 # Accumulate action. + beq t1, t0, .Lanalyze_token_accumulate + + li t0, 0x02 # Print action. + beq t1, t0, .Lanalyze_token_print + + li t0, 0x03 # Skip action. + beq t1, t0, .Lanalyze_token_skip + + li t0, 0x04 # Comment action. + beq t1, t0, .Lanalyze_token_comment + + /* DEBUG + mv s4, t1 + addi t1, t1, '0' + sb t1, 0(sp) + li t1, ' ' + sb t1, 1(sp) + addi t1, s2, '0' + sb t1, 2(sp) + addi a0, sp, 0 */ + sw s1, 0(sp) + addi a0, s1, 0 + li a1, 3 call _write_error + /* mv t1, s4 + DEBUG */ -.Linitialize_classes_step: - bnez s1, .Linitialize_classes_loop + j .Lanalyze_token_reject - lw s1, 12(sp) # Restore the saved register. +.Lanalyze_token_reject: + addi s1, s1, 1 + + j .Lanalyze_token_end + +.Lanalyze_token_accumulate: + addi s1, s1, 1 + + j .Lanalyze_token_loop + +.Lanalyze_token_skip: + addi s1, s1, 1 + lw t0, 4(sp) + addi t0, t0, 1 + sw t0, 4(sp) + + j .Lanalyze_token_loop + +.Lanalyze_token_print: + /* DEBUG + lw a0, 4(sp) + mv a1, s1 + sub a1, a1, a0 + call _write_error + DEBUG */ + + j .Lanalyze_token_end + +.Lanalyze_token_comment: + addi s1, s1, 1 + + /* DEBUG + lw a0, 4(sp) + mv a1, s1 + sub a1, a1, a0 + call _write_error + DEBUG */ + + j .Lanalyze_token_end + +.Lanalyze_token_end: + mv a0, s1 # Return the advanced text pointer. + + # Restore saved registers. + lw s1, 12(sp) + lw s2, 8(sp) # Epilogue. lw ra, 20(sp) @@ -193,15 +408,22 @@ _initialize_classes: ret # Initializes the lookup tables. -.type _tokenizer_initialize, @function -_tokenizer_initialize: +# +# Parameters: +# a0 - Source text pointer. +.type _tokenize, @function +_tokenize: # Prologue. addi sp, sp, -8 sw ra, 4(sp) sw s0, 0(sp) addi s0, sp, 8 - call _initialize_classes +.Ltokenize_loop: + call _analyze_token + + lw t0, (a0) + bnez t0, .Ltokenize_loop # Epilogue. lw ra, 4(sp)