diff options
| author | Eugen Wissner <belka@caraus.de> | 2025-05-02 22:57:04 +0200 |
|---|---|---|
| committer | Eugen Wissner <belka@caraus.de> | 2025-05-02 22:57:04 +0200 |
| commit | dcfd6b1515679cfbc75de12a17352d9d1eddceaf (patch) | |
| tree | b50acc98325fffd5d1168a9c89d64ed54dd4217d /boot/tokenizer.s | |
| parent | 768821c68980968f7ab37ef71f4389c4fcee9235 (diff) | |
| download | elna-dcfd6b1515679cfbc75de12a17352d9d1eddceaf.tar.gz | |
Properly tokenize declaration sections
Diffstat (limited to 'boot/tokenizer.s')
| -rw-r--r-- | boot/tokenizer.s | 230 |
1 files changed, 162 insertions, 68 deletions
diff --git a/boot/tokenizer.s b/boot/tokenizer.s index 5570031..4315f66 100644 --- a/boot/tokenizer.s +++ b/boot/tokenizer.s @@ -1,4 +1,4 @@ -.global _tokenize, classification, transitions +.global _tokenize_next, classification, transitions, keywords .section .rodata @@ -24,8 +24,10 @@ .equ CLASS_X, 0x0d .equ CLASS_EOF, 0x0e .equ CLASS_DOT, 0x0f +.equ CLASS_MINUS, 0x10 +.equ CLASS_DOUBLE_QUOTE, 0x11 -.equ CLASS_COUNT, 16 +.equ CLASS_COUNT, 18 .type classification, @object .size classification, 128 @@ -64,7 +66,7 @@ classification: .byte CLASS_INVALID # 1F US .byte CLASS_SPACE # 20 Space .byte CLASS_SINGLE # 21 ! - .byte 0x00 # 22 " + .byte CLASS_DOUBLE_QUOTE # 22 " .byte 0x00 # 23 # .byte 0x00 # 24 $ .byte CLASS_SINGLE # 25 % @@ -75,7 +77,7 @@ classification: .byte CLASS_ASTERISK # 2A * .byte CLASS_SINGLE # 2B + .byte CLASS_SINGLE # 2C , - .byte 0x00 # 2D - + .byte CLASS_MINUS # 2D - .byte CLASS_DOT # 2E . .byte CLASS_SINGLE # 2F / .byte CLASS_ZERO # 30 0 @@ -159,6 +161,67 @@ classification: .byte CLASS_SINGLE # 7E ~ .byte CLASS_INVALID # 7F DEL +# +# Textual keywords in the language. +# +.equ KEYWORDS_COUNT, 21 + +.type keywords, @object +keywords: + .word 7 + .ascii "program" + .word 6 + .ascii "import" + .word 5 + .ascii "const" + .word 3 + .ascii "var" + .word 2 + .ascii "if" + .word 4 + .ascii "then" + .word 5 + .ascii "elsif" + .word 4 + .ascii "else" + .word 5 + .ascii "while" + .word 2 + .ascii "do" + .word 4 + .ascii "proc" + .word 5 + .ascii "begin" + .word 3 + .ascii "end" + .word 4 + .ascii "type" + .word 6 + .ascii "record" + .word 5 + .ascii "union" + .word 4 + .ascii "true" + .word 5 + .ascii "false" + .word 3 + .ascii "nil" + .word 3 + .ascii "xor" + .word 2 + .ascii "or" + .word 6 + .ascii "return" + .word 4 + .ascii "cast" + .word 5 + .ascii "defer" + .word 4 + .ascii "case" + .word 2 + .ascii "of" +.size keywords, . - keywords + .section .data # The transition table describes transitions from one state to another, given @@ -173,58 +236,82 @@ classification: # It specifies the target state. "ff" means that this is an end state and no # transition is possible. # - The next byte is the action that should be performed when transitioning. -# For the meaning of actions see labels in the _analyze_token function, which +# For the meaning of actions see labels in the _tokenize_next function, which # handles each action. # .type transitions, @object -.size transitions, 13 * CLASS_COUNT # state count * CLASS_COUNT +.size transitions, 17 * CLASS_COUNT # state count * CLASS_COUNT transitions: # Invalid Digit Alpha Space : = ( ) # * _ Single Hex 0 x NUL . + # - " .word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107 .word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start + .word 0x010f, 0x0110 .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon + .word 0x02ff, 0x02ff - .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 02 Identifier + .word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff + .word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff # 02 Identifier + .word 0x05ff, 0x05ff .word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer + .word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign + .word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals + .word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren + .word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren + .word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk + .word 0x02ff, 0x02ff .word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109 .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment + .word 0x0109, 0x0109 .word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment + .word 0x0109, 0x0109 .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token + .word 0x02ff, 0x02ff .word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero + .word 0x02ff, 0x02ff .word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal + .word 0x00ff, 0x02ff .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x2ff, 0x02ff # 0e Dot + .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 0e Dot + .word 0x02ff, 0x02ff + + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0f Minus + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff + + .word 0x00ff, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 # 10 Starting string. + .word 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 + .word 0x0110, 0x04ff .section .text @@ -292,12 +379,42 @@ _next_state: addi sp, sp, 16 ret +# Takes an identifier and checks whether it's a keyword. +# +# Parameters: +# a0 - Token length. +# a1 - Token pointer. +# +# Sets a0 to the appropriate token type. +.type _classify_identifier, @function +_classify_identifier: + # Prologue. + addi sp, sp, -16 + sw ra, 12(sp) + sw s0, 8(sp) + addi s0, sp, 16 + + mv a2, a0 + mv a3, a1 + li a0, KEYWORDS_COUNT + la a1, keywords + call _strings_index + + # Epilogue. + lw ra, 12(sp) + lw s0, 8(sp) + addi sp, sp, 16 + ret + # Initializes the classification table. # # Paramaters: # a0 - Source text pointer. -.type _analyze_token, @function -_analyze_token: +# a1 - A pointer for output value, the token kind. 4 Bytes. +# +# Sets a0 to the position of the next token. +.type _tokenize_next, @function +_tokenize_next: # Prologue. addi sp, sp, -24 sw ra, 20(sp) @@ -311,7 +428,10 @@ _analyze_token: sw s2, 8(sp) # Preserve s2 containing the current state. li s2, 0x00 # Initial, start state. -.Lanalyze_token_loop: + sw a1, 0(sp) + sw zero, (a1) # Initialize. + +.Ltokenize_next_loop: mv a0, s2 lbu a1, (s1) call _next_state @@ -323,56 +443,43 @@ _analyze_token: and t1, a0, t0 # Transition action. srli t1, t1, 8 - + # Perform the provided action. li t0, 0x01 # Accumulate action. - beq t1, t0, .Lanalyze_token_accumulate + beq t1, t0, .Ltokenize_next_accumulate li t0, 0x02 # Print action. - beq t1, t0, .Lanalyze_token_print + beq t1, t0, .Ltokenize_next_print li t0, 0x03 # Skip action. - beq t1, t0, .Lanalyze_token_skip + beq t1, t0, .Ltokenize_next_skip li t0, 0x04 # Comment action. - beq t1, t0, .Lanalyze_token_comment + beq t1, t0, .Ltokenize_next_comment - /* DEBUG - mv s4, t1 - addi t1, t1, '0' - sb t1, 0(sp) - li t1, ' ' - sb t1, 1(sp) - addi t1, s2, '0' - sb t1, 2(sp) - addi a0, sp, 0 */ - sw s1, 0(sp) - addi a0, s1, 0 - li a1, 3 - call _write_error - /* mv t1, s4 - DEBUG */ + li t0, 0x05 # Finalize identifier. + beq t1, t0, .Ltokenize_next_identifier - j .Lanalyze_token_reject + j .Ltokenize_next_reject -.Lanalyze_token_reject: +.Ltokenize_next_reject: addi s1, s1, 1 - j .Lanalyze_token_end + j .Ltokenize_next_end -.Lanalyze_token_accumulate: +.Ltokenize_next_accumulate: addi s1, s1, 1 - j .Lanalyze_token_loop + j .Ltokenize_next_loop -.Lanalyze_token_skip: +.Ltokenize_next_skip: addi s1, s1, 1 lw t0, 4(sp) addi t0, t0, 1 sw t0, 4(sp) - j .Lanalyze_token_loop + j .Ltokenize_next_loop -.Lanalyze_token_print: +.Ltokenize_next_print: /* DEBUG lw a0, 4(sp) mv a1, s1 @@ -380,9 +487,9 @@ _analyze_token: call _write_error DEBUG */ - j .Lanalyze_token_end + j .Ltokenize_next_end -.Lanalyze_token_comment: +.Ltokenize_next_comment: addi s1, s1, 1 /* DEBUG @@ -392,9 +499,20 @@ _analyze_token: call _write_error DEBUG */ - j .Lanalyze_token_end + j .Ltokenize_next_end + +.Ltokenize_next_identifier: + # An identifier can be a textual keyword. + # Check the kind of the token and write it into the output parameter. + lw a1, 4(sp) + sub a0, s1, a1 + call _classify_identifier + lw a1, 0(sp) + sw a0, (a1) + + j .Ltokenize_next_end -.Lanalyze_token_end: +.Ltokenize_next_end: mv a0, s1 # Return the advanced text pointer. # Restore saved registers. @@ -406,27 +524,3 @@ _analyze_token: lw s0, 16(sp) addi sp, sp, 24 ret - -# Initializes the lookup tables. -# -# Parameters: -# a0 - Source text pointer. -.type _tokenize, @function -_tokenize: - # Prologue. - addi sp, sp, -8 - sw ra, 4(sp) - sw s0, 0(sp) - addi s0, sp, 8 - -.Ltokenize_loop: - call _analyze_token - - lw t0, (a0) - bnez t0, .Ltokenize_loop - - # Epilogue. - lw ra, 4(sp) - lw s0, 0(sp) - addi sp, sp, 8 - ret |
