diff options
Diffstat (limited to 'boot/tokenizer.s')
| -rw-r--r-- | boot/tokenizer.s | 207 |
1 files changed, 138 insertions, 69 deletions
diff --git a/boot/tokenizer.s b/boot/tokenizer.s index 4315f66..67b2602 100644 --- a/boot/tokenizer.s +++ b/boot/tokenizer.s @@ -1,4 +1,10 @@ -.global _tokenize_next, classification, transitions, keywords +# This Source Code Form is subject to the terms of the Mozilla Public License, +# v. 2.0. If a copy of the MPL was not distributed with this file, You can +# obtain one at https://mozilla.org/MPL/2.0/. + +.global _tokenize_next, classification, transitions, keywords, byte_keywords + +.include "boot/definitions.inc" .section .rodata @@ -8,7 +14,7 @@ # # Classification: # -.equ CLASS_INVALID, 0x0 +.equ CLASS_INVALID, 0x00 .equ CLASS_DIGIT, 0x01 .equ CLASS_CHARACTER, 0x02 .equ CLASS_SPACE, 0x03 @@ -25,9 +31,11 @@ .equ CLASS_EOF, 0x0e .equ CLASS_DOT, 0x0f .equ CLASS_MINUS, 0x10 -.equ CLASS_DOUBLE_QUOTE, 0x11 +.equ CLASS_QUOTE, 0x11 +.equ CLASS_GREATER, 0x12 +.equ CLASS_LESS, 0x13 -.equ CLASS_COUNT, 18 +.equ CLASS_COUNT, 20 .type classification, @object .size classification, 128 @@ -66,12 +74,12 @@ classification: .byte CLASS_INVALID # 1F US .byte CLASS_SPACE # 20 Space .byte CLASS_SINGLE # 21 ! - .byte CLASS_DOUBLE_QUOTE # 22 " + .byte CLASS_QUOTE # 22 " .byte 0x00 # 23 # .byte 0x00 # 24 $ .byte CLASS_SINGLE # 25 % .byte CLASS_SINGLE # 26 & - .byte 0x00 # 27 ' + .byte CLASS_QUOTE # 27 ' .byte CLASS_LEFT_PAREN # 28 ( .byte CLASS_RIGHT_PAREN # 29 ) .byte CLASS_ASTERISK # 2A * @@ -92,9 +100,9 @@ classification: .byte CLASS_DIGIT # 39 9 .byte CLASS_COLON # 3A : .byte CLASS_SINGLE # 3B ; - .byte 0x00 # 3C < + .byte CLASS_LESS # 3C < .byte CLASS_EQUALS # 3D = - .byte 0x00 # 3E > + .byte CLASS_GREATER # 3E > .byte 0x00 # 3F ? .byte CLASS_SINGLE # 40 @ .byte CLASS_CHARACTER # 41 A @@ -220,7 +228,10 @@ keywords: .ascii "case" .word 2 .ascii "of" -.size keywords, . - keywords + +.type byte_keywords, @object +byte_keywords: .ascii "&.,:;()[]^=+-*@" +.equ BYTE_KEYWORDS_SIZE, . - byte_keywords .section .data @@ -240,78 +251,66 @@ keywords: # handles each action. # .type transitions, @object -.size transitions, 17 * CLASS_COUNT # state count * CLASS_COUNT +.size transitions, 14 * CLASS_COUNT # state count * CLASS_COUNT transitions: # Invalid Digit Alpha Space : = ( ) # * _ Single Hex 0 x NUL . - # - " - .word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107 - .word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start - .word 0x010f, 0x0110 + # - " or ' > < + .word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x06ff, 0x0106, 0x06ff + .word 0x06ff, 0x0102, 0x06ff, 0x0102, 0x010c, 0x0102, 0x00ff, 0x0108 + .word 0x0105, 0x0110, 0x0104, 0x0107 # 0x00 Start - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon - .word 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x07ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x01 Colon .word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff - .word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff # 02 Identifier - .word 0x05ff, 0x05ff + .word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff + .word 0x05ff, 0x05ff, 0x05ff, 0x05ff # 0x02 Identifier .word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer - .word 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x03 Integer + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x04ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign - .word 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x04ff, 0x02ff # 0x04 Greater - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals - .word 0x02ff, 0x02ff + .word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff + .word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff + .word 0x06ff, 0x06ff, 0x04ff, 0x06ff # 0x05 Minus .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren - .word 0x02ff, 0x02ff + .word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x06 Left paren .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren - .word 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk - .word 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x04ff # 0x07 Less + + .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x08 Dot .word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109 - .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment - .word 0x0109, 0x0109 + .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 + .word 0x0109, 0x0109, 0x0109, 0x0109 # 0x09 Comment .word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff - .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment - .word 0x0109, 0x0109 + .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 + .word 0x0109, 0x0109, 0x0109, 0x0109 # 0x0a Closing comment - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token - .word 0x02ff, 0x02ff + .word 0x00ff, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110 + .word 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110 + .word 0x010b, 0x04ff, 0x010b, 0x010b # 0x0b String .word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero - .word 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x0c Zero .word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal - .word 0x00ff, 0x02ff - - .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 0e Dot - .word 0x02ff, 0x02ff - - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0f Minus - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff - - .word 0x00ff, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 # 10 Starting string. - .word 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 - .word 0x0110, 0x04ff + .word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff + .word 0x00ff, 0x02ff, 0x02ff, 0x02ff # 0x0d Hexadecimal .section .text @@ -406,6 +405,57 @@ _classify_identifier: addi sp, sp, 16 ret +# Takes a symbol and determines its type. +# +# Parameters: +# a0 - Token character. +# +# Sets a0 to the appropriate token type. +.type _classify_single, @function +_classify_single: + # Prologue. + addi sp, sp, -16 + sw ra, 12(sp) + sw s0, 8(sp) + addi s0, sp, 16 + + mv a1, a0 + li a2, BYTE_KEYWORDS_SIZE + la a0, byte_keywords + call _memchr + + la a1, byte_keywords + sub a0, a0, a1 + addi a0, a0, 27 + + # Epilogue. + lw ra, 12(sp) + lw s0, 8(sp) + addi sp, sp, 16 + ret + +# Classified a symbol containing multiple characters (probably 2). +# +# Parameters: +# a0 - Token length. +# a1 - Token pointer. +# +# Sets a0 to the appropriate token type. +.type _classify_composite, @function +_classify_composite: + lbu t0, 0(a1) + li t1, ':' + beq t0, t1, .Lclassify_composite_assign + + j .Lclassify_composite_end + +.Lclassify_composite_assign: + li a0, TOKEN_ASSIGN + j .Lclassify_composite_end + +.Lclassify_composite_end: + ret + # Initializes the classification table. # # Paramaters: @@ -453,12 +503,18 @@ _tokenize_next: li t0, 0x03 # Skip action. beq t1, t0, .Ltokenize_next_skip - li t0, 0x04 # Comment action. + li t0, 0x04 # Delimited string action. beq t1, t0, .Ltokenize_next_comment li t0, 0x05 # Finalize identifier. beq t1, t0, .Ltokenize_next_identifier + li t0, 0x06 # Single character symbol action. + beq t1, t0, .Ltokenize_next_single + + li t0, 0x07 # An action for symbols containing multiple characters. + beq t1, t0, .Ltokenize_next_composite + j .Ltokenize_next_reject .Ltokenize_next_reject: @@ -481,24 +537,17 @@ _tokenize_next: .Ltokenize_next_print: /* DEBUG - lw a0, 4(sp) - mv a1, s1 - sub a1, a1, a0 - call _write_error - DEBUG */ + addi a0, a0, 21 + sw a0, 0(sp) + addi a0, sp, 0 + li a1, 1 + call _write_error */ j .Ltokenize_next_end .Ltokenize_next_comment: addi s1, s1, 1 - /* DEBUG - lw a0, 4(sp) - mv a1, s1 - sub a1, a1, a0 - call _write_error - DEBUG */ - j .Ltokenize_next_end .Ltokenize_next_identifier: @@ -512,6 +561,26 @@ _tokenize_next: j .Ltokenize_next_end +.Ltokenize_next_single: + lw a0, 4(sp) + addi s1, a0, 1 + lbu a0, (a0) + call _classify_single + lw a1, 0(sp) + sw a0, (a1) + + j .Ltokenize_next_end + +.Ltokenize_next_composite: + addi s1, s1, 1 + lw a1, 4(sp) + sub a0, s1, a1 + call _classify_composite + lw a1, 0(sp) + sw a0, (a1) + + j .Ltokenize_next_end + .Ltokenize_next_end: mv a0, s1 # Return the advanced text pointer. |
