Properly tokenize declaration sections

2025-05-02 22:57:04 +02:00
parent 768821c689
commit dcfd6b1515
5 changed files with 357 additions and 165 deletions
--- a/boot/tokenizer.s
+++ b/boot/tokenizer.s
@@ -1,4 +1,4 @@
-.global _tokenize, classification, transitions
+.global _tokenize_next, classification, transitions, keywords

 .section .rodata

@@ -24,8 +24,10 @@
 .equ CLASS_X, 0x0d
 .equ CLASS_EOF, 0x0e
 .equ CLASS_DOT, 0x0f
+.equ CLASS_MINUS, 0x10
+.equ CLASS_DOUBLE_QUOTE, 0x11

-.equ CLASS_COUNT, 16
+.equ CLASS_COUNT, 18

 .type classification, @object
 .size classification, 128
@@ -64,7 +66,7 @@ classification:
 	.byte CLASS_INVALID # 1F US
 	.byte CLASS_SPACE # 20 Space
 	.byte CLASS_SINGLE # 21 !
-	.byte 0x00 # 22 "
+	.byte CLASS_DOUBLE_QUOTE # 22 "
 	.byte 0x00 # 23 #
 	.byte 0x00 # 24 $
 	.byte CLASS_SINGLE # 25 %
@@ -75,7 +77,7 @@ classification:
 	.byte CLASS_ASTERISK # 2A *
 	.byte CLASS_SINGLE # 2B +
 	.byte CLASS_SINGLE # 2C ,
-	.byte 0x00 # 2D -
+	.byte CLASS_MINUS # 2D -
 	.byte CLASS_DOT # 2E .
 	.byte CLASS_SINGLE # 2F /
 	.byte CLASS_ZERO # 30 0
@@ -159,6 +161,67 @@ classification:
 	.byte CLASS_SINGLE # 7E ~
 	.byte CLASS_INVALID # 7F DEL

+#
+# Textual keywords in the language.
+#
+.equ KEYWORDS_COUNT, 21
+
+.type keywords, @object
+keywords:
+	.word 7
+	.ascii "program"
+	.word 6
+	.ascii "import"
+	.word 5
+	.ascii "const"
+	.word 3
+	.ascii "var"
+	.word 2
+	.ascii "if"
+	.word 4
+	.ascii "then"
+	.word 5
+	.ascii "elsif"
+	.word 4
+	.ascii "else"
+	.word 5
+	.ascii "while"
+	.word 2
+	.ascii "do"
+	.word 4
+	.ascii "proc"
+	.word 5
+	.ascii "begin"
+	.word 3
+	.ascii "end"
+	.word 4
+	.ascii "type"
+	.word 6
+	.ascii "record"
+	.word 5
+	.ascii "union"
+	.word 4
+	.ascii "true"
+	.word 5
+	.ascii "false"
+	.word 3
+	.ascii "nil"
+	.word 3
+	.ascii "xor"
+	.word 2
+	.ascii "or"
+	.word 6
+	.ascii "return"
+	.word 4
+	.ascii "cast"
+	.word 5
+	.ascii "defer"
+	.word 4
+	.ascii "case"
+	.word 2
+	.ascii "of"
+.size keywords, . - keywords
+
 .section .data

 # The transition table describes transitions from one state to another, given
@@ -173,58 +236,82 @@ classification:
 #   It specifies the target state. "ff" means that this is an end state and no
 #   transition is possible.
 # - The next byte is the action that should be performed when transitioning.
-#   For the meaning of actions see labels in the _analyze_token function, which
+#   For the meaning of actions see labels in the _tokenize_next function, which
 #   handles each action.
 #
 .type transitions, @object
-.size transitions, 13 * CLASS_COUNT # state count * CLASS_COUNT
+.size transitions, 17 * CLASS_COUNT # state count * CLASS_COUNT
 transitions:
 	#     Invalid Digit   Alpha   Space   :       =       (       )     
 	#     *       _       Single  Hex     0       x       NUL     .
+	#     -       "
 	.word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107
 	.word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start
+	.word 0x010f, 0x0110

 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff
 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon
+	.word 0x02ff, 0x02ff

-	.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 02 Identifier
+	.word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff
+	.word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff # 02 Identifier
+	.word 0x05ff, 0x05ff

 	.word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
 	.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer
+	.word 0x02ff, 0x02ff

 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign
+	.word 0x02ff, 0x02ff

 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals
+	.word 0x02ff, 0x02ff

 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
 	.word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren
+	.word 0x02ff, 0x02ff

 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren
+	.word 0x02ff, 0x02ff

 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk
+	.word 0x02ff, 0x02ff

 	.word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109
 	.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment
+	.word 0x0109, 0x0109

 	.word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff
 	.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment
+	.word 0x0109, 0x0109

 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token
+	.word 0x02ff, 0x02ff

 	.word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
 	.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero
+	.word 0x02ff, 0x02ff

 	.word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
 	.word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal
+	.word 0x00ff, 0x02ff

 	.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x2ff, 0x02ff # 0e Dot
+	.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 0e Dot
+	.word 0x02ff, 0x02ff
+
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0f Minus
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff
+
+	.word 0x00ff, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 # 10 Starting string.
+	.word 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110
+	.word 0x0110, 0x04ff

 .section .text

@@ -292,12 +379,42 @@ _next_state:
 	addi sp, sp, 16
 	ret

+# Takes an identifier and checks whether it's a keyword.
+#
+# Parameters:
+# a0 - Token length.
+# a1 - Token pointer.
+#
+# Sets a0 to the appropriate token type.
+.type _classify_identifier, @function
+_classify_identifier:
+	# Prologue.
+	addi sp, sp, -16
+	sw ra, 12(sp)
+	sw s0, 8(sp)
+	addi s0, sp, 16
+
+	mv a2, a0
+	mv a3, a1
+	li a0, KEYWORDS_COUNT
+	la a1, keywords
+	call _strings_index
+
+	# Epilogue.
+	lw ra, 12(sp)
+	lw s0, 8(sp)
+	addi sp, sp, 16
+	ret
+
 # Initializes the classification table.
 #
 # Paramaters:
 # a0 - Source text pointer.
-.type _analyze_token, @function
-_analyze_token:
+# a1 - A pointer for output value, the token kind. 4 Bytes.
+#
+# Sets a0 to the position of the next token.
+.type _tokenize_next, @function
+_tokenize_next:
 	# Prologue.
 	addi sp, sp, -24
 	sw ra, 20(sp)
@@ -311,7 +428,10 @@ _analyze_token:
 	sw s2, 8(sp) # Preserve s2 containing the current state.
 	li s2, 0x00 # Initial, start state.

-.Lanalyze_token_loop:
+	sw a1, 0(sp)
+	sw zero, (a1) # Initialize.
+
+.Ltokenize_next_loop:
 	mv a0, s2
 	lbu a1, (s1)
 	call _next_state
@@ -323,56 +443,43 @@ _analyze_token:
 	and t1, a0, t0 # Transition action.
 	srli t1, t1, 8

-
+	# Perform the provided action.
 	li t0, 0x01 # Accumulate action.
-	beq t1, t0, .Lanalyze_token_accumulate
+	beq t1, t0, .Ltokenize_next_accumulate

 	li t0, 0x02 # Print action.
-	beq t1, t0, .Lanalyze_token_print
+	beq t1, t0, .Ltokenize_next_print

 	li t0, 0x03 # Skip action.
-	beq t1, t0, .Lanalyze_token_skip
+	beq t1, t0, .Ltokenize_next_skip

 	li t0, 0x04 # Comment action.
-	beq t1, t0, .Lanalyze_token_comment
+	beq t1, t0, .Ltokenize_next_comment

-	/* DEBUG
-	mv s4, t1
-	addi t1, t1, '0'
-	sb t1, 0(sp)
-	li t1, ' '
-	sb t1, 1(sp)
-	addi t1, s2, '0'
-	sb t1, 2(sp)
-	addi a0, sp, 0 */
-	sw s1, 0(sp)
-	addi a0, s1, 0
-	li a1, 3
-	call _write_error
-	/* mv t1, s4
-	DEBUG */
+	li t0, 0x05 # Finalize identifier.
+	beq t1, t0, .Ltokenize_next_identifier

-	j .Lanalyze_token_reject
+	j .Ltokenize_next_reject

-.Lanalyze_token_reject:
+.Ltokenize_next_reject:
 	addi s1, s1, 1

-	j .Lanalyze_token_end
+	j .Ltokenize_next_end

-.Lanalyze_token_accumulate:
+.Ltokenize_next_accumulate:
 	addi s1, s1, 1

-	j .Lanalyze_token_loop
+	j .Ltokenize_next_loop

-.Lanalyze_token_skip:
+.Ltokenize_next_skip:
 	addi s1, s1, 1
 	lw t0, 4(sp)
 	addi t0, t0, 1
 	sw t0, 4(sp)

-	j .Lanalyze_token_loop
+	j .Ltokenize_next_loop

-.Lanalyze_token_print:
+.Ltokenize_next_print:
 	/* DEBUG
 	lw a0, 4(sp)
 	mv a1, s1
@@ -380,9 +487,9 @@ _analyze_token:
 	call _write_error
 	DEBUG */

-	j .Lanalyze_token_end
+	j .Ltokenize_next_end

-.Lanalyze_token_comment:
+.Ltokenize_next_comment:
 	addi s1, s1, 1

 	/* DEBUG
@@ -392,9 +499,20 @@ _analyze_token:
 	call _write_error
 	DEBUG */

-	j .Lanalyze_token_end
+	j .Ltokenize_next_end

-.Lanalyze_token_end:
+.Ltokenize_next_identifier:
+	# An identifier can be a textual keyword.
+	# Check the kind of the token and write it into the output parameter.
+	lw a1, 4(sp)
+	sub a0, s1, a1
+	call _classify_identifier
+	lw a1, 0(sp)
+	sw a0, (a1)
+
+	j .Ltokenize_next_end
+
+.Ltokenize_next_end:
 	mv a0, s1 # Return the advanced text pointer.

 	# Restore saved registers.
@@ -406,27 +524,3 @@ _analyze_token:
 	lw s0, 16(sp)
 	addi sp, sp, 24
 	ret
-
-# Initializes the lookup tables.
-#
-# Parameters:
-# a0 - Source text pointer.
-.type _tokenize, @function
-_tokenize:
-	# Prologue.
-	addi sp, sp, -8
-	sw ra, 4(sp)
-	sw s0, 0(sp)
-	addi s0, sp, 8
-
-.Ltokenize_loop:
-	call _analyze_token
-
-	lw t0, (a0)
-	bnez t0, .Ltokenize_loop
-
-	# Epilogue.
-	lw ra, 4(sp)
-	lw s0, 0(sp)
-	addi sp, sp, 8
-	ret