Fix strings search looping

2025-05-04 23:49:39 +02:00
parent 0a0bc4e1f2
commit df1c0486c5
4 changed files with 243 additions and 350 deletions
--- a/boot/tokenizer.s
+++ b/boot/tokenizer.s
@@ -38,7 +38,6 @@
 .equ CLASS_COUNT, 20

 .type classification, @object
-.size classification, 128
 classification:
 	.byte CLASS_EOF # 00 NUL
 	.byte CLASS_INVALID # 01 SOH
@@ -172,7 +171,7 @@ classification:
 #
 # Textual keywords in the language.
 #
-.equ KEYWORDS_COUNT, 21
+.equ KEYWORDS_COUNT, TOKEN_IDENTIFIER - 1

 .type keywords, @object
 keywords:
@@ -222,8 +221,8 @@ keywords:
 	.ascii "return"
 	.word 4
 	.ascii "cast"
-	.word 5
-	.ascii "defer"
+	.word 4
+	.ascii "goto"
 	.word 4
 	.ascii "case"
 	.word 2
@@ -251,13 +250,12 @@ byte_keywords: .ascii "&.,:;()[]^=+-*@"
 #   handles each action.
 #
 .type transitions, @object
-.size transitions, 14 * CLASS_COUNT # state count * CLASS_COUNT
 transitions:
 	#     Invalid Digit   Alpha   Space   :       =       (       )     
 	#     *       _       Single  Hex     0       x       NUL     .
 	#     -       " or '  >       <
 	.word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x06ff, 0x0106, 0x06ff
-	.word 0x06ff, 0x0102, 0x06ff, 0x0102, 0x010c, 0x0102, 0x00ff, 0x0108
+	.word 0x06ff, 0x0102, 0x06ff, 0x0102, 0x010c, 0x0102, 0x00ff, 0x06ff
 	.word 0x0105, 0x0110, 0x0104, 0x0107 # 0x00 Start

 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x07ff, 0x02ff, 0x02ff
@@ -280,9 +278,9 @@ transitions:
 	.word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff
 	.word 0x06ff, 0x06ff, 0x04ff, 0x06ff # 0x05 Minus

-	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x06 Left paren
+	.word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff
+	.word 0x0109, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff
+	.word 0x06ff, 0x06ff, 0x06ff, 0x06ff # 0x06 Left paren

 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
@@ -399,6 +397,10 @@ _classify_identifier:
 	la a1, keywords
 	call _strings_index

+	bnez a0, .Lclassify_identifier_end
+	li a0, TOKEN_IDENTIFIER
+
+.Lclassify_identifier_end:
 	# Epilogue.
 	lw ra, 12(sp)
 	lw s0, 8(sp)
@@ -426,7 +428,7 @@ _classify_single:

 	la a1, byte_keywords
 	sub a0, a0, a1
-	addi a0, a0, 27
+	addi a0, a0, TOKEN_IDENTIFIER + 1

 	# Epilogue.
 	lw ra, 12(sp)
@@ -466,16 +468,17 @@ _classify_composite:
 .type _tokenize_next, @function
 _tokenize_next:
 	# Prologue.
-	addi sp, sp, -24
-	sw ra, 20(sp)
-	sw s0, 16(sp)
-	addi s0, sp, 24
+	addi sp, sp, -32
+	sw ra, 28(sp)
+	sw s0, 24(sp)
+	addi s0, sp, 32

-	sw s1, 12(sp) # Preserve s1 used for current source text position.
+	sw s1, 20(sp) # Preserve s1 used for current source text position.
 	mv s1, a0
-	sw a0, 4(sp) # Keeps a pointer to the beginning of a token.
+	sw a0, 12(sp) # Keeps a pointer to the beginning of a token.
+	# 4(sp) and 8(sp) are reserved for the kind and length of the token if needed.

-	sw s2, 8(sp) # Preserve s2 containing the current state.
+	sw s2, 16(sp) # Preserve s2 containing the current state.
 	li s2, 0x00 # Initial, start state.

 	sw a1, 0(sp)
@@ -529,9 +532,9 @@ _tokenize_next:

 .Ltokenize_next_skip:
 	addi s1, s1, 1
-	lw t0, 4(sp)
+	lw t0, 12(sp)
 	addi t0, t0, 1
-	sw t0, 4(sp)
+	sw t0, 12(sp)

 	j .Ltokenize_next_loop

@@ -553,16 +556,20 @@ _tokenize_next:
 .Ltokenize_next_identifier:
 	# An identifier can be a textual keyword.
 	# Check the kind of the token and write it into the output parameter.
-	lw a1, 4(sp)
+	lw a1, 12(sp)
 	sub a0, s1, a1
+	sw a0, 8(sp)
 	call _classify_identifier
-	lw a1, 0(sp)
-	sw a0, (a1)
+	sw a0, 4(sp)
+	lw a0, 0(sp)
+	addi a1, sp, 4
+	li a2, 12
+	call _memcpy

 	j .Ltokenize_next_end

 .Ltokenize_next_single:
-	lw a0, 4(sp)
+	lw a0, 12(sp)
 	addi s1, a0, 1
 	lbu a0, (a0)
 	call _classify_single
@@ -573,7 +580,7 @@ _tokenize_next:

 .Ltokenize_next_composite:
 	addi s1, s1, 1
-	lw a1, 4(sp)
+	lw a1, 12(sp)
 	sub a0, s1, a1
 	call _classify_composite
 	lw a1, 0(sp)
@@ -585,11 +592,11 @@ _tokenize_next:
 	mv a0, s1 # Return the advanced text pointer.

 	# Restore saved registers.
-	lw s1, 12(sp)
-	lw s2, 8(sp)
+	lw s1, 20(sp)
+	lw s2, 16(sp)

 	# Epilogue.
-	lw ra, 20(sp)
-	lw s0, 16(sp)
-	addi sp, sp, 24
+	lw ra, 28(sp)
+	lw s0, 24(sp)
+	addi sp, sp, 32
 	ret