Tokenize single character symbols

author: Eugen Wissner <belka@caraus.de> 2025-05-03 23:35:41 +0200
committer: Eugen Wissner <belka@caraus.de> 2025-05-03 23:35:41 +0200
commit: 0a0bc4e1f2e263cdda83bd78a2a77ff68ae2334e (patch)
tree: 23e6b113d741761a2e897e4138a96c7f7701d348 /boot/tokenizer.s
parent: dcfd6b1515679cfbc75de12a17352d9d1eddceaf (diff)
download: elna-0a0bc4e1f2e263cdda83bd78a2a77ff68ae2334e.tar.gz
1 files changed, 138 insertions, 69 deletions
diff --git a/boot/tokenizer.s b/boot/tokenizer.s
index 4315f66..67b2602 100644
--- a/boot/tokenizer.s
+++ b/boot/tokenizer.s
@@ -1,4 +1,10 @@
-.global _tokenize_next, classification, transitions, keywords
+# This Source Code Form is subject to the terms of the Mozilla Public License,
+# v. 2.0. If a copy of the MPL was not distributed with this file, You can
+# obtain one at https://mozilla.org/MPL/2.0/.
+
+.global _tokenize_next, classification, transitions, keywords, byte_keywords
+
+.include "boot/definitions.inc"
 
 .section .rodata
 
@@ -8,7 +14,7 @@
 #
 # Classification:
 #
-.equ CLASS_INVALID, 0x0
+.equ CLASS_INVALID, 0x00
 .equ CLASS_DIGIT, 0x01
 .equ CLASS_CHARACTER, 0x02
 .equ CLASS_SPACE, 0x03
@@ -25,9 +31,11 @@
 .equ CLASS_EOF, 0x0e
 .equ CLASS_DOT, 0x0f
 .equ CLASS_MINUS, 0x10
-.equ CLASS_DOUBLE_QUOTE, 0x11
+.equ CLASS_QUOTE, 0x11
+.equ CLASS_GREATER, 0x12
+.equ CLASS_LESS, 0x13
 
-.equ CLASS_COUNT, 18
+.equ CLASS_COUNT, 20
 
 .type classification, @object
 .size classification, 128
@@ -66,12 +74,12 @@ classification:
 	.byte CLASS_INVALID # 1F US
 	.byte CLASS_SPACE # 20 Space
 	.byte CLASS_SINGLE # 21 !
-	.byte CLASS_DOUBLE_QUOTE # 22 "
+	.byte CLASS_QUOTE # 22 "
 	.byte 0x00 # 23 #
 	.byte 0x00 # 24 $
 	.byte CLASS_SINGLE # 25 %
 	.byte CLASS_SINGLE # 26 &
-	.byte 0x00 # 27 '
+	.byte CLASS_QUOTE # 27 '
 	.byte CLASS_LEFT_PAREN # 28 (
 	.byte CLASS_RIGHT_PAREN # 29 )
 	.byte CLASS_ASTERISK # 2A *
@@ -92,9 +100,9 @@ classification:
 	.byte CLASS_DIGIT # 39 9
 	.byte CLASS_COLON # 3A :
 	.byte CLASS_SINGLE # 3B ;
-	.byte 0x00 # 3C <
+	.byte CLASS_LESS # 3C <
 	.byte CLASS_EQUALS # 3D =
-	.byte 0x00 # 3E >
+	.byte CLASS_GREATER # 3E >
 	.byte 0x00 # 3F ?
 	.byte CLASS_SINGLE # 40 @
 	.byte CLASS_CHARACTER # 41 A
@@ -220,7 +228,10 @@ keywords:
 	.ascii "case"
 	.word 2
 	.ascii "of"
-.size keywords, . - keywords
+
+.type byte_keywords, @object
+byte_keywords: .ascii "&.,:;()[]^=+-*@"
+.equ BYTE_KEYWORDS_SIZE, . - byte_keywords
 
 .section .data
 
@@ -240,78 +251,66 @@ keywords:
 #   handles each action.
 #
 .type transitions, @object
-.size transitions, 17 * CLASS_COUNT # state count * CLASS_COUNT
+.size transitions, 14 * CLASS_COUNT # state count * CLASS_COUNT
 transitions:
 	#     Invalid Digit   Alpha   Space   :       =       (       )     
 	#     *       _       Single  Hex     0       x       NUL     .
-	#     -       "
-	.word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107
-	.word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start
-	.word 0x010f, 0x0110
+	#     -       " or '  >       <
+	.word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x06ff, 0x0106, 0x06ff
+	.word 0x06ff, 0x0102, 0x06ff, 0x0102, 0x010c, 0x0102, 0x00ff, 0x0108
+	.word 0x0105, 0x0110, 0x0104, 0x0107 # 0x00 Start
 
-	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff
-	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon
-	.word 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x07ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x01 Colon
 
 	.word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff
-	.word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff # 02 Identifier
-	.word 0x05ff, 0x05ff
+	.word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff
+	.word 0x05ff, 0x05ff, 0x05ff, 0x05ff # 0x02 Identifier
 
 	.word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer
-	.word 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x03 Integer
 
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x04ff, 0x02ff, 0x02ff
 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign
-	.word 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x04ff, 0x02ff # 0x04 Greater
 
-	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals
-	.word 0x02ff, 0x02ff
+	.word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff
+	.word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff
+	.word 0x06ff, 0x06ff, 0x04ff, 0x06ff # 0x05 Minus
 
 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren
-	.word 0x02ff, 0x02ff
+	.word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x06 Left paren
 
 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren
-	.word 0x02ff, 0x02ff
-
 	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk
-	.word 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x04ff # 0x07 Less
+
+	.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x08 Dot
 
 	.word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109
-	.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment
-	.word 0x0109, 0x0109
+	.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109
+	.word 0x0109, 0x0109, 0x0109, 0x0109 # 0x09 Comment
 
 	.word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff
-	.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment
-	.word 0x0109, 0x0109
+	.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109
+	.word 0x0109, 0x0109, 0x0109, 0x0109 # 0x0a Closing comment
 
-	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token
-	.word 0x02ff, 0x02ff
+	.word 0x00ff, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110
+	.word 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110
+	.word 0x010b, 0x04ff, 0x010b, 0x010b # 0x0b String
 
 	.word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero
-	.word 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x0c Zero
 
 	.word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal
-	.word 0x00ff, 0x02ff
-
-	.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 0e Dot
-	.word 0x02ff, 0x02ff
-
-	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0f Minus
-	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
-	.word 0x02ff, 0x02ff
-
-	.word 0x00ff, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 # 10 Starting string.
-	.word 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110
-	.word 0x0110, 0x04ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff
+	.word 0x00ff, 0x02ff, 0x02ff, 0x02ff # 0x0d Hexadecimal
 
 .section .text
 
@@ -406,6 +405,57 @@ _classify_identifier:
 	addi sp, sp, 16
 	ret
 
+# Takes a symbol and determines its type.
+#
+# Parameters:
+# a0 - Token character.
+#
+# Sets a0 to the appropriate token type.
+.type _classify_single, @function
+_classify_single:
+	# Prologue.
+	addi sp, sp, -16
+	sw ra, 12(sp)
+	sw s0, 8(sp)
+	addi s0, sp, 16
+
+	mv a1, a0
+	li a2, BYTE_KEYWORDS_SIZE
+	la a0, byte_keywords
+	call _memchr
+
+	la a1, byte_keywords
+	sub a0, a0, a1
+	addi a0, a0, 27
+
+	# Epilogue.
+	lw ra, 12(sp)
+	lw s0, 8(sp)
+	addi sp, sp, 16
+	ret
+
+# Classified a symbol containing multiple characters (probably 2).
+#
+# Parameters:
+# a0 - Token length.
+# a1 - Token pointer.
+#
+# Sets a0 to the appropriate token type.
+.type _classify_composite, @function
+_classify_composite:
+	lbu t0, 0(a1)
+	li t1, ':'
+	beq t0, t1, .Lclassify_composite_assign
+
+	j .Lclassify_composite_end
+
+.Lclassify_composite_assign:
+	li a0, TOKEN_ASSIGN
+	j .Lclassify_composite_end
+
+.Lclassify_composite_end:
+	ret
+
 # Initializes the classification table.
 #
 # Paramaters:
@@ -453,12 +503,18 @@ _tokenize_next:
 	li t0, 0x03 # Skip action.
 	beq t1, t0, .Ltokenize_next_skip
 
-	li t0, 0x04 # Comment action.
+	li t0, 0x04 # Delimited string action.
 	beq t1, t0, .Ltokenize_next_comment
 
 	li t0, 0x05 # Finalize identifier.
 	beq t1, t0, .Ltokenize_next_identifier
 
+	li t0, 0x06 # Single character symbol action.
+	beq t1, t0, .Ltokenize_next_single
+
+	li t0, 0x07 # An action for symbols containing multiple characters.
+	beq t1, t0, .Ltokenize_next_composite
+
 	j .Ltokenize_next_reject
 
 .Ltokenize_next_reject:
@@ -481,24 +537,17 @@ _tokenize_next:
 
 .Ltokenize_next_print:
 	/* DEBUG
-	lw a0, 4(sp)
-	mv a1, s1
-	sub a1, a1, a0
-	call _write_error
-	DEBUG */
+	addi a0, a0, 21
+	sw a0, 0(sp)
+	addi a0, sp, 0
+	li a1, 1
+	call _write_error */
 
 	j .Ltokenize_next_end
 
 .Ltokenize_next_comment:
 	addi s1, s1, 1
 
-	/* DEBUG
-	lw a0, 4(sp)
-	mv a1, s1
-	sub a1, a1, a0
-	call _write_error
-	DEBUG */
-
 	j .Ltokenize_next_end
 
 .Ltokenize_next_identifier:
@@ -512,6 +561,26 @@ _tokenize_next:
 
 	j .Ltokenize_next_end
 
+.Ltokenize_next_single:
+	lw a0, 4(sp)
+	addi s1, a0, 1
+	lbu a0, (a0)
+	call _classify_single
+	lw a1, 0(sp)
+	sw a0, (a1)
+
+	j .Ltokenize_next_end
+
+.Ltokenize_next_composite:
+	addi s1, s1, 1
+	lw a1, 4(sp)
+	sub a0, s1, a1
+	call _classify_composite
+	lw a1, 0(sp)
+	sw a0, (a1)
+
+	j .Ltokenize_next_end
+
 .Ltokenize_next_end:
 	mv a0, s1 # Return the advanced text pointer.
author	Eugen Wissner <belka@caraus.de>	2025-05-03 23:35:41 +0200
committer	Eugen Wissner <belka@caraus.de>	2025-05-03 23:35:41 +0200
commit	0a0bc4e1f2e263cdda83bd78a2a77ff68ae2334e (patch)
tree	23e6b113d741761a2e897e4138a96c7f7701d348 /boot/tokenizer.s
parent	dcfd6b1515679cfbc75de12a17352d9d1eddceaf (diff)
download	elna-0a0bc4e1f2e263cdda83bd78a2a77ff68ae2334e.tar.gz