Create tokenization tables

2025-05-01 23:37:40 +02:00
parent f3a8b2626a
commit 768821c689
3 changed files with 381 additions and 178 deletions
--- a/boot/stage1.s
+++ b/boot/stage1.s
@@ -1896,11 +1896,6 @@ _main:
 	sw s0, 0(sp)
 	addi s0, sp, 8

-	# Read the source from the standard input.
-	la a0, source_code
-	li a1, SOURCE_BUFFER_SIZE # Buffer size.
-	call _read_file
-
 	li s2, 1

 	# Epilogue.
@@ -1912,7 +1907,13 @@ _main:
 # Entry point.
 .type _start, @function
 _start:
-	call _tokenizer_initialize
+	# Read the source from the standard input.
+	la a0, source_code
+	li a1, SOURCE_BUFFER_SIZE # Buffer size.
+	call _read_file
+
+	mv a0, s1
+	call _tokenize
 	call _main
 	call _compile

--- a/boot/states.txt
+++ b/boot/states.txt
@@ -1,20 +0,0 @@
- start
-digit: integer
-upper: identifier
-lower: identifier
-space: start
-invalid: error
-
- identifier
-digit: identifier
-upper: identifier
-lower: identifier
-space: end
-invalid: end
-
- integer:
-digit: integer
-upper: end
-lower: end
-space: end
-invalid: end
--- a/boot/tokenizer.s
+++ b/boot/tokenizer.s
@@ -1,190 +1,405 @@
-.global _tokenizer_initialize
+.global _tokenize, classification, transitions

 .section .rodata
+
 #
-# Classes:
+# Classification table assigns each possible character to a group (class). All
+# characters of the same group a handled equivalently.
 #
-# 0x00: Invalid
-# 0x01: Digit
-# 0x02: Character
-# 0x03: Space
-.type classes, @object
-.size classes, 128
-classes:
-	.byte 0x00 # 00 NUL
-	.byte 0x00 # 01 SOH
-	.byte 0x00 # 02 STX
-	.byte 0x00 # 03 ETX
-	.byte 0x00 # 04 EOT
-	.byte 0x00 # 05 ENQ
-	.byte 0x00 # 06 ACK
-	.byte 0x00 # 07 BEL
-	.byte 0x00 # 08 BS
-	.byte 0x00 # 09 HT
-	.byte 0x00 # 0A LF
-	.byte 0x00 # 0B VT
-	.byte 0x00 # 0C FF
-	.byte 0x00 # 0D CR
-	.byte 0x00 # 0E SO
-	.byte 0x00 # 0F SI
-	.byte 0x00 # 10 DLE
-	.byte 0x00 # 11 DC1
-	.byte 0x00 # 12 DC2
-	.byte 0x00 # 13 DC3
-	.byte 0x00 # 14 DC4
-	.byte 0x00 # 15 NAK
-	.byte 0x00 # 16 SYN
-	.byte 0x00 # 17 ETB
-	.byte 0x00 # 18 CAN
-	.byte 0x00 # 19 EM
-	.byte 0x00 # 1A SUB
-	.byte 0x00 # 1B ESC
-	.byte 0x00 # 1C FS
-	.byte 0x00 # 1D GS
-	.byte 0x00 # 1E RS
-	.byte 0x00 # 1F US
-	.byte 0x03 # 20 Space
-	.byte 0x00 # 21 !
+# Classification:
+#
+.equ CLASS_INVALID, 0x0
+.equ CLASS_DIGIT, 0x01
+.equ CLASS_CHARACTER, 0x02
+.equ CLASS_SPACE, 0x03
+.equ CLASS_COLON, 0x04
+.equ CLASS_EQUALS, 0x05
+.equ CLASS_LEFT_PAREN, 0x06
+.equ CLASS_RIGHT_PAREN, 0x07
+.equ CLASS_ASTERISK, 0x08
+.equ CLASS_UNDERSCORE, 0x09
+.equ CLASS_SINGLE, 0x0a
+.equ CLASS_HEX, 0x0b
+.equ CLASS_ZERO, 0x0c
+.equ CLASS_X, 0x0d
+.equ CLASS_EOF, 0x0e
+.equ CLASS_DOT, 0x0f
+
+.equ CLASS_COUNT, 16
+
+.type classification, @object
+.size classification, 128
+classification:
+	.byte CLASS_EOF # 00 NUL
+	.byte CLASS_INVALID # 01 SOH
+	.byte CLASS_INVALID # 02 STX
+	.byte CLASS_INVALID # 03 ETX
+	.byte CLASS_INVALID # 04 EOT
+	.byte CLASS_INVALID # 05 ENQ
+	.byte CLASS_INVALID # 06 ACK
+	.byte CLASS_INVALID # 07 BEL
+	.byte CLASS_INVALID # 08 BS
+	.byte CLASS_SPACE # 09 HT
+	.byte CLASS_SPACE # 0A LF
+	.byte CLASS_INVALID # 0B VT
+	.byte CLASS_INVALID # 0C FF
+	.byte CLASS_SPACE # 0D CR
+	.byte CLASS_INVALID # 0E SO
+	.byte CLASS_INVALID # 0F SI
+	.byte CLASS_INVALID # 10 DLE
+	.byte CLASS_INVALID # 11 DC1
+	.byte CLASS_INVALID # 12 DC2
+	.byte CLASS_INVALID # 13 DC3
+	.byte CLASS_INVALID # 14 DC4
+	.byte CLASS_INVALID # 15 NAK
+	.byte CLASS_INVALID # 16 SYN
+	.byte CLASS_INVALID # 17 ETB
+	.byte CLASS_INVALID # 18 CAN
+	.byte CLASS_INVALID # 19 EM
+	.byte CLASS_INVALID # 1A SUB
+	.byte CLASS_INVALID # 1B ESC
+	.byte CLASS_INVALID # 1C FS
+	.byte CLASS_INVALID # 1D GS
+	.byte CLASS_INVALID # 1E RS
+	.byte CLASS_INVALID # 1F US
+	.byte CLASS_SPACE # 20 Space
+	.byte CLASS_SINGLE # 21 !
 	.byte 0x00 # 22 "
 	.byte 0x00 # 23 #
 	.byte 0x00 # 24 $
-	.byte 0x00 # 25 %
-	.byte 0x00 # 26 &
+	.byte CLASS_SINGLE # 25 %
+	.byte CLASS_SINGLE # 26 &
 	.byte 0x00 # 27 '
-	.byte 0x00 # 28 (
-	.byte 0x00 # 29 )
-	.byte 0x00 # 2A *
-	.byte 0x00 # 2B +
-	.byte 0x00 # 2C ,
+	.byte CLASS_LEFT_PAREN # 28 (
+	.byte CLASS_RIGHT_PAREN # 29 )
+	.byte CLASS_ASTERISK # 2A *
+	.byte CLASS_SINGLE # 2B +
+	.byte CLASS_SINGLE # 2C ,
 	.byte 0x00 # 2D -
-	.byte 0x00 # 2E .
-	.byte 0x00 # 2F /
-	.byte 0x01 # 30 0
-	.byte 0x01 # 31 1
-	.byte 0x01 # 32 2
-	.byte 0x01 # 33 3
-	.byte 0x01 # 34 4
-	.byte 0x01 # 35 5
-	.byte 0x01 # 36 6
-	.byte 0x01 # 37 7
-	.byte 0x01 # 38 8
-	.byte 0x01 # 39 9
-	.byte 0x00 # 3A :
-	.byte 0x00 # 3B ;
+	.byte CLASS_DOT # 2E .
+	.byte CLASS_SINGLE # 2F /
+	.byte CLASS_ZERO # 30 0
+	.byte CLASS_DIGIT # 31 1
+	.byte CLASS_DIGIT # 32 2
+	.byte CLASS_DIGIT # 33 3
+	.byte CLASS_DIGIT # 34 4
+	.byte CLASS_DIGIT # 35 5
+	.byte CLASS_DIGIT # 36 6
+	.byte CLASS_DIGIT # 37 7
+	.byte CLASS_DIGIT # 38 8
+	.byte CLASS_DIGIT # 39 9
+	.byte CLASS_COLON # 3A :
+	.byte CLASS_SINGLE # 3B ;
 	.byte 0x00 # 3C <
-	.byte 0x00 # 3D =
+	.byte CLASS_EQUALS # 3D =
 	.byte 0x00 # 3E >
 	.byte 0x00 # 3F ?
-	.byte 0x00 # 40 @
-	.byte 0x02 # 41 A
-	.byte 0x02 # 42 B
-	.byte 0x02 # 43 C
-	.byte 0x02 # 44 D
-	.byte 0x02 # 45 E
-	.byte 0x02 # 46 F
-	.byte 0x02 # 47 G
-	.byte 0x02 # 48 H
-	.byte 0x02 # 49 I
-	.byte 0x02 # 4A J
-	.byte 0x02 # 4B K
-	.byte 0x02 # 4C L
-	.byte 0x02 # 4D M
-	.byte 0x02 # 4E N
-	.byte 0x02 # 4F O
-	.byte 0x02 # 50 P
-	.byte 0x02 # 51 Q
-	.byte 0x02 # 52 R
-	.byte 0x02 # 53 S
-	.byte 0x02 # 54 T
-	.byte 0x02 # 55 U
-	.byte 0x02 # 56 V
-	.byte 0x02 # 57 W
-	.byte 0x02 # 58 X
-	.byte 0x02 # 59 Y
-	.byte 0x02 # 5A Z
-	.byte 0x00 # 5B [
+	.byte CLASS_SINGLE # 40 @
+	.byte CLASS_CHARACTER # 41 A
+	.byte CLASS_CHARACTER # 42 B
+	.byte CLASS_CHARACTER # 43 C
+	.byte CLASS_CHARACTER # 44 D
+	.byte CLASS_CHARACTER # 45 E
+	.byte CLASS_CHARACTER # 46 F
+	.byte CLASS_CHARACTER # 47 G
+	.byte CLASS_CHARACTER # 48 H
+	.byte CLASS_CHARACTER # 49 I
+	.byte CLASS_CHARACTER # 4A J
+	.byte CLASS_CHARACTER # 4B K
+	.byte CLASS_CHARACTER # 4C L
+	.byte CLASS_CHARACTER # 4D M
+	.byte CLASS_CHARACTER # 4E N
+	.byte CLASS_CHARACTER # 4F O
+	.byte CLASS_CHARACTER # 50 P
+	.byte CLASS_CHARACTER # 51 Q
+	.byte CLASS_CHARACTER # 52 R
+	.byte CLASS_CHARACTER # 53 S
+	.byte CLASS_CHARACTER # 54 T
+	.byte CLASS_CHARACTER # 55 U
+	.byte CLASS_CHARACTER # 56 V
+	.byte CLASS_CHARACTER # 57 W
+	.byte CLASS_CHARACTER # 58 X
+	.byte CLASS_CHARACTER # 59 Y
+	.byte CLASS_CHARACTER # 5A Z
+	.byte CLASS_SINGLE # 5B [
 	.byte 0x00 # 5C \
-	.byte 0x00 # 5D ]
-	.byte 0x00 # 5E ^
-	.byte 0x00 # 5F _
+	.byte CLASS_SINGLE # 5D ]
+	.byte CLASS_SINGLE # 5E ^
+	.byte CLASS_UNDERSCORE # 5F _
 	.byte 0x00 # 60 `
-	.byte 0x02 # 61 a
-	.byte 0x02 # 62 b
-	.byte 0x02 # 63 c
-	.byte 0x02 # 64 d
-	.byte 0x02 # 65 e
-	.byte 0x02 # 66 f
-	.byte 0x02 # 67 g
-	.byte 0x02 # 68 h
-	.byte 0x02 # 69 i
-	.byte 0x02 # 6A j
-	.byte 0x02 # 6B k
-	.byte 0x02 # 6C l
-	.byte 0x02 # 6D m
-	.byte 0x02 # 6E n
-	.byte 0x02 # 6F o
-	.byte 0x02 # 70 p
-	.byte 0x02 # 71 q
-	.byte 0x02 # 72 r
-	.byte 0x02 # 73 s
-	.byte 0x02 # 74 t
-	.byte 0x02 # 75 u
-	.byte 0x02 # 76 v
-	.byte 0x02 # 77 w
-	.byte 0x02 # 78 x
-	.byte 0x02 # 79 y
-	.byte 0x02 # 7A z
+	.byte CLASS_HEX # 61 a
+	.byte CLASS_HEX # 62 b
+	.byte CLASS_HEX # 63 c
+	.byte CLASS_HEX # 64 d
+	.byte CLASS_HEX # 65 e
+	.byte CLASS_HEX # 66 f
+	.byte CLASS_CHARACTER # 67 g
+	.byte CLASS_CHARACTER # 68 h
+	.byte CLASS_CHARACTER # 69 i
+	.byte CLASS_CHARACTER # 6A j
+	.byte CLASS_CHARACTER # 6B k
+	.byte CLASS_CHARACTER # 6C l
+	.byte CLASS_CHARACTER # 6D m
+	.byte CLASS_CHARACTER # 6E n
+	.byte CLASS_CHARACTER # 6F o
+	.byte CLASS_CHARACTER # 70 p
+	.byte CLASS_CHARACTER # 71 q
+	.byte CLASS_CHARACTER # 72 r
+	.byte CLASS_CHARACTER # 73 s
+	.byte CLASS_CHARACTER # 74 t
+	.byte CLASS_CHARACTER # 75 u
+	.byte CLASS_CHARACTER # 76 v
+	.byte CLASS_CHARACTER # 77 w
+	.byte CLASS_X # 78 x
+	.byte CLASS_CHARACTER # 79 y
+	.byte CLASS_CHARACTER # 7A z
 	.byte 0x00 # 7B {
-	.byte 0x00 # 7C |
+	.byte CLASS_SINGLE # 7C |
 	.byte 0x00 # 7D }
-	.byte 0x00 # 7E ~
-	.byte 0x00 # 7F DEL
+	.byte CLASS_SINGLE # 7E ~
+	.byte CLASS_INVALID # 7F DEL

 .section .data

-.section .bss
-.type class_names, @object
-.size class_names, 1024
-class_names: .zero 1024
+# The transition table describes transitions from one state to another, given
+# a symbol (character class).
+#
+# The table has m rows and n columns, where m is the amount of states and n is
+# the amount of classes. So given the current state and a classified character
+# the table can be used to look up the next state.
+#
+# Each cell is a word long.
+# - The least significant byte of the word is a row number (beginning with 0).
+#   It specifies the target state. "ff" means that this is an end state and no
+#   transition is possible.
+# - The next byte is the action that should be performed when transitioning.
+#   For the meaning of actions see labels in the _analyze_token function, which
+#   handles each action.
+#
+.type transitions, @object
+.size transitions, 13 * CLASS_COUNT # state count * CLASS_COUNT
+transitions:
+	#     Invalid Digit   Alpha   Space   :       =       (       )     
+	#     *       _       Single  Hex     0       x       NUL     .
+	.word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107
+	.word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start
+
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon
+
+	.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 02 Identifier
+
+	.word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer
+
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign
+
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals
+
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren
+
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren
+
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk
+
+	.word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109
+	.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment
+
+	.word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff
+	.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment
+
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token
+
+	.word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero
+
+	.word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal
+
+	.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
+	.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x2ff, 0x02ff # 0e Dot

 .section .text

+# Returns the class from the classification table for the given character.
+#
+# Parameters:
+# a0 - Character.
+#
+# Sets a0 to the class number.
+.type _classify, @function
+_classify:
+	la t0, classification
+	add t0, t0, a0 # Character class pointer.
+	lbu a0, (t0) # Character class.
+	ret
+
+# Given the current state and a character class, calculates the next state.
+
+# Parameters:
+# a0 - Current state.
+# a1 - Character class.
+#
+# Sets a0 to the next state.
+.type _lookup_state, @function
+_lookup_state:
+	li t0, CLASS_COUNT
+	mul a0, a0, t0 # Transition row.
+	add a0, a0, a1 # Transition column.
+
+	li t0, 4
+	mul a0, a0, t0 # Multiply by the word size.
+
+	la t0, transitions
+	add t0, t0, a0
+	lw a0, (t0) # Next state.
+
+	ret
+
+# Chains _classify and _lookup_state.
+#
+# Parameters:
+# a0 - Current state.
+# a1 - Character.
+#
+# Sets a0 to the next state based on the given character.
+.type _next_state, @function
+_next_state:
+	# Prologue.
+	addi sp, sp, -16
+	sw ra, 12(sp)
+	sw s0, 8(sp)
+	addi s0, sp, 16
+
+	sw a0, 4(sp)
+	mv a0, a1
+	call _classify
+
+	mv a1, a0
+	lw a0, 4(sp)
+	call _lookup_state
+
+	# Epilogue.
+	lw ra, 12(sp)
+	lw s0, 8(sp)
+	addi sp, sp, 16
+	ret
+
 # Initializes the classification table.
 #
 # Paramaters:
-# a0 - Raw input for the classification table.
-.type _initialize_classes, @function
-_initialize_classes:
+# a0 - Source text pointer.
+.type _analyze_token, @function
+_analyze_token:
 	# Prologue.
 	addi sp, sp, -24
 	sw ra, 20(sp)
 	sw s0, 16(sp)
 	addi s0, sp, 24

-	sw s1, 12(sp) # Preserve the s1 register used for the character counter.
-	li s1, 128 # 128 ASCII characters.
+	sw s1, 12(sp) # Preserve s1 used for current source text position.
+	mv s1, a0
+	sw a0, 4(sp) # Keeps a pointer to the beginning of a token.

-.Linitialize_classes_loop:
-	addi s1, s1, -1
+	sw s2, 8(sp) # Preserve s2 containing the current state.
+	li s2, 0x00 # Initial, start state.

-	la t0, classes
-	add t0, t0, s1
-	lbu t0, (t0)
-	li t1, 0x01
+.Lanalyze_token_loop:
+	mv a0, s2
+	lbu a1, (s1)
+	call _next_state

-	bne t0, t1, .Linitialize_classes_step
+	li t0, 0xff
+	and s2, a0, t0 # Next state.

-	/* DEBUG */
-	li a0, 0x69676964
-	sw a0, 8(sp) # Preserve the memory address.
-	addi a0, sp, 8
-	li a1, 4
+	li t0, 0xff00
+	and t1, a0, t0 # Transition action.
+	srli t1, t1, 8
+
+
+	li t0, 0x01 # Accumulate action.
+	beq t1, t0, .Lanalyze_token_accumulate
+
+	li t0, 0x02 # Print action.
+	beq t1, t0, .Lanalyze_token_print
+
+	li t0, 0x03 # Skip action.
+	beq t1, t0, .Lanalyze_token_skip
+
+	li t0, 0x04 # Comment action.
+	beq t1, t0, .Lanalyze_token_comment
+
+	/* DEBUG
+	mv s4, t1
+	addi t1, t1, '0'
+	sb t1, 0(sp)
+	li t1, ' '
+	sb t1, 1(sp)
+	addi t1, s2, '0'
+	sb t1, 2(sp)
+	addi a0, sp, 0 */
+	sw s1, 0(sp)
+	addi a0, s1, 0
+	li a1, 3
 	call _write_error
+	/* mv t1, s4
+	DEBUG */

-.Linitialize_classes_step:
-	bnez s1, .Linitialize_classes_loop
+	j .Lanalyze_token_reject

-	lw s1, 12(sp) # Restore the saved register.
+.Lanalyze_token_reject:
+	addi s1, s1, 1
+
+	j .Lanalyze_token_end
+
+.Lanalyze_token_accumulate:
+	addi s1, s1, 1
+
+	j .Lanalyze_token_loop
+
+.Lanalyze_token_skip:
+	addi s1, s1, 1
+	lw t0, 4(sp)
+	addi t0, t0, 1
+	sw t0, 4(sp)
+
+	j .Lanalyze_token_loop
+
+.Lanalyze_token_print:
+	/* DEBUG
+	lw a0, 4(sp)
+	mv a1, s1
+	sub a1, a1, a0
+	call _write_error
+	DEBUG */
+
+	j .Lanalyze_token_end
+
+.Lanalyze_token_comment:
+	addi s1, s1, 1
+
+	/* DEBUG
+	lw a0, 4(sp)
+	mv a1, s1
+	sub a1, a1, a0
+	call _write_error
+	DEBUG */
+
+	j .Lanalyze_token_end
+
+.Lanalyze_token_end:
+	mv a0, s1 # Return the advanced text pointer.
+
+	# Restore saved registers.
+	lw s1, 12(sp)
+	lw s2, 8(sp)

 	# Epilogue.
 	lw ra, 20(sp)
@@ -193,15 +408,22 @@ _initialize_classes:
 	ret

 # Initializes the lookup tables.
-.type _tokenizer_initialize, @function
-_tokenizer_initialize:
+#
+# Parameters:
+# a0 - Source text pointer.
+.type _tokenize, @function
+_tokenize:
 	# Prologue.
 	addi sp, sp, -8
 	sw ra, 4(sp)
 	sw s0, 0(sp)
 	addi s0, sp, 8

-	call _initialize_classes
+.Ltokenize_loop:
+	call _analyze_token
+
+	lw t0, (a0)
+	bnez t0, .Ltokenize_loop

 	# Epilogue.
 	lw ra, 4(sp)