Create tokenization tables

This commit is contained in:
Eugen Wissner 2025-05-01 23:37:40 +02:00
parent f3a8b2626a
commit 768821c689
Signed by: belka
GPG Key ID: A27FDC1E8EE902C0
3 changed files with 381 additions and 178 deletions

View File

@ -1896,11 +1896,6 @@ _main:
sw s0, 0(sp) sw s0, 0(sp)
addi s0, sp, 8 addi s0, sp, 8
# Read the source from the standard input.
la a0, source_code
li a1, SOURCE_BUFFER_SIZE # Buffer size.
call _read_file
li s2, 1 li s2, 1
# Epilogue. # Epilogue.
@ -1912,7 +1907,13 @@ _main:
# Entry point. # Entry point.
.type _start, @function .type _start, @function
_start: _start:
call _tokenizer_initialize # Read the source from the standard input.
la a0, source_code
li a1, SOURCE_BUFFER_SIZE # Buffer size.
call _read_file
mv a0, s1
call _tokenize
call _main call _main
call _compile call _compile

View File

@ -1,20 +0,0 @@
- start
digit: integer
upper: identifier
lower: identifier
space: start
invalid: error
- identifier
digit: identifier
upper: identifier
lower: identifier
space: end
invalid: end
- integer:
digit: integer
upper: end
lower: end
space: end
invalid: end

View File

@ -1,190 +1,405 @@
.global _tokenizer_initialize .global _tokenize, classification, transitions
.section .rodata .section .rodata
# #
# Classes: # Classification table assigns each possible character to a group (class). All
# characters of the same group a handled equivalently.
# #
# 0x00: Invalid # Classification:
# 0x01: Digit #
# 0x02: Character .equ CLASS_INVALID, 0x0
# 0x03: Space .equ CLASS_DIGIT, 0x01
.type classes, @object .equ CLASS_CHARACTER, 0x02
.size classes, 128 .equ CLASS_SPACE, 0x03
classes: .equ CLASS_COLON, 0x04
.byte 0x00 # 00 NUL .equ CLASS_EQUALS, 0x05
.byte 0x00 # 01 SOH .equ CLASS_LEFT_PAREN, 0x06
.byte 0x00 # 02 STX .equ CLASS_RIGHT_PAREN, 0x07
.byte 0x00 # 03 ETX .equ CLASS_ASTERISK, 0x08
.byte 0x00 # 04 EOT .equ CLASS_UNDERSCORE, 0x09
.byte 0x00 # 05 ENQ .equ CLASS_SINGLE, 0x0a
.byte 0x00 # 06 ACK .equ CLASS_HEX, 0x0b
.byte 0x00 # 07 BEL .equ CLASS_ZERO, 0x0c
.byte 0x00 # 08 BS .equ CLASS_X, 0x0d
.byte 0x00 # 09 HT .equ CLASS_EOF, 0x0e
.byte 0x00 # 0A LF .equ CLASS_DOT, 0x0f
.byte 0x00 # 0B VT
.byte 0x00 # 0C FF .equ CLASS_COUNT, 16
.byte 0x00 # 0D CR
.byte 0x00 # 0E SO .type classification, @object
.byte 0x00 # 0F SI .size classification, 128
.byte 0x00 # 10 DLE classification:
.byte 0x00 # 11 DC1 .byte CLASS_EOF # 00 NUL
.byte 0x00 # 12 DC2 .byte CLASS_INVALID # 01 SOH
.byte 0x00 # 13 DC3 .byte CLASS_INVALID # 02 STX
.byte 0x00 # 14 DC4 .byte CLASS_INVALID # 03 ETX
.byte 0x00 # 15 NAK .byte CLASS_INVALID # 04 EOT
.byte 0x00 # 16 SYN .byte CLASS_INVALID # 05 ENQ
.byte 0x00 # 17 ETB .byte CLASS_INVALID # 06 ACK
.byte 0x00 # 18 CAN .byte CLASS_INVALID # 07 BEL
.byte 0x00 # 19 EM .byte CLASS_INVALID # 08 BS
.byte 0x00 # 1A SUB .byte CLASS_SPACE # 09 HT
.byte 0x00 # 1B ESC .byte CLASS_SPACE # 0A LF
.byte 0x00 # 1C FS .byte CLASS_INVALID # 0B VT
.byte 0x00 # 1D GS .byte CLASS_INVALID # 0C FF
.byte 0x00 # 1E RS .byte CLASS_SPACE # 0D CR
.byte 0x00 # 1F US .byte CLASS_INVALID # 0E SO
.byte 0x03 # 20 Space .byte CLASS_INVALID # 0F SI
.byte 0x00 # 21 ! .byte CLASS_INVALID # 10 DLE
.byte CLASS_INVALID # 11 DC1
.byte CLASS_INVALID # 12 DC2
.byte CLASS_INVALID # 13 DC3
.byte CLASS_INVALID # 14 DC4
.byte CLASS_INVALID # 15 NAK
.byte CLASS_INVALID # 16 SYN
.byte CLASS_INVALID # 17 ETB
.byte CLASS_INVALID # 18 CAN
.byte CLASS_INVALID # 19 EM
.byte CLASS_INVALID # 1A SUB
.byte CLASS_INVALID # 1B ESC
.byte CLASS_INVALID # 1C FS
.byte CLASS_INVALID # 1D GS
.byte CLASS_INVALID # 1E RS
.byte CLASS_INVALID # 1F US
.byte CLASS_SPACE # 20 Space
.byte CLASS_SINGLE # 21 !
.byte 0x00 # 22 " .byte 0x00 # 22 "
.byte 0x00 # 23 # .byte 0x00 # 23 #
.byte 0x00 # 24 $ .byte 0x00 # 24 $
.byte 0x00 # 25 % .byte CLASS_SINGLE # 25 %
.byte 0x00 # 26 & .byte CLASS_SINGLE # 26 &
.byte 0x00 # 27 ' .byte 0x00 # 27 '
.byte 0x00 # 28 ( .byte CLASS_LEFT_PAREN # 28 (
.byte 0x00 # 29 ) .byte CLASS_RIGHT_PAREN # 29 )
.byte 0x00 # 2A * .byte CLASS_ASTERISK # 2A *
.byte 0x00 # 2B + .byte CLASS_SINGLE # 2B +
.byte 0x00 # 2C , .byte CLASS_SINGLE # 2C ,
.byte 0x00 # 2D - .byte 0x00 # 2D -
.byte 0x00 # 2E . .byte CLASS_DOT # 2E .
.byte 0x00 # 2F / .byte CLASS_SINGLE # 2F /
.byte 0x01 # 30 0 .byte CLASS_ZERO # 30 0
.byte 0x01 # 31 1 .byte CLASS_DIGIT # 31 1
.byte 0x01 # 32 2 .byte CLASS_DIGIT # 32 2
.byte 0x01 # 33 3 .byte CLASS_DIGIT # 33 3
.byte 0x01 # 34 4 .byte CLASS_DIGIT # 34 4
.byte 0x01 # 35 5 .byte CLASS_DIGIT # 35 5
.byte 0x01 # 36 6 .byte CLASS_DIGIT # 36 6
.byte 0x01 # 37 7 .byte CLASS_DIGIT # 37 7
.byte 0x01 # 38 8 .byte CLASS_DIGIT # 38 8
.byte 0x01 # 39 9 .byte CLASS_DIGIT # 39 9
.byte 0x00 # 3A : .byte CLASS_COLON # 3A :
.byte 0x00 # 3B ; .byte CLASS_SINGLE # 3B ;
.byte 0x00 # 3C < .byte 0x00 # 3C <
.byte 0x00 # 3D = .byte CLASS_EQUALS # 3D =
.byte 0x00 # 3E > .byte 0x00 # 3E >
.byte 0x00 # 3F ? .byte 0x00 # 3F ?
.byte 0x00 # 40 @ .byte CLASS_SINGLE # 40 @
.byte 0x02 # 41 A .byte CLASS_CHARACTER # 41 A
.byte 0x02 # 42 B .byte CLASS_CHARACTER # 42 B
.byte 0x02 # 43 C .byte CLASS_CHARACTER # 43 C
.byte 0x02 # 44 D .byte CLASS_CHARACTER # 44 D
.byte 0x02 # 45 E .byte CLASS_CHARACTER # 45 E
.byte 0x02 # 46 F .byte CLASS_CHARACTER # 46 F
.byte 0x02 # 47 G .byte CLASS_CHARACTER # 47 G
.byte 0x02 # 48 H .byte CLASS_CHARACTER # 48 H
.byte 0x02 # 49 I .byte CLASS_CHARACTER # 49 I
.byte 0x02 # 4A J .byte CLASS_CHARACTER # 4A J
.byte 0x02 # 4B K .byte CLASS_CHARACTER # 4B K
.byte 0x02 # 4C L .byte CLASS_CHARACTER # 4C L
.byte 0x02 # 4D M .byte CLASS_CHARACTER # 4D M
.byte 0x02 # 4E N .byte CLASS_CHARACTER # 4E N
.byte 0x02 # 4F O .byte CLASS_CHARACTER # 4F O
.byte 0x02 # 50 P .byte CLASS_CHARACTER # 50 P
.byte 0x02 # 51 Q .byte CLASS_CHARACTER # 51 Q
.byte 0x02 # 52 R .byte CLASS_CHARACTER # 52 R
.byte 0x02 # 53 S .byte CLASS_CHARACTER # 53 S
.byte 0x02 # 54 T .byte CLASS_CHARACTER # 54 T
.byte 0x02 # 55 U .byte CLASS_CHARACTER # 55 U
.byte 0x02 # 56 V .byte CLASS_CHARACTER # 56 V
.byte 0x02 # 57 W .byte CLASS_CHARACTER # 57 W
.byte 0x02 # 58 X .byte CLASS_CHARACTER # 58 X
.byte 0x02 # 59 Y .byte CLASS_CHARACTER # 59 Y
.byte 0x02 # 5A Z .byte CLASS_CHARACTER # 5A Z
.byte 0x00 # 5B [ .byte CLASS_SINGLE # 5B [
.byte 0x00 # 5C \ .byte 0x00 # 5C \
.byte 0x00 # 5D ] .byte CLASS_SINGLE # 5D ]
.byte 0x00 # 5E ^ .byte CLASS_SINGLE # 5E ^
.byte 0x00 # 5F _ .byte CLASS_UNDERSCORE # 5F _
.byte 0x00 # 60 ` .byte 0x00 # 60 `
.byte 0x02 # 61 a .byte CLASS_HEX # 61 a
.byte 0x02 # 62 b .byte CLASS_HEX # 62 b
.byte 0x02 # 63 c .byte CLASS_HEX # 63 c
.byte 0x02 # 64 d .byte CLASS_HEX # 64 d
.byte 0x02 # 65 e .byte CLASS_HEX # 65 e
.byte 0x02 # 66 f .byte CLASS_HEX # 66 f
.byte 0x02 # 67 g .byte CLASS_CHARACTER # 67 g
.byte 0x02 # 68 h .byte CLASS_CHARACTER # 68 h
.byte 0x02 # 69 i .byte CLASS_CHARACTER # 69 i
.byte 0x02 # 6A j .byte CLASS_CHARACTER # 6A j
.byte 0x02 # 6B k .byte CLASS_CHARACTER # 6B k
.byte 0x02 # 6C l .byte CLASS_CHARACTER # 6C l
.byte 0x02 # 6D m .byte CLASS_CHARACTER # 6D m
.byte 0x02 # 6E n .byte CLASS_CHARACTER # 6E n
.byte 0x02 # 6F o .byte CLASS_CHARACTER # 6F o
.byte 0x02 # 70 p .byte CLASS_CHARACTER # 70 p
.byte 0x02 # 71 q .byte CLASS_CHARACTER # 71 q
.byte 0x02 # 72 r .byte CLASS_CHARACTER # 72 r
.byte 0x02 # 73 s .byte CLASS_CHARACTER # 73 s
.byte 0x02 # 74 t .byte CLASS_CHARACTER # 74 t
.byte 0x02 # 75 u .byte CLASS_CHARACTER # 75 u
.byte 0x02 # 76 v .byte CLASS_CHARACTER # 76 v
.byte 0x02 # 77 w .byte CLASS_CHARACTER # 77 w
.byte 0x02 # 78 x .byte CLASS_X # 78 x
.byte 0x02 # 79 y .byte CLASS_CHARACTER # 79 y
.byte 0x02 # 7A z .byte CLASS_CHARACTER # 7A z
.byte 0x00 # 7B { .byte 0x00 # 7B {
.byte 0x00 # 7C | .byte CLASS_SINGLE # 7C |
.byte 0x00 # 7D } .byte 0x00 # 7D }
.byte 0x00 # 7E ~ .byte CLASS_SINGLE # 7E ~
.byte 0x00 # 7F DEL .byte CLASS_INVALID # 7F DEL
.section .data .section .data
.section .bss # The transition table describes transitions from one state to another, given
.type class_names, @object # a symbol (character class).
.size class_names, 1024 #
class_names: .zero 1024 # The table has m rows and n columns, where m is the amount of states and n is
# the amount of classes. So given the current state and a classified character
# the table can be used to look up the next state.
#
# Each cell is a word long.
# - The least significant byte of the word is a row number (beginning with 0).
# It specifies the target state. "ff" means that this is an end state and no
# transition is possible.
# - The next byte is the action that should be performed when transitioning.
# For the meaning of actions see labels in the _analyze_token function, which
# handles each action.
#
.type transitions, @object
.size transitions, 13 * CLASS_COUNT # state count * CLASS_COUNT
transitions:
# Invalid Digit Alpha Space : = ( )
# * _ Single Hex 0 x NUL .
.word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107
.word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon
.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 02 Identifier
.word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk
.word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109
.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment
.word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff
.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token
.word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero
.word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal
.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x2ff, 0x02ff # 0e Dot
.section .text .section .text
# Returns the class from the classification table for the given character.
#
# Parameters:
# a0 - Character.
#
# Sets a0 to the class number.
.type _classify, @function
_classify:
la t0, classification
add t0, t0, a0 # Character class pointer.
lbu a0, (t0) # Character class.
ret
# Given the current state and a character class, calculates the next state.
# Parameters:
# a0 - Current state.
# a1 - Character class.
#
# Sets a0 to the next state.
.type _lookup_state, @function
_lookup_state:
li t0, CLASS_COUNT
mul a0, a0, t0 # Transition row.
add a0, a0, a1 # Transition column.
li t0, 4
mul a0, a0, t0 # Multiply by the word size.
la t0, transitions
add t0, t0, a0
lw a0, (t0) # Next state.
ret
# Chains _classify and _lookup_state.
#
# Parameters:
# a0 - Current state.
# a1 - Character.
#
# Sets a0 to the next state based on the given character.
.type _next_state, @function
_next_state:
# Prologue.
addi sp, sp, -16
sw ra, 12(sp)
sw s0, 8(sp)
addi s0, sp, 16
sw a0, 4(sp)
mv a0, a1
call _classify
mv a1, a0
lw a0, 4(sp)
call _lookup_state
# Epilogue.
lw ra, 12(sp)
lw s0, 8(sp)
addi sp, sp, 16
ret
# Initializes the classification table. # Initializes the classification table.
# #
# Paramaters: # Paramaters:
# a0 - Raw input for the classification table. # a0 - Source text pointer.
.type _initialize_classes, @function .type _analyze_token, @function
_initialize_classes: _analyze_token:
# Prologue. # Prologue.
addi sp, sp, -24 addi sp, sp, -24
sw ra, 20(sp) sw ra, 20(sp)
sw s0, 16(sp) sw s0, 16(sp)
addi s0, sp, 24 addi s0, sp, 24
sw s1, 12(sp) # Preserve the s1 register used for the character counter. sw s1, 12(sp) # Preserve s1 used for current source text position.
li s1, 128 # 128 ASCII characters. mv s1, a0
sw a0, 4(sp) # Keeps a pointer to the beginning of a token.
.Linitialize_classes_loop: sw s2, 8(sp) # Preserve s2 containing the current state.
addi s1, s1, -1 li s2, 0x00 # Initial, start state.
la t0, classes .Lanalyze_token_loop:
add t0, t0, s1 mv a0, s2
lbu t0, (t0) lbu a1, (s1)
li t1, 0x01 call _next_state
bne t0, t1, .Linitialize_classes_step li t0, 0xff
and s2, a0, t0 # Next state.
/* DEBUG */ li t0, 0xff00
li a0, 0x69676964 and t1, a0, t0 # Transition action.
sw a0, 8(sp) # Preserve the memory address. srli t1, t1, 8
addi a0, sp, 8
li a1, 4
li t0, 0x01 # Accumulate action.
beq t1, t0, .Lanalyze_token_accumulate
li t0, 0x02 # Print action.
beq t1, t0, .Lanalyze_token_print
li t0, 0x03 # Skip action.
beq t1, t0, .Lanalyze_token_skip
li t0, 0x04 # Comment action.
beq t1, t0, .Lanalyze_token_comment
/* DEBUG
mv s4, t1
addi t1, t1, '0'
sb t1, 0(sp)
li t1, ' '
sb t1, 1(sp)
addi t1, s2, '0'
sb t1, 2(sp)
addi a0, sp, 0 */
sw s1, 0(sp)
addi a0, s1, 0
li a1, 3
call _write_error call _write_error
/* mv t1, s4
DEBUG */
.Linitialize_classes_step: j .Lanalyze_token_reject
bnez s1, .Linitialize_classes_loop
lw s1, 12(sp) # Restore the saved register. .Lanalyze_token_reject:
addi s1, s1, 1
j .Lanalyze_token_end
.Lanalyze_token_accumulate:
addi s1, s1, 1
j .Lanalyze_token_loop
.Lanalyze_token_skip:
addi s1, s1, 1
lw t0, 4(sp)
addi t0, t0, 1
sw t0, 4(sp)
j .Lanalyze_token_loop
.Lanalyze_token_print:
/* DEBUG
lw a0, 4(sp)
mv a1, s1
sub a1, a1, a0
call _write_error
DEBUG */
j .Lanalyze_token_end
.Lanalyze_token_comment:
addi s1, s1, 1
/* DEBUG
lw a0, 4(sp)
mv a1, s1
sub a1, a1, a0
call _write_error
DEBUG */
j .Lanalyze_token_end
.Lanalyze_token_end:
mv a0, s1 # Return the advanced text pointer.
# Restore saved registers.
lw s1, 12(sp)
lw s2, 8(sp)
# Epilogue. # Epilogue.
lw ra, 20(sp) lw ra, 20(sp)
@ -193,15 +408,22 @@ _initialize_classes:
ret ret
# Initializes the lookup tables. # Initializes the lookup tables.
.type _tokenizer_initialize, @function #
_tokenizer_initialize: # Parameters:
# a0 - Source text pointer.
.type _tokenize, @function
_tokenize:
# Prologue. # Prologue.
addi sp, sp, -8 addi sp, sp, -8
sw ra, 4(sp) sw ra, 4(sp)
sw s0, 0(sp) sw s0, 0(sp)
addi s0, sp, 8 addi s0, sp, 8
call _initialize_classes .Ltokenize_loop:
call _analyze_token
lw t0, (a0)
bnez t0, .Ltokenize_loop
# Epilogue. # Epilogue.
lw ra, 4(sp) lw ra, 4(sp)