Properly tokenize declaration sections
This commit is contained in:
230
boot/tokenizer.s
230
boot/tokenizer.s
@@ -1,4 +1,4 @@
|
||||
.global _tokenize, classification, transitions
|
||||
.global _tokenize_next, classification, transitions, keywords
|
||||
|
||||
.section .rodata
|
||||
|
||||
@@ -24,8 +24,10 @@
|
||||
.equ CLASS_X, 0x0d
|
||||
.equ CLASS_EOF, 0x0e
|
||||
.equ CLASS_DOT, 0x0f
|
||||
.equ CLASS_MINUS, 0x10
|
||||
.equ CLASS_DOUBLE_QUOTE, 0x11
|
||||
|
||||
.equ CLASS_COUNT, 16
|
||||
.equ CLASS_COUNT, 18
|
||||
|
||||
.type classification, @object
|
||||
.size classification, 128
|
||||
@@ -64,7 +66,7 @@ classification:
|
||||
.byte CLASS_INVALID # 1F US
|
||||
.byte CLASS_SPACE # 20 Space
|
||||
.byte CLASS_SINGLE # 21 !
|
||||
.byte 0x00 # 22 "
|
||||
.byte CLASS_DOUBLE_QUOTE # 22 "
|
||||
.byte 0x00 # 23 #
|
||||
.byte 0x00 # 24 $
|
||||
.byte CLASS_SINGLE # 25 %
|
||||
@@ -75,7 +77,7 @@ classification:
|
||||
.byte CLASS_ASTERISK # 2A *
|
||||
.byte CLASS_SINGLE # 2B +
|
||||
.byte CLASS_SINGLE # 2C ,
|
||||
.byte 0x00 # 2D -
|
||||
.byte CLASS_MINUS # 2D -
|
||||
.byte CLASS_DOT # 2E .
|
||||
.byte CLASS_SINGLE # 2F /
|
||||
.byte CLASS_ZERO # 30 0
|
||||
@@ -159,6 +161,67 @@ classification:
|
||||
.byte CLASS_SINGLE # 7E ~
|
||||
.byte CLASS_INVALID # 7F DEL
|
||||
|
||||
#
|
||||
# Textual keywords in the language.
|
||||
#
|
||||
.equ KEYWORDS_COUNT, 21
|
||||
|
||||
.type keywords, @object
|
||||
keywords:
|
||||
.word 7
|
||||
.ascii "program"
|
||||
.word 6
|
||||
.ascii "import"
|
||||
.word 5
|
||||
.ascii "const"
|
||||
.word 3
|
||||
.ascii "var"
|
||||
.word 2
|
||||
.ascii "if"
|
||||
.word 4
|
||||
.ascii "then"
|
||||
.word 5
|
||||
.ascii "elsif"
|
||||
.word 4
|
||||
.ascii "else"
|
||||
.word 5
|
||||
.ascii "while"
|
||||
.word 2
|
||||
.ascii "do"
|
||||
.word 4
|
||||
.ascii "proc"
|
||||
.word 5
|
||||
.ascii "begin"
|
||||
.word 3
|
||||
.ascii "end"
|
||||
.word 4
|
||||
.ascii "type"
|
||||
.word 6
|
||||
.ascii "record"
|
||||
.word 5
|
||||
.ascii "union"
|
||||
.word 4
|
||||
.ascii "true"
|
||||
.word 5
|
||||
.ascii "false"
|
||||
.word 3
|
||||
.ascii "nil"
|
||||
.word 3
|
||||
.ascii "xor"
|
||||
.word 2
|
||||
.ascii "or"
|
||||
.word 6
|
||||
.ascii "return"
|
||||
.word 4
|
||||
.ascii "cast"
|
||||
.word 5
|
||||
.ascii "defer"
|
||||
.word 4
|
||||
.ascii "case"
|
||||
.word 2
|
||||
.ascii "of"
|
||||
.size keywords, . - keywords
|
||||
|
||||
.section .data
|
||||
|
||||
# The transition table describes transitions from one state to another, given
|
||||
@@ -173,58 +236,82 @@ classification:
|
||||
# It specifies the target state. "ff" means that this is an end state and no
|
||||
# transition is possible.
|
||||
# - The next byte is the action that should be performed when transitioning.
|
||||
# For the meaning of actions see labels in the _analyze_token function, which
|
||||
# For the meaning of actions see labels in the _tokenize_next function, which
|
||||
# handles each action.
|
||||
#
|
||||
.type transitions, @object
|
||||
.size transitions, 13 * CLASS_COUNT # state count * CLASS_COUNT
|
||||
.size transitions, 17 * CLASS_COUNT # state count * CLASS_COUNT
|
||||
transitions:
|
||||
# Invalid Digit Alpha Space : = ( )
|
||||
# * _ Single Hex 0 x NUL .
|
||||
# - "
|
||||
.word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107
|
||||
.word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start
|
||||
.word 0x010f, 0x0110
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 02 Identifier
|
||||
.word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff
|
||||
.word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff # 02 Identifier
|
||||
.word 0x05ff, 0x05ff
|
||||
|
||||
.word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109
|
||||
.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment
|
||||
.word 0x0109, 0x0109
|
||||
|
||||
.word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff
|
||||
.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment
|
||||
.word 0x0109, 0x0109
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal
|
||||
.word 0x00ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x2ff, 0x02ff # 0e Dot
|
||||
.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 0e Dot
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0f Minus
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x00ff, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 # 10 Starting string.
|
||||
.word 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110
|
||||
.word 0x0110, 0x04ff
|
||||
|
||||
.section .text
|
||||
|
||||
@@ -292,12 +379,42 @@ _next_state:
|
||||
addi sp, sp, 16
|
||||
ret
|
||||
|
||||
# Takes an identifier and checks whether it's a keyword.
|
||||
#
|
||||
# Parameters:
|
||||
# a0 - Token length.
|
||||
# a1 - Token pointer.
|
||||
#
|
||||
# Sets a0 to the appropriate token type.
|
||||
.type _classify_identifier, @function
|
||||
_classify_identifier:
|
||||
# Prologue.
|
||||
addi sp, sp, -16
|
||||
sw ra, 12(sp)
|
||||
sw s0, 8(sp)
|
||||
addi s0, sp, 16
|
||||
|
||||
mv a2, a0
|
||||
mv a3, a1
|
||||
li a0, KEYWORDS_COUNT
|
||||
la a1, keywords
|
||||
call _strings_index
|
||||
|
||||
# Epilogue.
|
||||
lw ra, 12(sp)
|
||||
lw s0, 8(sp)
|
||||
addi sp, sp, 16
|
||||
ret
|
||||
|
||||
# Initializes the classification table.
|
||||
#
|
||||
# Paramaters:
|
||||
# a0 - Source text pointer.
|
||||
.type _analyze_token, @function
|
||||
_analyze_token:
|
||||
# a1 - A pointer for output value, the token kind. 4 Bytes.
|
||||
#
|
||||
# Sets a0 to the position of the next token.
|
||||
.type _tokenize_next, @function
|
||||
_tokenize_next:
|
||||
# Prologue.
|
||||
addi sp, sp, -24
|
||||
sw ra, 20(sp)
|
||||
@@ -311,7 +428,10 @@ _analyze_token:
|
||||
sw s2, 8(sp) # Preserve s2 containing the current state.
|
||||
li s2, 0x00 # Initial, start state.
|
||||
|
||||
.Lanalyze_token_loop:
|
||||
sw a1, 0(sp)
|
||||
sw zero, (a1) # Initialize.
|
||||
|
||||
.Ltokenize_next_loop:
|
||||
mv a0, s2
|
||||
lbu a1, (s1)
|
||||
call _next_state
|
||||
@@ -323,56 +443,43 @@ _analyze_token:
|
||||
and t1, a0, t0 # Transition action.
|
||||
srli t1, t1, 8
|
||||
|
||||
|
||||
# Perform the provided action.
|
||||
li t0, 0x01 # Accumulate action.
|
||||
beq t1, t0, .Lanalyze_token_accumulate
|
||||
beq t1, t0, .Ltokenize_next_accumulate
|
||||
|
||||
li t0, 0x02 # Print action.
|
||||
beq t1, t0, .Lanalyze_token_print
|
||||
beq t1, t0, .Ltokenize_next_print
|
||||
|
||||
li t0, 0x03 # Skip action.
|
||||
beq t1, t0, .Lanalyze_token_skip
|
||||
beq t1, t0, .Ltokenize_next_skip
|
||||
|
||||
li t0, 0x04 # Comment action.
|
||||
beq t1, t0, .Lanalyze_token_comment
|
||||
beq t1, t0, .Ltokenize_next_comment
|
||||
|
||||
/* DEBUG
|
||||
mv s4, t1
|
||||
addi t1, t1, '0'
|
||||
sb t1, 0(sp)
|
||||
li t1, ' '
|
||||
sb t1, 1(sp)
|
||||
addi t1, s2, '0'
|
||||
sb t1, 2(sp)
|
||||
addi a0, sp, 0 */
|
||||
sw s1, 0(sp)
|
||||
addi a0, s1, 0
|
||||
li a1, 3
|
||||
call _write_error
|
||||
/* mv t1, s4
|
||||
DEBUG */
|
||||
li t0, 0x05 # Finalize identifier.
|
||||
beq t1, t0, .Ltokenize_next_identifier
|
||||
|
||||
j .Lanalyze_token_reject
|
||||
j .Ltokenize_next_reject
|
||||
|
||||
.Lanalyze_token_reject:
|
||||
.Ltokenize_next_reject:
|
||||
addi s1, s1, 1
|
||||
|
||||
j .Lanalyze_token_end
|
||||
j .Ltokenize_next_end
|
||||
|
||||
.Lanalyze_token_accumulate:
|
||||
.Ltokenize_next_accumulate:
|
||||
addi s1, s1, 1
|
||||
|
||||
j .Lanalyze_token_loop
|
||||
j .Ltokenize_next_loop
|
||||
|
||||
.Lanalyze_token_skip:
|
||||
.Ltokenize_next_skip:
|
||||
addi s1, s1, 1
|
||||
lw t0, 4(sp)
|
||||
addi t0, t0, 1
|
||||
sw t0, 4(sp)
|
||||
|
||||
j .Lanalyze_token_loop
|
||||
j .Ltokenize_next_loop
|
||||
|
||||
.Lanalyze_token_print:
|
||||
.Ltokenize_next_print:
|
||||
/* DEBUG
|
||||
lw a0, 4(sp)
|
||||
mv a1, s1
|
||||
@@ -380,9 +487,9 @@ _analyze_token:
|
||||
call _write_error
|
||||
DEBUG */
|
||||
|
||||
j .Lanalyze_token_end
|
||||
j .Ltokenize_next_end
|
||||
|
||||
.Lanalyze_token_comment:
|
||||
.Ltokenize_next_comment:
|
||||
addi s1, s1, 1
|
||||
|
||||
/* DEBUG
|
||||
@@ -392,9 +499,20 @@ _analyze_token:
|
||||
call _write_error
|
||||
DEBUG */
|
||||
|
||||
j .Lanalyze_token_end
|
||||
j .Ltokenize_next_end
|
||||
|
||||
.Lanalyze_token_end:
|
||||
.Ltokenize_next_identifier:
|
||||
# An identifier can be a textual keyword.
|
||||
# Check the kind of the token and write it into the output parameter.
|
||||
lw a1, 4(sp)
|
||||
sub a0, s1, a1
|
||||
call _classify_identifier
|
||||
lw a1, 0(sp)
|
||||
sw a0, (a1)
|
||||
|
||||
j .Ltokenize_next_end
|
||||
|
||||
.Ltokenize_next_end:
|
||||
mv a0, s1 # Return the advanced text pointer.
|
||||
|
||||
# Restore saved registers.
|
||||
@@ -406,27 +524,3 @@ _analyze_token:
|
||||
lw s0, 16(sp)
|
||||
addi sp, sp, 24
|
||||
ret
|
||||
|
||||
# Initializes the lookup tables.
|
||||
#
|
||||
# Parameters:
|
||||
# a0 - Source text pointer.
|
||||
.type _tokenize, @function
|
||||
_tokenize:
|
||||
# Prologue.
|
||||
addi sp, sp, -8
|
||||
sw ra, 4(sp)
|
||||
sw s0, 0(sp)
|
||||
addi s0, sp, 8
|
||||
|
||||
.Ltokenize_loop:
|
||||
call _analyze_token
|
||||
|
||||
lw t0, (a0)
|
||||
bnez t0, .Ltokenize_loop
|
||||
|
||||
# Epilogue.
|
||||
lw ra, 4(sp)
|
||||
lw s0, 0(sp)
|
||||
addi sp, sp, 8
|
||||
ret
|
||||
|
Reference in New Issue
Block a user