Tokenize single character symbols
This commit is contained in:
223
boot/tokenizer.s
223
boot/tokenizer.s
@@ -1,4 +1,10 @@
|
||||
.global _tokenize_next, classification, transitions, keywords
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public License,
|
||||
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
|
||||
# obtain one at https://mozilla.org/MPL/2.0/.
|
||||
|
||||
.global _tokenize_next, classification, transitions, keywords, byte_keywords
|
||||
|
||||
.include "boot/definitions.inc"
|
||||
|
||||
.section .rodata
|
||||
|
||||
@@ -8,7 +14,7 @@
|
||||
#
|
||||
# Classification:
|
||||
#
|
||||
.equ CLASS_INVALID, 0x0
|
||||
.equ CLASS_INVALID, 0x00
|
||||
.equ CLASS_DIGIT, 0x01
|
||||
.equ CLASS_CHARACTER, 0x02
|
||||
.equ CLASS_SPACE, 0x03
|
||||
@@ -25,9 +31,11 @@
|
||||
.equ CLASS_EOF, 0x0e
|
||||
.equ CLASS_DOT, 0x0f
|
||||
.equ CLASS_MINUS, 0x10
|
||||
.equ CLASS_DOUBLE_QUOTE, 0x11
|
||||
.equ CLASS_QUOTE, 0x11
|
||||
.equ CLASS_GREATER, 0x12
|
||||
.equ CLASS_LESS, 0x13
|
||||
|
||||
.equ CLASS_COUNT, 18
|
||||
.equ CLASS_COUNT, 20
|
||||
|
||||
.type classification, @object
|
||||
.size classification, 128
|
||||
@@ -66,12 +74,12 @@ classification:
|
||||
.byte CLASS_INVALID # 1F US
|
||||
.byte CLASS_SPACE # 20 Space
|
||||
.byte CLASS_SINGLE # 21 !
|
||||
.byte CLASS_DOUBLE_QUOTE # 22 "
|
||||
.byte CLASS_QUOTE # 22 "
|
||||
.byte 0x00 # 23 #
|
||||
.byte 0x00 # 24 $
|
||||
.byte CLASS_SINGLE # 25 %
|
||||
.byte CLASS_SINGLE # 26 &
|
||||
.byte 0x00 # 27 '
|
||||
.byte CLASS_QUOTE # 27 '
|
||||
.byte CLASS_LEFT_PAREN # 28 (
|
||||
.byte CLASS_RIGHT_PAREN # 29 )
|
||||
.byte CLASS_ASTERISK # 2A *
|
||||
@@ -92,9 +100,9 @@ classification:
|
||||
.byte CLASS_DIGIT # 39 9
|
||||
.byte CLASS_COLON # 3A :
|
||||
.byte CLASS_SINGLE # 3B ;
|
||||
.byte 0x00 # 3C <
|
||||
.byte CLASS_LESS # 3C <
|
||||
.byte CLASS_EQUALS # 3D =
|
||||
.byte 0x00 # 3E >
|
||||
.byte CLASS_GREATER # 3E >
|
||||
.byte 0x00 # 3F ?
|
||||
.byte CLASS_SINGLE # 40 @
|
||||
.byte CLASS_CHARACTER # 41 A
|
||||
@@ -220,7 +228,10 @@ keywords:
|
||||
.ascii "case"
|
||||
.word 2
|
||||
.ascii "of"
|
||||
.size keywords, . - keywords
|
||||
|
||||
.type byte_keywords, @object
|
||||
byte_keywords: .ascii "&.,:;()[]^=+-*@"
|
||||
.equ BYTE_KEYWORDS_SIZE, . - byte_keywords
|
||||
|
||||
.section .data
|
||||
|
||||
@@ -240,78 +251,66 @@ keywords:
|
||||
# handles each action.
|
||||
#
|
||||
.type transitions, @object
|
||||
.size transitions, 17 * CLASS_COUNT # state count * CLASS_COUNT
|
||||
.size transitions, 14 * CLASS_COUNT # state count * CLASS_COUNT
|
||||
transitions:
|
||||
# Invalid Digit Alpha Space : = ( )
|
||||
# * _ Single Hex 0 x NUL .
|
||||
# - "
|
||||
.word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107
|
||||
.word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start
|
||||
.word 0x010f, 0x0110
|
||||
# - " or ' > <
|
||||
.word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x06ff, 0x0106, 0x06ff
|
||||
.word 0x06ff, 0x0102, 0x06ff, 0x0102, 0x010c, 0x0102, 0x00ff, 0x0108
|
||||
.word 0x0105, 0x0110, 0x0104, 0x0107 # 0x00 Start
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon
|
||||
.word 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x07ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x01 Colon
|
||||
|
||||
.word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff
|
||||
.word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff # 02 Identifier
|
||||
.word 0x05ff, 0x05ff
|
||||
.word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff
|
||||
.word 0x05ff, 0x05ff, 0x05ff, 0x05ff # 0x02 Identifier
|
||||
|
||||
.word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer
|
||||
.word 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x03 Integer
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x04ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x04ff, 0x02ff # 0x04 Greater
|
||||
|
||||
.word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff
|
||||
.word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff
|
||||
.word 0x06ff, 0x06ff, 0x04ff, 0x06ff # 0x05 Minus
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign
|
||||
.word 0x02ff, 0x02ff
|
||||
.word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x06 Left paren
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109
|
||||
.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment
|
||||
.word 0x0109, 0x0109
|
||||
|
||||
.word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff
|
||||
.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment
|
||||
.word 0x0109, 0x0109
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero
|
||||
.word 0x02ff, 0x02ff
|
||||
|
||||
.word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal
|
||||
.word 0x00ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x04ff # 0x07 Less
|
||||
|
||||
.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 0e Dot
|
||||
.word 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x08 Dot
|
||||
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0f Minus
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff
|
||||
.word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109
|
||||
.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109
|
||||
.word 0x0109, 0x0109, 0x0109, 0x0109 # 0x09 Comment
|
||||
|
||||
.word 0x00ff, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 # 10 Starting string.
|
||||
.word 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110
|
||||
.word 0x0110, 0x04ff
|
||||
.word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff
|
||||
.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109
|
||||
.word 0x0109, 0x0109, 0x0109, 0x0109 # 0x0a Closing comment
|
||||
|
||||
.word 0x00ff, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110
|
||||
.word 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110
|
||||
.word 0x010b, 0x04ff, 0x010b, 0x010b # 0x0b String
|
||||
|
||||
.word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x0c Zero
|
||||
|
||||
.word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
|
||||
.word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff
|
||||
.word 0x00ff, 0x02ff, 0x02ff, 0x02ff # 0x0d Hexadecimal
|
||||
|
||||
.section .text
|
||||
|
||||
@@ -406,6 +405,57 @@ _classify_identifier:
|
||||
addi sp, sp, 16
|
||||
ret
|
||||
|
||||
# Takes a symbol and determines its type.
|
||||
#
|
||||
# Parameters:
|
||||
# a0 - Token character.
|
||||
#
|
||||
# Sets a0 to the appropriate token type.
|
||||
.type _classify_single, @function
|
||||
_classify_single:
|
||||
# Prologue.
|
||||
addi sp, sp, -16
|
||||
sw ra, 12(sp)
|
||||
sw s0, 8(sp)
|
||||
addi s0, sp, 16
|
||||
|
||||
mv a1, a0
|
||||
li a2, BYTE_KEYWORDS_SIZE
|
||||
la a0, byte_keywords
|
||||
call _memchr
|
||||
|
||||
la a1, byte_keywords
|
||||
sub a0, a0, a1
|
||||
addi a0, a0, 27
|
||||
|
||||
# Epilogue.
|
||||
lw ra, 12(sp)
|
||||
lw s0, 8(sp)
|
||||
addi sp, sp, 16
|
||||
ret
|
||||
|
||||
# Classified a symbol containing multiple characters (probably 2).
|
||||
#
|
||||
# Parameters:
|
||||
# a0 - Token length.
|
||||
# a1 - Token pointer.
|
||||
#
|
||||
# Sets a0 to the appropriate token type.
|
||||
.type _classify_composite, @function
|
||||
_classify_composite:
|
||||
lbu t0, 0(a1)
|
||||
li t1, ':'
|
||||
beq t0, t1, .Lclassify_composite_assign
|
||||
|
||||
j .Lclassify_composite_end
|
||||
|
||||
.Lclassify_composite_assign:
|
||||
li a0, TOKEN_ASSIGN
|
||||
j .Lclassify_composite_end
|
||||
|
||||
.Lclassify_composite_end:
|
||||
ret
|
||||
|
||||
# Initializes the classification table.
|
||||
#
|
||||
# Paramaters:
|
||||
@@ -453,12 +503,18 @@ _tokenize_next:
|
||||
li t0, 0x03 # Skip action.
|
||||
beq t1, t0, .Ltokenize_next_skip
|
||||
|
||||
li t0, 0x04 # Comment action.
|
||||
li t0, 0x04 # Delimited string action.
|
||||
beq t1, t0, .Ltokenize_next_comment
|
||||
|
||||
li t0, 0x05 # Finalize identifier.
|
||||
beq t1, t0, .Ltokenize_next_identifier
|
||||
|
||||
li t0, 0x06 # Single character symbol action.
|
||||
beq t1, t0, .Ltokenize_next_single
|
||||
|
||||
li t0, 0x07 # An action for symbols containing multiple characters.
|
||||
beq t1, t0, .Ltokenize_next_composite
|
||||
|
||||
j .Ltokenize_next_reject
|
||||
|
||||
.Ltokenize_next_reject:
|
||||
@@ -481,24 +537,17 @@ _tokenize_next:
|
||||
|
||||
.Ltokenize_next_print:
|
||||
/* DEBUG
|
||||
lw a0, 4(sp)
|
||||
mv a1, s1
|
||||
sub a1, a1, a0
|
||||
call _write_error
|
||||
DEBUG */
|
||||
addi a0, a0, 21
|
||||
sw a0, 0(sp)
|
||||
addi a0, sp, 0
|
||||
li a1, 1
|
||||
call _write_error */
|
||||
|
||||
j .Ltokenize_next_end
|
||||
|
||||
.Ltokenize_next_comment:
|
||||
addi s1, s1, 1
|
||||
|
||||
/* DEBUG
|
||||
lw a0, 4(sp)
|
||||
mv a1, s1
|
||||
sub a1, a1, a0
|
||||
call _write_error
|
||||
DEBUG */
|
||||
|
||||
j .Ltokenize_next_end
|
||||
|
||||
.Ltokenize_next_identifier:
|
||||
@@ -512,6 +561,26 @@ _tokenize_next:
|
||||
|
||||
j .Ltokenize_next_end
|
||||
|
||||
.Ltokenize_next_single:
|
||||
lw a0, 4(sp)
|
||||
addi s1, a0, 1
|
||||
lbu a0, (a0)
|
||||
call _classify_single
|
||||
lw a1, 0(sp)
|
||||
sw a0, (a1)
|
||||
|
||||
j .Ltokenize_next_end
|
||||
|
||||
.Ltokenize_next_composite:
|
||||
addi s1, s1, 1
|
||||
lw a1, 4(sp)
|
||||
sub a0, s1, a1
|
||||
call _classify_composite
|
||||
lw a1, 0(sp)
|
||||
sw a0, (a1)
|
||||
|
||||
j .Ltokenize_next_end
|
||||
|
||||
.Ltokenize_next_end:
|
||||
mv a0, s1 # Return the advanced text pointer.
|
||||
|
||||
|
Reference in New Issue
Block a user