Properly tokenize declaration sections

This commit is contained in:
Eugen Wissner 2025-05-02 22:57:04 +02:00
parent 768821c689
commit dcfd6b1515
Signed by: belka
GPG Key ID: A27FDC1E8EE902C0
5 changed files with 357 additions and 165 deletions

View File

@ -36,11 +36,17 @@ end
directory 'build' directory 'build'
desc 'Initial stage' Dir.glob('boot/*.s').each do |assembly_source|
file 'build/stage1' => ['boot/stage1.s', 'boot/common-boot.s', 'boot/tokenizer.s', 'build'] do |t| target_object = Pathname.new('build') + Pathname.new(assembly_source).basename.sub_ext('.o')
source = t.prerequisites.filter { |prerequisite| prerequisite.end_with? '.s' }
sh CROSS_GCC, '-nostdlib', '-o', t.name, *source file target_object.to_s => [assembly_source, 'build'] do |t|
sh CROSS_GCC, '-c', '-o', t.name, assembly_source
end
end
desc 'Initial stage'
file 'build/stage1' => ['build/tokenizer.o', 'build/stage1.o', 'build/common-boot.o'] do |t|
sh CROSS_GCC, '-nostdlib', '-o', t.name, *t.prerequisites
end end
file 'build/stage2a.s' => ['build/stage1', 'boot/stage2.elna'] do |t| file 'build/stage2a.s' => ['build/stage1', 'boot/stage2.elna'] do |t|
@ -51,8 +57,10 @@ file 'build/stage2a.s' => ['build/stage1', 'boot/stage2.elna'] do |t|
end end
end end
file 'build/stage2a' => ['build/stage2a.s', 'boot/common-boot.s'] do |t| ['build/stage2a', 'build/stage2b'].each do |exe|
sh CROSS_GCC, '-nostdlib', '-o', t.name, *t.prerequisites file exe => [exe.ext('.s'), 'build/common-boot.o'] do |t|
sh CROSS_GCC, '-nostdlib', '-o', t.name, *t.prerequisites
end
end end
file 'build/stage2b.s' => ['build/stage2a', 'boot/stage2.elna'] do |t| file 'build/stage2b.s' => ['build/stage2a', 'boot/stage2.elna'] do |t|
@ -62,7 +70,3 @@ file 'build/stage2b.s' => ['build/stage2a', 'boot/stage2.elna'] do |t|
assemble_stage output, exe, source assemble_stage output, exe, source
end end
end end
file 'build/stage2b' => ['build/stage2b.s', 'boot/common-boot.s'] do |t|
sh CROSS_GCC, '-nostdlib', '-o', t.name, *t.prerequisites
end

View File

@ -2,6 +2,7 @@
.global _write_out, _read_file, _write_error, _put_char, _printi .global _write_out, _read_file, _write_error, _put_char, _printi
.global _get, _memcmp, _memchr, _memmem, _memcpy .global _get, _memcmp, _memchr, _memmem, _memcpy
.global _divide_by_zero_error, _exit .global _divide_by_zero_error, _exit
.global _strings_index
.section .rodata .section .rodata
@ -424,3 +425,69 @@ _memcpy:
.Lmemcpy_end: .Lmemcpy_end:
mv a0, t0 mv a0, t0
ret ret
# Searches for a string in a string array.
#
# Parameters:
# a0 - Number of elements in the string array.
# a1 - String array.
# a2 - Needle length.
# a3 - Needle.
#
# Sets a0 to the 1-based index of the needle in the haystack or to 0 if the
# element could not be found.
.type _strings_index, @function
_strings_index:
# Prologue.
addi sp, sp, -32
sw ra, 28(sp)
sw s0, 24(sp)
addi s0, sp, 32
sw s1, 20(sp)
mv s1, a0
sw s2, 16(sp)
mv s2, a1
sw s3, 12(sp)
mv s3, a2
sw s4, 8(sp)
mv s4, a3
sw s5, 4(sp)
li s5, 0 # Index counter.
.Lstrings_index_loop:
addi s5, s5, 1
beqz s1, .Lstrings_index_missing
lw a2, (s2) # Read the length of the current element in the haystack.
bne a2, s3, .Lstrings_index_next # Lengths don't match, skip the iteration.
addi a0, s2, 4
mv a1, s4
call _memcmp
beqz a0, .Lstrings_index_end
.Lstrings_index_next:
addi s2, s2, 4
add s2, s2, a2
addi s1, s1, -1
j .Lstrings_index_loop
.Lstrings_index_missing:
li s5, 0
.Lstrings_index_end:
mv a0, s5
lw s1, 20(sp)
lw s2, 16(sp)
lw s3, 12(sp)
lw s4, 8(sp)
lw s5, 4(sp)
# Epilogue.
lw ra, 28(sp)
lw s0, 24(sp)
add sp, sp, 32
ret

28
boot/definitions.inc Normal file
View File

@ -0,0 +1,28 @@
# The constant should match the index in the keywords array in tokenizer.s.
.equ TOKEN_PROGRAM, 1
.equ TOKEN_IMPORT, 2
.equ TOKEN_CONST, 3
.equ TOKEN_VAR, 4
.equ TOKEN_IF, 5
.equ TOKEN_THEN, 6
.equ TOKEN_ELSIF, 7
.equ TOKEN_ELSE, 8
.equ TOKEN_WHILE, 9
.equ TOKEN_DO, 10
.equ TOKEN_PROC, 11
.equ TOKEN_BEGIN, 12
.equ TOKEN_END, 13
.equ TOKEN_TYPE, 14
.equ TOKEN_RECORD, 15
.equ TOKEN_RECORD, 16
.equ TOKEN_TRUE, 17
.equ TOKEN_FASE, 18
.equ TOKEN_NIL, 19
.equ TOKEN_XOR, 20
.equ TOKEN_OR, 21
.equ TOKEN_RETURN, 22
.equ TOKEN_CAST, 23
.equ TOKEN_DEFER, 24
.equ TOKEN_CASE, 25
.equ TOKEN_OF, 26

View File

@ -4,6 +4,8 @@
# s1 - Contains the current position in the source text. # s1 - Contains the current position in the source text.
# s2 - Label counter. # s2 - Label counter.
.include "boot/definitions.inc"
.equ SOURCE_BUFFER_SIZE, 81920 .equ SOURCE_BUFFER_SIZE, 81920
.section .rodata .section .rodata
@ -55,20 +57,41 @@ source_code: .zero SOURCE_BUFFER_SIZE
.type _compile_import, @function .type _compile_import, @function
_compile_import: _compile_import:
# Prologue. # Prologue.
addi sp, sp, -8 addi sp, sp, -16
sw ra, 4(sp) sw ra, 12(sp)
sw s0, 0(sp) sw s0, 8(sp)
addi s0, sp, 8 addi s0, sp, 16
addi s1, s1, 6 .Lcompile_import_loop:
call _skip_comment
call _skip_spaces call _skip_spaces
call _read_token
add s1, s1, a0 # Skip the imported module name.
mv a0, s1
addi a1, sp, 0
call _tokenize_next
li t0, TOKEN_IMPORT
lw t1, 0(sp)
bne t0, t1, .Lcompile_import_end
# a0 is set from the previous _tokenize_next call. Skip the module name.
addi a1, sp, 0
call _tokenize_next
mv s1, a0
/* DEBUG
lw t0, 0(sp)
addi t0, t0, '0'
sw t0, 4(sp)
addi a0, sp, 4
li a1, 1
call _write_error*/
j .Lcompile_import_loop
.Lcompile_import_end:
# Epilogue. # Epilogue.
lw ra, 4(sp) lw ra, 12(sp)
lw s0, 0(sp) lw s0, 8(sp)
addi sp, sp, 8 addi sp, sp, 16
ret ret
.type _build_binary_expression, @function .type _build_binary_expression, @function
@ -943,40 +966,54 @@ _compile_assembly:
addi sp, sp, 16 addi sp, sp, 16
ret ret
.type _compile_program, @function .type _compile_module_declaration, @function
_compile_program: _compile_module_declaration:
# Prologue. # Prologue.
addi sp, sp, -8 addi sp, sp, -16
sw ra, 4(sp) sw ra, 12(sp)
sw s0, 0(sp) sw s0, 8(sp)
addi s0, sp, 8 addi s0, sp, 16
la a0, global_start la a0, global_start
li a1, GLOBAL_START_SIZE li a1, GLOBAL_START_SIZE
call _write_out call _write_out
addi s1, s1, 8 # program\n. # Skip "program".
call _skip_comment
mv a0, s1
addi a1, sp, 0
call _tokenize_next
mv s1, a0
# Epilogue. # Epilogue.
lw ra, 4(sp) lw ra, 12(sp)
lw s0, 0(sp) lw s0, 8(sp)
addi sp, sp, 8 addi sp, sp, 16
ret ret
.type _compile_constant_section, @function .type _compile_constant_section, @function
_compile_constant_section: _compile_constant_section:
# Prologue. # Prologue.
addi sp, sp, -8 addi sp, sp, -16
sw ra, 4(sp) sw ra, 12(sp)
sw s0, 0(sp) sw s0, 8(sp)
addi s0, sp, 8 addi s0, sp, 16
call _skip_comment
call _skip_spaces
mv a0, s1
addi a1, sp, 0
call _tokenize_next
li t0, TOKEN_CONST
lw t1, 0(sp)
bne t0, t1, .Lcompile_constant_section_end
mv s1, a0
la a0, section_rodata la a0, section_rodata
li a1, SECTION_RODATA_SIZE li a1, SECTION_RODATA_SIZE
call _write_out call _write_out
addi s1, s1, 6 # const\n.
.Lcompile_constant_section_item: .Lcompile_constant_section_item:
call _skip_spaces call _skip_spaces
lbu a0, (s1) lbu a0, (s1)
@ -988,9 +1025,9 @@ _compile_constant_section:
.Lcompile_constant_section_end: .Lcompile_constant_section_end:
# Epilogue. # Epilogue.
lw ra, 4(sp) lw ra, 12(sp)
lw s0, 0(sp) lw s0, 8(sp)
addi sp, sp, 8 addi sp, sp, 16
ret ret
.type _compile_constant, @function .type _compile_constant, @function
@ -1040,17 +1077,23 @@ _compile_constant:
.type _compile_variable_section, @function .type _compile_variable_section, @function
_compile_variable_section: _compile_variable_section:
# Prologue. # Prologue.
addi sp, sp, -8 addi sp, sp, -16
sw ra, 4(sp) sw ra, 12(sp)
sw s0, 0(sp) sw s0, 8(sp)
addi s0, sp, 8 addi s0, sp, 16
mv a0, s1
addi a1, sp, 0
call _tokenize_next
li t0, TOKEN_VAR
lw t1, 0(sp)
bne t0, t1, .Lcompile_variable_section_end
mv s1, a0
la a0, section_bss la a0, section_bss
li a1, SECTION_BSS_SIZE li a1, SECTION_BSS_SIZE
call _write_out call _write_out
addi s1, s1, 4 # var\n.
.Lcompile_variable_section_item: .Lcompile_variable_section_item:
call _skip_spaces call _skip_spaces
lbu a0, (s1) lbu a0, (s1)
@ -1062,9 +1105,9 @@ _compile_variable_section:
.Lcompile_variable_section_end: .Lcompile_variable_section_end:
# Epilogue. # Epilogue.
lw ra, 4(sp) lw ra, 12(sp)
lw s0, 0(sp) lw s0, 8(sp)
addi sp, sp, 8 addi sp, sp, 16
ret ret
.type _compile_variable, @function .type _compile_variable, @function
@ -1589,30 +1632,6 @@ _compile_line:
li t1, '(' li t1, '('
beq t0, t1, .Lcompile_line_comment beq t0, t1, .Lcompile_line_comment
li t0, 0x676f7270 # prog
sw t0, 12(sp)
mv a0, s1
addi a1, sp, 12
li a2, 4
call _memcmp
beqz a0, .Lcompile_line_program
li t0, 0x736e6f63 # cons
sw t0, 12(sp)
mv a0, s1
addi a1, sp, 12
li a2, 4
call _memcmp
beqz a0, .Lcompile_line_const
li t0, 0x0a726176 # var\n
sw t0, 12(sp)
mv a0, s1
addi a1, sp, 12
li a2, 4
call _memcmp
beqz a0, .Lcompile_line_var
li t0, 0x636f7270 # proc li t0, 0x636f7270 # proc
sw t0, 12(sp) sw t0, 12(sp)
mv a0, s1 mv a0, s1
@ -1647,14 +1666,6 @@ _compile_line:
call _is_register_identifier call _is_register_identifier
bnez a0, .Lcompile_line_identifier bnez a0, .Lcompile_line_identifier
li t0, 0x6f706d69 # impo
sw t0, 12(sp)
mv a0, s1
addi a1, sp, 12
li a2, 4
call _memcmp
beqz a0, .Lcompile_line_import
li t0, 0x6f746f67 # goto li t0, 0x6f746f67 # goto
sw t0, 12(sp) sw t0, 12(sp)
mv a0, s1 mv a0, s1
@ -1704,10 +1715,6 @@ _compile_line:
call _compile_goto call _compile_goto
j .Lcompile_line_section j .Lcompile_line_section
.Lcompile_line_import:
call _compile_import
j .Lcompile_line_section
.Lcompile_line_identifier: .Lcompile_line_identifier:
call _compile_identifier call _compile_identifier
j .Lcompile_line_section j .Lcompile_line_section
@ -1725,10 +1732,6 @@ _compile_line:
li a0, 1 li a0, 1
j .Lcompile_line_end j .Lcompile_line_end
.Lcompile_line_const:
call _compile_constant_section
j .Lcompile_line_section
.Lcompile_line_procedure: .Lcompile_line_procedure:
lw a1, 16(sp) lw a1, 16(sp)
bnez a1, .Lcompile_line_compile_procedure bnez a1, .Lcompile_line_compile_procedure
@ -1738,14 +1741,6 @@ _compile_line:
li a0, 1 li a0, 1
j .Lcompile_line_end j .Lcompile_line_end
.Lcompile_line_var:
call _compile_variable_section
j .Lcompile_line_section
.Lcompile_line_program:
call _compile_program
j .Lcompile_line_section
.Lcompile_line_comment: .Lcompile_line_comment:
lw a0, 20(sp) lw a0, 20(sp)
call _skip_comment call _skip_comment
@ -1864,6 +1859,11 @@ _compile:
sw zero, 4(sp) # Whether the text section header was already emitted. sw zero, 4(sp) # Whether the text section header was already emitted.
call _compile_module_declaration
call _compile_import
call _compile_constant_section
call _compile_variable_section
.Lcompile_do: .Lcompile_do:
lbu t0, (s1) # t0 = Current character. lbu t0, (s1) # t0 = Current character.
beqz t0, .Lcompile_end # Exit the loop on the NUL character. beqz t0, .Lcompile_end # Exit the loop on the NUL character.
@ -1913,7 +1913,6 @@ _start:
call _read_file call _read_file
mv a0, s1 mv a0, s1
call _tokenize
call _main call _main
call _compile call _compile

View File

@ -1,4 +1,4 @@
.global _tokenize, classification, transitions .global _tokenize_next, classification, transitions, keywords
.section .rodata .section .rodata
@ -24,8 +24,10 @@
.equ CLASS_X, 0x0d .equ CLASS_X, 0x0d
.equ CLASS_EOF, 0x0e .equ CLASS_EOF, 0x0e
.equ CLASS_DOT, 0x0f .equ CLASS_DOT, 0x0f
.equ CLASS_MINUS, 0x10
.equ CLASS_DOUBLE_QUOTE, 0x11
.equ CLASS_COUNT, 16 .equ CLASS_COUNT, 18
.type classification, @object .type classification, @object
.size classification, 128 .size classification, 128
@ -64,7 +66,7 @@ classification:
.byte CLASS_INVALID # 1F US .byte CLASS_INVALID # 1F US
.byte CLASS_SPACE # 20 Space .byte CLASS_SPACE # 20 Space
.byte CLASS_SINGLE # 21 ! .byte CLASS_SINGLE # 21 !
.byte 0x00 # 22 " .byte CLASS_DOUBLE_QUOTE # 22 "
.byte 0x00 # 23 # .byte 0x00 # 23 #
.byte 0x00 # 24 $ .byte 0x00 # 24 $
.byte CLASS_SINGLE # 25 % .byte CLASS_SINGLE # 25 %
@ -75,7 +77,7 @@ classification:
.byte CLASS_ASTERISK # 2A * .byte CLASS_ASTERISK # 2A *
.byte CLASS_SINGLE # 2B + .byte CLASS_SINGLE # 2B +
.byte CLASS_SINGLE # 2C , .byte CLASS_SINGLE # 2C ,
.byte 0x00 # 2D - .byte CLASS_MINUS # 2D -
.byte CLASS_DOT # 2E . .byte CLASS_DOT # 2E .
.byte CLASS_SINGLE # 2F / .byte CLASS_SINGLE # 2F /
.byte CLASS_ZERO # 30 0 .byte CLASS_ZERO # 30 0
@ -159,6 +161,67 @@ classification:
.byte CLASS_SINGLE # 7E ~ .byte CLASS_SINGLE # 7E ~
.byte CLASS_INVALID # 7F DEL .byte CLASS_INVALID # 7F DEL
#
# Textual keywords in the language.
#
.equ KEYWORDS_COUNT, 21
.type keywords, @object
keywords:
.word 7
.ascii "program"
.word 6
.ascii "import"
.word 5
.ascii "const"
.word 3
.ascii "var"
.word 2
.ascii "if"
.word 4
.ascii "then"
.word 5
.ascii "elsif"
.word 4
.ascii "else"
.word 5
.ascii "while"
.word 2
.ascii "do"
.word 4
.ascii "proc"
.word 5
.ascii "begin"
.word 3
.ascii "end"
.word 4
.ascii "type"
.word 6
.ascii "record"
.word 5
.ascii "union"
.word 4
.ascii "true"
.word 5
.ascii "false"
.word 3
.ascii "nil"
.word 3
.ascii "xor"
.word 2
.ascii "or"
.word 6
.ascii "return"
.word 4
.ascii "cast"
.word 5
.ascii "defer"
.word 4
.ascii "case"
.word 2
.ascii "of"
.size keywords, . - keywords
.section .data .section .data
# The transition table describes transitions from one state to another, given # The transition table describes transitions from one state to another, given
@ -173,58 +236,82 @@ classification:
# It specifies the target state. "ff" means that this is an end state and no # It specifies the target state. "ff" means that this is an end state and no
# transition is possible. # transition is possible.
# - The next byte is the action that should be performed when transitioning. # - The next byte is the action that should be performed when transitioning.
# For the meaning of actions see labels in the _analyze_token function, which # For the meaning of actions see labels in the _tokenize_next function, which
# handles each action. # handles each action.
# #
.type transitions, @object .type transitions, @object
.size transitions, 13 * CLASS_COUNT # state count * CLASS_COUNT .size transitions, 17 * CLASS_COUNT # state count * CLASS_COUNT
transitions: transitions:
# Invalid Digit Alpha Space : = ( ) # Invalid Digit Alpha Space : = ( )
# * _ Single Hex 0 x NUL . # * _ Single Hex 0 x NUL .
# - "
.word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107 .word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107
.word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start .word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start
.word 0x010f, 0x0110
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon
.word 0x02ff, 0x02ff
.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff
.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 02 Identifier .word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff # 02 Identifier
.word 0x05ff, 0x05ff
.word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer
.word 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign
.word 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals
.word 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren .word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren
.word 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren
.word 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk
.word 0x02ff, 0x02ff
.word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109 .word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109
.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment
.word 0x0109, 0x0109
.word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff .word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff
.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment
.word 0x0109, 0x0109
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token
.word 0x02ff, 0x02ff
.word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero
.word 0x02ff, 0x02ff
.word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal .word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal
.word 0x00ff, 0x02ff
.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x2ff, 0x02ff # 0e Dot .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 0e Dot
.word 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0f Minus
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff
.word 0x00ff, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 # 10 Starting string.
.word 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110
.word 0x0110, 0x04ff
.section .text .section .text
@ -292,12 +379,42 @@ _next_state:
addi sp, sp, 16 addi sp, sp, 16
ret ret
# Takes an identifier and checks whether it's a keyword.
#
# Parameters:
# a0 - Token length.
# a1 - Token pointer.
#
# Sets a0 to the appropriate token type.
.type _classify_identifier, @function
_classify_identifier:
# Prologue.
addi sp, sp, -16
sw ra, 12(sp)
sw s0, 8(sp)
addi s0, sp, 16
mv a2, a0
mv a3, a1
li a0, KEYWORDS_COUNT
la a1, keywords
call _strings_index
# Epilogue.
lw ra, 12(sp)
lw s0, 8(sp)
addi sp, sp, 16
ret
# Initializes the classification table. # Initializes the classification table.
# #
# Paramaters: # Paramaters:
# a0 - Source text pointer. # a0 - Source text pointer.
.type _analyze_token, @function # a1 - A pointer for output value, the token kind. 4 Bytes.
_analyze_token: #
# Sets a0 to the position of the next token.
.type _tokenize_next, @function
_tokenize_next:
# Prologue. # Prologue.
addi sp, sp, -24 addi sp, sp, -24
sw ra, 20(sp) sw ra, 20(sp)
@ -311,7 +428,10 @@ _analyze_token:
sw s2, 8(sp) # Preserve s2 containing the current state. sw s2, 8(sp) # Preserve s2 containing the current state.
li s2, 0x00 # Initial, start state. li s2, 0x00 # Initial, start state.
.Lanalyze_token_loop: sw a1, 0(sp)
sw zero, (a1) # Initialize.
.Ltokenize_next_loop:
mv a0, s2 mv a0, s2
lbu a1, (s1) lbu a1, (s1)
call _next_state call _next_state
@ -323,56 +443,43 @@ _analyze_token:
and t1, a0, t0 # Transition action. and t1, a0, t0 # Transition action.
srli t1, t1, 8 srli t1, t1, 8
# Perform the provided action.
li t0, 0x01 # Accumulate action. li t0, 0x01 # Accumulate action.
beq t1, t0, .Lanalyze_token_accumulate beq t1, t0, .Ltokenize_next_accumulate
li t0, 0x02 # Print action. li t0, 0x02 # Print action.
beq t1, t0, .Lanalyze_token_print beq t1, t0, .Ltokenize_next_print
li t0, 0x03 # Skip action. li t0, 0x03 # Skip action.
beq t1, t0, .Lanalyze_token_skip beq t1, t0, .Ltokenize_next_skip
li t0, 0x04 # Comment action. li t0, 0x04 # Comment action.
beq t1, t0, .Lanalyze_token_comment beq t1, t0, .Ltokenize_next_comment
/* DEBUG li t0, 0x05 # Finalize identifier.
mv s4, t1 beq t1, t0, .Ltokenize_next_identifier
addi t1, t1, '0'
sb t1, 0(sp)
li t1, ' '
sb t1, 1(sp)
addi t1, s2, '0'
sb t1, 2(sp)
addi a0, sp, 0 */
sw s1, 0(sp)
addi a0, s1, 0
li a1, 3
call _write_error
/* mv t1, s4
DEBUG */
j .Lanalyze_token_reject j .Ltokenize_next_reject
.Lanalyze_token_reject: .Ltokenize_next_reject:
addi s1, s1, 1 addi s1, s1, 1
j .Lanalyze_token_end j .Ltokenize_next_end
.Lanalyze_token_accumulate: .Ltokenize_next_accumulate:
addi s1, s1, 1 addi s1, s1, 1
j .Lanalyze_token_loop j .Ltokenize_next_loop
.Lanalyze_token_skip: .Ltokenize_next_skip:
addi s1, s1, 1 addi s1, s1, 1
lw t0, 4(sp) lw t0, 4(sp)
addi t0, t0, 1 addi t0, t0, 1
sw t0, 4(sp) sw t0, 4(sp)
j .Lanalyze_token_loop j .Ltokenize_next_loop
.Lanalyze_token_print: .Ltokenize_next_print:
/* DEBUG /* DEBUG
lw a0, 4(sp) lw a0, 4(sp)
mv a1, s1 mv a1, s1
@ -380,9 +487,9 @@ _analyze_token:
call _write_error call _write_error
DEBUG */ DEBUG */
j .Lanalyze_token_end j .Ltokenize_next_end
.Lanalyze_token_comment: .Ltokenize_next_comment:
addi s1, s1, 1 addi s1, s1, 1
/* DEBUG /* DEBUG
@ -392,9 +499,20 @@ _analyze_token:
call _write_error call _write_error
DEBUG */ DEBUG */
j .Lanalyze_token_end j .Ltokenize_next_end
.Lanalyze_token_end: .Ltokenize_next_identifier:
# An identifier can be a textual keyword.
# Check the kind of the token and write it into the output parameter.
lw a1, 4(sp)
sub a0, s1, a1
call _classify_identifier
lw a1, 0(sp)
sw a0, (a1)
j .Ltokenize_next_end
.Ltokenize_next_end:
mv a0, s1 # Return the advanced text pointer. mv a0, s1 # Return the advanced text pointer.
# Restore saved registers. # Restore saved registers.
@ -406,27 +524,3 @@ _analyze_token:
lw s0, 16(sp) lw s0, 16(sp)
addi sp, sp, 24 addi sp, sp, 24
ret ret
# Initializes the lookup tables.
#
# Parameters:
# a0 - Source text pointer.
.type _tokenize, @function
_tokenize:
# Prologue.
addi sp, sp, -8
sw ra, 4(sp)
sw s0, 0(sp)
addi s0, sp, 8
.Ltokenize_loop:
call _analyze_token
lw t0, (a0)
bnez t0, .Ltokenize_loop
# Epilogue.
lw ra, 4(sp)
lw s0, 0(sp)
addi sp, sp, 8
ret