From dcfd6b1515679cfbc75de12a17352d9d1eddceaf Mon Sep 17 00:00:00 2001 From: Eugen Wissner Date: Fri, 2 May 2025 22:57:04 +0200 Subject: [PATCH] Properly tokenize declaration sections --- Rakefile | 24 +++-- boot/common-boot.s | 67 +++++++++++++ boot/definitions.inc | 28 ++++++ boot/stage1.s | 173 ++++++++++++++++---------------- boot/tokenizer.s | 230 ++++++++++++++++++++++++++++++------------- 5 files changed, 357 insertions(+), 165 deletions(-) create mode 100644 boot/definitions.inc diff --git a/Rakefile b/Rakefile index 4398ef8..2bc2683 100644 --- a/Rakefile +++ b/Rakefile @@ -36,11 +36,17 @@ end directory 'build' -desc 'Initial stage' -file 'build/stage1' => ['boot/stage1.s', 'boot/common-boot.s', 'boot/tokenizer.s', 'build'] do |t| - source = t.prerequisites.filter { |prerequisite| prerequisite.end_with? '.s' } +Dir.glob('boot/*.s').each do |assembly_source| + target_object = Pathname.new('build') + Pathname.new(assembly_source).basename.sub_ext('.o') - sh CROSS_GCC, '-nostdlib', '-o', t.name, *source + file target_object.to_s => [assembly_source, 'build'] do |t| + sh CROSS_GCC, '-c', '-o', t.name, assembly_source + end +end + +desc 'Initial stage' +file 'build/stage1' => ['build/tokenizer.o', 'build/stage1.o', 'build/common-boot.o'] do |t| + sh CROSS_GCC, '-nostdlib', '-o', t.name, *t.prerequisites end file 'build/stage2a.s' => ['build/stage1', 'boot/stage2.elna'] do |t| @@ -51,8 +57,10 @@ file 'build/stage2a.s' => ['build/stage1', 'boot/stage2.elna'] do |t| end end -file 'build/stage2a' => ['build/stage2a.s', 'boot/common-boot.s'] do |t| - sh CROSS_GCC, '-nostdlib', '-o', t.name, *t.prerequisites +['build/stage2a', 'build/stage2b'].each do |exe| + file exe => [exe.ext('.s'), 'build/common-boot.o'] do |t| + sh CROSS_GCC, '-nostdlib', '-o', t.name, *t.prerequisites + end end file 'build/stage2b.s' => ['build/stage2a', 'boot/stage2.elna'] do |t| @@ -62,7 +70,3 @@ file 'build/stage2b.s' => ['build/stage2a', 'boot/stage2.elna'] do |t| assemble_stage output, exe, source end end - -file 'build/stage2b' => ['build/stage2b.s', 'boot/common-boot.s'] do |t| - sh CROSS_GCC, '-nostdlib', '-o', t.name, *t.prerequisites -end diff --git a/boot/common-boot.s b/boot/common-boot.s index 0cf31f1..26dad8d 100644 --- a/boot/common-boot.s +++ b/boot/common-boot.s @@ -2,6 +2,7 @@ .global _write_out, _read_file, _write_error, _put_char, _printi .global _get, _memcmp, _memchr, _memmem, _memcpy .global _divide_by_zero_error, _exit +.global _strings_index .section .rodata @@ -424,3 +425,69 @@ _memcpy: .Lmemcpy_end: mv a0, t0 ret + +# Searches for a string in a string array. +# +# Parameters: +# a0 - Number of elements in the string array. +# a1 - String array. +# a2 - Needle length. +# a3 - Needle. +# +# Sets a0 to the 1-based index of the needle in the haystack or to 0 if the +# element could not be found. +.type _strings_index, @function +_strings_index: + # Prologue. + addi sp, sp, -32 + sw ra, 28(sp) + sw s0, 24(sp) + addi s0, sp, 32 + + sw s1, 20(sp) + mv s1, a0 + sw s2, 16(sp) + mv s2, a1 + sw s3, 12(sp) + mv s3, a2 + sw s4, 8(sp) + mv s4, a3 + sw s5, 4(sp) + li s5, 0 # Index counter. + +.Lstrings_index_loop: + addi s5, s5, 1 + beqz s1, .Lstrings_index_missing + + lw a2, (s2) # Read the length of the current element in the haystack. + bne a2, s3, .Lstrings_index_next # Lengths don't match, skip the iteration. + + addi a0, s2, 4 + mv a1, s4 + call _memcmp + + beqz a0, .Lstrings_index_end + +.Lstrings_index_next: + addi s2, s2, 4 + add s2, s2, a2 + addi s1, s1, -1 + j .Lstrings_index_loop + +.Lstrings_index_missing: + li s5, 0 + +.Lstrings_index_end: + mv a0, s5 + + lw s1, 20(sp) + lw s2, 16(sp) + lw s3, 12(sp) + lw s4, 8(sp) + lw s5, 4(sp) + + # Epilogue. + lw ra, 28(sp) + lw s0, 24(sp) + add sp, sp, 32 + ret diff --git a/boot/definitions.inc b/boot/definitions.inc new file mode 100644 index 0000000..0e2f54e --- /dev/null +++ b/boot/definitions.inc @@ -0,0 +1,28 @@ +# The constant should match the index in the keywords array in tokenizer.s. + +.equ TOKEN_PROGRAM, 1 +.equ TOKEN_IMPORT, 2 +.equ TOKEN_CONST, 3 +.equ TOKEN_VAR, 4 +.equ TOKEN_IF, 5 +.equ TOKEN_THEN, 6 +.equ TOKEN_ELSIF, 7 +.equ TOKEN_ELSE, 8 +.equ TOKEN_WHILE, 9 +.equ TOKEN_DO, 10 +.equ TOKEN_PROC, 11 +.equ TOKEN_BEGIN, 12 +.equ TOKEN_END, 13 +.equ TOKEN_TYPE, 14 +.equ TOKEN_RECORD, 15 +.equ TOKEN_RECORD, 16 +.equ TOKEN_TRUE, 17 +.equ TOKEN_FASE, 18 +.equ TOKEN_NIL, 19 +.equ TOKEN_XOR, 20 +.equ TOKEN_OR, 21 +.equ TOKEN_RETURN, 22 +.equ TOKEN_CAST, 23 +.equ TOKEN_DEFER, 24 +.equ TOKEN_CASE, 25 +.equ TOKEN_OF, 26 diff --git a/boot/stage1.s b/boot/stage1.s index 9b118d5..9ab072d 100644 --- a/boot/stage1.s +++ b/boot/stage1.s @@ -4,6 +4,8 @@ # s1 - Contains the current position in the source text. # s2 - Label counter. +.include "boot/definitions.inc" + .equ SOURCE_BUFFER_SIZE, 81920 .section .rodata @@ -55,20 +57,41 @@ source_code: .zero SOURCE_BUFFER_SIZE .type _compile_import, @function _compile_import: # Prologue. - addi sp, sp, -8 - sw ra, 4(sp) - sw s0, 0(sp) - addi s0, sp, 8 + addi sp, sp, -16 + sw ra, 12(sp) + sw s0, 8(sp) + addi s0, sp, 16 - addi s1, s1, 6 +.Lcompile_import_loop: + call _skip_comment call _skip_spaces - call _read_token - add s1, s1, a0 # Skip the imported module name. + mv a0, s1 + addi a1, sp, 0 + call _tokenize_next + li t0, TOKEN_IMPORT + lw t1, 0(sp) + bne t0, t1, .Lcompile_import_end + # a0 is set from the previous _tokenize_next call. Skip the module name. + addi a1, sp, 0 + call _tokenize_next + mv s1, a0 + + /* DEBUG + lw t0, 0(sp) + addi t0, t0, '0' + sw t0, 4(sp) + addi a0, sp, 4 + li a1, 1 + call _write_error*/ + + j .Lcompile_import_loop + +.Lcompile_import_end: # Epilogue. - lw ra, 4(sp) - lw s0, 0(sp) - addi sp, sp, 8 + lw ra, 12(sp) + lw s0, 8(sp) + addi sp, sp, 16 ret .type _build_binary_expression, @function @@ -943,40 +966,54 @@ _compile_assembly: addi sp, sp, 16 ret -.type _compile_program, @function -_compile_program: +.type _compile_module_declaration, @function +_compile_module_declaration: # Prologue. - addi sp, sp, -8 - sw ra, 4(sp) - sw s0, 0(sp) - addi s0, sp, 8 + addi sp, sp, -16 + sw ra, 12(sp) + sw s0, 8(sp) + addi s0, sp, 16 la a0, global_start li a1, GLOBAL_START_SIZE call _write_out - addi s1, s1, 8 # program\n. + # Skip "program". + call _skip_comment + mv a0, s1 + addi a1, sp, 0 + call _tokenize_next + mv s1, a0 # Epilogue. - lw ra, 4(sp) - lw s0, 0(sp) - addi sp, sp, 8 + lw ra, 12(sp) + lw s0, 8(sp) + addi sp, sp, 16 ret .type _compile_constant_section, @function _compile_constant_section: # Prologue. - addi sp, sp, -8 - sw ra, 4(sp) - sw s0, 0(sp) - addi s0, sp, 8 + addi sp, sp, -16 + sw ra, 12(sp) + sw s0, 8(sp) + addi s0, sp, 16 + + call _skip_comment + call _skip_spaces + + mv a0, s1 + addi a1, sp, 0 + call _tokenize_next + li t0, TOKEN_CONST + lw t1, 0(sp) + bne t0, t1, .Lcompile_constant_section_end + mv s1, a0 la a0, section_rodata li a1, SECTION_RODATA_SIZE call _write_out - addi s1, s1, 6 # const\n. - .Lcompile_constant_section_item: call _skip_spaces lbu a0, (s1) @@ -988,9 +1025,9 @@ _compile_constant_section: .Lcompile_constant_section_end: # Epilogue. - lw ra, 4(sp) - lw s0, 0(sp) - addi sp, sp, 8 + lw ra, 12(sp) + lw s0, 8(sp) + addi sp, sp, 16 ret .type _compile_constant, @function @@ -1040,17 +1077,23 @@ _compile_constant: .type _compile_variable_section, @function _compile_variable_section: # Prologue. - addi sp, sp, -8 - sw ra, 4(sp) - sw s0, 0(sp) - addi s0, sp, 8 + addi sp, sp, -16 + sw ra, 12(sp) + sw s0, 8(sp) + addi s0, sp, 16 + + mv a0, s1 + addi a1, sp, 0 + call _tokenize_next + li t0, TOKEN_VAR + lw t1, 0(sp) + bne t0, t1, .Lcompile_variable_section_end + mv s1, a0 la a0, section_bss li a1, SECTION_BSS_SIZE call _write_out - addi s1, s1, 4 # var\n. - .Lcompile_variable_section_item: call _skip_spaces lbu a0, (s1) @@ -1062,9 +1105,9 @@ _compile_variable_section: .Lcompile_variable_section_end: # Epilogue. - lw ra, 4(sp) - lw s0, 0(sp) - addi sp, sp, 8 + lw ra, 12(sp) + lw s0, 8(sp) + addi sp, sp, 16 ret .type _compile_variable, @function @@ -1589,30 +1632,6 @@ _compile_line: li t1, '(' beq t0, t1, .Lcompile_line_comment - li t0, 0x676f7270 # prog - sw t0, 12(sp) - mv a0, s1 - addi a1, sp, 12 - li a2, 4 - call _memcmp - beqz a0, .Lcompile_line_program - - li t0, 0x736e6f63 # cons - sw t0, 12(sp) - mv a0, s1 - addi a1, sp, 12 - li a2, 4 - call _memcmp - beqz a0, .Lcompile_line_const - - li t0, 0x0a726176 # var\n - sw t0, 12(sp) - mv a0, s1 - addi a1, sp, 12 - li a2, 4 - call _memcmp - beqz a0, .Lcompile_line_var - li t0, 0x636f7270 # proc sw t0, 12(sp) mv a0, s1 @@ -1647,14 +1666,6 @@ _compile_line: call _is_register_identifier bnez a0, .Lcompile_line_identifier - li t0, 0x6f706d69 # impo - sw t0, 12(sp) - mv a0, s1 - addi a1, sp, 12 - li a2, 4 - call _memcmp - beqz a0, .Lcompile_line_import - li t0, 0x6f746f67 # goto sw t0, 12(sp) mv a0, s1 @@ -1704,10 +1715,6 @@ _compile_line: call _compile_goto j .Lcompile_line_section -.Lcompile_line_import: - call _compile_import - j .Lcompile_line_section - .Lcompile_line_identifier: call _compile_identifier j .Lcompile_line_section @@ -1725,10 +1732,6 @@ _compile_line: li a0, 1 j .Lcompile_line_end -.Lcompile_line_const: - call _compile_constant_section - j .Lcompile_line_section - .Lcompile_line_procedure: lw a1, 16(sp) bnez a1, .Lcompile_line_compile_procedure @@ -1738,14 +1741,6 @@ _compile_line: li a0, 1 j .Lcompile_line_end -.Lcompile_line_var: - call _compile_variable_section - j .Lcompile_line_section - -.Lcompile_line_program: - call _compile_program - j .Lcompile_line_section - .Lcompile_line_comment: lw a0, 20(sp) call _skip_comment @@ -1864,6 +1859,11 @@ _compile: sw zero, 4(sp) # Whether the text section header was already emitted. + call _compile_module_declaration + call _compile_import + call _compile_constant_section + call _compile_variable_section + .Lcompile_do: lbu t0, (s1) # t0 = Current character. beqz t0, .Lcompile_end # Exit the loop on the NUL character. @@ -1913,7 +1913,6 @@ _start: call _read_file mv a0, s1 - call _tokenize call _main call _compile diff --git a/boot/tokenizer.s b/boot/tokenizer.s index 5570031..4315f66 100644 --- a/boot/tokenizer.s +++ b/boot/tokenizer.s @@ -1,4 +1,4 @@ -.global _tokenize, classification, transitions +.global _tokenize_next, classification, transitions, keywords .section .rodata @@ -24,8 +24,10 @@ .equ CLASS_X, 0x0d .equ CLASS_EOF, 0x0e .equ CLASS_DOT, 0x0f +.equ CLASS_MINUS, 0x10 +.equ CLASS_DOUBLE_QUOTE, 0x11 -.equ CLASS_COUNT, 16 +.equ CLASS_COUNT, 18 .type classification, @object .size classification, 128 @@ -64,7 +66,7 @@ classification: .byte CLASS_INVALID # 1F US .byte CLASS_SPACE # 20 Space .byte CLASS_SINGLE # 21 ! - .byte 0x00 # 22 " + .byte CLASS_DOUBLE_QUOTE # 22 " .byte 0x00 # 23 # .byte 0x00 # 24 $ .byte CLASS_SINGLE # 25 % @@ -75,7 +77,7 @@ classification: .byte CLASS_ASTERISK # 2A * .byte CLASS_SINGLE # 2B + .byte CLASS_SINGLE # 2C , - .byte 0x00 # 2D - + .byte CLASS_MINUS # 2D - .byte CLASS_DOT # 2E . .byte CLASS_SINGLE # 2F / .byte CLASS_ZERO # 30 0 @@ -159,6 +161,67 @@ classification: .byte CLASS_SINGLE # 7E ~ .byte CLASS_INVALID # 7F DEL +# +# Textual keywords in the language. +# +.equ KEYWORDS_COUNT, 21 + +.type keywords, @object +keywords: + .word 7 + .ascii "program" + .word 6 + .ascii "import" + .word 5 + .ascii "const" + .word 3 + .ascii "var" + .word 2 + .ascii "if" + .word 4 + .ascii "then" + .word 5 + .ascii "elsif" + .word 4 + .ascii "else" + .word 5 + .ascii "while" + .word 2 + .ascii "do" + .word 4 + .ascii "proc" + .word 5 + .ascii "begin" + .word 3 + .ascii "end" + .word 4 + .ascii "type" + .word 6 + .ascii "record" + .word 5 + .ascii "union" + .word 4 + .ascii "true" + .word 5 + .ascii "false" + .word 3 + .ascii "nil" + .word 3 + .ascii "xor" + .word 2 + .ascii "or" + .word 6 + .ascii "return" + .word 4 + .ascii "cast" + .word 5 + .ascii "defer" + .word 4 + .ascii "case" + .word 2 + .ascii "of" +.size keywords, . - keywords + .section .data # The transition table describes transitions from one state to another, given @@ -173,58 +236,82 @@ classification: # It specifies the target state. "ff" means that this is an end state and no # transition is possible. # - The next byte is the action that should be performed when transitioning. -# For the meaning of actions see labels in the _analyze_token function, which +# For the meaning of actions see labels in the _tokenize_next function, which # handles each action. # .type transitions, @object -.size transitions, 13 * CLASS_COUNT # state count * CLASS_COUNT +.size transitions, 17 * CLASS_COUNT # state count * CLASS_COUNT transitions: # Invalid Digit Alpha Space : = ( ) # * _ Single Hex 0 x NUL . + # - " .word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107 .word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start + .word 0x010f, 0x0110 .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon + .word 0x02ff, 0x02ff - .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 02 Identifier + .word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff + .word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff # 02 Identifier + .word 0x05ff, 0x05ff .word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer + .word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign + .word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals + .word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren + .word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren + .word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk + .word 0x02ff, 0x02ff .word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109 .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment + .word 0x0109, 0x0109 .word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment + .word 0x0109, 0x0109 .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token + .word 0x02ff, 0x02ff .word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero + .word 0x02ff, 0x02ff .word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal + .word 0x00ff, 0x02ff .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x2ff, 0x02ff # 0e Dot + .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 0e Dot + .word 0x02ff, 0x02ff + + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0f Minus + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff + + .word 0x00ff, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 # 10 Starting string. + .word 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 + .word 0x0110, 0x04ff .section .text @@ -292,12 +379,42 @@ _next_state: addi sp, sp, 16 ret +# Takes an identifier and checks whether it's a keyword. +# +# Parameters: +# a0 - Token length. +# a1 - Token pointer. +# +# Sets a0 to the appropriate token type. +.type _classify_identifier, @function +_classify_identifier: + # Prologue. + addi sp, sp, -16 + sw ra, 12(sp) + sw s0, 8(sp) + addi s0, sp, 16 + + mv a2, a0 + mv a3, a1 + li a0, KEYWORDS_COUNT + la a1, keywords + call _strings_index + + # Epilogue. + lw ra, 12(sp) + lw s0, 8(sp) + addi sp, sp, 16 + ret + # Initializes the classification table. # # Paramaters: # a0 - Source text pointer. -.type _analyze_token, @function -_analyze_token: +# a1 - A pointer for output value, the token kind. 4 Bytes. +# +# Sets a0 to the position of the next token. +.type _tokenize_next, @function +_tokenize_next: # Prologue. addi sp, sp, -24 sw ra, 20(sp) @@ -311,7 +428,10 @@ _analyze_token: sw s2, 8(sp) # Preserve s2 containing the current state. li s2, 0x00 # Initial, start state. -.Lanalyze_token_loop: + sw a1, 0(sp) + sw zero, (a1) # Initialize. + +.Ltokenize_next_loop: mv a0, s2 lbu a1, (s1) call _next_state @@ -323,56 +443,43 @@ _analyze_token: and t1, a0, t0 # Transition action. srli t1, t1, 8 - + # Perform the provided action. li t0, 0x01 # Accumulate action. - beq t1, t0, .Lanalyze_token_accumulate + beq t1, t0, .Ltokenize_next_accumulate li t0, 0x02 # Print action. - beq t1, t0, .Lanalyze_token_print + beq t1, t0, .Ltokenize_next_print li t0, 0x03 # Skip action. - beq t1, t0, .Lanalyze_token_skip + beq t1, t0, .Ltokenize_next_skip li t0, 0x04 # Comment action. - beq t1, t0, .Lanalyze_token_comment + beq t1, t0, .Ltokenize_next_comment - /* DEBUG - mv s4, t1 - addi t1, t1, '0' - sb t1, 0(sp) - li t1, ' ' - sb t1, 1(sp) - addi t1, s2, '0' - sb t1, 2(sp) - addi a0, sp, 0 */ - sw s1, 0(sp) - addi a0, s1, 0 - li a1, 3 - call _write_error - /* mv t1, s4 - DEBUG */ + li t0, 0x05 # Finalize identifier. + beq t1, t0, .Ltokenize_next_identifier - j .Lanalyze_token_reject + j .Ltokenize_next_reject -.Lanalyze_token_reject: +.Ltokenize_next_reject: addi s1, s1, 1 - j .Lanalyze_token_end + j .Ltokenize_next_end -.Lanalyze_token_accumulate: +.Ltokenize_next_accumulate: addi s1, s1, 1 - j .Lanalyze_token_loop + j .Ltokenize_next_loop -.Lanalyze_token_skip: +.Ltokenize_next_skip: addi s1, s1, 1 lw t0, 4(sp) addi t0, t0, 1 sw t0, 4(sp) - j .Lanalyze_token_loop + j .Ltokenize_next_loop -.Lanalyze_token_print: +.Ltokenize_next_print: /* DEBUG lw a0, 4(sp) mv a1, s1 @@ -380,9 +487,9 @@ _analyze_token: call _write_error DEBUG */ - j .Lanalyze_token_end + j .Ltokenize_next_end -.Lanalyze_token_comment: +.Ltokenize_next_comment: addi s1, s1, 1 /* DEBUG @@ -392,9 +499,20 @@ _analyze_token: call _write_error DEBUG */ - j .Lanalyze_token_end + j .Ltokenize_next_end -.Lanalyze_token_end: +.Ltokenize_next_identifier: + # An identifier can be a textual keyword. + # Check the kind of the token and write it into the output parameter. + lw a1, 4(sp) + sub a0, s1, a1 + call _classify_identifier + lw a1, 0(sp) + sw a0, (a1) + + j .Ltokenize_next_end + +.Ltokenize_next_end: mv a0, s1 # Return the advanced text pointer. # Restore saved registers. @@ -406,27 +524,3 @@ _analyze_token: lw s0, 16(sp) addi sp, sp, 24 ret - -# Initializes the lookup tables. -# -# Parameters: -# a0 - Source text pointer. -.type _tokenize, @function -_tokenize: - # Prologue. - addi sp, sp, -8 - sw ra, 4(sp) - sw s0, 0(sp) - addi s0, sp, 8 - -.Ltokenize_loop: - call _analyze_token - - lw t0, (a0) - bnez t0, .Ltokenize_loop - - # Epilogue. - lw ra, 4(sp) - lw s0, 0(sp) - addi sp, sp, 8 - ret