From 40701008f04f2242ab69bfb4bc4f376e6b75429a Mon Sep 17 00:00:00 2001 From: Eugen Wissner Date: Tue, 6 May 2025 23:58:46 +0200 Subject: [PATCH] Replace _read_token with the lexer --- boot/stage1.s | 242 ++++++++++++++--------------------------------- boot/stage2.elna | 89 +---------------- boot/tokenizer.s | 36 +++---- 3 files changed, 91 insertions(+), 276 deletions(-) diff --git a/boot/stage1.s b/boot/stage1.s index b32815d..525da11 100644 --- a/boot/stage1.s +++ b/boot/stage1.s @@ -409,11 +409,15 @@ _build_expression: mv a0, s1 addi a1, sp, 24 call _tokenize_next + sw a0, 20(sp) - call _skip_spaces - call _read_token - sw s1, 32(sp) - sw a0, 28(sp) + /* DEBUG + lw a0, 32(sp) + lw a1, 28(sp) + call _write_error + lw a0, 28(sp) + li a1, 8 + call _write_error */ lw a0, 24(sp) @@ -423,14 +427,15 @@ _build_expression: li t0, TOKEN_AT beq a0, t0, .Lbuild_expression_address - lbu a0, (s1) - call _is_digit - bnez a0, .Lbuild_expression_literal + li t0, TOKEN_INTEGER + beq a0, t0, .Lbuild_expression_literal - lbu a0, (s1) + lw a0, 32(sp) + lbu a0, (a0) li t0, '_' beq a0, t0, .Lbuild_expression_call + lw s1, 32(sp) lw a0, 28(sp) lw a1, 36(sp) call _compile_identifier_expression @@ -438,7 +443,7 @@ _build_expression: j .Lbuild_expression_advance .Lbuild_expression_negate: - addi s1, s1, 1 # Skip the -. + lw s1, 20(sp) # Skip the -. mv a0, zero call _build_expression @@ -446,7 +451,7 @@ _build_expression: li a1, ASM_NEG_A0_SIZE call _write_out - j .Lbuild_expression_advance + j .Lbuild_expression_end .Lbuild_expression_address: lw t1, 36(sp) @@ -463,10 +468,10 @@ _build_expression: li a1, 13 call _write_out - addi s1, s1, 1 # Skip @. - call _read_token - sw s1, 32(sp) - sw a0, 28(sp) + lw a0, 20(sp) # Skip @. + addi a1, sp, 24 + call _tokenize_next + mv s1, a0 lw a0, 32(sp) lw a1, 28(sp) @@ -477,13 +482,16 @@ _build_expression: li a0, '\n' call _put_char - j .Lbuild_expression_advance + j .Lbuild_expression_end .Lbuild_expression_call: + lw a0, 20(sp) + addi a1, sp, 8 + call _tokenize_next + mv s1, a0 + lw a0, 32(sp) lw a1, 28(sp) - add s1, s1, a1 - addi s1, s1, 1 call _compile_call j .Lbuild_expression_end @@ -509,8 +517,7 @@ _build_expression: j .Lbuild_expression_advance .Lbuild_expression_advance: - lw a0, 28(sp) - add s1, s1, a0 + lw s1, 20(sp) .Lbuild_expression_end: # Epilogue. @@ -747,102 +754,6 @@ _compile_call: addi sp, sp, 32 ret -# Reads a token and returns its length in a0. -# _read_token doesn't change s1, it finds the length of the token s1 is pointing to. -.type _read_token, @function -_read_token: - # Prologue. - addi sp, sp, -16 - sw ra, 12(sp) - sw s0, 8(sp) - addi s0, sp, 16 - - lbu t0, (s1) # t0 = Current character. - sw zero, 4(sp) - - li t1, '.' - beq t0, t1, .Ltoken_character_single - - li t1, ',' - beq t0, t1, .Ltoken_character_single - - li t1, ':' - beq t0, t1, .Ltoken_character_colon - - li t1, ';' - beq t0, t1, .Ltoken_character_single - - li t1, '(' - beq t0, t1, .Ltoken_character_single - - li t1, ')' - beq t0, t1, .Ltoken_character_single - - li t1, '[' - beq t0, t1, .Ltoken_character_single - - li t1, ']' - beq t0, t1, .Ltoken_character_single - - li t1, '^' - beq t0, t1, .Ltoken_character_single - - li t1, '&' - beq t0, t1, .Ltoken_character_single - - li t1, '=' - beq t0, t1, .Ltoken_character_single - - li t1, '+' - beq t0, t1, .Ltoken_character_single - - li t1, '-' - beq t0, t1, .Ltoken_character_single - - li t1, '*' - beq t0, t1, .Ltoken_character_single - - li t1, '@' - beq t0, t1, .Ltoken_character_single -# Expect an identifier or a number. -.Ltoken_character_loop_do: - lw t6, 4(sp) - add t1, s1, t6 - lbu a0, (t1) # a0 = Current character. - - call _is_alnum - - beqz a0, .Ltoken_character_end - lw t6, 4(sp) - addi t6, t6, 1 - sw t6, 4(sp) - j .Ltoken_character_loop_do - -.Ltoken_character_single: - lw t6, 4(sp) - addi t6, t6, 1 - sw t6, 4(sp) - j .Ltoken_character_end - -.Ltoken_character_colon: - lbu t0, 1(s1) # t0 = The character after the colon. - lw t6, 4(sp) - addi t6, t6, 1 - sw t6, 4(sp) - - li t1, '=' - beq t0, t1, .Ltoken_character_single - j .Ltoken_character_end - -.Ltoken_character_end: - lw a0, 4(sp) - - # Epilogue. - lw ra, 12(sp) - lw s0, 8(sp) - addi sp, sp, 16 - ret - # Skips the spaces till the next non space character. .type _skip_spaces, @function _skip_spaces: @@ -911,15 +822,14 @@ _skip_comment: addi sp, sp, 16 ret -# Parameters: -# a0 - Line length. +# Walks through the procedure definitions. .type _compile_procedure_section, @function _compile_procedure_section: # Prologue. - addi sp, sp, -24 - sw ra, 20(sp) - sw s0, 16(sp) - addi s0, sp, 24 + addi sp, sp, -32 + sw ra, 28(sp) + sw s0, 24(sp) + addi s0, sp, 32 .Lcompile_procedure_section_loop: call _skip_spaces @@ -938,9 +848,9 @@ _compile_procedure_section: .Lcompile_procedure_section_end: # Epilogue. - lw ra, 20(sp) - lw s0, 16(sp) - addi sp, sp, 24 + lw ra, 28(sp) + lw s0, 24(sp) + addi sp, sp, 32 ret .type _compile_module_declaration, @function @@ -970,10 +880,10 @@ _compile_module_declaration: .type _compile_constant_section, @function _compile_constant_section: # Prologue. - addi sp, sp, -24 - sw ra, 20(sp) - sw s0, 16(sp) - addi s0, sp, 24 + addi sp, sp, -32 + sw ra, 28(sp) + sw s0, 24(sp) + addi s0, sp, 32 mv a0, s1 addi a1, sp, 4 @@ -988,19 +898,24 @@ _compile_constant_section: call _write_out .Lcompile_constant_section_item: - call _skip_spaces - lbu a0, (s1) - call _is_upper - beqz a0, .Lcompile_constant_section_end + mv a0, s1 + addi a1, sp, 12 + call _tokenize_next + + lw t0, 12(sp) + li t1, TOKEN_IDENTIFIER + + bne t0, t1, .Lcompile_constant_section_end + lw s1, 20(sp) call _compile_constant j .Lcompile_constant_section_item .Lcompile_constant_section_end: # Epilogue. - lw ra, 20(sp) - lw s0, 16(sp) - addi sp, sp, 24 + lw ra, 28(sp) + lw s0, 24(sp) + addi sp, sp, 32 ret .type _compile_constant, @function @@ -1042,7 +957,6 @@ _compile_constant: li a0, '\n' call _put_char - call _skip_spaces # Epilogue. lw ra, 28(sp) @@ -1053,10 +967,10 @@ _compile_constant: .type _compile_variable_section, @function _compile_variable_section: # Prologue. - addi sp, sp, -24 - sw ra, 20(sp) - sw s0, 16(sp) - addi s0, sp, 24 + addi sp, sp, -32 + sw ra, 28(sp) + sw s0, 24(sp) + addi s0, sp, 32 mv a0, s1 addi a1, sp, 4 @@ -1071,19 +985,23 @@ _compile_variable_section: call _write_out .Lcompile_variable_section_item: - call _skip_spaces - lbu a0, (s1) - call _is_lower - beqz a0, .Lcompile_variable_section_end + mv a0, s1 + addi a1, sp, 12 + call _tokenize_next + lw t0, 12(sp) + li t1, TOKEN_IDENTIFIER + + bne t0, t1, .Lcompile_variable_section_end + lw s1, 20(sp) # Advance to the beginning of the variable name. call _compile_variable j .Lcompile_variable_section_item .Lcompile_variable_section_end: # Epilogue. - lw ra, 20(sp) - lw s0, 16(sp) - addi sp, sp, 24 + lw ra, 28(sp) + lw s0, 24(sp) + addi sp, sp, 32 ret # Compile a global variable. @@ -1111,18 +1029,6 @@ _compile_variable: call _tokenize_next # Skip the type. mv s1, a0 - /* DEBUG - lw a0, 24(sp) - add a0, a0, '0' - sw a0, 24(sp) - addi a0, sp, 24 - li a1, 1 - call _write_error - lw a0, 28(sp) - li a1, 8 - call _write_error - */ - # .type identifier, @object la a0, asm_type li a1, ASM_TYPE_SIZE @@ -1478,10 +1384,6 @@ _compile_statement: j .Lcompile_statement_end .Lcompile_statement_end: - sw a0, 12(sp) - call _skip_comment - lw a0, 12(sp) - # Epilogue. lw ra, 28(sp) lw s0, 24(sp) @@ -1492,19 +1394,19 @@ _compile_statement: .type _compile_text_section, @function _compile_text_section: # Prologue. - addi sp, sp, -8 - sw ra, 4(sp) - sw s0, 0(sp) - addi s0, sp, 8 + addi sp, sp, -16 + sw ra, 12(sp) + sw s0, 8(sp) + addi s0, sp, 16 la a0, section_text li a1, SECTION_TEXT_SIZE call _write_out # Epilogue. - lw ra, 4(sp) - lw s0, 0(sp) - addi sp, sp, 8 + lw ra, 12(sp) + lw s0, 8(sp) + addi sp, sp, 16 ret .type _compile_entry_point, @function diff --git a/boot/stage2.elna b/boot/stage2.elna index 50f1303..62bd307 100644 --- a/boot/stage2.elna +++ b/boot/stage2.elna @@ -183,7 +183,6 @@ begin goto .Lcompile_identifier_expression_end end - (* Global identifier. *); loca8 := 0x6120616c; _write_out(@loca8, 4); loca8 := 0x00202c00 or loca80; @@ -208,11 +207,6 @@ begin .Lcompile_identifier_expression_end end -(* -Evalutes an expression and saves the result in a0. - -a0 - X in aX, the register number to save the result. -*) proc _build_expression() var loca0, loca20, loca28, loca8: Word @@ -305,13 +299,6 @@ begin .Lbuild_expression_end end -(* -Compiles an lvalue. - -Parameters: -a0 - Pointer to the identifier. -a1 - Identifier length. -*) proc _compile_designator_expression(loca84: ^Byte, loca80: Word) var loca0: Word @@ -352,12 +339,6 @@ begin .Lcompile_designator_expression_end end -(* -Compiles a statement beginning with an identifier. - -Left values should be variables named "loca n", where n is the offset -of the variable on the stack, like loca8 or loca4. -*) proc _compile_identifier() var loca0, loca16, loca8: Word @@ -393,13 +374,6 @@ begin .Lcompile_identifier_end end -(* -Compiles a procedure call. Expects s1 to point to the first argument. -a0 - Pointer to the procedure name. -a1 - Length of the procedure name. - -Returns the procedure result in a0. -*) proc _compile_call(loca84: ^Byte, loca80: Word) var loca0, loca4, loca12: Word @@ -422,10 +396,6 @@ begin loca0 := 0x202c30; _write_out(@loca0, 3); - (* - Only 6 arguments are supported with a0-a5. - Save all arguments on the stack so they aren't overriden afterwards. - *) loca0 := -4 * loca12; loca0 := loca0 + 60; _printi(loca0); @@ -450,10 +420,7 @@ begin loca12 := 0; .Lcompile_call_restore; - (* - Just go through all a0-a5 registers and read them from stack. - If this stack value contains garbage, the procedure just shouldn't use it. - *) + loca0 := 0x6120776c; _write_out(@loca0, 4); loca4 := 0x36202c30; @@ -514,10 +481,6 @@ begin _advance(1) end -(* -Reads a token and returns its length in a0. -_read_token doesn't change s1, it finds the length of the token s1 is pointing to. -*) proc _read_token() var loca0, loca4: Word @@ -612,7 +575,6 @@ begin return loca4 end -(* Skips the spaces till the next non space character. *) proc _skip_spaces() var loca0: Byte @@ -643,10 +605,6 @@ begin .Lspace_loop_end end -(* -Parameters: - a0 - Line length. -*) proc _skip_comment(loca84: Word) var loca0: ^Byte @@ -684,10 +642,6 @@ begin .Lskip_comment_end end -(* -Parameters: - a0 - Line length. -*) proc _compile_assembly(loca84: Word) var loca0: ^Byte begin @@ -846,7 +800,6 @@ begin loca0 := 0x0a74; _write_out(@loca0, 2); - (* .size identifier, size *); loca0 := 0x7a69732e; _write_out(@loca0, 4); loca0 := 0x2065; @@ -882,7 +835,6 @@ begin loca20 := _current(); _advance(loca16); - (* .type identifier, @function *); loca0 := 0x7079742e; _write_out(@loca0, 4); loca0 := 0x2065; @@ -910,10 +862,6 @@ begin loca12 := 0x6e; loca8 := 0x69676562; - (* - Skip all declarations until we find the "begin" keyword, denoting the - beginning of the procedure body. - *) .Lcompile_procedure_begin; _skip_spaces(); loca0 := _read_token(); @@ -1056,15 +1004,6 @@ begin _write_out(@loca0, 4) end -(* -Compares two string, which of one has a length, the other one is null-terminated. - - a0 - The address of the token string. - a1 - The length of the string in a0. - a2 - The address of the null-terminated string. - -If the strings match sets a0 to 0, otherwise sets it to 1. -*) proc _token_compare(loca84: ^Byte, loca80: Word, loca76: ^Byte) var loca0: Bool @@ -1074,10 +1013,6 @@ begin .Ltoken_compare_loop; loca4 := _front(loca76); - (* - Will only be 0 if the current character in the null terminated string is \0 and the remaining length of the - another string is 0. - *) loca8 := loca4 or loca80; if loca8 = 0 then goto .Ltoken_compare_equal @@ -1135,7 +1070,6 @@ begin _put_char(0x0a) end -(* a0 - Line length. *) proc _compile_label(loca84: Word) var loca0: Word @@ -1187,7 +1121,6 @@ begin _write_out(@loca12, 4); _put_char(0x20); - (* Write the label *); _write_out(@loca16, 4); _printi(s2); @@ -1213,20 +1146,10 @@ begin loca12 := 0x0a3a0a3a; _write_out(@loca12, 2); - (* Increment the label counter. *); s2 := s2 + 1; _advance(4) end -(* -Parameters: - a0 - Line length. - a1 - Whether the section header was already emitted. If not it should be - emitted before any code is written. - -Returns 1 in a0 if the parsed line contained a text section element such a -procedure or the program entry point. Otherwise sets a0 to 0. -*) proc _compile_line(loca84: Word, loca80: Bool) var loca0: Char @@ -1407,7 +1330,6 @@ begin return loca8 end -(* Prints ".section .text" and exits. *) proc _compile_text_section() var loca0: Word begin @@ -1466,7 +1388,6 @@ begin _skip_spaces() end -(* Finds the end of the line and returns its length in a0. *) proc _read_line() var loca0: ^Byte @@ -1520,23 +1441,16 @@ begin .Lcompile_end end -(* Returns the pointer to the current position in the source text in a0. *) proc _current() begin return s1 end -(* a0 is the number of bytes to advance in the source text. *) proc _advance(loca84: Word) begin s1 := s1 + loca84 end -(* -a0 - Pointer to an array to get the first element. - -Returns the first character in the remaining source text. -*) proc _front(loca84: ^Word) begin return _get(loca84) & 0xff @@ -1549,7 +1463,6 @@ begin s2 := 1 end -(* Entry point. *) begin _main(); _compile() diff --git a/boot/tokenizer.s b/boot/tokenizer.s index bf8e443..647a3b6 100644 --- a/boot/tokenizer.s +++ b/boot/tokenizer.s @@ -268,7 +268,7 @@ transitions: .word 0x08ff, 0x0103, 0x00ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff .word 0x08ff, 0x00ff, 0x08ff, 0x00ff, 0x0103, 0x00ff, 0x08ff, 0x08ff - .word 0x08ff, 0x08ff, 0x08ff, 0x08ff # 0x03 Integer + .word 0x08ff, 0x08ff, 0x08ff, 0x08ff # 0x03 Decimal .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x04ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff @@ -286,9 +286,9 @@ transitions: .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x04ff # 0x07 Less - .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x08 Dot + .word 0x08ff, 0x0108, 0x00ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff + .word 0x08ff, 0x00ff, 0x08ff, 0x0108, 0x0108, 0x00ff, 0x08ff, 0x08ff + .word 0x08ff, 0x08ff, 0x08ff, 0x08ff # 0x08 Hexadecimal after 0x. .word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109 .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 @@ -302,13 +302,13 @@ transitions: .word 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110 .word 0x010b, 0x04ff, 0x010b, 0x010b # 0x0b String - .word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x0c Zero + .word 0x08ff, 0x00ff, 0x00ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff + .word 0x08ff, 0x00ff, 0x08ff, 0x00ff, 0x00ff, 0x010d, 0x08ff, 0x08ff + .word 0x08ff, 0x08ff, 0x08ff, 0x08ff # 0x0c Leading zero - .word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff - .word 0x00ff, 0x02ff, 0x02ff, 0x02ff # 0x0d Hexadecimal + .word 0x00ff, 0x0108, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff + .word 0x00ff, 0x00ff, 0x00ff, 0x0108, 0x0108, 0x00ff, 0x00ff, 0x00ff + .word 0x00ff, 0x00ff, 0x00ff, 0x00ff # 0x0d Starting hexadecimal .section .text @@ -592,14 +592,14 @@ _tokenize_next: j .Ltokenize_next_end .Ltokenize_next_integer: - lw a1, 12(sp) - sub a0, s1, a1 - sw a0, 8(sp) - sw a0, 4(sp) - lw a0, 0(sp) - addi a1, sp, 4 - li a2, 12 - call _memcpy + lw t0, 0(sp) + li t1, TOKEN_INTEGER + sw t1, 0(t0) + lw t1, 12(sp) + sw t1, 8(t0) + sub t1, s1, t1 + sw t1, 4(t0) + j .Ltokenize_next_end .Ltokenize_next_end: