From 0a0bc4e1f2e263cdda83bd78a2a77ff68ae2334e Mon Sep 17 00:00:00 2001 From: Eugen Wissner Date: Sat, 3 May 2025 23:35:41 +0200 Subject: [PATCH] Tokenize single character symbols --- Gemfile | 3 + Rakefile | 3 + boot/common-boot.s | 4 + boot/definitions.inc | 24 +++ boot/stage1.s | 369 +++++++++++++------------------------------ boot/tokenizer.s | 223 +++++++++++++++++--------- 6 files changed, 291 insertions(+), 335 deletions(-) diff --git a/Gemfile b/Gemfile index 235744c..97bdde4 100644 --- a/Gemfile +++ b/Gemfile @@ -1,3 +1,6 @@ +# This Source Code Form is subject to the terms of the Mozilla Public License, +# v. 2.0. If a copy of the MPL was not distributed with this file, You can +# obtain one at https://mozilla.org/MPL/2.0/. # frozen_string_literal: true source 'https://rubygems.org' diff --git a/Rakefile b/Rakefile index 2bc2683..71b3293 100644 --- a/Rakefile +++ b/Rakefile @@ -1,3 +1,6 @@ +# This Source Code Form is subject to the terms of the Mozilla Public License, +# v. 2.0. If a copy of the MPL was not distributed with this file, You can +# obtain one at https://mozilla.org/MPL/2.0/. # frozen_string_literal: true require 'open3' diff --git a/boot/common-boot.s b/boot/common-boot.s index 26dad8d..a6fb04e 100644 --- a/boot/common-boot.s +++ b/boot/common-boot.s @@ -1,3 +1,7 @@ +# This Source Code Form is subject to the terms of the Mozilla Public License, +# v. 2.0. If a copy of the MPL was not distributed with this file, You can +# obtain one at https://mozilla.org/MPL/2.0/. + .global _is_alpha, _is_digit, _is_alnum, _is_upper, _is_lower .global _write_out, _read_file, _write_error, _put_char, _printi .global _get, _memcmp, _memchr, _memmem, _memcpy diff --git a/boot/definitions.inc b/boot/definitions.inc index 0e2f54e..4d8ab9c 100644 --- a/boot/definitions.inc +++ b/boot/definitions.inc @@ -1,3 +1,7 @@ +# This Source Code Form is subject to the terms of the Mozilla Public License, +# v. 2.0. If a copy of the MPL was not distributed with this file, You can +# obtain one at https://mozilla.org/MPL/2.0/. + # The constant should match the index in the keywords array in tokenizer.s. .equ TOKEN_PROGRAM, 1 @@ -26,3 +30,23 @@ .equ TOKEN_DEFER, 24 .equ TOKEN_CASE, 25 .equ TOKEN_OF, 26 + +# The constant should match the character index in the byte_keywords string. + +.equ TOKEN_AND, 27 +.equ TOKEN_DOT, 28 +.equ TOKEN_COMMA, 29 +.equ TOKEN_COLON, 30 +.equ TOKEN_SEMICOLON, 31 +.equ TOKEN_LEFT_PAREN, 32 +.equ TOKEN_RIGHT_PAREN, 33 +.equ TOKEN_LEFT_BRACKET, 34 +.equ TOKEN_RIGHT_BRACKET, 35 +.equ TOKEN_HAT, 36 +.equ TOKEN_EQUALS, 37 +.equ TOKEN_PLUS, 38 +.equ TOKEN_MINUS, 39 +.equ TOKEN_ASTERISK, 40 +.equ TOKEN_AT, 41 + +.equ TOKEN_ASSIGN, 42 diff --git a/boot/stage1.s b/boot/stage1.s index 9ab072d..6761bb2 100644 --- a/boot/stage1.s +++ b/boot/stage1.s @@ -1,6 +1,10 @@ +# This Source Code Form is subject to the terms of the Mozilla Public License, +# v. 2.0. If a copy of the MPL was not distributed with this file, You can +# obtain one at https://mozilla.org/MPL/2.0/. + .global _start # Program entry point. -# Global variables or registers. +# Registers used as global variables: # s1 - Contains the current position in the source text. # s2 - Label counter. @@ -42,6 +46,10 @@ asm_neg_a0: .ascii "neg a0, a0\n" .equ ASM_NEG_A0_SIZE, . - asm_neg_a0 asm_type: .ascii ".type " .equ ASM_TYPE_SIZE, . - asm_type +asm_type_function: .ascii ", @function\n" +.equ ASM_TYPE_FUNCTION_SIZE, . - asm_type_function +asm_type_object: .ascii ", @object\n" +.equ ASM_TYPE_OBJECT_SIZE, . - asm_type_object asm_restore_parameters: .ascii "lw a0, 60(sp)\nlw a1, 56(sp)\nlw a2, 52(sp)\nlw a3, 48(sp)\nlw a4, 44(sp)\nlw a5, 40(sp)\n" .equ ASM_RESTORE_PARAMETERS_SIZE, . - asm_restore_parameters @@ -77,14 +85,6 @@ _compile_import: call _tokenize_next mv s1, a0 - /* DEBUG - lw t0, 0(sp) - addi t0, t0, '0' - sw t0, 4(sp) - addi a0, sp, 4 - li a1, 1 - call _write_error*/ - j .Lcompile_import_loop .Lcompile_import_end: @@ -104,63 +104,35 @@ _build_binary_expression: li a0, 0 call _build_expression - call _skip_spaces - call _read_token - sw a0, 20(sp) - li t0, '&' - sw t0, 16(sp) mv a0, s1 - lw a1, 20(sp) - addi a2, sp, 16 - call _token_compare - beqz a0, .L_build_binary_expression_and + addi a1, sp, 16 + call _tokenize_next + lw t0, 16(sp) - li t0, 0x726f # or - sw t0, 16(sp) - mv a0, s1 - lw a1, 20(sp) - addi a2, sp, 16 - call _token_compare - beqz a0, .L_build_binary_expression_or + li t1, TOKEN_AND + beq t0, t1, .L_build_binary_expression_and - li t0, '=' - sw t0, 16(sp) - mv a0, s1 - lw a1, 20(sp) - addi a2, sp, 16 - call _token_compare - beqz a0, .L_build_binary_expression_equal + li t1, TOKEN_OR + beq t0, t1, .L_build_binary_expression_or - li t0, '+' - sw t0, 16(sp) - mv a0, s1 - lw a1, 20(sp) - addi a2, sp, 16 - call _token_compare - beqz a0, .L_build_binary_expression_plus + li t1, TOKEN_PLUS + beq t0, t1, .L_build_binary_expression_plus - li t0, '-' - sw t0, 16(sp) - mv a0, s1 - lw a1, 20(sp) - addi a2, sp, 16 - call _token_compare - beqz a0, .L_build_binary_expression_minus + li t1, TOKEN_EQUALS + beq t0, t1, .L_build_binary_expression_equal - li t0, '*' - sw t0, 16(sp) - mv a0, s1 - lw a1, 20(sp) - addi a2, sp, 16 - call _token_compare - beqz a0, .L_build_binary_expression_product + li t1, TOKEN_ASTERISK + beq t0, t1, .L_build_binary_expression_product + + li t1, TOKEN_MINUS + beq t0, t1, .L_build_binary_expression_minus j .Lbuild_binary_expression_end .L_build_binary_expression_equal: - addi s1, s1, 1 # Skip =. + mv s1, a0 # Skip =. li a0, 1 call _build_expression la a0, asm_sub_a0_a1 @@ -174,7 +146,12 @@ _build_binary_expression: j .Lbuild_binary_expression_end .L_build_binary_expression_and: - addi s1, s1, 1 # Skip &. + /* DEBUG + addi a0, s1, 0 + li a1, 4 + call _write_error */ + + mv s1, a0 # Skip &. li a0, 1 call _build_expression la a0, asm_and_a0_a1 @@ -184,7 +161,7 @@ _build_binary_expression: j .Lbuild_binary_expression_end .L_build_binary_expression_or: - addi s1, s1, 2 # Skip or. + mv s1, a0 # Skip or. li a0, 1 call _build_expression la a0, asm_or_a0_a1 @@ -194,7 +171,7 @@ _build_binary_expression: j .Lbuild_binary_expression_end .L_build_binary_expression_plus: - addi s1, s1, 1 # Skip +. + mv s1, a0 # Skip +. li a0, 1 call _build_expression la a0, asm_add_a0_a1 @@ -204,7 +181,7 @@ _build_binary_expression: j .Lbuild_binary_expression_end .L_build_binary_expression_minus: - addi s1, s1, 1 # Skip -. + mv s1, a0 # Skip -. li a0, 1 call _build_expression la a0, asm_sub_a0_a1 @@ -214,7 +191,7 @@ _build_binary_expression: j .Lbuild_binary_expression_end .L_build_binary_expression_product: - addi s1, s1, 1 # Skip *. + mv s1, a0 # Skip *. li a0, 1 call _build_expression la a0, asm_mul_a0_a1 @@ -937,29 +914,31 @@ _skip_comment: # Parameters: # a0 - Line length. -.type _compile_assembly, @function -_compile_assembly: +.type _compile_procedure_section, @function +_compile_procedure_section: # Prologue. addi sp, sp, -16 sw ra, 12(sp) sw s0, 8(sp) addi s0, sp, 16 - sw a0, 4(sp) # a0 - Line length. +.Lcompile_procedure_section_loop: + call _skip_spaces + call _skip_comment + call _skip_spaces - # Write the source to the standard output. mv a0, s1 - lw a1, 4(sp) - call _write_out + addi a1, sp, 0 + call _tokenize_next + li t0, TOKEN_PROC + lw t1, 0(sp) + bne t0, t1, .Lcompile_procedure_section_end - lw t0, 4(sp) - add s1, s1, t0 + call _compile_procedure - li a0, '\n' - call _put_char - - addi s1, s1, 1 # Skip the new line. + j .Lcompile_procedure_section_loop +.Lcompile_procedure_section_end: # Epilogue. lw ra, 12(sp) lw s0, 8(sp) @@ -1038,15 +1017,19 @@ _compile_constant: sw s0, 8(sp) addi s0, sp, 16 - call _read_token + mv a0, s1 + addi a1, sp, 0 + call _tokenize_next - mv a1, a0 # The identifier length from _read_token should be in a1. - mv a0, s1 # Save the identifier pointer before advancing it. - add s1, s1, a1 + sub a1, a0, s1 # The identifier end from _tokenize_next should be in a0. + mv a0, s1 + add s1, s1, a1 # Save the identifier pointer before advancing it. call _write_out - call _skip_spaces - addi s1, s1, 2 # Skip the assignment sign. + mv a0, s1 + addi a1, sp, 0 + call _tokenize_next + mv s1, a0 # Skip the assignment sign. # : .long li t0, 0x20676e6f # ong_ @@ -1154,42 +1137,10 @@ _compile_variable: lw a1, 24(sp) call _write_out - li t0, 0x0a74 # t\n - sw t0, 12(sp) - li t0, 0x63656a62 # bjec - sw t0, 8(sp) - li t0, 0x6f40202c # , @o - sw t0, 4(sp) - addi a0, sp, 4 - li a1, 10 + la a0, asm_type_object + li a1, ASM_TYPE_OBJECT_SIZE call _write_out - # .size identifier, size - li t0, 0x2065 # e_ - sw t0, 12(sp) - li t0, 0x7a69732e # .siz - sw t0, 8(sp) - addi a0, sp, 8 - li a1, 6 - call _write_out - - lw a0, 28(sp) - lw a1, 24(sp) - call _write_out - - li t0, 0x202c # ,_ - sw t0, 12(sp) - addi a0, sp, 12 - li a1, 2 - call _write_out - - lw a0, 20(sp) - lw a1, 16(sp) - call _write_out - - li a0, '\n' - call _put_char - # identifier: .zero size lw a0, 28(sp) lw a1, 24(sp) @@ -1239,14 +1190,8 @@ _compile_procedure: lw a1, 16(sp) call _write_out - li t0, 0x0a6e6f69 # ion\n - sw t0, 12(sp) - li t0, 0x74636e75 # unct - sw t0, 8(sp) - li t0, 0x6640202c # , @f - sw t0, 4(sp) - addi a0, sp, 4 - li a1, 12 + la a0, asm_type_function + li a1, ASM_TYPE_FUNCTION_SIZE call _write_out lw a0, 20(sp) @@ -1356,7 +1301,7 @@ _compile_procedure: beqz a0, .Lcompile_procedure_end lw a0, 12(sp) - call _compile_line + call _compile_statement j .Lcompile_procedure_body .Lcompile_procedure_end: @@ -1577,7 +1522,7 @@ _compile_if: call _read_line li a1, 1 - call _compile_line + call _compile_statement j .Lcompile_if_loop @@ -1614,8 +1559,8 @@ _compile_if: # # Returns 1 in a0 if the parsed line contained a text section element such a # procedure or the program entry point. Otherwise sets a0 to 0. -.type _compile_line, @function -_compile_line: +.type _compile_statement, @function +_compile_statement: # Prologue. addi sp, sp, -32 sw ra, 28(sp) @@ -1626,45 +1571,17 @@ _compile_line: sw a0, 20(sp) sw a1, 16(sp) - beqz a0, .Lcompile_line_empty # Skip an empty line. - - lbu t0, (s1) - li t1, '(' - beq t0, t1, .Lcompile_line_comment - - li t0, 0x636f7270 # proc - sw t0, 12(sp) - mv a0, s1 - addi a1, sp, 12 - li a2, 4 - call _memcmp - beqz a0, .Lcompile_line_procedure - - li t0, 0x69676562 # begi - sw t0, 12(sp) - mv a0, s1 - addi a1, sp, 12 - li a2, 4 - call _memcmp - beqz a0, .Lcompile_line_begin - - li t0, 0x2e646e65 # end. - sw t0, 12(sp) - mv a0, s1 - addi a1, sp, 12 - li a2, 4 - call _memcmp - beqz a0, .Lcompile_line_exit + call _skip_comment mv a0, s1 lw a1, 20(sp) call _is_local_identifier - bnez a0, .Lcompile_line_identifier + bnez a0, .Lcompile_statement_identifier mv a0, s1 li a1, 2 call _is_register_identifier - bnez a0, .Lcompile_line_identifier + bnez a0, .Lcompile_statement_identifier li t0, 0x6f746f67 # goto sw t0, 12(sp) @@ -1672,7 +1589,7 @@ _compile_line: addi a1, sp, 12 li a2, 4 call _memcmp - beqz a0, .Lcompile_line_goto + beqz a0, .Lcompile_statement_goto li t0, 0x75746572 # retu sw t0, 12(sp) @@ -1680,7 +1597,7 @@ _compile_line: addi a1, sp, 12 li a2, 4 call _memcmp - beqz a0, .Lcompile_line_return + beqz a0, .Lcompile_statement_return li t0, 0x6669 # if sw t0, 12(sp) @@ -1688,77 +1605,42 @@ _compile_line: addi a1, sp, 12 li a2, 2 call _memcmp - beqz a0, .Lcompile_line_if + beqz a0, .Lcompile_statement_if lbu t0, (s1) li t1, '.' - beq t0, t1, .Lcompile_line_label + beq t0, t1, .Lcompile_statement_label li t1, '_' - beq t0, t1, .Lcompile_line_identifier + beq t0, t1, .Lcompile_statement_identifier - j .Lcompile_line_unchanged # Else. + j .Lcompile_statement_empty # Else. -.Lcompile_line_if: +.Lcompile_statement_if: call _compile_if - j .Lcompile_line_section + j .Lcompile_statement_end -.Lcompile_line_label: +.Lcompile_statement_label: lw a0, 20(sp) call _compile_label - j .Lcompile_line_section + j .Lcompile_statement_end -.Lcompile_line_return: +.Lcompile_statement_return: call _compile_return - j .Lcompile_line_section + j .Lcompile_statement_end -.Lcompile_line_goto: +.Lcompile_statement_goto: call _compile_goto - j .Lcompile_line_section + j .Lcompile_statement_end -.Lcompile_line_identifier: +.Lcompile_statement_identifier: call _compile_identifier - j .Lcompile_line_section + j .Lcompile_statement_end -.Lcompile_line_exit: - call _compile_exit - j .Lcompile_line_section - -.Lcompile_line_begin: - lw a1, 16(sp) - bnez a1, .Lcompile_line_compile_entry - call _compile_text_section -.Lcompile_line_compile_entry: - call _compile_entry_point - li a0, 1 - j .Lcompile_line_end - -.Lcompile_line_procedure: - lw a1, 16(sp) - bnez a1, .Lcompile_line_compile_procedure - call _compile_text_section -.Lcompile_line_compile_procedure: - call _compile_procedure - li a0, 1 - j .Lcompile_line_end - -.Lcompile_line_comment: - lw a0, 20(sp) - call _skip_comment - j .Lcompile_line_section - -.Lcompile_line_empty: +.Lcompile_statement_empty: addi s1, s1, 1 - j .Lcompile_line_section + j .Lcompile_statement_end -.Lcompile_line_unchanged: - lw a0, 20(sp) - call _compile_assembly - j .Lcompile_line_section - -.Lcompile_line_section: - mv a0, zero - -.Lcompile_line_end: +.Lcompile_statement_end: sw a0, 12(sp) call _skip_spaces call _skip_comment @@ -1804,20 +1686,25 @@ _compile_entry_point: addi s1, s1, 6 # Skip begin\n. - # Epilogue. - lw ra, 4(sp) - lw s0, 0(sp) - addi sp, sp, 8 - ret + # Generate the body of the procedure. +.Lcompile_entry_point_body: + call _skip_spaces + call _read_line + sw a0, 12(sp) + li t0, 0x2e646e65 # end + sw t0, 8(sp) + mv a0, s1 + addi a1, sp, 8 + li a2, 4 + call _memcmp -.type _compile_exit, @function -_compile_exit: - # Prologue. - addi sp, sp, -8 - sw ra, 4(sp) - sw s0, 0(sp) - addi s0, sp, 8 + beqz a0, .Lcompile_entry_point_end + lw a0, 12(sp) + call _compile_statement + j .Lcompile_entry_point_body + +.Lcompile_entry_point_end: la a0, asm_exit li a1, ASM_EXIT_SIZE call _write_out @@ -1857,30 +1744,13 @@ _compile: sw s0, 8(sp) addi s0, sp, 16 - sw zero, 4(sp) # Whether the text section header was already emitted. - call _compile_module_declaration call _compile_import call _compile_constant_section call _compile_variable_section - -.Lcompile_do: - lbu t0, (s1) # t0 = Current character. - beqz t0, .Lcompile_end # Exit the loop on the NUL character. - - call _skip_spaces - call _read_line - lw a1, 4(sp) - call _compile_line - - beqz a0, .Lcompile_do - # Update whether the text section header was already emitted. - lw t0, 4(sp) - or t0, t0, a0 - sw t0, 4(sp) - - j .Lcompile_do -.Lcompile_end: + call _compile_text_section + call _compile_procedure_section + call _compile_entry_point # Epilogue. lw ra, 12(sp) @@ -1888,22 +1758,6 @@ _compile: addi sp, sp, 16 ret -.type _main, @function -_main: - # Prologue. - addi sp, sp, -8 - sw ra, 4(sp) - sw s0, 0(sp) - addi s0, sp, 8 - - li s2, 1 - - # Epilogue. - lw ra, 4(sp) - lw s0, 0(sp) - addi sp, sp, 8 - ret - # Entry point. .type _start, @function _start: @@ -1912,8 +1766,7 @@ _start: li a1, SOURCE_BUFFER_SIZE # Buffer size. call _read_file - mv a0, s1 - call _main + li s2, 1 call _compile # Call exit. diff --git a/boot/tokenizer.s b/boot/tokenizer.s index 4315f66..67b2602 100644 --- a/boot/tokenizer.s +++ b/boot/tokenizer.s @@ -1,4 +1,10 @@ -.global _tokenize_next, classification, transitions, keywords +# This Source Code Form is subject to the terms of the Mozilla Public License, +# v. 2.0. If a copy of the MPL was not distributed with this file, You can +# obtain one at https://mozilla.org/MPL/2.0/. + +.global _tokenize_next, classification, transitions, keywords, byte_keywords + +.include "boot/definitions.inc" .section .rodata @@ -8,7 +14,7 @@ # # Classification: # -.equ CLASS_INVALID, 0x0 +.equ CLASS_INVALID, 0x00 .equ CLASS_DIGIT, 0x01 .equ CLASS_CHARACTER, 0x02 .equ CLASS_SPACE, 0x03 @@ -25,9 +31,11 @@ .equ CLASS_EOF, 0x0e .equ CLASS_DOT, 0x0f .equ CLASS_MINUS, 0x10 -.equ CLASS_DOUBLE_QUOTE, 0x11 +.equ CLASS_QUOTE, 0x11 +.equ CLASS_GREATER, 0x12 +.equ CLASS_LESS, 0x13 -.equ CLASS_COUNT, 18 +.equ CLASS_COUNT, 20 .type classification, @object .size classification, 128 @@ -66,12 +74,12 @@ classification: .byte CLASS_INVALID # 1F US .byte CLASS_SPACE # 20 Space .byte CLASS_SINGLE # 21 ! - .byte CLASS_DOUBLE_QUOTE # 22 " + .byte CLASS_QUOTE # 22 " .byte 0x00 # 23 # .byte 0x00 # 24 $ .byte CLASS_SINGLE # 25 % .byte CLASS_SINGLE # 26 & - .byte 0x00 # 27 ' + .byte CLASS_QUOTE # 27 ' .byte CLASS_LEFT_PAREN # 28 ( .byte CLASS_RIGHT_PAREN # 29 ) .byte CLASS_ASTERISK # 2A * @@ -92,9 +100,9 @@ classification: .byte CLASS_DIGIT # 39 9 .byte CLASS_COLON # 3A : .byte CLASS_SINGLE # 3B ; - .byte 0x00 # 3C < + .byte CLASS_LESS # 3C < .byte CLASS_EQUALS # 3D = - .byte 0x00 # 3E > + .byte CLASS_GREATER # 3E > .byte 0x00 # 3F ? .byte CLASS_SINGLE # 40 @ .byte CLASS_CHARACTER # 41 A @@ -220,7 +228,10 @@ keywords: .ascii "case" .word 2 .ascii "of" -.size keywords, . - keywords + +.type byte_keywords, @object +byte_keywords: .ascii "&.,:;()[]^=+-*@" +.equ BYTE_KEYWORDS_SIZE, . - byte_keywords .section .data @@ -240,78 +251,66 @@ keywords: # handles each action. # .type transitions, @object -.size transitions, 17 * CLASS_COUNT # state count * CLASS_COUNT +.size transitions, 14 * CLASS_COUNT # state count * CLASS_COUNT transitions: # Invalid Digit Alpha Space : = ( ) # * _ Single Hex 0 x NUL . - # - " - .word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107 - .word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start - .word 0x010f, 0x0110 + # - " or ' > < + .word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x06ff, 0x0106, 0x06ff + .word 0x06ff, 0x0102, 0x06ff, 0x0102, 0x010c, 0x0102, 0x00ff, 0x0108 + .word 0x0105, 0x0110, 0x0104, 0x0107 # 0x00 Start - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon - .word 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x07ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x01 Colon .word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff - .word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff # 02 Identifier - .word 0x05ff, 0x05ff + .word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff + .word 0x05ff, 0x05ff, 0x05ff, 0x05ff # 0x02 Identifier .word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer - .word 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x03 Integer + + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x04ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x04ff, 0x02ff # 0x04 Greater + + .word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff + .word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff + .word 0x06ff, 0x06ff, 0x04ff, 0x06ff # 0x05 Minus .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign - .word 0x02ff, 0x02ff + .word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x06 Left paren .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals - .word 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren - .word 0x02ff, 0x02ff - - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren - .word 0x02ff, 0x02ff - - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk - .word 0x02ff, 0x02ff - - .word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109 - .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment - .word 0x0109, 0x0109 - - .word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff - .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment - .word 0x0109, 0x0109 - - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token - .word 0x02ff, 0x02ff - - .word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero - .word 0x02ff, 0x02ff - - .word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal - .word 0x00ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x04ff # 0x07 Less .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 0e Dot - .word 0x02ff, 0x02ff + .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x08 Dot - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0f Minus - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff + .word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109 + .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 + .word 0x0109, 0x0109, 0x0109, 0x0109 # 0x09 Comment - .word 0x00ff, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 # 10 Starting string. - .word 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 - .word 0x0110, 0x04ff + .word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff + .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 + .word 0x0109, 0x0109, 0x0109, 0x0109 # 0x0a Closing comment + + .word 0x00ff, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110 + .word 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110 + .word 0x010b, 0x04ff, 0x010b, 0x010b # 0x0b String + + .word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x0c Zero + + .word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff + .word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff + .word 0x00ff, 0x02ff, 0x02ff, 0x02ff # 0x0d Hexadecimal .section .text @@ -406,6 +405,57 @@ _classify_identifier: addi sp, sp, 16 ret +# Takes a symbol and determines its type. +# +# Parameters: +# a0 - Token character. +# +# Sets a0 to the appropriate token type. +.type _classify_single, @function +_classify_single: + # Prologue. + addi sp, sp, -16 + sw ra, 12(sp) + sw s0, 8(sp) + addi s0, sp, 16 + + mv a1, a0 + li a2, BYTE_KEYWORDS_SIZE + la a0, byte_keywords + call _memchr + + la a1, byte_keywords + sub a0, a0, a1 + addi a0, a0, 27 + + # Epilogue. + lw ra, 12(sp) + lw s0, 8(sp) + addi sp, sp, 16 + ret + +# Classified a symbol containing multiple characters (probably 2). +# +# Parameters: +# a0 - Token length. +# a1 - Token pointer. +# +# Sets a0 to the appropriate token type. +.type _classify_composite, @function +_classify_composite: + lbu t0, 0(a1) + li t1, ':' + beq t0, t1, .Lclassify_composite_assign + + j .Lclassify_composite_end + +.Lclassify_composite_assign: + li a0, TOKEN_ASSIGN + j .Lclassify_composite_end + +.Lclassify_composite_end: + ret + # Initializes the classification table. # # Paramaters: @@ -453,12 +503,18 @@ _tokenize_next: li t0, 0x03 # Skip action. beq t1, t0, .Ltokenize_next_skip - li t0, 0x04 # Comment action. + li t0, 0x04 # Delimited string action. beq t1, t0, .Ltokenize_next_comment li t0, 0x05 # Finalize identifier. beq t1, t0, .Ltokenize_next_identifier + li t0, 0x06 # Single character symbol action. + beq t1, t0, .Ltokenize_next_single + + li t0, 0x07 # An action for symbols containing multiple characters. + beq t1, t0, .Ltokenize_next_composite + j .Ltokenize_next_reject .Ltokenize_next_reject: @@ -481,24 +537,17 @@ _tokenize_next: .Ltokenize_next_print: /* DEBUG - lw a0, 4(sp) - mv a1, s1 - sub a1, a1, a0 - call _write_error - DEBUG */ + addi a0, a0, 21 + sw a0, 0(sp) + addi a0, sp, 0 + li a1, 1 + call _write_error */ j .Ltokenize_next_end .Ltokenize_next_comment: addi s1, s1, 1 - /* DEBUG - lw a0, 4(sp) - mv a1, s1 - sub a1, a1, a0 - call _write_error - DEBUG */ - j .Ltokenize_next_end .Ltokenize_next_identifier: @@ -512,6 +561,26 @@ _tokenize_next: j .Ltokenize_next_end +.Ltokenize_next_single: + lw a0, 4(sp) + addi s1, a0, 1 + lbu a0, (a0) + call _classify_single + lw a1, 0(sp) + sw a0, (a1) + + j .Ltokenize_next_end + +.Ltokenize_next_composite: + addi s1, s1, 1 + lw a1, 4(sp) + sub a0, s1, a1 + call _classify_composite + lw a1, 0(sp) + sw a0, (a1) + + j .Ltokenize_next_end + .Ltokenize_next_end: mv a0, s1 # Return the advanced text pointer.