Tokenize single character symbols

This commit is contained in:
Eugen Wissner 2025-05-03 23:35:41 +02:00
parent dcfd6b1515
commit 0a0bc4e1f2
Signed by: belka
GPG Key ID: A27FDC1E8EE902C0
6 changed files with 291 additions and 335 deletions

View File

@ -1,3 +1,6 @@
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
# frozen_string_literal: true # frozen_string_literal: true
source 'https://rubygems.org' source 'https://rubygems.org'

View File

@ -1,3 +1,6 @@
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
# frozen_string_literal: true # frozen_string_literal: true
require 'open3' require 'open3'

View File

@ -1,3 +1,7 @@
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
.global _is_alpha, _is_digit, _is_alnum, _is_upper, _is_lower .global _is_alpha, _is_digit, _is_alnum, _is_upper, _is_lower
.global _write_out, _read_file, _write_error, _put_char, _printi .global _write_out, _read_file, _write_error, _put_char, _printi
.global _get, _memcmp, _memchr, _memmem, _memcpy .global _get, _memcmp, _memchr, _memmem, _memcpy

View File

@ -1,3 +1,7 @@
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
# The constant should match the index in the keywords array in tokenizer.s. # The constant should match the index in the keywords array in tokenizer.s.
.equ TOKEN_PROGRAM, 1 .equ TOKEN_PROGRAM, 1
@ -26,3 +30,23 @@
.equ TOKEN_DEFER, 24 .equ TOKEN_DEFER, 24
.equ TOKEN_CASE, 25 .equ TOKEN_CASE, 25
.equ TOKEN_OF, 26 .equ TOKEN_OF, 26
# The constant should match the character index in the byte_keywords string.
.equ TOKEN_AND, 27
.equ TOKEN_DOT, 28
.equ TOKEN_COMMA, 29
.equ TOKEN_COLON, 30
.equ TOKEN_SEMICOLON, 31
.equ TOKEN_LEFT_PAREN, 32
.equ TOKEN_RIGHT_PAREN, 33
.equ TOKEN_LEFT_BRACKET, 34
.equ TOKEN_RIGHT_BRACKET, 35
.equ TOKEN_HAT, 36
.equ TOKEN_EQUALS, 37
.equ TOKEN_PLUS, 38
.equ TOKEN_MINUS, 39
.equ TOKEN_ASTERISK, 40
.equ TOKEN_AT, 41
.equ TOKEN_ASSIGN, 42

View File

@ -1,6 +1,10 @@
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
.global _start # Program entry point. .global _start # Program entry point.
# Global variables or registers. # Registers used as global variables:
# s1 - Contains the current position in the source text. # s1 - Contains the current position in the source text.
# s2 - Label counter. # s2 - Label counter.
@ -42,6 +46,10 @@ asm_neg_a0: .ascii "neg a0, a0\n"
.equ ASM_NEG_A0_SIZE, . - asm_neg_a0 .equ ASM_NEG_A0_SIZE, . - asm_neg_a0
asm_type: .ascii ".type " asm_type: .ascii ".type "
.equ ASM_TYPE_SIZE, . - asm_type .equ ASM_TYPE_SIZE, . - asm_type
asm_type_function: .ascii ", @function\n"
.equ ASM_TYPE_FUNCTION_SIZE, . - asm_type_function
asm_type_object: .ascii ", @object\n"
.equ ASM_TYPE_OBJECT_SIZE, . - asm_type_object
asm_restore_parameters: asm_restore_parameters:
.ascii "lw a0, 60(sp)\nlw a1, 56(sp)\nlw a2, 52(sp)\nlw a3, 48(sp)\nlw a4, 44(sp)\nlw a5, 40(sp)\n" .ascii "lw a0, 60(sp)\nlw a1, 56(sp)\nlw a2, 52(sp)\nlw a3, 48(sp)\nlw a4, 44(sp)\nlw a5, 40(sp)\n"
.equ ASM_RESTORE_PARAMETERS_SIZE, . - asm_restore_parameters .equ ASM_RESTORE_PARAMETERS_SIZE, . - asm_restore_parameters
@ -77,14 +85,6 @@ _compile_import:
call _tokenize_next call _tokenize_next
mv s1, a0 mv s1, a0
/* DEBUG
lw t0, 0(sp)
addi t0, t0, '0'
sw t0, 4(sp)
addi a0, sp, 4
li a1, 1
call _write_error*/
j .Lcompile_import_loop j .Lcompile_import_loop
.Lcompile_import_end: .Lcompile_import_end:
@ -104,63 +104,35 @@ _build_binary_expression:
li a0, 0 li a0, 0
call _build_expression call _build_expression
call _skip_spaces call _skip_spaces
call _read_token
sw a0, 20(sp)
li t0, '&'
sw t0, 16(sp)
mv a0, s1 mv a0, s1
lw a1, 20(sp) addi a1, sp, 16
addi a2, sp, 16 call _tokenize_next
call _token_compare lw t0, 16(sp)
beqz a0, .L_build_binary_expression_and
li t0, 0x726f # or li t1, TOKEN_AND
sw t0, 16(sp) beq t0, t1, .L_build_binary_expression_and
mv a0, s1
lw a1, 20(sp)
addi a2, sp, 16
call _token_compare
beqz a0, .L_build_binary_expression_or
li t0, '=' li t1, TOKEN_OR
sw t0, 16(sp) beq t0, t1, .L_build_binary_expression_or
mv a0, s1
lw a1, 20(sp)
addi a2, sp, 16
call _token_compare
beqz a0, .L_build_binary_expression_equal
li t0, '+' li t1, TOKEN_PLUS
sw t0, 16(sp) beq t0, t1, .L_build_binary_expression_plus
mv a0, s1
lw a1, 20(sp)
addi a2, sp, 16
call _token_compare
beqz a0, .L_build_binary_expression_plus
li t0, '-' li t1, TOKEN_EQUALS
sw t0, 16(sp) beq t0, t1, .L_build_binary_expression_equal
mv a0, s1
lw a1, 20(sp)
addi a2, sp, 16
call _token_compare
beqz a0, .L_build_binary_expression_minus
li t0, '*' li t1, TOKEN_ASTERISK
sw t0, 16(sp) beq t0, t1, .L_build_binary_expression_product
mv a0, s1
lw a1, 20(sp) li t1, TOKEN_MINUS
addi a2, sp, 16 beq t0, t1, .L_build_binary_expression_minus
call _token_compare
beqz a0, .L_build_binary_expression_product
j .Lbuild_binary_expression_end j .Lbuild_binary_expression_end
.L_build_binary_expression_equal: .L_build_binary_expression_equal:
addi s1, s1, 1 # Skip =. mv s1, a0 # Skip =.
li a0, 1 li a0, 1
call _build_expression call _build_expression
la a0, asm_sub_a0_a1 la a0, asm_sub_a0_a1
@ -174,7 +146,12 @@ _build_binary_expression:
j .Lbuild_binary_expression_end j .Lbuild_binary_expression_end
.L_build_binary_expression_and: .L_build_binary_expression_and:
addi s1, s1, 1 # Skip &. /* DEBUG
addi a0, s1, 0
li a1, 4
call _write_error */
mv s1, a0 # Skip &.
li a0, 1 li a0, 1
call _build_expression call _build_expression
la a0, asm_and_a0_a1 la a0, asm_and_a0_a1
@ -184,7 +161,7 @@ _build_binary_expression:
j .Lbuild_binary_expression_end j .Lbuild_binary_expression_end
.L_build_binary_expression_or: .L_build_binary_expression_or:
addi s1, s1, 2 # Skip or. mv s1, a0 # Skip or.
li a0, 1 li a0, 1
call _build_expression call _build_expression
la a0, asm_or_a0_a1 la a0, asm_or_a0_a1
@ -194,7 +171,7 @@ _build_binary_expression:
j .Lbuild_binary_expression_end j .Lbuild_binary_expression_end
.L_build_binary_expression_plus: .L_build_binary_expression_plus:
addi s1, s1, 1 # Skip +. mv s1, a0 # Skip +.
li a0, 1 li a0, 1
call _build_expression call _build_expression
la a0, asm_add_a0_a1 la a0, asm_add_a0_a1
@ -204,7 +181,7 @@ _build_binary_expression:
j .Lbuild_binary_expression_end j .Lbuild_binary_expression_end
.L_build_binary_expression_minus: .L_build_binary_expression_minus:
addi s1, s1, 1 # Skip -. mv s1, a0 # Skip -.
li a0, 1 li a0, 1
call _build_expression call _build_expression
la a0, asm_sub_a0_a1 la a0, asm_sub_a0_a1
@ -214,7 +191,7 @@ _build_binary_expression:
j .Lbuild_binary_expression_end j .Lbuild_binary_expression_end
.L_build_binary_expression_product: .L_build_binary_expression_product:
addi s1, s1, 1 # Skip *. mv s1, a0 # Skip *.
li a0, 1 li a0, 1
call _build_expression call _build_expression
la a0, asm_mul_a0_a1 la a0, asm_mul_a0_a1
@ -937,29 +914,31 @@ _skip_comment:
# Parameters: # Parameters:
# a0 - Line length. # a0 - Line length.
.type _compile_assembly, @function .type _compile_procedure_section, @function
_compile_assembly: _compile_procedure_section:
# Prologue. # Prologue.
addi sp, sp, -16 addi sp, sp, -16
sw ra, 12(sp) sw ra, 12(sp)
sw s0, 8(sp) sw s0, 8(sp)
addi s0, sp, 16 addi s0, sp, 16
sw a0, 4(sp) # a0 - Line length. .Lcompile_procedure_section_loop:
call _skip_spaces
call _skip_comment
call _skip_spaces
# Write the source to the standard output.
mv a0, s1 mv a0, s1
lw a1, 4(sp) addi a1, sp, 0
call _write_out call _tokenize_next
li t0, TOKEN_PROC
lw t1, 0(sp)
bne t0, t1, .Lcompile_procedure_section_end
lw t0, 4(sp) call _compile_procedure
add s1, s1, t0
li a0, '\n' j .Lcompile_procedure_section_loop
call _put_char
addi s1, s1, 1 # Skip the new line.
.Lcompile_procedure_section_end:
# Epilogue. # Epilogue.
lw ra, 12(sp) lw ra, 12(sp)
lw s0, 8(sp) lw s0, 8(sp)
@ -1038,15 +1017,19 @@ _compile_constant:
sw s0, 8(sp) sw s0, 8(sp)
addi s0, sp, 16 addi s0, sp, 16
call _read_token mv a0, s1
addi a1, sp, 0
call _tokenize_next
mv a1, a0 # The identifier length from _read_token should be in a1. sub a1, a0, s1 # The identifier end from _tokenize_next should be in a0.
mv a0, s1 # Save the identifier pointer before advancing it. mv a0, s1
add s1, s1, a1 add s1, s1, a1 # Save the identifier pointer before advancing it.
call _write_out call _write_out
call _skip_spaces mv a0, s1
addi s1, s1, 2 # Skip the assignment sign. addi a1, sp, 0
call _tokenize_next
mv s1, a0 # Skip the assignment sign.
# : .long # : .long
li t0, 0x20676e6f # ong_ li t0, 0x20676e6f # ong_
@ -1154,42 +1137,10 @@ _compile_variable:
lw a1, 24(sp) lw a1, 24(sp)
call _write_out call _write_out
li t0, 0x0a74 # t\n la a0, asm_type_object
sw t0, 12(sp) li a1, ASM_TYPE_OBJECT_SIZE
li t0, 0x63656a62 # bjec
sw t0, 8(sp)
li t0, 0x6f40202c # , @o
sw t0, 4(sp)
addi a0, sp, 4
li a1, 10
call _write_out call _write_out
# .size identifier, size
li t0, 0x2065 # e_
sw t0, 12(sp)
li t0, 0x7a69732e # .siz
sw t0, 8(sp)
addi a0, sp, 8
li a1, 6
call _write_out
lw a0, 28(sp)
lw a1, 24(sp)
call _write_out
li t0, 0x202c # ,_
sw t0, 12(sp)
addi a0, sp, 12
li a1, 2
call _write_out
lw a0, 20(sp)
lw a1, 16(sp)
call _write_out
li a0, '\n'
call _put_char
# identifier: .zero size # identifier: .zero size
lw a0, 28(sp) lw a0, 28(sp)
lw a1, 24(sp) lw a1, 24(sp)
@ -1239,14 +1190,8 @@ _compile_procedure:
lw a1, 16(sp) lw a1, 16(sp)
call _write_out call _write_out
li t0, 0x0a6e6f69 # ion\n la a0, asm_type_function
sw t0, 12(sp) li a1, ASM_TYPE_FUNCTION_SIZE
li t0, 0x74636e75 # unct
sw t0, 8(sp)
li t0, 0x6640202c # , @f
sw t0, 4(sp)
addi a0, sp, 4
li a1, 12
call _write_out call _write_out
lw a0, 20(sp) lw a0, 20(sp)
@ -1356,7 +1301,7 @@ _compile_procedure:
beqz a0, .Lcompile_procedure_end beqz a0, .Lcompile_procedure_end
lw a0, 12(sp) lw a0, 12(sp)
call _compile_line call _compile_statement
j .Lcompile_procedure_body j .Lcompile_procedure_body
.Lcompile_procedure_end: .Lcompile_procedure_end:
@ -1577,7 +1522,7 @@ _compile_if:
call _read_line call _read_line
li a1, 1 li a1, 1
call _compile_line call _compile_statement
j .Lcompile_if_loop j .Lcompile_if_loop
@ -1614,8 +1559,8 @@ _compile_if:
# #
# Returns 1 in a0 if the parsed line contained a text section element such a # Returns 1 in a0 if the parsed line contained a text section element such a
# procedure or the program entry point. Otherwise sets a0 to 0. # procedure or the program entry point. Otherwise sets a0 to 0.
.type _compile_line, @function .type _compile_statement, @function
_compile_line: _compile_statement:
# Prologue. # Prologue.
addi sp, sp, -32 addi sp, sp, -32
sw ra, 28(sp) sw ra, 28(sp)
@ -1626,45 +1571,17 @@ _compile_line:
sw a0, 20(sp) sw a0, 20(sp)
sw a1, 16(sp) sw a1, 16(sp)
beqz a0, .Lcompile_line_empty # Skip an empty line. call _skip_comment
lbu t0, (s1)
li t1, '('
beq t0, t1, .Lcompile_line_comment
li t0, 0x636f7270 # proc
sw t0, 12(sp)
mv a0, s1
addi a1, sp, 12
li a2, 4
call _memcmp
beqz a0, .Lcompile_line_procedure
li t0, 0x69676562 # begi
sw t0, 12(sp)
mv a0, s1
addi a1, sp, 12
li a2, 4
call _memcmp
beqz a0, .Lcompile_line_begin
li t0, 0x2e646e65 # end.
sw t0, 12(sp)
mv a0, s1
addi a1, sp, 12
li a2, 4
call _memcmp
beqz a0, .Lcompile_line_exit
mv a0, s1 mv a0, s1
lw a1, 20(sp) lw a1, 20(sp)
call _is_local_identifier call _is_local_identifier
bnez a0, .Lcompile_line_identifier bnez a0, .Lcompile_statement_identifier
mv a0, s1 mv a0, s1
li a1, 2 li a1, 2
call _is_register_identifier call _is_register_identifier
bnez a0, .Lcompile_line_identifier bnez a0, .Lcompile_statement_identifier
li t0, 0x6f746f67 # goto li t0, 0x6f746f67 # goto
sw t0, 12(sp) sw t0, 12(sp)
@ -1672,7 +1589,7 @@ _compile_line:
addi a1, sp, 12 addi a1, sp, 12
li a2, 4 li a2, 4
call _memcmp call _memcmp
beqz a0, .Lcompile_line_goto beqz a0, .Lcompile_statement_goto
li t0, 0x75746572 # retu li t0, 0x75746572 # retu
sw t0, 12(sp) sw t0, 12(sp)
@ -1680,7 +1597,7 @@ _compile_line:
addi a1, sp, 12 addi a1, sp, 12
li a2, 4 li a2, 4
call _memcmp call _memcmp
beqz a0, .Lcompile_line_return beqz a0, .Lcompile_statement_return
li t0, 0x6669 # if li t0, 0x6669 # if
sw t0, 12(sp) sw t0, 12(sp)
@ -1688,77 +1605,42 @@ _compile_line:
addi a1, sp, 12 addi a1, sp, 12
li a2, 2 li a2, 2
call _memcmp call _memcmp
beqz a0, .Lcompile_line_if beqz a0, .Lcompile_statement_if
lbu t0, (s1) lbu t0, (s1)
li t1, '.' li t1, '.'
beq t0, t1, .Lcompile_line_label beq t0, t1, .Lcompile_statement_label
li t1, '_' li t1, '_'
beq t0, t1, .Lcompile_line_identifier beq t0, t1, .Lcompile_statement_identifier
j .Lcompile_line_unchanged # Else. j .Lcompile_statement_empty # Else.
.Lcompile_line_if: .Lcompile_statement_if:
call _compile_if call _compile_if
j .Lcompile_line_section j .Lcompile_statement_end
.Lcompile_line_label: .Lcompile_statement_label:
lw a0, 20(sp) lw a0, 20(sp)
call _compile_label call _compile_label
j .Lcompile_line_section j .Lcompile_statement_end
.Lcompile_line_return: .Lcompile_statement_return:
call _compile_return call _compile_return
j .Lcompile_line_section j .Lcompile_statement_end
.Lcompile_line_goto: .Lcompile_statement_goto:
call _compile_goto call _compile_goto
j .Lcompile_line_section j .Lcompile_statement_end
.Lcompile_line_identifier: .Lcompile_statement_identifier:
call _compile_identifier call _compile_identifier
j .Lcompile_line_section j .Lcompile_statement_end
.Lcompile_line_exit: .Lcompile_statement_empty:
call _compile_exit
j .Lcompile_line_section
.Lcompile_line_begin:
lw a1, 16(sp)
bnez a1, .Lcompile_line_compile_entry
call _compile_text_section
.Lcompile_line_compile_entry:
call _compile_entry_point
li a0, 1
j .Lcompile_line_end
.Lcompile_line_procedure:
lw a1, 16(sp)
bnez a1, .Lcompile_line_compile_procedure
call _compile_text_section
.Lcompile_line_compile_procedure:
call _compile_procedure
li a0, 1
j .Lcompile_line_end
.Lcompile_line_comment:
lw a0, 20(sp)
call _skip_comment
j .Lcompile_line_section
.Lcompile_line_empty:
addi s1, s1, 1 addi s1, s1, 1
j .Lcompile_line_section j .Lcompile_statement_end
.Lcompile_line_unchanged: .Lcompile_statement_end:
lw a0, 20(sp)
call _compile_assembly
j .Lcompile_line_section
.Lcompile_line_section:
mv a0, zero
.Lcompile_line_end:
sw a0, 12(sp) sw a0, 12(sp)
call _skip_spaces call _skip_spaces
call _skip_comment call _skip_comment
@ -1804,20 +1686,25 @@ _compile_entry_point:
addi s1, s1, 6 # Skip begin\n. addi s1, s1, 6 # Skip begin\n.
# Epilogue. # Generate the body of the procedure.
lw ra, 4(sp) .Lcompile_entry_point_body:
lw s0, 0(sp) call _skip_spaces
addi sp, sp, 8 call _read_line
ret sw a0, 12(sp)
li t0, 0x2e646e65 # end
sw t0, 8(sp)
mv a0, s1
addi a1, sp, 8
li a2, 4
call _memcmp
.type _compile_exit, @function beqz a0, .Lcompile_entry_point_end
_compile_exit:
# Prologue.
addi sp, sp, -8
sw ra, 4(sp)
sw s0, 0(sp)
addi s0, sp, 8
lw a0, 12(sp)
call _compile_statement
j .Lcompile_entry_point_body
.Lcompile_entry_point_end:
la a0, asm_exit la a0, asm_exit
li a1, ASM_EXIT_SIZE li a1, ASM_EXIT_SIZE
call _write_out call _write_out
@ -1857,30 +1744,13 @@ _compile:
sw s0, 8(sp) sw s0, 8(sp)
addi s0, sp, 16 addi s0, sp, 16
sw zero, 4(sp) # Whether the text section header was already emitted.
call _compile_module_declaration call _compile_module_declaration
call _compile_import call _compile_import
call _compile_constant_section call _compile_constant_section
call _compile_variable_section call _compile_variable_section
call _compile_text_section
.Lcompile_do: call _compile_procedure_section
lbu t0, (s1) # t0 = Current character. call _compile_entry_point
beqz t0, .Lcompile_end # Exit the loop on the NUL character.
call _skip_spaces
call _read_line
lw a1, 4(sp)
call _compile_line
beqz a0, .Lcompile_do
# Update whether the text section header was already emitted.
lw t0, 4(sp)
or t0, t0, a0
sw t0, 4(sp)
j .Lcompile_do
.Lcompile_end:
# Epilogue. # Epilogue.
lw ra, 12(sp) lw ra, 12(sp)
@ -1888,22 +1758,6 @@ _compile:
addi sp, sp, 16 addi sp, sp, 16
ret ret
.type _main, @function
_main:
# Prologue.
addi sp, sp, -8
sw ra, 4(sp)
sw s0, 0(sp)
addi s0, sp, 8
li s2, 1
# Epilogue.
lw ra, 4(sp)
lw s0, 0(sp)
addi sp, sp, 8
ret
# Entry point. # Entry point.
.type _start, @function .type _start, @function
_start: _start:
@ -1912,8 +1766,7 @@ _start:
li a1, SOURCE_BUFFER_SIZE # Buffer size. li a1, SOURCE_BUFFER_SIZE # Buffer size.
call _read_file call _read_file
mv a0, s1 li s2, 1
call _main
call _compile call _compile
# Call exit. # Call exit.

View File

@ -1,4 +1,10 @@
.global _tokenize_next, classification, transitions, keywords # This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
.global _tokenize_next, classification, transitions, keywords, byte_keywords
.include "boot/definitions.inc"
.section .rodata .section .rodata
@ -8,7 +14,7 @@
# #
# Classification: # Classification:
# #
.equ CLASS_INVALID, 0x0 .equ CLASS_INVALID, 0x00
.equ CLASS_DIGIT, 0x01 .equ CLASS_DIGIT, 0x01
.equ CLASS_CHARACTER, 0x02 .equ CLASS_CHARACTER, 0x02
.equ CLASS_SPACE, 0x03 .equ CLASS_SPACE, 0x03
@ -25,9 +31,11 @@
.equ CLASS_EOF, 0x0e .equ CLASS_EOF, 0x0e
.equ CLASS_DOT, 0x0f .equ CLASS_DOT, 0x0f
.equ CLASS_MINUS, 0x10 .equ CLASS_MINUS, 0x10
.equ CLASS_DOUBLE_QUOTE, 0x11 .equ CLASS_QUOTE, 0x11
.equ CLASS_GREATER, 0x12
.equ CLASS_LESS, 0x13
.equ CLASS_COUNT, 18 .equ CLASS_COUNT, 20
.type classification, @object .type classification, @object
.size classification, 128 .size classification, 128
@ -66,12 +74,12 @@ classification:
.byte CLASS_INVALID # 1F US .byte CLASS_INVALID # 1F US
.byte CLASS_SPACE # 20 Space .byte CLASS_SPACE # 20 Space
.byte CLASS_SINGLE # 21 ! .byte CLASS_SINGLE # 21 !
.byte CLASS_DOUBLE_QUOTE # 22 " .byte CLASS_QUOTE # 22 "
.byte 0x00 # 23 # .byte 0x00 # 23 #
.byte 0x00 # 24 $ .byte 0x00 # 24 $
.byte CLASS_SINGLE # 25 % .byte CLASS_SINGLE # 25 %
.byte CLASS_SINGLE # 26 & .byte CLASS_SINGLE # 26 &
.byte 0x00 # 27 ' .byte CLASS_QUOTE # 27 '
.byte CLASS_LEFT_PAREN # 28 ( .byte CLASS_LEFT_PAREN # 28 (
.byte CLASS_RIGHT_PAREN # 29 ) .byte CLASS_RIGHT_PAREN # 29 )
.byte CLASS_ASTERISK # 2A * .byte CLASS_ASTERISK # 2A *
@ -92,9 +100,9 @@ classification:
.byte CLASS_DIGIT # 39 9 .byte CLASS_DIGIT # 39 9
.byte CLASS_COLON # 3A : .byte CLASS_COLON # 3A :
.byte CLASS_SINGLE # 3B ; .byte CLASS_SINGLE # 3B ;
.byte 0x00 # 3C < .byte CLASS_LESS # 3C <
.byte CLASS_EQUALS # 3D = .byte CLASS_EQUALS # 3D =
.byte 0x00 # 3E > .byte CLASS_GREATER # 3E >
.byte 0x00 # 3F ? .byte 0x00 # 3F ?
.byte CLASS_SINGLE # 40 @ .byte CLASS_SINGLE # 40 @
.byte CLASS_CHARACTER # 41 A .byte CLASS_CHARACTER # 41 A
@ -220,7 +228,10 @@ keywords:
.ascii "case" .ascii "case"
.word 2 .word 2
.ascii "of" .ascii "of"
.size keywords, . - keywords
.type byte_keywords, @object
byte_keywords: .ascii "&.,:;()[]^=+-*@"
.equ BYTE_KEYWORDS_SIZE, . - byte_keywords
.section .data .section .data
@ -240,78 +251,66 @@ keywords:
# handles each action. # handles each action.
# #
.type transitions, @object .type transitions, @object
.size transitions, 17 * CLASS_COUNT # state count * CLASS_COUNT .size transitions, 14 * CLASS_COUNT # state count * CLASS_COUNT
transitions: transitions:
# Invalid Digit Alpha Space : = ( ) # Invalid Digit Alpha Space : = ( )
# * _ Single Hex 0 x NUL . # * _ Single Hex 0 x NUL .
# - " # - " or ' > <
.word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x0105, 0x0106, 0x0107 .word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x06ff, 0x0106, 0x06ff
.word 0x0108, 0x0102, 0x010b, 0x0102, 0x010c, 0x0102, 0x00ff, 0x010e # 00 Start .word 0x06ff, 0x0102, 0x06ff, 0x0102, 0x010c, 0x0102, 0x00ff, 0x0108
.word 0x010f, 0x0110 .word 0x0105, 0x0110, 0x0104, 0x0107 # 0x00 Start
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x0104, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x07ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 01 Colon .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x01 Colon
.word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff .word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff
.word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff # 02 Identifier .word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff
.word 0x05ff, 0x05ff .word 0x05ff, 0x05ff, 0x05ff, 0x05ff # 0x02 Identifier
.word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x0103, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff # 03 Integer .word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x0103, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x03 Integer
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x04ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x04ff, 0x02ff # 0x04 Greater
.word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff
.word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff
.word 0x06ff, 0x06ff, 0x04ff, 0x06ff # 0x05 Minus
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 04 Assign .word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x06 Left paren
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 05 Eauals
.word 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x0109, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 06 Left paren .word 0x02ff, 0x02ff, 0x02ff, 0x04ff # 0x07 Less
.word 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 07 Right paren
.word 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 08 Asterisk
.word 0x02ff, 0x02ff
.word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109
.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 09 Comment
.word 0x0109, 0x0109
.word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff
.word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 # 0a Closing comment
.word 0x0109, 0x0109
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0b Single character token
.word 0x02ff, 0x02ff
.word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff # 0c Zero
.word 0x02ff, 0x02ff
.word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff # 0d Hexadecimal
.word 0x00ff, 0x02ff
.word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x02ff, 0x0102, 0x0102, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff # 0e Dot .word 0x02ff, 0x0102, 0x02ff, 0x0102, 0x0102, 0x0102, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x08 Dot
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0f Minus .word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109
.word 0x02ff, 0x02ff .word 0x0109, 0x0109, 0x0109, 0x0109 # 0x09 Comment
.word 0x00ff, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 # 10 Starting string. .word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff
.word 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110, 0x0110 .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109
.word 0x0110, 0x04ff .word 0x0109, 0x0109, 0x0109, 0x0109 # 0x0a Closing comment
.word 0x00ff, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110
.word 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110
.word 0x010b, 0x04ff, 0x010b, 0x010b # 0x0b String
.word 0x02ff, 0x00ff, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x00ff, 0x00ff, 0x010d, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x0c Zero
.word 0x02ff, 0x010d, 0x00ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
.word 0x02ff, 0x02ff, 0x02ff, 0x010d, 0x010d, 0x00ff, 0x2ff, 0x02ff
.word 0x00ff, 0x02ff, 0x02ff, 0x02ff # 0x0d Hexadecimal
.section .text .section .text
@ -406,6 +405,57 @@ _classify_identifier:
addi sp, sp, 16 addi sp, sp, 16
ret ret
# Takes a symbol and determines its type.
#
# Parameters:
# a0 - Token character.
#
# Sets a0 to the appropriate token type.
.type _classify_single, @function
_classify_single:
# Prologue.
addi sp, sp, -16
sw ra, 12(sp)
sw s0, 8(sp)
addi s0, sp, 16
mv a1, a0
li a2, BYTE_KEYWORDS_SIZE
la a0, byte_keywords
call _memchr
la a1, byte_keywords
sub a0, a0, a1
addi a0, a0, 27
# Epilogue.
lw ra, 12(sp)
lw s0, 8(sp)
addi sp, sp, 16
ret
# Classified a symbol containing multiple characters (probably 2).
#
# Parameters:
# a0 - Token length.
# a1 - Token pointer.
#
# Sets a0 to the appropriate token type.
.type _classify_composite, @function
_classify_composite:
lbu t0, 0(a1)
li t1, ':'
beq t0, t1, .Lclassify_composite_assign
j .Lclassify_composite_end
.Lclassify_composite_assign:
li a0, TOKEN_ASSIGN
j .Lclassify_composite_end
.Lclassify_composite_end:
ret
# Initializes the classification table. # Initializes the classification table.
# #
# Paramaters: # Paramaters:
@ -453,12 +503,18 @@ _tokenize_next:
li t0, 0x03 # Skip action. li t0, 0x03 # Skip action.
beq t1, t0, .Ltokenize_next_skip beq t1, t0, .Ltokenize_next_skip
li t0, 0x04 # Comment action. li t0, 0x04 # Delimited string action.
beq t1, t0, .Ltokenize_next_comment beq t1, t0, .Ltokenize_next_comment
li t0, 0x05 # Finalize identifier. li t0, 0x05 # Finalize identifier.
beq t1, t0, .Ltokenize_next_identifier beq t1, t0, .Ltokenize_next_identifier
li t0, 0x06 # Single character symbol action.
beq t1, t0, .Ltokenize_next_single
li t0, 0x07 # An action for symbols containing multiple characters.
beq t1, t0, .Ltokenize_next_composite
j .Ltokenize_next_reject j .Ltokenize_next_reject
.Ltokenize_next_reject: .Ltokenize_next_reject:
@ -481,24 +537,17 @@ _tokenize_next:
.Ltokenize_next_print: .Ltokenize_next_print:
/* DEBUG /* DEBUG
lw a0, 4(sp) addi a0, a0, 21
mv a1, s1 sw a0, 0(sp)
sub a1, a1, a0 addi a0, sp, 0
call _write_error li a1, 1
DEBUG */ call _write_error */
j .Ltokenize_next_end j .Ltokenize_next_end
.Ltokenize_next_comment: .Ltokenize_next_comment:
addi s1, s1, 1 addi s1, s1, 1
/* DEBUG
lw a0, 4(sp)
mv a1, s1
sub a1, a1, a0
call _write_error
DEBUG */
j .Ltokenize_next_end j .Ltokenize_next_end
.Ltokenize_next_identifier: .Ltokenize_next_identifier:
@ -512,6 +561,26 @@ _tokenize_next:
j .Ltokenize_next_end j .Ltokenize_next_end
.Ltokenize_next_single:
lw a0, 4(sp)
addi s1, a0, 1
lbu a0, (a0)
call _classify_single
lw a1, 0(sp)
sw a0, (a1)
j .Ltokenize_next_end
.Ltokenize_next_composite:
addi s1, s1, 1
lw a1, 4(sp)
sub a0, s1, a1
call _classify_composite
lw a1, 0(sp)
sw a0, (a1)
j .Ltokenize_next_end
.Ltokenize_next_end: .Ltokenize_next_end:
mv a0, s1 # Return the advanced text pointer. mv a0, s1 # Return the advanced text pointer.