summaryrefslogtreecommitdiff
path: root/boot/tokenizer.s
diff options
context:
space:
mode:
authorEugen Wissner <belka@caraus.de>2025-08-28 22:45:42 +0200
committerEugen Wissner <belka@caraus.de>2025-08-30 01:29:00 +0200
commit627975775c941130975ce0f9dbef08c723e69794 (patch)
tree0f7f4c2c6d83420262d89ff720e6435a67410c9a /boot/tokenizer.s
parente614d43ea9af078301d538fcddb19e83eed7e879 (diff)
downloadelna-627975775c941130975ce0f9dbef08c723e69794.tar.gz
Start over
Diffstat (limited to 'boot/tokenizer.s')
-rw-r--r--boot/tokenizer.s616
1 files changed, 0 insertions, 616 deletions
diff --git a/boot/tokenizer.s b/boot/tokenizer.s
deleted file mode 100644
index 2c7f2a3..0000000
--- a/boot/tokenizer.s
+++ /dev/null
@@ -1,616 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public License,
-# v. 2.0. If a copy of the MPL was not distributed with this file, You can
-# obtain one at https://mozilla.org/MPL/2.0/.
-
-.global lex_next, classification, transitions, keywords, byte_keywords
-
-.include "boot/definitions.inc"
-
-.section .rodata
-
-#
-# Classification table assigns each possible character to a group (class). All
-# characters of the same group a handled equivalently.
-#
-# Classification:
-#
-.equ CLASS_INVALID, 0x00
-.equ CLASS_DIGIT, 0x01
-.equ CLASS_CHARACTER, 0x02
-.equ CLASS_SPACE, 0x03
-.equ CLASS_COLON, 0x04
-.equ CLASS_EQUALS, 0x05
-.equ CLASS_LEFT_PAREN, 0x06
-.equ CLASS_RIGHT_PAREN, 0x07
-.equ CLASS_ASTERISK, 0x08
-.equ CLASS_UNDERSCORE, 0x09
-.equ CLASS_SINGLE, 0x0a
-.equ CLASS_HEX, 0x0b
-.equ CLASS_ZERO, 0x0c
-.equ CLASS_X, 0x0d
-.equ CLASS_EOF, 0x0e
-.equ CLASS_DOT, 0x0f
-.equ CLASS_MINUS, 0x10
-.equ CLASS_QUOTE, 0x11
-.equ CLASS_GREATER, 0x12
-.equ CLASS_LESS, 0x13
-
-.equ CLASS_COUNT, 20
-
-.type classification, @object
-classification:
- .byte CLASS_EOF # 00 NUL
- .byte CLASS_INVALID # 01 SOH
- .byte CLASS_INVALID # 02 STX
- .byte CLASS_INVALID # 03 ETX
- .byte CLASS_INVALID # 04 EOT
- .byte CLASS_INVALID # 05 ENQ
- .byte CLASS_INVALID # 06 ACK
- .byte CLASS_INVALID # 07 BEL
- .byte CLASS_INVALID # 08 BS
- .byte CLASS_SPACE # 09 HT
- .byte CLASS_SPACE # 0A LF
- .byte CLASS_INVALID # 0B VT
- .byte CLASS_INVALID # 0C FF
- .byte CLASS_SPACE # 0D CR
- .byte CLASS_INVALID # 0E SO
- .byte CLASS_INVALID # 0F SI
- .byte CLASS_INVALID # 10 DLE
- .byte CLASS_INVALID # 11 DC1
- .byte CLASS_INVALID # 12 DC2
- .byte CLASS_INVALID # 13 DC3
- .byte CLASS_INVALID # 14 DC4
- .byte CLASS_INVALID # 15 NAK
- .byte CLASS_INVALID # 16 SYN
- .byte CLASS_INVALID # 17 ETB
- .byte CLASS_INVALID # 18 CAN
- .byte CLASS_INVALID # 19 EM
- .byte CLASS_INVALID # 1A SUB
- .byte CLASS_INVALID # 1B ESC
- .byte CLASS_INVALID # 1C FS
- .byte CLASS_INVALID # 1D GS
- .byte CLASS_INVALID # 1E RS
- .byte CLASS_INVALID # 1F US
- .byte CLASS_SPACE # 20 Space
- .byte CLASS_SINGLE # 21 !
- .byte CLASS_QUOTE # 22 "
- .byte 0x00 # 23 #
- .byte 0x00 # 24 $
- .byte CLASS_SINGLE # 25 %
- .byte CLASS_SINGLE # 26 &
- .byte CLASS_QUOTE # 27 '
- .byte CLASS_LEFT_PAREN # 28 (
- .byte CLASS_RIGHT_PAREN # 29 )
- .byte CLASS_ASTERISK # 2A *
- .byte CLASS_SINGLE # 2B +
- .byte CLASS_SINGLE # 2C ,
- .byte CLASS_MINUS # 2D -
- .byte CLASS_DOT # 2E .
- .byte CLASS_SINGLE # 2F /
- .byte CLASS_ZERO # 30 0
- .byte CLASS_DIGIT # 31 1
- .byte CLASS_DIGIT # 32 2
- .byte CLASS_DIGIT # 33 3
- .byte CLASS_DIGIT # 34 4
- .byte CLASS_DIGIT # 35 5
- .byte CLASS_DIGIT # 36 6
- .byte CLASS_DIGIT # 37 7
- .byte CLASS_DIGIT # 38 8
- .byte CLASS_DIGIT # 39 9
- .byte CLASS_COLON # 3A :
- .byte CLASS_SINGLE # 3B ;
- .byte CLASS_LESS # 3C <
- .byte CLASS_EQUALS # 3D =
- .byte CLASS_GREATER # 3E >
- .byte 0x00 # 3F ?
- .byte CLASS_SINGLE # 40 @
- .byte CLASS_CHARACTER # 41 A
- .byte CLASS_CHARACTER # 42 B
- .byte CLASS_CHARACTER # 43 C
- .byte CLASS_CHARACTER # 44 D
- .byte CLASS_CHARACTER # 45 E
- .byte CLASS_CHARACTER # 46 F
- .byte CLASS_CHARACTER # 47 G
- .byte CLASS_CHARACTER # 48 H
- .byte CLASS_CHARACTER # 49 I
- .byte CLASS_CHARACTER # 4A J
- .byte CLASS_CHARACTER # 4B K
- .byte CLASS_CHARACTER # 4C L
- .byte CLASS_CHARACTER # 4D M
- .byte CLASS_CHARACTER # 4E N
- .byte CLASS_CHARACTER # 4F O
- .byte CLASS_CHARACTER # 50 P
- .byte CLASS_CHARACTER # 51 Q
- .byte CLASS_CHARACTER # 52 R
- .byte CLASS_CHARACTER # 53 S
- .byte CLASS_CHARACTER # 54 T
- .byte CLASS_CHARACTER # 55 U
- .byte CLASS_CHARACTER # 56 V
- .byte CLASS_CHARACTER # 57 W
- .byte CLASS_CHARACTER # 58 X
- .byte CLASS_CHARACTER # 59 Y
- .byte CLASS_CHARACTER # 5A Z
- .byte CLASS_SINGLE # 5B [
- .byte 0x00 # 5C \
- .byte CLASS_SINGLE # 5D ]
- .byte CLASS_SINGLE # 5E ^
- .byte CLASS_UNDERSCORE # 5F _
- .byte 0x00 # 60 `
- .byte CLASS_HEX # 61 a
- .byte CLASS_HEX # 62 b
- .byte CLASS_HEX # 63 c
- .byte CLASS_HEX # 64 d
- .byte CLASS_HEX # 65 e
- .byte CLASS_HEX # 66 f
- .byte CLASS_CHARACTER # 67 g
- .byte CLASS_CHARACTER # 68 h
- .byte CLASS_CHARACTER # 69 i
- .byte CLASS_CHARACTER # 6A j
- .byte CLASS_CHARACTER # 6B k
- .byte CLASS_CHARACTER # 6C l
- .byte CLASS_CHARACTER # 6D m
- .byte CLASS_CHARACTER # 6E n
- .byte CLASS_CHARACTER # 6F o
- .byte CLASS_CHARACTER # 70 p
- .byte CLASS_CHARACTER # 71 q
- .byte CLASS_CHARACTER # 72 r
- .byte CLASS_CHARACTER # 73 s
- .byte CLASS_CHARACTER # 74 t
- .byte CLASS_CHARACTER # 75 u
- .byte CLASS_CHARACTER # 76 v
- .byte CLASS_CHARACTER # 77 w
- .byte CLASS_X # 78 x
- .byte CLASS_CHARACTER # 79 y
- .byte CLASS_CHARACTER # 7A z
- .byte 0x00 # 7B {
- .byte CLASS_SINGLE # 7C |
- .byte 0x00 # 7D }
- .byte CLASS_SINGLE # 7E ~
- .byte CLASS_INVALID # 7F DEL
-
-#
-# Textual keywords in the language.
-#
-.equ KEYWORDS_COUNT, TOKEN_IDENTIFIER - 1
-
-.type keywords, @object
-keywords:
- .word 7
- .ascii "program"
- .word 6
- .ascii "import"
- .word 5
- .ascii "const"
- .word 3
- .ascii "var"
- .word 2
- .ascii "if"
- .word 4
- .ascii "then"
- .word 5
- .ascii "elsif"
- .word 4
- .ascii "else"
- .word 5
- .ascii "while"
- .word 2
- .ascii "do"
- .word 4
- .ascii "proc"
- .word 5
- .ascii "begin"
- .word 3
- .ascii "end"
- .word 4
- .ascii "type"
- .word 6
- .ascii "record"
- .word 5
- .ascii "union"
- .word 4
- .ascii "true"
- .word 5
- .ascii "false"
- .word 3
- .ascii "nil"
- .word 3
- .ascii "xor"
- .word 2
- .ascii "or"
- .word 6
- .ascii "return"
- .word 4
- .ascii "cast"
- .word 4
- .ascii "goto"
- .word 4
- .ascii "case"
- .word 2
- .ascii "of"
-
-.type byte_keywords, @object
-byte_keywords: .ascii "&.,:;()[]^=+-*@"
-.equ BYTE_KEYWORDS_SIZE, . - byte_keywords
-
-.section .data
-
-# The transition table describes transitions from one state to another, given
-# a symbol (character class).
-#
-# The table has m rows and n columns, where m is the amount of states and n is
-# the amount of classes. So given the current state and a classified character
-# the table can be used to look up the next state.
-#
-# Each cell is a word long.
-# - The least significant byte of the word is a row number (beginning with 0).
-# It specifies the target state. "ff" means that this is an end state and no
-# transition is possible.
-# - The next byte is the action that should be performed when transitioning.
-# For the meaning of actions see labels in the lex_next function, which
-# handles each action.
-#
-.type transitions, @object
-transitions:
- # Invalid Digit Alpha Space : = ( )
- # * _ Single Hex 0 x NUL .
- # - " or ' > <
- .word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x06ff, 0x0106, 0x06ff
- .word 0x06ff, 0x0102, 0x06ff, 0x0102, 0x010c, 0x0102, 0x00ff, 0x06ff
- .word 0x0105, 0x0110, 0x0104, 0x0107 # 0x00 Start
-
- .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x07ff, 0x02ff, 0x02ff
- .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
- .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x01 Colon
-
- .word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff
- .word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff
- .word 0x05ff, 0x05ff, 0x05ff, 0x05ff # 0x02 Identifier
-
- .word 0x08ff, 0x0103, 0x00ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff
- .word 0x08ff, 0x00ff, 0x08ff, 0x00ff, 0x0103, 0x00ff, 0x08ff, 0x08ff
- .word 0x08ff, 0x08ff, 0x08ff, 0x08ff # 0x03 Decimal
-
- .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x04ff, 0x02ff, 0x02ff
- .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
- .word 0x02ff, 0x02ff, 0x04ff, 0x02ff # 0x04 Greater
-
- .word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff
- .word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff
- .word 0x06ff, 0x06ff, 0x04ff, 0x06ff # 0x05 Minus
-
- .word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff
- .word 0x0109, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff
- .word 0x06ff, 0x06ff, 0x06ff, 0x06ff # 0x06 Left paren
-
- .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
- .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff
- .word 0x02ff, 0x02ff, 0x02ff, 0x04ff # 0x07 Less
-
- .word 0x08ff, 0x0108, 0x00ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff
- .word 0x08ff, 0x00ff, 0x08ff, 0x0108, 0x0108, 0x00ff, 0x08ff, 0x08ff
- .word 0x08ff, 0x08ff, 0x08ff, 0x08ff # 0x08 Hexadecimal after 0x.
-
- .word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109
- .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109
- .word 0x0109, 0x0109, 0x0109, 0x0109 # 0x09 Comment
-
- .word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff
- .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109
- .word 0x0109, 0x0109, 0x0109, 0x0109 # 0x0a Closing comment
-
- .word 0x00ff, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110
- .word 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110
- .word 0x010b, 0x04ff, 0x010b, 0x010b # 0x0b String
-
- .word 0x08ff, 0x00ff, 0x00ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff
- .word 0x08ff, 0x00ff, 0x08ff, 0x00ff, 0x00ff, 0x010d, 0x08ff, 0x08ff
- .word 0x08ff, 0x08ff, 0x08ff, 0x08ff # 0x0c Leading zero
-
- .word 0x00ff, 0x0108, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff
- .word 0x00ff, 0x00ff, 0x00ff, 0x0108, 0x0108, 0x00ff, 0x00ff, 0x00ff
- .word 0x00ff, 0x00ff, 0x00ff, 0x00ff # 0x0d Starting hexadecimal
-
-.section .text
-
-# Returns the class from the classification table for the given character.
-#
-# Parameters:
-# a0 - Character.
-#
-# Sets a0 to the class number.
-.type classify, @function
-classify:
- la t0, classification
- add t0, t0, a0 # Character class pointer.
- lbu a0, (t0) # Character class.
- ret
-
-# Given the current state and a character class, calculates the next state.
-
-# Parameters:
-# a0 - Current state.
-# a1 - Character class.
-#
-# Sets a0 to the next state.
-.type lookup_state, @function
-lookup_state:
- li t0, CLASS_COUNT
- mul a0, a0, t0 # Transition row.
- add a0, a0, a1 # Transition column.
-
- li t0, 4
- mul a0, a0, t0 # Multiply by the word size.
-
- la t0, transitions
- add t0, t0, a0
- lw a0, (t0) # Next state.
-
- ret
-
-# Chains classify and lookup_state.
-#
-# Parameters:
-# a0 - Current state.
-# a1 - Character.
-#
-# Sets a0 to the next state based on the given character.
-.type _next_state, @function
-_next_state:
- # Prologue.
- addi sp, sp, -16
- sw ra, 12(sp)
- sw s0, 8(sp)
- addi s0, sp, 16
-
- sw a0, 4(sp)
- mv a0, a1
- call classify
-
- mv a1, a0
- lw a0, 4(sp)
- call lookup_state
-
- # Epilogue.
- lw ra, 12(sp)
- lw s0, 8(sp)
- addi sp, sp, 16
- ret
-
-# Takes an identifier and checks whether it's a keyword.
-#
-# Parameters:
-# a0 - Token length.
-# a1 - Token pointer.
-#
-# Sets a0 to the appropriate token type.
-.type classify_identifier, @function
-classify_identifier:
- # Prologue.
- addi sp, sp, -16
- sw ra, 12(sp)
- sw s0, 8(sp)
- addi s0, sp, 16
-
- mv a2, a0
- mv a3, a1
- li a0, KEYWORDS_COUNT
- la a1, keywords
- call _strings_index
-
- bnez a0, .Lclassify_identifier_end
- li a0, TOKEN_IDENTIFIER
-
-.Lclassify_identifier_end:
- # Epilogue.
- lw ra, 12(sp)
- lw s0, 8(sp)
- addi sp, sp, 16
- ret
-
-# Takes a symbol and determines its type.
-#
-# Parameters:
-# a0 - Token character.
-#
-# Sets a0 to the appropriate token type.
-.type classify_single, @function
-classify_single:
- # Prologue.
- addi sp, sp, -16
- sw ra, 12(sp)
- sw s0, 8(sp)
- addi s0, sp, 16
-
- mv a1, a0
- li a2, BYTE_KEYWORDS_SIZE
- la a0, byte_keywords
- call _memchr
-
- la a1, byte_keywords
- sub a0, a0, a1
- addi a0, a0, TOKEN_IDENTIFIER + 1
-
- # Epilogue.
- lw ra, 12(sp)
- lw s0, 8(sp)
- addi sp, sp, 16
- ret
-
-# Classified a symbol containing multiple characters (probably 2).
-#
-# Parameters:
-# a0 - Token length.
-# a1 - Token pointer.
-#
-# Sets a0 to the appropriate token type.
-.type classify_composite, @function
-classify_composite:
- lbu t0, 0(a1)
- li t1, ':'
- beq t0, t1, .Lclassify_composite_assign
-
- j .Lclassify_composite_end
-
-.Lclassify_composite_assign:
- li a0, TOKEN_ASSIGN
- j .Lclassify_composite_end
-
-.Lclassify_composite_end:
- ret
-
-# Initializes the classification table.
-#
-# Paramaters:
-# a0 - Source text pointer.
-# a1 - A pointer for output value, the token kind. 4 Bytes.
-#
-# Sets a0 to the position of the next token.
-.type lex_next, @function
-lex_next:
- # Prologue.
- addi sp, sp, -32
- sw ra, 28(sp)
- sw s0, 24(sp)
- addi s0, sp, 32
-
- sw s1, 20(sp) # Preserve s1 used for current source text position.
- mv s1, a0
- sw a0, 12(sp) # Keeps a pointer to the beginning of a token.
- # 4(sp) and 8(sp) are reserved for the kind and length of the token if needed.
-
- sw s2, 16(sp) # Preserve s2 containing the current state.
- li s2, 0x00 # Initial, start state.
-
- sw a1, 0(sp)
- sw zero, (a1) # Initialize.
-
-.Llex_next_loop:
- mv a0, s2
- lbu a1, (s1)
- call _next_state
-
- li t0, 0xff
- and s2, a0, t0 # Next state.
-
- li t0, 0xff00
- and t1, a0, t0 # Transition action.
- srli t1, t1, 8
-
- # Perform the provided action.
- li t0, 0x01 # Accumulate action.
- beq t1, t0, .Llex_next_accumulate
-
- li t0, 0x02 # Print action.
- beq t1, t0, .Llex_next_print
-
- li t0, 0x03 # Skip action.
- beq t1, t0, .Llex_next_skip
-
- li t0, 0x04 # Delimited string action.
- beq t1, t0, .Llex_next_comment
-
- li t0, 0x05 # Finalize identifier.
- beq t1, t0, .Llex_next_identifier
-
- li t0, 0x06 # Single character symbol action.
- beq t1, t0, .Llex_next_single
-
- li t0, 0x07 # An action for symbols containing multiple characters.
- beq t1, t0, .Llex_next_composite
-
- li t0, 0x08 # Integer action.
- beq t1, t0, .Llex_next_integer
-
- j .Llex_next_reject
-
-.Llex_next_reject:
- addi s1, s1, 1
-
- j .Llex_next_end
-
-.Llex_next_accumulate:
- addi s1, s1, 1
-
- j .Llex_next_loop
-
-.Llex_next_skip:
- addi s1, s1, 1
- lw t0, 12(sp)
- addi t0, t0, 1
- sw t0, 12(sp)
-
- j .Llex_next_loop
-
-.Llex_next_print:
- /* DEBUG
- addi a0, a0, 21
- sw a0, 0(sp)
- addi a0, sp, 0
- li a1, 1
- call _write_error */
-
- j .Llex_next_end
-
-.Llex_next_comment:
- addi s1, s1, 1
-
- j .Llex_next_end
-
-.Llex_next_identifier:
- # An identifier can be a textual keyword.
- # Check the kind of the token and write it into the output parameter.
- lw a1, 12(sp)
- sub a0, s1, a1
- sw a0, 8(sp)
- call classify_identifier
- sw a0, 4(sp)
- lw a0, 0(sp)
- addi a1, sp, 4
- li a2, 12
- call _memcpy
-
- j .Llex_next_end
-
-.Llex_next_single:
- lw a0, 12(sp)
- addi s1, a0, 1
- lbu a0, (a0)
- call classify_single
- lw a1, 0(sp)
- sw a0, (a1)
-
- j .Llex_next_end
-
-.Llex_next_composite:
- addi s1, s1, 1
- lw a1, 12(sp)
- sub a0, s1, a1
- call classify_composite
- lw a1, 0(sp)
- sw a0, (a1)
-
- j .Llex_next_end
-
-.Llex_next_integer:
- lw t0, 0(sp)
- li t1, TOKEN_INTEGER
- sw t1, 0(t0)
- lw t1, 12(sp)
- sw t1, 8(t0)
- sub t1, s1, t1
- sw t1, 4(t0)
-
- j .Llex_next_end
-
-.Llex_next_end:
- mv a0, s1 # Return the advanced text pointer.
-
- # Restore saved registers.
- lw s1, 20(sp)
- lw s2, 16(sp)
-
- # Epilogue.
- lw ra, 28(sp)
- lw s0, 24(sp)
- addi sp, sp, 32
- ret