diff options
| author | Eugen Wissner <belka@caraus.de> | 2025-08-28 22:45:42 +0200 |
|---|---|---|
| committer | Eugen Wissner <belka@caraus.de> | 2025-08-30 01:29:00 +0200 |
| commit | 627975775c941130975ce0f9dbef08c723e69794 (patch) | |
| tree | 0f7f4c2c6d83420262d89ff720e6435a67410c9a /boot/tokenizer.s | |
| parent | e614d43ea9af078301d538fcddb19e83eed7e879 (diff) | |
| download | elna-627975775c941130975ce0f9dbef08c723e69794.tar.gz | |
Start over
Diffstat (limited to 'boot/tokenizer.s')
| -rw-r--r-- | boot/tokenizer.s | 616 |
1 files changed, 0 insertions, 616 deletions
diff --git a/boot/tokenizer.s b/boot/tokenizer.s deleted file mode 100644 index 2c7f2a3..0000000 --- a/boot/tokenizer.s +++ /dev/null @@ -1,616 +0,0 @@ -# This Source Code Form is subject to the terms of the Mozilla Public License, -# v. 2.0. If a copy of the MPL was not distributed with this file, You can -# obtain one at https://mozilla.org/MPL/2.0/. - -.global lex_next, classification, transitions, keywords, byte_keywords - -.include "boot/definitions.inc" - -.section .rodata - -# -# Classification table assigns each possible character to a group (class). All -# characters of the same group a handled equivalently. -# -# Classification: -# -.equ CLASS_INVALID, 0x00 -.equ CLASS_DIGIT, 0x01 -.equ CLASS_CHARACTER, 0x02 -.equ CLASS_SPACE, 0x03 -.equ CLASS_COLON, 0x04 -.equ CLASS_EQUALS, 0x05 -.equ CLASS_LEFT_PAREN, 0x06 -.equ CLASS_RIGHT_PAREN, 0x07 -.equ CLASS_ASTERISK, 0x08 -.equ CLASS_UNDERSCORE, 0x09 -.equ CLASS_SINGLE, 0x0a -.equ CLASS_HEX, 0x0b -.equ CLASS_ZERO, 0x0c -.equ CLASS_X, 0x0d -.equ CLASS_EOF, 0x0e -.equ CLASS_DOT, 0x0f -.equ CLASS_MINUS, 0x10 -.equ CLASS_QUOTE, 0x11 -.equ CLASS_GREATER, 0x12 -.equ CLASS_LESS, 0x13 - -.equ CLASS_COUNT, 20 - -.type classification, @object -classification: - .byte CLASS_EOF # 00 NUL - .byte CLASS_INVALID # 01 SOH - .byte CLASS_INVALID # 02 STX - .byte CLASS_INVALID # 03 ETX - .byte CLASS_INVALID # 04 EOT - .byte CLASS_INVALID # 05 ENQ - .byte CLASS_INVALID # 06 ACK - .byte CLASS_INVALID # 07 BEL - .byte CLASS_INVALID # 08 BS - .byte CLASS_SPACE # 09 HT - .byte CLASS_SPACE # 0A LF - .byte CLASS_INVALID # 0B VT - .byte CLASS_INVALID # 0C FF - .byte CLASS_SPACE # 0D CR - .byte CLASS_INVALID # 0E SO - .byte CLASS_INVALID # 0F SI - .byte CLASS_INVALID # 10 DLE - .byte CLASS_INVALID # 11 DC1 - .byte CLASS_INVALID # 12 DC2 - .byte CLASS_INVALID # 13 DC3 - .byte CLASS_INVALID # 14 DC4 - .byte CLASS_INVALID # 15 NAK - .byte CLASS_INVALID # 16 SYN - .byte CLASS_INVALID # 17 ETB - .byte CLASS_INVALID # 18 CAN - .byte CLASS_INVALID # 19 EM - .byte CLASS_INVALID # 1A SUB - .byte CLASS_INVALID # 1B ESC - .byte CLASS_INVALID # 1C FS - .byte CLASS_INVALID # 1D GS - .byte CLASS_INVALID # 1E RS - .byte CLASS_INVALID # 1F US - .byte CLASS_SPACE # 20 Space - .byte CLASS_SINGLE # 21 ! - .byte CLASS_QUOTE # 22 " - .byte 0x00 # 23 # - .byte 0x00 # 24 $ - .byte CLASS_SINGLE # 25 % - .byte CLASS_SINGLE # 26 & - .byte CLASS_QUOTE # 27 ' - .byte CLASS_LEFT_PAREN # 28 ( - .byte CLASS_RIGHT_PAREN # 29 ) - .byte CLASS_ASTERISK # 2A * - .byte CLASS_SINGLE # 2B + - .byte CLASS_SINGLE # 2C , - .byte CLASS_MINUS # 2D - - .byte CLASS_DOT # 2E . - .byte CLASS_SINGLE # 2F / - .byte CLASS_ZERO # 30 0 - .byte CLASS_DIGIT # 31 1 - .byte CLASS_DIGIT # 32 2 - .byte CLASS_DIGIT # 33 3 - .byte CLASS_DIGIT # 34 4 - .byte CLASS_DIGIT # 35 5 - .byte CLASS_DIGIT # 36 6 - .byte CLASS_DIGIT # 37 7 - .byte CLASS_DIGIT # 38 8 - .byte CLASS_DIGIT # 39 9 - .byte CLASS_COLON # 3A : - .byte CLASS_SINGLE # 3B ; - .byte CLASS_LESS # 3C < - .byte CLASS_EQUALS # 3D = - .byte CLASS_GREATER # 3E > - .byte 0x00 # 3F ? - .byte CLASS_SINGLE # 40 @ - .byte CLASS_CHARACTER # 41 A - .byte CLASS_CHARACTER # 42 B - .byte CLASS_CHARACTER # 43 C - .byte CLASS_CHARACTER # 44 D - .byte CLASS_CHARACTER # 45 E - .byte CLASS_CHARACTER # 46 F - .byte CLASS_CHARACTER # 47 G - .byte CLASS_CHARACTER # 48 H - .byte CLASS_CHARACTER # 49 I - .byte CLASS_CHARACTER # 4A J - .byte CLASS_CHARACTER # 4B K - .byte CLASS_CHARACTER # 4C L - .byte CLASS_CHARACTER # 4D M - .byte CLASS_CHARACTER # 4E N - .byte CLASS_CHARACTER # 4F O - .byte CLASS_CHARACTER # 50 P - .byte CLASS_CHARACTER # 51 Q - .byte CLASS_CHARACTER # 52 R - .byte CLASS_CHARACTER # 53 S - .byte CLASS_CHARACTER # 54 T - .byte CLASS_CHARACTER # 55 U - .byte CLASS_CHARACTER # 56 V - .byte CLASS_CHARACTER # 57 W - .byte CLASS_CHARACTER # 58 X - .byte CLASS_CHARACTER # 59 Y - .byte CLASS_CHARACTER # 5A Z - .byte CLASS_SINGLE # 5B [ - .byte 0x00 # 5C \ - .byte CLASS_SINGLE # 5D ] - .byte CLASS_SINGLE # 5E ^ - .byte CLASS_UNDERSCORE # 5F _ - .byte 0x00 # 60 ` - .byte CLASS_HEX # 61 a - .byte CLASS_HEX # 62 b - .byte CLASS_HEX # 63 c - .byte CLASS_HEX # 64 d - .byte CLASS_HEX # 65 e - .byte CLASS_HEX # 66 f - .byte CLASS_CHARACTER # 67 g - .byte CLASS_CHARACTER # 68 h - .byte CLASS_CHARACTER # 69 i - .byte CLASS_CHARACTER # 6A j - .byte CLASS_CHARACTER # 6B k - .byte CLASS_CHARACTER # 6C l - .byte CLASS_CHARACTER # 6D m - .byte CLASS_CHARACTER # 6E n - .byte CLASS_CHARACTER # 6F o - .byte CLASS_CHARACTER # 70 p - .byte CLASS_CHARACTER # 71 q - .byte CLASS_CHARACTER # 72 r - .byte CLASS_CHARACTER # 73 s - .byte CLASS_CHARACTER # 74 t - .byte CLASS_CHARACTER # 75 u - .byte CLASS_CHARACTER # 76 v - .byte CLASS_CHARACTER # 77 w - .byte CLASS_X # 78 x - .byte CLASS_CHARACTER # 79 y - .byte CLASS_CHARACTER # 7A z - .byte 0x00 # 7B { - .byte CLASS_SINGLE # 7C | - .byte 0x00 # 7D } - .byte CLASS_SINGLE # 7E ~ - .byte CLASS_INVALID # 7F DEL - -# -# Textual keywords in the language. -# -.equ KEYWORDS_COUNT, TOKEN_IDENTIFIER - 1 - -.type keywords, @object -keywords: - .word 7 - .ascii "program" - .word 6 - .ascii "import" - .word 5 - .ascii "const" - .word 3 - .ascii "var" - .word 2 - .ascii "if" - .word 4 - .ascii "then" - .word 5 - .ascii "elsif" - .word 4 - .ascii "else" - .word 5 - .ascii "while" - .word 2 - .ascii "do" - .word 4 - .ascii "proc" - .word 5 - .ascii "begin" - .word 3 - .ascii "end" - .word 4 - .ascii "type" - .word 6 - .ascii "record" - .word 5 - .ascii "union" - .word 4 - .ascii "true" - .word 5 - .ascii "false" - .word 3 - .ascii "nil" - .word 3 - .ascii "xor" - .word 2 - .ascii "or" - .word 6 - .ascii "return" - .word 4 - .ascii "cast" - .word 4 - .ascii "goto" - .word 4 - .ascii "case" - .word 2 - .ascii "of" - -.type byte_keywords, @object -byte_keywords: .ascii "&.,:;()[]^=+-*@" -.equ BYTE_KEYWORDS_SIZE, . - byte_keywords - -.section .data - -# The transition table describes transitions from one state to another, given -# a symbol (character class). -# -# The table has m rows and n columns, where m is the amount of states and n is -# the amount of classes. So given the current state and a classified character -# the table can be used to look up the next state. -# -# Each cell is a word long. -# - The least significant byte of the word is a row number (beginning with 0). -# It specifies the target state. "ff" means that this is an end state and no -# transition is possible. -# - The next byte is the action that should be performed when transitioning. -# For the meaning of actions see labels in the lex_next function, which -# handles each action. -# -.type transitions, @object -transitions: - # Invalid Digit Alpha Space : = ( ) - # * _ Single Hex 0 x NUL . - # - " or ' > < - .word 0x00ff, 0x0103, 0x0102, 0x0300, 0x0101, 0x06ff, 0x0106, 0x06ff - .word 0x06ff, 0x0102, 0x06ff, 0x0102, 0x010c, 0x0102, 0x00ff, 0x06ff - .word 0x0105, 0x0110, 0x0104, 0x0107 # 0x00 Start - - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x07ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff # 0x01 Colon - - .word 0x05ff, 0x0102, 0x0102, 0x05ff, 0x05ff, 0x05ff, 0x05ff, 0x05ff - .word 0x05ff, 0x0102, 0x05ff, 0x0102, 0x0102, 0x0102, 0x05ff, 0x05ff - .word 0x05ff, 0x05ff, 0x05ff, 0x05ff # 0x02 Identifier - - .word 0x08ff, 0x0103, 0x00ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff - .word 0x08ff, 0x00ff, 0x08ff, 0x00ff, 0x0103, 0x00ff, 0x08ff, 0x08ff - .word 0x08ff, 0x08ff, 0x08ff, 0x08ff # 0x03 Decimal - - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x04ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x04ff, 0x02ff # 0x04 Greater - - .word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff - .word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff - .word 0x06ff, 0x06ff, 0x04ff, 0x06ff # 0x05 Minus - - .word 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff - .word 0x0109, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff, 0x06ff - .word 0x06ff, 0x06ff, 0x06ff, 0x06ff # 0x06 Left paren - - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff, 0x02ff - .word 0x02ff, 0x02ff, 0x02ff, 0x04ff # 0x07 Less - - .word 0x08ff, 0x0108, 0x00ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff - .word 0x08ff, 0x00ff, 0x08ff, 0x0108, 0x0108, 0x00ff, 0x08ff, 0x08ff - .word 0x08ff, 0x08ff, 0x08ff, 0x08ff # 0x08 Hexadecimal after 0x. - - .word 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109 - .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 - .word 0x0109, 0x0109, 0x0109, 0x0109 # 0x09 Comment - - .word 0x00ff, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x04ff - .word 0x010a, 0x0109, 0x0109, 0x0109, 0x0109, 0x0109, 0x00ff, 0x0109 - .word 0x0109, 0x0109, 0x0109, 0x0109 # 0x0a Closing comment - - .word 0x00ff, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110 - .word 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x010b, 0x0110 - .word 0x010b, 0x04ff, 0x010b, 0x010b # 0x0b String - - .word 0x08ff, 0x00ff, 0x00ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff, 0x08ff - .word 0x08ff, 0x00ff, 0x08ff, 0x00ff, 0x00ff, 0x010d, 0x08ff, 0x08ff - .word 0x08ff, 0x08ff, 0x08ff, 0x08ff # 0x0c Leading zero - - .word 0x00ff, 0x0108, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff - .word 0x00ff, 0x00ff, 0x00ff, 0x0108, 0x0108, 0x00ff, 0x00ff, 0x00ff - .word 0x00ff, 0x00ff, 0x00ff, 0x00ff # 0x0d Starting hexadecimal - -.section .text - -# Returns the class from the classification table for the given character. -# -# Parameters: -# a0 - Character. -# -# Sets a0 to the class number. -.type classify, @function -classify: - la t0, classification - add t0, t0, a0 # Character class pointer. - lbu a0, (t0) # Character class. - ret - -# Given the current state and a character class, calculates the next state. - -# Parameters: -# a0 - Current state. -# a1 - Character class. -# -# Sets a0 to the next state. -.type lookup_state, @function -lookup_state: - li t0, CLASS_COUNT - mul a0, a0, t0 # Transition row. - add a0, a0, a1 # Transition column. - - li t0, 4 - mul a0, a0, t0 # Multiply by the word size. - - la t0, transitions - add t0, t0, a0 - lw a0, (t0) # Next state. - - ret - -# Chains classify and lookup_state. -# -# Parameters: -# a0 - Current state. -# a1 - Character. -# -# Sets a0 to the next state based on the given character. -.type _next_state, @function -_next_state: - # Prologue. - addi sp, sp, -16 - sw ra, 12(sp) - sw s0, 8(sp) - addi s0, sp, 16 - - sw a0, 4(sp) - mv a0, a1 - call classify - - mv a1, a0 - lw a0, 4(sp) - call lookup_state - - # Epilogue. - lw ra, 12(sp) - lw s0, 8(sp) - addi sp, sp, 16 - ret - -# Takes an identifier and checks whether it's a keyword. -# -# Parameters: -# a0 - Token length. -# a1 - Token pointer. -# -# Sets a0 to the appropriate token type. -.type classify_identifier, @function -classify_identifier: - # Prologue. - addi sp, sp, -16 - sw ra, 12(sp) - sw s0, 8(sp) - addi s0, sp, 16 - - mv a2, a0 - mv a3, a1 - li a0, KEYWORDS_COUNT - la a1, keywords - call _strings_index - - bnez a0, .Lclassify_identifier_end - li a0, TOKEN_IDENTIFIER - -.Lclassify_identifier_end: - # Epilogue. - lw ra, 12(sp) - lw s0, 8(sp) - addi sp, sp, 16 - ret - -# Takes a symbol and determines its type. -# -# Parameters: -# a0 - Token character. -# -# Sets a0 to the appropriate token type. -.type classify_single, @function -classify_single: - # Prologue. - addi sp, sp, -16 - sw ra, 12(sp) - sw s0, 8(sp) - addi s0, sp, 16 - - mv a1, a0 - li a2, BYTE_KEYWORDS_SIZE - la a0, byte_keywords - call _memchr - - la a1, byte_keywords - sub a0, a0, a1 - addi a0, a0, TOKEN_IDENTIFIER + 1 - - # Epilogue. - lw ra, 12(sp) - lw s0, 8(sp) - addi sp, sp, 16 - ret - -# Classified a symbol containing multiple characters (probably 2). -# -# Parameters: -# a0 - Token length. -# a1 - Token pointer. -# -# Sets a0 to the appropriate token type. -.type classify_composite, @function -classify_composite: - lbu t0, 0(a1) - li t1, ':' - beq t0, t1, .Lclassify_composite_assign - - j .Lclassify_composite_end - -.Lclassify_composite_assign: - li a0, TOKEN_ASSIGN - j .Lclassify_composite_end - -.Lclassify_composite_end: - ret - -# Initializes the classification table. -# -# Paramaters: -# a0 - Source text pointer. -# a1 - A pointer for output value, the token kind. 4 Bytes. -# -# Sets a0 to the position of the next token. -.type lex_next, @function -lex_next: - # Prologue. - addi sp, sp, -32 - sw ra, 28(sp) - sw s0, 24(sp) - addi s0, sp, 32 - - sw s1, 20(sp) # Preserve s1 used for current source text position. - mv s1, a0 - sw a0, 12(sp) # Keeps a pointer to the beginning of a token. - # 4(sp) and 8(sp) are reserved for the kind and length of the token if needed. - - sw s2, 16(sp) # Preserve s2 containing the current state. - li s2, 0x00 # Initial, start state. - - sw a1, 0(sp) - sw zero, (a1) # Initialize. - -.Llex_next_loop: - mv a0, s2 - lbu a1, (s1) - call _next_state - - li t0, 0xff - and s2, a0, t0 # Next state. - - li t0, 0xff00 - and t1, a0, t0 # Transition action. - srli t1, t1, 8 - - # Perform the provided action. - li t0, 0x01 # Accumulate action. - beq t1, t0, .Llex_next_accumulate - - li t0, 0x02 # Print action. - beq t1, t0, .Llex_next_print - - li t0, 0x03 # Skip action. - beq t1, t0, .Llex_next_skip - - li t0, 0x04 # Delimited string action. - beq t1, t0, .Llex_next_comment - - li t0, 0x05 # Finalize identifier. - beq t1, t0, .Llex_next_identifier - - li t0, 0x06 # Single character symbol action. - beq t1, t0, .Llex_next_single - - li t0, 0x07 # An action for symbols containing multiple characters. - beq t1, t0, .Llex_next_composite - - li t0, 0x08 # Integer action. - beq t1, t0, .Llex_next_integer - - j .Llex_next_reject - -.Llex_next_reject: - addi s1, s1, 1 - - j .Llex_next_end - -.Llex_next_accumulate: - addi s1, s1, 1 - - j .Llex_next_loop - -.Llex_next_skip: - addi s1, s1, 1 - lw t0, 12(sp) - addi t0, t0, 1 - sw t0, 12(sp) - - j .Llex_next_loop - -.Llex_next_print: - /* DEBUG - addi a0, a0, 21 - sw a0, 0(sp) - addi a0, sp, 0 - li a1, 1 - call _write_error */ - - j .Llex_next_end - -.Llex_next_comment: - addi s1, s1, 1 - - j .Llex_next_end - -.Llex_next_identifier: - # An identifier can be a textual keyword. - # Check the kind of the token and write it into the output parameter. - lw a1, 12(sp) - sub a0, s1, a1 - sw a0, 8(sp) - call classify_identifier - sw a0, 4(sp) - lw a0, 0(sp) - addi a1, sp, 4 - li a2, 12 - call _memcpy - - j .Llex_next_end - -.Llex_next_single: - lw a0, 12(sp) - addi s1, a0, 1 - lbu a0, (a0) - call classify_single - lw a1, 0(sp) - sw a0, (a1) - - j .Llex_next_end - -.Llex_next_composite: - addi s1, s1, 1 - lw a1, 12(sp) - sub a0, s1, a1 - call classify_composite - lw a1, 0(sp) - sw a0, (a1) - - j .Llex_next_end - -.Llex_next_integer: - lw t0, 0(sp) - li t1, TOKEN_INTEGER - sw t1, 0(t0) - lw t1, 12(sp) - sw t1, 8(t0) - sub t1, s1, t1 - sw t1, 4(t0) - - j .Llex_next_end - -.Llex_next_end: - mv a0, s1 # Return the advanced text pointer. - - # Restore saved registers. - lw s1, 20(sp) - lw s2, 16(sp) - - # Epilogue. - lw ra, 28(sp) - lw s0, 24(sp) - addi sp, sp, 32 - ret |
