From 2519b3f9e999fc0f97c61c8f2a26d6c3e4f5f92c Mon Sep 17 00:00:00 2001 From: Eugen Wissner Date: Mon, 29 Sep 2025 16:55:38 +0200 Subject: [PATCH] Switch completely to the table based lexer --- Rakefile | 13 +- boot/stage14.elna | 1298 ++++++++++++-------- boot/stage15.elna | 2970 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 3775 insertions(+), 506 deletions(-) create mode 100644 boot/stage15.elna diff --git a/Rakefile b/Rakefile index a772900..9d866ea 100644 --- a/Rakefile +++ b/Rakefile @@ -41,10 +41,17 @@ end desc 'Convert previous stage language into the current stage language' task :convert do - File.open('boot/stage10.elna', 'w') do |current_stage| - File.readlines('boot/stage9.elna').each do |line| - current_stage << line + File.open('boot/stage14.elna', 'w') do |current_stage| + previous_line = nil + + File.readlines('boot/stage13.elna').each do |line| + if !previous_line.nil? && previous_line.start_with?('begin') && line.strip.start_with?('return') + else + current_stage << previous_line unless previous_line.nil? + end + previous_line = line end + current_stage << previous_line end end diff --git a/boot/stage14.elna b/boot/stage14.elna index e3091dc..6e5e186 100644 --- a/boot/stage14.elna +++ b/boot/stage14.elna @@ -6,36 +6,42 @@ (* Stage 14 compiler. *) +(* - Binary minus. *) +(* - Space independent parsing. *) +(* - Label names in goto statements aren't required to begin with a dot. *) +(* - Dereferencing pointers pointing to word long data. *) + const symbol_builtin_name_int := "Int"; symbol_builtin_name_word := "Word"; symbol_builtin_name_pointer := "Pointer"; symbol_builtin_name_char := "Char"; - symbol_builtin_name_bool := "Bool"; + symbol_builtin_name_array := "Array"; (* Every type info starts with a word describing what type it is. PRIMITIVE_TYPE = 1 + ENUMERATION_TYPE = 2 Primitive types have only type size. *) symbol_builtin_type_int := S(1, 4); symbol_builtin_type_word := S(1, 4); symbol_builtin_type_pointer := S(1, 4); symbol_builtin_type_char := S(1, 1); - symbol_builtin_type_bool := S(1, 1); + symbol_builtin_type_array := S(1, 4); (* Info objects start with a word describing its type. - INFO_TYPE = 1 - INFO_PARAMETER = 2 - INFO_TEMPORARY = 3 + TYPE_INFO = 1 + PARAMETER_INFO = 2 + TEMPORARY_INFO = 3 Type info has the type it belongs to. *) symbol_type_info_int := S(1, @symbol_builtin_type_int); symbol_type_info_word := S(1, @symbol_builtin_type_word); symbol_type_info_pointer := S(1, @symbol_builtin_type_pointer); symbol_type_info_char := S(1, @symbol_builtin_type_char); - symbol_type_info_bool := S(1, @symbol_builtin_type_bool); + symbol_type_info_array := S(1, @symbol_builtin_type_array); var source_code: Array; @@ -43,12 +49,15 @@ var symbol_table_global: Array; symbol_table_local: Array; classification: Array; + + (* To reserve memory just add the value of needed bytes to the memory_free_pointer_variable. *) memory: Array; compiler_strings_position: Pointer := @compiler_strings; compiler_strings_length: Word := 0; label_counter: Word := 0; - source_code_position: Pointer := @source_code; + + (* Points to a segment of free memory. *) memory_free_pointer: Word := @memory; (** @@ -374,29 +383,18 @@ begin return destination end; -(** - * Prints the current token. - * - * Parameters: - * length - Token length. - * - * Returns a0 unchanged. - *) -proc _write_token(length: Word); -begin - _write_s(source_code_position, length); - return length -end; - proc _compile_integer_literal(); var integer_token: Word; + integer_length: Word; token_kind: Word; begin _write_z("\tli t0, \0"); - integer_token := _lexer_read_token(@token_kind); - _write_token(integer_token); + integer_token := _lexer_global_get_start(); + integer_length := _lexer_global_get_end() + -integer_token; + + _write_s(integer_token, integer_length); _lexer_skip_token(); _write_c('\n') @@ -405,18 +403,16 @@ end; proc _compile_character_literal(); var character: Word; + token_kind: Word; + character_length: Word; begin - _write_z("\tli t0, '\0"); - source_code_position := source_code_position + 1; + character := _lexer_global_get_start(); + character_length := _lexer_global_get_end() + -character; - character := _load_byte(source_code_position); - if character = '\\' then - _write_c('\\'); - source_code_position := source_code_position + 1 - end; - _write_s(source_code_position, 1); - _write_s("'\n", 2); - source_code_position := source_code_position + 2 + _write_z("\tli t0, \0"); + _write_s(character, character_length); + _write_c('\n'); + _lexer_skip_token() end; proc _compile_variable_expression(); @@ -425,41 +421,49 @@ begin _write_z("\tlw t0, (t0)\n\0") end; +(** + * Compiled take address expression, starting with an "@" sign. + *) proc _compile_address_expression(); begin - (* Skip the "@" sign. *) - source_code_position := source_code_position + 1; + _lexer_skip_token(); _compile_designator() end; +(** + * Compile unary negation, "-" sign. + *) proc _compile_negate_expression(); begin - (* Skip the "-" sign. *) - source_code_position := source_code_position + 1; + _lexer_skip_token(); _compile_term(); - _write_z("\tneg t0, t0\n\0") end; +(* Compile unary negation, "~" sign. *) proc _compile_not_expression(); +var + token_kind: Word; begin - (* Skip the "~" sign. *) - source_code_position := source_code_position + 1; + _lexer_read_token(@token_kind); + _lexer_skip_token(); _compile_term(); - _write_z("\tnot t0, t0\n\0") -end; +end; proc _compile_string_literal(); var + token_kind: Word; + token_start: Word; length: Word; offset: Word; begin - length := _string_length(source_code_position); - offset := _add_string(source_code_position); + _lexer_read_token(@token_kind); + token_start := _lexer_global_get_start(); + length := _string_length(token_start); + offset := _add_string(token_start); - source_code_position := source_code_position + length; - source_code_position := source_code_position + 2; + _lexer_skip_token(); _write_z("\tla t0, strings\n\0"); _write_z("\tli t1, \0"); @@ -472,33 +476,40 @@ end; proc _compile_term(); var current_character: Word; + token_kind: Word; begin - current_character := _load_byte(source_code_position); + _lexer_read_token(@token_kind); - if current_character = '\'' then + if token_kind = _lexer_token_kind_character() then _compile_character_literal() - elsif current_character = '@' then - _compile_address_expression() - elsif current_character = '-' then - _compile_negate_expression() - elsif current_character = '~' then - _compile_not_expression() - elsif current_character = '"' then + elsif token_kind = _lexer_token_kind_string() then _compile_string_literal() - elsif current_character = '_' then - _compile_call(); - _write_z("\nmv t0, a0\n\0") - elsif _is_digit(current_character) = 1 then + elsif token_kind = _lexer_token_kind_integer() then _compile_integer_literal() - elsif _is_lower(current_character) = 1 then - _compile_variable_expression() + elsif token_kind = _lexer_token_kind_at() then + _compile_address_expression() + elsif token_kind = _lexer_token_kind_minus() then + _compile_negate_expression() + elsif token_kind = _lexer_token_kind_not() then + _compile_not_expression() + elsif token_kind = _lexer_token_kind_identifier() then + current_character := _lexer_global_get_start(); + current_character := _load_byte(current_character); + + (* This is a call if the statement starts with an underscore. *) + if current_character = '_' then + _compile_call(); + _write_z("\tmv t0, a0\n\0") + else + _compile_variable_expression() + end end end; proc _compile_binary_rhs(); begin - (* Skip the whitespace after the binary operator. *) - source_code_position := source_code_position + 1; + (* Save the value of the left expression on the stack. *) + _write_z("\tsw t0, 64(sp)\n\0"); _compile_term(); (* Load the left expression from the stack; *) @@ -507,108 +518,96 @@ end; proc _compile_expression(); var - current_character: Word; + token_kind: Word; begin _compile_term(); - current_character := _load_byte(source_code_position); - if current_character <> ' ' then - goto .compile_expression_end - end; - (* It is a binary expression. *) + _lexer_read_token(@token_kind); - (* Save the value of the left expression on the stack. *) - _write_z("sw t0, 64(sp)\n\0"); - - (* Skip surrounding whitespace in front of the operator. *) - source_code_position := source_code_position + 1; - current_character := _load_byte(source_code_position); - - if current_character = '+' then - source_code_position := source_code_position + 1; + if token_kind = _lexer_token_kind_plus() then + _lexer_skip_token(); _compile_binary_rhs(); (* Execute the operation. *) - _write_z("add t0, t0, t1\n\0") - elsif current_character = '*' then - source_code_position := source_code_position + 1; + _write_z("\tadd t0, t0, t1\n\0") + elsif token_kind = _lexer_token_kind_minus() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\tsub t0, t1, t0\n\0"); + elsif token_kind = _lexer_token_kind_multiplication() then + _lexer_skip_token(); _compile_binary_rhs(); (* Execute the operation. *) _write_z("\tmul t0, t0, t1\n\0") - elsif current_character = '&' then - source_code_position := source_code_position + 1; + elsif token_kind = _lexer_token_kind_and() then + _lexer_skip_token(); _compile_binary_rhs(); (* Execute the operation. *) _write_z("\tand t0, t0, t1\n\0") - elsif current_character = 'o' then - source_code_position := source_code_position + 2; + elsif token_kind = _lexer_token_kind_or() then + _lexer_skip_token(); _compile_binary_rhs(); (* Execute the operation. *) - _write_z("or t0, t0, t1\n\0") - elsif current_character = 'x' then - source_code_position := source_code_position + 3; + _write_z("\tor t0, t0, t1\n\0") + elsif token_kind = _lexer_token_kind_xor() then + _lexer_skip_token(); _compile_binary_rhs(); (* Execute the operation. *) - _write_z("xor t0, t0, t1\n\0") - elsif current_character = '=' then - source_code_position := source_code_position + 1; + _write_z("\txor t0, t0, t1\n\0") + elsif token_kind = _lexer_token_kind_equals() then + _lexer_skip_token(); _compile_binary_rhs(); (* Execute the operation. *) - _write_z("xor t0, t0, t1\nseqz t0, t0\n\0") - elsif current_character = '%' then - source_code_position := source_code_position + 1; + _write_z("\txor t0, t0, t1\n\tseqz t0, t0\n\0") + elsif token_kind = _lexer_token_kind_remainder() then + _lexer_skip_token(); _compile_binary_rhs(); (* Execute the operation. *) - _write_z("rem t0, t1, t0\n\0") - elsif current_character = '/' then - source_code_position := source_code_position + 1; + _write_z("\trem t0, t1, t0\n\0") + elsif token_kind = _lexer_token_kind_division() then + _lexer_skip_token(); _compile_binary_rhs(); (* Execute the operation. *) - _write_z("div t0, t1, t0\n\0") - elsif current_character = '<' then - source_code_position := source_code_position + 1; - current_character := _load_byte(source_code_position); + _write_z("\tdiv t0, t1, t0\n\0") + elsif token_kind = _lexer_token_kind_less_than() then + _lexer_skip_token(); + _compile_binary_rhs(); - if current_character = '>' then - source_code_position := source_code_position + 1; - _compile_binary_rhs(); + (* Execute the operation. *) + _write_z("\tslt t0, t1, t0\n\0") + elsif token_kind = _lexer_token_kind_greater_than() then + _lexer_skip_token(); + _compile_binary_rhs(); - (* Execute the operation. *) - _write_z("\txor t0, t0, t1\nsnez t0, t0\n\0") - elsif current_character = '=' then - source_code_position := source_code_position + 1; - _compile_binary_rhs(); + (* Execute the operation. *) + _write_z("\tslt t0, t0, t1\n\0") + elsif token_kind = _lexer_token_kind_less_equal() then + _lexer_skip_token(); + _compile_binary_rhs(); - (* Execute the operation. *) - _write_z("\tslt t0, t0, t1\nxori t0, t0, 1\n\0") - else - _compile_binary_rhs(); + (* Execute the operation. *) + _write_z("\tslt t0, t0, t1\n\txori t0, t0, 1\n\0") + elsif token_kind = _lexer_token_kind_not_equal() then + _lexer_skip_token(); + _compile_binary_rhs(); - (* Execute the operation. *) - _write_z("slt t0, t1, t0\n\0") - end - elsif current_character = '>' then - source_code_position := source_code_position + 1; - current_character := _load_byte(source_code_position); - if current_character = '=' then - source_code_position := source_code_position + 1; - _compile_binary_rhs(); + (* Execute the operation. *) + _write_z("\txor t0, t0, t1\n\tsnez t0, t0\n\0") + elsif token_kind = _lexer_token_kind_greater_equal() then + _lexer_skip_token(); + _compile_binary_rhs(); - (* Execute the operation. *) - _write_z("\tslt t0, t1, t0\nxori t0, t0, 1\n\0") - else - _compile_binary_rhs(); - - (* Execute the operation. *) - _write_z("\tslt t0, t0, t1\n\0") - end + (* Execute the operation. *) + _write_z("\tslt t0, t1, t0\n\txori t0, t0, 1\n\0") end; .compile_expression_end; @@ -622,18 +621,18 @@ var stack_offset: Word; token_kind: Word; begin - name_length := _lexer_read_token(@token_kind); - name := _lexer_global_start(); - name := _load_word(name); - name_length := _lexer_global_end(); - name_length := _load_word(name_length) + -name; + _lexer_read_token(@token_kind); + name := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -name; argument_count := 0; (* Skip the identifier and left paren. *) _lexer_skip_token(); - source_code_position := source_code_position + 1; + _lexer_read_token(@token_kind); + _lexer_skip_token(); - if _load_byte(source_code_position) = ')' then + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_right_paren() then goto .compile_call_finalize end; .compile_call_loop; @@ -651,10 +650,12 @@ begin (* Add one to the argument counter. *) argument_count := argument_count + 1; - if _load_byte(source_code_position) <> ',' then + _lexer_read_token(@token_kind); + + if token_kind <> _lexer_token_kind_comma() then goto .compile_call_finalize end; - source_code_position := source_code_position + 2; + _lexer_skip_token(); goto .compile_call_loop; .compile_call_finalize; @@ -680,25 +681,32 @@ begin .compile_call_end; _write_z("\tcall \0"); _write_s(name, name_length); + _write_c('\n'); (* Skip the right paren. *) - source_code_position := source_code_position + 1 + _lexer_read_token(@token_kind); + _lexer_skip_token() end; proc _compile_goto(); var next_token: Word; + next_length: Word; token_kind: Word; begin - _lexer_read_token(@token_kind); _lexer_skip_token(); + _lexer_read_token(@token_kind); - source_code_position := source_code_position + 2; + if token_kind = _lexer_token_kind_dot() then + _lexer_skip_token(); + _lexer_read_token(@token_kind) + end; + next_token := _lexer_global_get_start(); + next_length := _lexer_global_get_end() + -next_token; - next_token := _lexer_read_token(@token_kind); _write_z("\tj .\0"); - _write_token(next_token); + _write_s(next_token, next_length); _lexer_skip_token() end; @@ -717,11 +725,14 @@ proc _compile_global_designator(); var name: Word; token_kind: Word; + token_length: Word; begin _write_z("\tla t0, \0"); - name := _lexer_read_token(@token_kind); - _write_token(name); + _lexer_read_token(@token_kind); + name := _lexer_global_get_start(); + token_length := _lexer_global_get_end() + -name; + _write_s(name, token_length); _lexer_skip_token(); _write_c('\n') @@ -734,21 +745,26 @@ var token_kind: Word; name: Word; begin - name_token := _lexer_read_token(@token_kind); - name := _lexer_global_start(); - name := _load_word(name); - name_token := _lexer_global_end(); - name_token := _load_word(name_token) + -name; + _lexer_read_token(@token_kind); + name := _lexer_global_get_start(); + name_token := _lexer_global_get_end() + -name; lookup_result := _symbol_table_lookup(@symbol_table_local, name, name_token); if lookup_result <> 0 then _compile_local_designator(lookup_result) else _compile_global_designator() + end; + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_hat() then + _lexer_skip_token(); + _write_z("\tlw t0, (t0)\n\0") end end; proc _compile_assignment(); +var + token_kind: Word; begin _compile_designator(); @@ -756,12 +772,13 @@ begin _write_z("\tsw t0, 60(sp)\n\0"); (* Skip the assignment sign (:=) with surrounding whitespaces. *) - source_code_position := source_code_position + 4; + _lexer_read_token(@token_kind); + _lexer_skip_token(); (* Compile the assignment. *) _compile_expression(); - _write_z("\tlw t1, 60(sp)\nsw t0, (t1)\n\0") + _write_z("\tlw t1, 60(sp)\n\tsw t0, (t1)\n\0") end; proc _compile_return_statement(); @@ -771,9 +788,8 @@ begin (* Skip "return" keyword and whitespace after it. *) _lexer_read_token(@token_kind); _lexer_skip_token(); - source_code_position := source_code_position + 1; - _compile_expression(); + _compile_expression(); _write_z("\tmv a0, t0\n\0") end; @@ -794,8 +810,6 @@ var condition_label: Word; token_kind: Word; begin - source_code_position := source_code_position + 1; - (* Compile condition. *) _compile_expression(); (* Skip " then" with newline. *) @@ -810,7 +824,7 @@ begin _write_label(condition_label); _write_c('\n'); - _compile_procedure_body(); + _compile_statement_list(); _write_z("\tj \0"); _write_label(after_end_label); @@ -839,7 +853,7 @@ begin _lexer_read_token(@token_kind); if token_kind = _lexer_token_kind_else() then _lexer_skip_token(); - _compile_procedure_body() + _compile_statement_list() elsif token_kind = _lexer_token_kind_elsif() then _lexer_skip_token(); _compile_condition(after_end_label); @@ -859,11 +873,10 @@ var name: Word; begin (* Skip the dot. *) - _lexer_read_token(@token_kind); _lexer_skip_token(); - label_token := _lexer_read_token(@token_kind); - name := _lexer_global_start(); - name := _load_word(name); + _lexer_read_token(@token_kind); + name := _lexer_global_get_start(); + label_token := _lexer_global_get_end() + -name; _write_c('.'); _write_s(name, label_token); _write_z(":\n\0"); @@ -886,8 +899,7 @@ begin elsif token_kind = _lexer_token_kind_dot() then _compile_label_declaration() elsif token_kind = _lexer_token_kind_identifier() then - current_byte := _lexer_global_start(); - current_byte := _load_word(current_byte); + current_byte := _lexer_global_get_start(); current_byte := _load_byte(current_byte); (* This is a call if the statement starts with an underscore. *) @@ -900,7 +912,7 @@ begin _write_c('\n') end; -proc _compile_procedure_body(); +proc _compile_statement_list(); var token_kind: Word; begin @@ -910,7 +922,7 @@ begin if token_kind = _lexer_token_kind_semicolon() then _lexer_skip_token(); - _compile_procedure_body() + _compile_statement_list() end; _skip_empty_lines() end; @@ -925,33 +937,139 @@ end; proc _write_register(register_character: Word, register_number: Word); begin _write_c(register_character); - register_number := register_number + '0'; - _write_c(register_number) + _write_c(register_number + '0') end; -proc _skip_spaces(); -var - current_byte: Word; - lhs: Word; - rhs: Word; -begin - current_byte := _load_byte(source_code_position); - lhs := current_byte = '\t'; - rhs := current_byte = ' '; +proc _type_get_kind(this: Word); + return _load_word(this) +end; - if lhs or rhs then - source_code_position := source_code_position + 1; - _skip_spaces() - end +proc _type_set_kind(this: Word, value: Word); +begin + _store_word(value, this) +end; + +proc _type_get_size(this: Word); + return _load_word(this + 4) +end; + +proc _type_set_size(this: Word, value: Word); +begin + _store_word(value, this + 4) +end; + +proc _enumeration_type_get_members(this: Word); + return _load_word(this + 8) +end; + +proc _enumeration_type_set_members(this: Word, value: Word); +begin + _store_word(value, this + 8) +end; + +proc _enumeration_type_get_length(this: Word); + return _load_word(this + 12) +end; + +proc _enumeration_type_set_length(this: Word, value: Word); +begin + _store_word(value, this + 12) +end; + +(** + * Reads and creates enumeration type representation. + * + * record + * type_kind: Word; + * size: Word; + * members: StringArray; + * length: Word + * end; + * + * Returns enumeration type description. + *) +proc _read_type_enumeration(); +var + token_kind: Word; + enumeration_name: Word; + name_length: Word; + memory_start: Word; + member_count: Word; + result: Word; +begin + _lexer_skip_token(); + memory_start := memory_free_pointer; + member_count := 0; + + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_right_paren() then + goto .read_type_enumeration_end + end; + .read_type_enumeration_loop; + member_count := member_count + 1; + + enumeration_name := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -enumeration_name; + + _store_word(enumeration_name, memory_free_pointer); + memory_free_pointer := memory_free_pointer + 4; + + _store_word(name_length, memory_free_pointer); + memory_free_pointer := memory_free_pointer + 4; + + (* Skip the identifier. *) + _lexer_skip_token(); + + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_comma() then + _lexer_skip_token(); + _lexer_read_token(@token_kind); + goto .read_type_enumeration_loop + end; + + .read_type_enumeration_end; + _lexer_skip_token(); + + (* The resulting structure is 16 bytes long. *) + result := memory_free_pointer; + memory_free_pointer := memory_free_pointer + 16; + + (* ENUMERATION_TYPE is 2. *) + _type_set_kind(result, 2); + _type_set_size(result, 4); + _enumeration_type_set_members(result, memory_start); + _enumeration_type_set_length(result, member_count); + + return result end; proc _read_type_expression(); var - type_name: Word; token_kind: Word; + type_name: Word; + name_length: Word; + result: Word; begin - type_name := _lexer_read_token(@token_kind); - _lexer_skip_token() + result := 0; + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_identifier() then + (* Named type. *) + type_name := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -type_name; + result := _symbol_table_lookup(@symbol_table_global, type_name, name_length); + result := _type_info_get_type(result); + + _lexer_skip_token() + elsif token_kind = _lexer_token_kind_left_paren() then + result := _read_type_enumeration() + end; + + return result +end; + +proc _type_info_get_type(this: Word); + return _load_word(this + 4) end; (** @@ -980,10 +1098,10 @@ begin return result end; -proc _parameter_info_get_offset(info: Word); +proc _parameter_info_get_offset(this: Word); begin - info := info + 4; - return _load_word(info) + this := this + 4; + return _load_word(this) end; (** @@ -1011,10 +1129,10 @@ begin return result end; -proc _temporary_info_get_offset(info: Word); +proc _temporary_info_get_offset(this: Word); begin - info := info + 4; - return _load_word(info) + this := this + 4; + return _load_word(this) end; (** @@ -1029,12 +1147,14 @@ var token_kind: Word; begin (* Read the parameter name. *) - name_position := source_code_position; - name_length := _lexer_read_token(@token_kind); + _lexer_read_token(@token_kind); + name_position := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -name_position; _lexer_skip_token(); (* Skip colon and space in front of the type expression. *) - source_code_position := source_code_position + 2; + _lexer_read_token(@token_kind); + _lexer_skip_token(); _read_type_expression(); @@ -1054,23 +1174,28 @@ end; proc _read_procedure_parameters(); var parameter_counter: Word; + token_kind: Word; begin (* Skip open paren. *) - source_code_position := source_code_position + 1; + _lexer_read_token(@token_kind); + _lexer_skip_token(); parameter_counter := 0; .compile_procedure_prologue_skip; - if _load_byte(source_code_position) <> ')' then + _lexer_read_token(@token_kind); + + if token_kind <> _lexer_token_kind_right_paren() then _read_procedure_parameter(parameter_counter); parameter_counter := parameter_counter + 1; + _lexer_read_token(@token_kind); - if _load_byte(source_code_position) = ',' then - source_code_position := source_code_position + 2; + if token_kind = _lexer_token_kind_comma() then + _lexer_skip_token(); goto .compile_procedure_prologue_skip end end; (* Skip close paren. *) - source_code_position := source_code_position + 1 + _lexer_skip_token() end; (** @@ -1084,13 +1209,14 @@ var name_position: Word; token_kind: Word; begin - _skip_spaces(); - name_position := source_code_position; + _lexer_read_token(@token_kind); + name_position := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -name_position; + _lexer_skip_token(); (* Read and skip variable name, colon and the space *) - name_length := _lexer_read_token(@token_kind); - _lexer_skip_token(name_length); - source_code_position := source_code_position + 2; + _lexer_read_token(@token_kind); + _lexer_skip_token(); _read_type_expression(); @@ -1098,49 +1224,58 @@ begin _symbol_table_enter(@symbol_table_local, name_position, name_length, info); (* Skip semicolon and newline after the variable declaration *) - source_code_position := source_code_position + 2 + _lexer_read_token(@token_kind); + _lexer_skip_token() end; proc _read_procedure_temporaries(); var temporary_counter: Word; + token_kind: Word; begin - if _memcmp(source_code_position, "var", 3) <> 0 then - goto .read_local_variables_end - end; - source_code_position := source_code_position + 4; - temporary_counter := 0; + _lexer_read_token(@token_kind); - .read_local_variables_loop; - if _memcmp(source_code_position, "begin", 5) <> 0 then - _read_procedure_temporary(temporary_counter); + if token_kind = _lexer_token_kind_var() then + _lexer_skip_token(); + temporary_counter := 0; - temporary_counter := temporary_counter + 1; - goto .read_local_variables_loop - end; - .read_local_variables_end + .read_local_variables_loop; + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_identifier() then + _read_procedure_temporary(temporary_counter); + + temporary_counter := temporary_counter + 1; + goto .read_local_variables_loop + end + end end; proc _compile_procedure(); var + name_pointer: Word; name_length: Word; token_kind: Word; begin (* Skip "proc ". *) - source_code_position := source_code_position + 5; + _lexer_read_token(@token_kind); + _lexer_skip_token(); + (* Clear local symbol table. *) _store_word(0, @symbol_table_local); - name_length := _lexer_read_token(@token_kind); + _lexer_read_token(@token_kind); + name_pointer := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -name_pointer; (* Write .type _procedure_name, @function. *) _write_z(".type \0"); - _write_token(name_length); + _write_s(name_pointer, name_length); _write_z(", @function\n\0"); (* Write procedure label, _procedure_name: *) - _write_token(name_length); + _write_s(name_pointer, name_length); _write_z(":\n\0"); (* Skip procedure name. *) @@ -1149,16 +1284,16 @@ begin _read_procedure_parameters(); (* Skip semicolon and newline. *) - source_code_position := source_code_position + 2; + _lexer_read_token(@token_kind); + _lexer_skip_token(); _read_procedure_temporaries(); (* Skip semicolon, "begin" and newline. *) _lexer_read_token(@token_kind); if token_kind = _lexer_token_kind_begin() then _lexer_skip_token(); - _compile_procedure_body() - end; - if token_kind = _lexer_token_kind_return() then + _compile_statement_list() + elsif token_kind = _lexer_token_kind_return() then _compile_return_statement() end; @@ -1166,94 +1301,79 @@ begin _write_z("\tlw ra, 124(sp)\n\tlw s0, 120(sp)\n\taddi sp, sp, 128\n\tret\n\0"); (* Skip the "end" keyword, semicolon and newline. *) - source_code_position := source_code_position + 5 -end; - -(** - * Prints and skips a line. - *) -proc _skip_comment(); -var - token_kind: Word; -begin _lexer_read_token(@token_kind); _lexer_skip_token(); - source_code_position := source_code_position + 1 + _lexer_read_token(@token_kind); + _lexer_skip_token() end; (** - * Skip newlines and comments. + * Skips comments. *) proc _skip_empty_lines(); var - current_position: Word; - current_byte: Word; + token_kind: Word; begin .skip_empty_lines_rerun; - current_position := source_code_position; - .skip_empty_lines_loop; - current_byte := _load_byte(current_position); + _lexer_read_token(@token_kind); - if current_byte = '\n' then - source_code_position := current_position + 1; - _skip_empty_lines() - elsif current_byte = '\t' then - current_position := current_position + 1; - goto .skip_empty_lines_loop - elsif current_byte = '(' then - current_byte := _load_byte(current_position + 1); - - if current_byte = '*' then - source_code_position := current_position; - _skip_comment(); - goto .skip_empty_lines_rerun - end; + if token_kind = _lexer_token_kind_comment() then + _lexer_skip_token(); + goto .skip_empty_lines_rerun end end; +(** + * Compile global variable initializer. + *) proc _compile_global_initializer(); var current_byte: Word; length: Word; token_kind: Word; + token_start: Word; begin - current_byte := _load_byte(source_code_position); + _lexer_read_token(@token_kind); + token_start := _lexer_global_get_start(); + current_byte := _load_byte(token_start); - if current_byte = '"' then + if token_kind = _lexer_token_kind_string() then _write_z("\n\t.word strings + \0"); - length := _string_length(source_code_position); + length := _string_length(token_start); - _add_string(source_code_position); + _add_string(token_start); _write_i(); (* Skip the quoted string. *) - source_code_position := source_code_position + length; - source_code_position := source_code_position + 2; + _lexer_skip_token(); goto .compile_global_initializer_end elsif current_byte = 'S' then (* Skip "S(". *) - source_code_position := source_code_position + 2; + _lexer_skip_token(); + _lexer_read_token(@token_kind); + _lexer_skip_token(); + _lexer_read_token(@token_kind); - if _load_byte(source_code_position) = ')' then + if token_kind = _lexer_token_kind_right_paren() then goto .compile_global_initializer_closing end; goto .compile_global_initializer_loop - elsif current_byte = '@' then + elsif token_kind = _lexer_token_kind_at() then (* Skip @. *) - source_code_position := source_code_position + 1; + _lexer_skip_token(); _write_z("\n\t.word \0"); - current_byte := _lexer_read_token(@token_kind); - _write_token(current_byte); + _lexer_read_token(@token_kind); + token_start := _lexer_global_get_start(); + _write_s(token_start, _lexer_global_get_end() + -token_start); _lexer_skip_token(); goto .compile_global_initializer_end - elsif _is_digit(current_byte) = 1 then + elsif token_kind = _lexer_token_kind_integer() then _write_z("\n\t.word \0"); - current_byte := _lexer_read_token(@token_kind); - _write_token(current_byte); - source_code_position := source_code_position + 1; + _write_s(token_start, _lexer_global_get_end() + -token_start); + _lexer_skip_token(); goto .compile_global_initializer_end end; @@ -1261,43 +1381,85 @@ begin .compile_global_initializer_loop; _compile_global_initializer(); - if _load_byte(source_code_position) <> ')' then + _lexer_read_token(@token_kind); + if token_kind <> _lexer_token_kind_right_paren() then (* Skip comma and whitespace after it. *) - source_code_position := source_code_position + 2; + _lexer_skip_token(); goto .compile_global_initializer_loop end; .compile_global_initializer_closing; (* Skip ")" *) - source_code_position := source_code_position + 1; + _lexer_skip_token(); .compile_global_initializer_end end; proc _compile_constant_declaration(); var + name: Word; name_length: Word; token_kind: Word; begin - name_length := _lexer_read_token(@token_kind); + name := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -name; _write_z(".type \0"); - _write_token(name_length); + _write_s(name, name_length); _write_z(", @object\n\0"); - _write_token(name_length); + _write_s(name, name_length); _write_c(':'); (* Skip the constant name with assignment sign and surrounding whitespaces. *) _lexer_skip_token(); - source_code_position := source_code_position + 4; + _lexer_read_token(@token_kind); + _lexer_skip_token(); _compile_global_initializer(); + (* Skip semicolon and newline. *) - source_code_position := source_code_position + 2; + _lexer_read_token(@token_kind); + _lexer_skip_token(); _write_c('\n') end; +proc _compile_type_declaration(); +var + token_kind: Word; +begin + _lexer_skip_token(); + _lexer_read_token(@token_kind); + _lexer_skip_token(); + _read_type_expression(); + _lexer_read_token(@token_kind); + _lexer_skip_token() +end; + +proc _compile_type_part(); +var + token_kind: Word; +begin + _skip_empty_lines(); + _lexer_read_token(@token_kind); + + if token_kind <> _lexer_token_kind_type() then + goto .compile_type_part_end + end; + _lexer_skip_token(); + + .compile_type_part_loop; + _skip_empty_lines(); + + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_identifier() then + _compile_type_declaration(); + goto .compile_type_part_loop + end; + + .compile_type_part_end +end; + proc _compile_const_part(); var token_kind: Word; @@ -1317,8 +1479,8 @@ begin (* If the character at the line beginning is not indentation, it is probably the next code section. *) - if _load_byte(source_code_position) = '\t' then - source_code_position := source_code_position + 1; + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_identifier() then _compile_constant_declaration(); goto .compile_const_part_loop end; @@ -1328,16 +1490,19 @@ end; proc _compile_variable_declaration(); var + name: Word; name_length: Word; token_kind: Word; begin - name_length := _lexer_read_token(@token_kind); + _lexer_read_token(@token_kind); + name := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -name; _write_z(".type \0"); - _write_token(name_length); + _write_s(name, name_length); _write_z(", @object\n\0"); - _write_token(name_length); + _write_s(name, name_length); _write_c(':'); (* Skip the variable name and colon with space before the type. *) @@ -1346,13 +1511,16 @@ begin _lexer_skip_token(); _read_type_expression(); - if _load_byte(source_code_position) <> ' ' then + _lexer_read_token(@token_kind); + + if token_kind <> _lexer_token_kind_assignment() then (* Else we assume this is a zeroed 81920 bytes big array. *) _write_z(" .zero 81920\0") else (* Skip the assignment sign with surrounding whitespaces. *) - source_code_position := source_code_position + 4; - _compile_global_initializer() + _lexer_skip_token(); + _compile_global_initializer(); + _lexer_read_token(@token_kind) end; (* Skip semicolon and newline. *) @@ -1390,7 +1558,10 @@ end; * Process the source code and print the generated code. *) proc _compile_module(); +var + token_kind: Word; begin + _compile_type_part(); _compile_const_part(); _skip_empty_lines(); _compile_var_part(); @@ -1404,15 +1575,12 @@ begin .compile_module_loop; _skip_empty_lines(); + _lexer_read_token(@token_kind); - if _load_byte(source_code_position) <> 0 then - (* 5 is "proc " length. Space is needed to distinguish from "procedure". *) - if _memcmp(source_code_position, "proc ", 5) = 0 then - _compile_procedure(); - goto .compile_module_loop - end - end; - .compile_module_end + if token_kind = _lexer_token_kind_proc() then + _compile_procedure(); + goto .compile_module_loop + end end; proc _compile(); @@ -1551,7 +1719,7 @@ begin _symbol_table_enter(@symbol_table_global, symbol_builtin_name_word, 4, @symbol_type_info_word); _symbol_table_enter(@symbol_table_global, symbol_builtin_name_pointer, 7, @symbol_type_info_pointer); _symbol_table_enter(@symbol_table_global, symbol_builtin_name_char, 4, @symbol_type_info_char); - _symbol_table_enter(@symbol_table_global, symbol_builtin_name_bool, 4, @symbol_type_info_bool) + _symbol_table_enter(@symbol_table_global, symbol_builtin_name_array, 5, @symbol_type_info_array) end; @@ -1600,7 +1768,7 @@ proc _lexer_class_asterisk(); return 9 end; -proc _lexer_class_underscore(); +proc _lexer_class_backslash(); return 10 end; @@ -1668,54 +1836,58 @@ proc _lexer_state_decimal(); return 4 end; -proc _lexer_state_greater(); +proc _lexer_state_leading_zero(); return 5 end; -proc _lexer_state_minus(); +proc _lexer_state_greater(); return 6 end; -proc _lexer_state_left_paren(); +proc _lexer_state_minus(); return 7 end; -proc _lexer_state_less(); +proc _lexer_state_left_paren(); return 8 end; -proc _lexer_state_dot(); +proc _lexer_state_less(); return 9 end; -proc _lexer_state_comment(); +proc _lexer_state_dot(); return 10 end; -proc _lexer_state_closing_comment(); +proc _lexer_state_comment(); return 11 end; -proc _lexer_state_character(); +proc _lexer_state_closing_comment(); return 12 end; -proc _lexer_state_string(); +proc _lexer_state_character(); return 13 end; -proc _lexer_state_leading_zero(); +proc _lexer_state_character_escape(); return 14 end; -proc _lexer_state_decimal_suffix(); +proc _lexer_state_string(); return 15 end; -proc _lexer_state_end(); +proc _lexer_state_string_escape(); return 16 end; +proc _lexer_state_end(); + return 17 +end; + proc _lexer_action_none(); return 1 end; @@ -1793,140 +1965,140 @@ proc _lexer_classifications(); var code: Word; begin - _assign_at(@classification, 1, 15); - _assign_at(@classification, 2, 1); - _assign_at(@classification, 3, 1); - _assign_at(@classification, 4, 1); - _assign_at(@classification, 5, 1); - _assign_at(@classification, 6, 1); - _assign_at(@classification, 7, 1); - _assign_at(@classification, 8, 1); - _assign_at(@classification, 9, 1); - _assign_at(@classification, 10, 4); - _assign_at(@classification, 11, 4); - _assign_at(@classification, 12, 1); - _assign_at(@classification, 13, 1); - _assign_at(@classification, 14, 4); - _assign_at(@classification, 15, 1); - _assign_at(@classification, 16, 1); - _assign_at(@classification, 17, 1); - _assign_at(@classification, 18, 1); - _assign_at(@classification, 19, 1); - _assign_at(@classification, 20, 1); - _assign_at(@classification, 21, 1); - _assign_at(@classification, 22, 1); - _assign_at(@classification, 23, 1); - _assign_at(@classification, 24, 1); - _assign_at(@classification, 25, 1); - _assign_at(@classification, 26, 1); - _assign_at(@classification, 27, 1); - _assign_at(@classification, 28, 1); - _assign_at(@classification, 29, 1); - _assign_at(@classification, 30, 1); - _assign_at(@classification, 31, 1); - _assign_at(@classification, 32, 1); - _assign_at(@classification, 33, 4); - _assign_at(@classification, 34, 11); - _assign_at(@classification, 35, 19); - _assign_at(@classification, 36, 22); - _assign_at(@classification, 37, 22); - _assign_at(@classification, 38, 11); - _assign_at(@classification, 39, 11); - _assign_at(@classification, 40, 18); - _assign_at(@classification, 41, 7); - _assign_at(@classification, 42, 8); - _assign_at(@classification, 43, 9); - _assign_at(@classification, 44, 11); - _assign_at(@classification, 45, 11); - _assign_at(@classification, 46, 17); - _assign_at(@classification, 47, 16); - _assign_at(@classification, 48, 11); - _assign_at(@classification, 49, 13); - _assign_at(@classification, 50, 2); - _assign_at(@classification, 51, 2); - _assign_at(@classification, 52, 2); - _assign_at(@classification, 53, 2); - _assign_at(@classification, 54, 2); - _assign_at(@classification, 55, 2); - _assign_at(@classification, 56, 2); - _assign_at(@classification, 57, 2); - _assign_at(@classification, 58, 2); - _assign_at(@classification, 59, 5); - _assign_at(@classification, 60, 11); - _assign_at(@classification, 61, 21); - _assign_at(@classification, 62, 6); - _assign_at(@classification, 63, 20); - _assign_at(@classification, 64, 22); - _assign_at(@classification, 65, 11); - _assign_at(@classification, 66, 3); - _assign_at(@classification, 67, 3); - _assign_at(@classification, 68, 3); - _assign_at(@classification, 69, 3); - _assign_at(@classification, 70, 3); - _assign_at(@classification, 71, 3); - _assign_at(@classification, 72, 3); - _assign_at(@classification, 73, 3); - _assign_at(@classification, 74, 3); - _assign_at(@classification, 75, 3); - _assign_at(@classification, 76, 3); - _assign_at(@classification, 77, 3); - _assign_at(@classification, 78, 3); - _assign_at(@classification, 79, 3); - _assign_at(@classification, 80, 3); - _assign_at(@classification, 81, 3); - _assign_at(@classification, 82, 3); - _assign_at(@classification, 83, 3); - _assign_at(@classification, 84, 3); - _assign_at(@classification, 85, 3); - _assign_at(@classification, 86, 3); - _assign_at(@classification, 87, 3); - _assign_at(@classification, 88, 3); - _assign_at(@classification, 89, 3); - _assign_at(@classification, 90, 3); - _assign_at(@classification, 91, 3); - _assign_at(@classification, 92, 11); - _assign_at(@classification, 93, 22); - _assign_at(@classification, 94, 11); - _assign_at(@classification, 95, 11); - _assign_at(@classification, 96, 10); - _assign_at(@classification, 97, 22); - _assign_at(@classification, 98, 12); - _assign_at(@classification, 99, 12); - _assign_at(@classification, 100, 12); - _assign_at(@classification, 101, 12); - _assign_at(@classification, 102, 12); - _assign_at(@classification, 103, 12); - _assign_at(@classification, 104, 3); - _assign_at(@classification, 105, 3); - _assign_at(@classification, 106, 3); - _assign_at(@classification, 107, 3); - _assign_at(@classification, 108, 3); - _assign_at(@classification, 109, 3); - _assign_at(@classification, 110, 3); - _assign_at(@classification, 111, 3); - _assign_at(@classification, 112, 3); - _assign_at(@classification, 113, 3); - _assign_at(@classification, 114, 3); - _assign_at(@classification, 115, 3); - _assign_at(@classification, 116, 3); - _assign_at(@classification, 117, 3); - _assign_at(@classification, 118, 3); - _assign_at(@classification, 119, 3); - _assign_at(@classification, 120, 3); - _assign_at(@classification, 121, 14); - _assign_at(@classification, 122, 3); - _assign_at(@classification, 123, 3); - _assign_at(@classification, 124, 22); - _assign_at(@classification, 125, 11); - _assign_at(@classification, 126, 22); - _assign_at(@classification, 127, 11); - _assign_at(@classification, 128, 1); + _assign_at(@classification, 1, _lexer_class_eof()); + _assign_at(@classification, 2, _lexer_class_invalid()); + _assign_at(@classification, 3, _lexer_class_invalid()); + _assign_at(@classification, 4, _lexer_class_invalid()); + _assign_at(@classification, 5, _lexer_class_invalid()); + _assign_at(@classification, 6, _lexer_class_invalid()); + _assign_at(@classification, 7, _lexer_class_invalid()); + _assign_at(@classification, 8, _lexer_class_invalid()); + _assign_at(@classification, 9, _lexer_class_invalid()); + _assign_at(@classification, 10, _lexer_class_space()); + _assign_at(@classification, 11, _lexer_class_space()); + _assign_at(@classification, 12, _lexer_class_invalid()); + _assign_at(@classification, 13, _lexer_class_invalid()); + _assign_at(@classification, 14, _lexer_class_space()); + _assign_at(@classification, 15, _lexer_class_invalid()); + _assign_at(@classification, 16, _lexer_class_invalid()); + _assign_at(@classification, 17, _lexer_class_invalid()); + _assign_at(@classification, 18, _lexer_class_invalid()); + _assign_at(@classification, 19, _lexer_class_invalid()); + _assign_at(@classification, 20, _lexer_class_invalid()); + _assign_at(@classification, 21, _lexer_class_invalid()); + _assign_at(@classification, 22, _lexer_class_invalid()); + _assign_at(@classification, 23, _lexer_class_invalid()); + _assign_at(@classification, 24, _lexer_class_invalid()); + _assign_at(@classification, 25, _lexer_class_invalid()); + _assign_at(@classification, 26, _lexer_class_invalid()); + _assign_at(@classification, 27, _lexer_class_invalid()); + _assign_at(@classification, 28, _lexer_class_invalid()); + _assign_at(@classification, 29, _lexer_class_invalid()); + _assign_at(@classification, 30, _lexer_class_invalid()); + _assign_at(@classification, 31, _lexer_class_invalid()); + _assign_at(@classification, 32, _lexer_class_invalid()); + _assign_at(@classification, 33, _lexer_class_space()); + _assign_at(@classification, 34, _lexer_class_single()); + _assign_at(@classification, 35, _lexer_class_double_quote()); + _assign_at(@classification, 36, _lexer_class_other()); + _assign_at(@classification, 37, _lexer_class_other()); + _assign_at(@classification, 38, _lexer_class_single()); + _assign_at(@classification, 39, _lexer_class_single()); + _assign_at(@classification, 40, _lexer_class_single_quote()); + _assign_at(@classification, 41, _lexer_class_left_paren()); + _assign_at(@classification, 42, _lexer_class_right_paren()); + _assign_at(@classification, 43, _lexer_class_asterisk()); + _assign_at(@classification, 44, _lexer_class_single()); + _assign_at(@classification, 45, _lexer_class_single()); + _assign_at(@classification, 46, _lexer_class_minus()); + _assign_at(@classification, 47, _lexer_class_dot()); + _assign_at(@classification, 48, _lexer_class_single()); + _assign_at(@classification, 49, _lexer_class_zero()); + _assign_at(@classification, 50, _lexer_class_digit()); + _assign_at(@classification, 51, _lexer_class_digit()); + _assign_at(@classification, 52, _lexer_class_digit()); + _assign_at(@classification, 53, _lexer_class_digit()); + _assign_at(@classification, 54, _lexer_class_digit()); + _assign_at(@classification, 55, _lexer_class_digit()); + _assign_at(@classification, 56, _lexer_class_digit()); + _assign_at(@classification, 57, _lexer_class_digit()); + _assign_at(@classification, 58, _lexer_class_digit()); + _assign_at(@classification, 59, _lexer_class_colon()); + _assign_at(@classification, 60, _lexer_class_single()); + _assign_at(@classification, 61, _lexer_class_less()); + _assign_at(@classification, 62, _lexer_class_equals()); + _assign_at(@classification, 63, _lexer_class_greater()); + _assign_at(@classification, 64, _lexer_class_other()); + _assign_at(@classification, 65, _lexer_class_single()); + _assign_at(@classification, 66, _lexer_class_alpha()); + _assign_at(@classification, 67, _lexer_class_alpha()); + _assign_at(@classification, 68, _lexer_class_alpha()); + _assign_at(@classification, 69, _lexer_class_alpha()); + _assign_at(@classification, 70, _lexer_class_alpha()); + _assign_at(@classification, 71, _lexer_class_alpha()); + _assign_at(@classification, 72, _lexer_class_alpha()); + _assign_at(@classification, 73, _lexer_class_alpha()); + _assign_at(@classification, 74, _lexer_class_alpha()); + _assign_at(@classification, 75, _lexer_class_alpha()); + _assign_at(@classification, 76, _lexer_class_alpha()); + _assign_at(@classification, 77, _lexer_class_alpha()); + _assign_at(@classification, 78, _lexer_class_alpha()); + _assign_at(@classification, 79, _lexer_class_alpha()); + _assign_at(@classification, 80, _lexer_class_alpha()); + _assign_at(@classification, 81, _lexer_class_alpha()); + _assign_at(@classification, 82, _lexer_class_alpha()); + _assign_at(@classification, 83, _lexer_class_alpha()); + _assign_at(@classification, 84, _lexer_class_alpha()); + _assign_at(@classification, 85, _lexer_class_alpha()); + _assign_at(@classification, 86, _lexer_class_alpha()); + _assign_at(@classification, 87, _lexer_class_alpha()); + _assign_at(@classification, 88, _lexer_class_alpha()); + _assign_at(@classification, 89, _lexer_class_alpha()); + _assign_at(@classification, 90, _lexer_class_alpha()); + _assign_at(@classification, 91, _lexer_class_alpha()); + _assign_at(@classification, 92, _lexer_class_single()); + _assign_at(@classification, 93, _lexer_class_backslash()); + _assign_at(@classification, 94, _lexer_class_single()); + _assign_at(@classification, 95, _lexer_class_single()); + _assign_at(@classification, 96, _lexer_class_alpha()); + _assign_at(@classification, 97, _lexer_class_other()); + _assign_at(@classification, 98, _lexer_class_hex()); + _assign_at(@classification, 99, _lexer_class_hex()); + _assign_at(@classification, 100, _lexer_class_hex()); + _assign_at(@classification, 101, _lexer_class_hex()); + _assign_at(@classification, 102, _lexer_class_hex()); + _assign_at(@classification, 103, _lexer_class_hex()); + _assign_at(@classification, 104, _lexer_class_alpha()); + _assign_at(@classification, 105, _lexer_class_alpha()); + _assign_at(@classification, 106, _lexer_class_alpha()); + _assign_at(@classification, 107, _lexer_class_alpha()); + _assign_at(@classification, 108, _lexer_class_alpha()); + _assign_at(@classification, 109, _lexer_class_alpha()); + _assign_at(@classification, 110, _lexer_class_alpha()); + _assign_at(@classification, 111, _lexer_class_alpha()); + _assign_at(@classification, 112, _lexer_class_alpha()); + _assign_at(@classification, 113, _lexer_class_alpha()); + _assign_at(@classification, 114, _lexer_class_alpha()); + _assign_at(@classification, 115, _lexer_class_alpha()); + _assign_at(@classification, 116, _lexer_class_alpha()); + _assign_at(@classification, 117, _lexer_class_alpha()); + _assign_at(@classification, 118, _lexer_class_alpha()); + _assign_at(@classification, 119, _lexer_class_alpha()); + _assign_at(@classification, 120, _lexer_class_alpha()); + _assign_at(@classification, 121, _lexer_class_x()); + _assign_at(@classification, 122, _lexer_class_alpha()); + _assign_at(@classification, 123, _lexer_class_alpha()); + _assign_at(@classification, 124, _lexer_class_other()); + _assign_at(@classification, 125, _lexer_class_single()); + _assign_at(@classification, 126, _lexer_class_other()); + _assign_at(@classification, 127, _lexer_class_single()); + _assign_at(@classification, 128, _lexer_class_invalid()); code := 129; (* Set the remaining 129 - 256 bytes to transitionClassOther. *) .create_classification_loop; - _assign_at(@classification, code, 22); + _assign_at(@classification, code, _lexer_class_other()); code := code + 1; if code < 257 then @@ -1990,7 +2162,7 @@ begin _lexer_set_transition(current_state, _lexer_class_left_paren(), default_action, next_state); _lexer_set_transition(current_state, _lexer_class_right_paren(), default_action, next_state); _lexer_set_transition(current_state, _lexer_class_asterisk(), default_action, next_state); - _lexer_set_transition(current_state, _lexer_class_underscore(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_backslash(), default_action, next_state); _lexer_set_transition(current_state, _lexer_class_single(), default_action, next_state); _lexer_set_transition(current_state, _lexer_class_hex(), default_action, next_state); _lexer_set_transition(current_state, _lexer_class_zero(), default_action, next_state); @@ -2020,18 +2192,18 @@ begin _lexer_set_transition(_lexer_state_start(), _lexer_class_digit(), _lexer_action_accumulate(), _lexer_state_decimal()); _lexer_set_transition(_lexer_state_start(), _lexer_class_alpha(), _lexer_action_accumulate(), _lexer_state_identifier()); _lexer_set_transition(_lexer_state_start(), _lexer_class_space(), _lexer_action_skip(), _lexer_state_start()); - _lexer_set_transition(_lexer_state_start(), _lexer_class_colon(), _lexer_action_accumulate(), _lexer_state_greater()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_colon(), _lexer_action_accumulate(), _lexer_state_colon()); _lexer_set_transition(_lexer_state_start(), _lexer_class_equals(), _lexer_action_single(), _lexer_state_end()); _lexer_set_transition(_lexer_state_start(), _lexer_class_left_paren(), _lexer_action_accumulate(), _lexer_state_left_paren()); _lexer_set_transition(_lexer_state_start(), _lexer_class_right_paren(), _lexer_action_single(), _lexer_state_end()); _lexer_set_transition(_lexer_state_start(), _lexer_class_asterisk(), _lexer_action_single(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_start(), _lexer_class_underscore(), _lexer_action_accumulate(), _lexer_state_identifier()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_backslash(), _lexer_action_none(), _lexer_state_end()); _lexer_set_transition(_lexer_state_start(), _lexer_class_single(), _lexer_action_single(), _lexer_state_end()); _lexer_set_transition(_lexer_state_start(), _lexer_class_hex(), _lexer_action_accumulate(), _lexer_state_identifier()); _lexer_set_transition(_lexer_state_start(), _lexer_class_zero(), _lexer_action_accumulate(), _lexer_state_leading_zero()); _lexer_set_transition(_lexer_state_start(), _lexer_class_x(), _lexer_action_accumulate(), _lexer_state_identifier()); _lexer_set_transition(_lexer_state_start(), _lexer_class_eof(), _lexer_action_eof(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_start(), _lexer_class_dot(), _lexer_action_accumulate(), _lexer_state_dot()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_dot(), _lexer_action_single(), _lexer_state_end()); _lexer_set_transition(_lexer_state_start(), _lexer_class_minus(), _lexer_action_accumulate(), _lexer_state_minus()); _lexer_set_transition(_lexer_state_start(), _lexer_class_single_quote(), _lexer_action_accumulate(), _lexer_state_character()); _lexer_set_transition(_lexer_state_start(), _lexer_class_double_quote(), _lexer_action_accumulate(), _lexer_state_string()); @@ -2047,7 +2219,6 @@ begin _lexer_default_transition(_lexer_state_identifier(), _lexer_action_key_id(), _lexer_state_end()); _lexer_set_transition(_lexer_state_identifier(), _lexer_class_digit(), _lexer_action_accumulate(), _lexer_state_identifier()); _lexer_set_transition(_lexer_state_identifier(), _lexer_class_alpha(), _lexer_action_accumulate(), _lexer_state_identifier()); - _lexer_set_transition(_lexer_state_identifier(), _lexer_class_underscore(), _lexer_action_accumulate(), _lexer_state_identifier()); _lexer_set_transition(_lexer_state_identifier(), _lexer_class_hex(), _lexer_action_accumulate(), _lexer_state_identifier()); _lexer_set_transition(_lexer_state_identifier(), _lexer_class_zero(), _lexer_action_accumulate(), _lexer_state_identifier()); _lexer_set_transition(_lexer_state_identifier(), _lexer_class_x(), _lexer_action_accumulate(), _lexer_state_identifier()); @@ -2055,11 +2226,18 @@ begin (* Decimal state. *) _lexer_default_transition(_lexer_state_decimal(), _lexer_action_integer(), _lexer_state_end()); _lexer_set_transition(_lexer_state_decimal(), _lexer_class_digit(), _lexer_action_accumulate(), _lexer_state_decimal()); - _lexer_set_transition(_lexer_state_decimal(), _lexer_class_alpha(), _lexer_action_accumulate(), _lexer_state_decimal_suffix()); - _lexer_set_transition(_lexer_state_decimal(), _lexer_class_underscore(), _lexer_action_none(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_decimal(), _lexer_class_hex(), _lexer_action_accumulate(), _lexer_state_decimal_suffix()); + _lexer_set_transition(_lexer_state_decimal(), _lexer_class_alpha(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_decimal(), _lexer_class_hex(), _lexer_action_none(), _lexer_state_end()); _lexer_set_transition(_lexer_state_decimal(), _lexer_class_zero(), _lexer_action_accumulate(), _lexer_state_decimal()); - _lexer_set_transition(_lexer_state_decimal(), _lexer_class_x(), _lexer_action_accumulate(), _lexer_state_decimal_suffix()); + _lexer_set_transition(_lexer_state_decimal(), _lexer_class_x(), _lexer_action_none(), _lexer_state_end()); + + (* Leading zero. *) + _lexer_default_transition(_lexer_state_leading_zero(), _lexer_action_integer(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_digit(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_alpha(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_hex(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_zero(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_x(), _lexer_action_none(), _lexer_state_dot()); (* Greater state. *) _lexer_default_transition(_lexer_state_greater(), _lexer_action_finalize(), _lexer_state_end()); @@ -2099,29 +2277,24 @@ begin _lexer_set_transition(_lexer_state_character(), _lexer_class_invalid(), _lexer_action_none(), _lexer_state_end()); _lexer_set_transition(_lexer_state_character(), _lexer_class_eof(), _lexer_action_none(), _lexer_state_end()); _lexer_set_transition(_lexer_state_character(), _lexer_class_single_quote(), _lexer_action_delimited(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_character(), _lexer_class_backslash(), _lexer_action_accumulate(), _lexer_state_character_escape()); + + (* Escape sequence in a character. *) + _lexer_default_transition(_lexer_state_character_escape(), _lexer_action_accumulate(), _lexer_state_character()); + _lexer_set_transition(_lexer_state_character_escape(), _lexer_class_invalid(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_character_escape(), _lexer_class_eof(), _lexer_action_none(), _lexer_state_end()); (* String. *) _lexer_default_transition(_lexer_state_string(), _lexer_action_accumulate(), _lexer_state_string()); _lexer_set_transition(_lexer_state_string(), _lexer_class_invalid(), _lexer_action_none(), _lexer_state_end()); _lexer_set_transition(_lexer_state_string(), _lexer_class_eof(), _lexer_action_none(), _lexer_state_end()); _lexer_set_transition(_lexer_state_string(), _lexer_class_double_quote(), _lexer_action_delimited(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_string(), _lexer_class_backslash(), _lexer_action_accumulate(), _lexer_state_string_escape()); - (* Leading zero. *) - _lexer_default_transition(_lexer_state_leading_zero(), _lexer_action_integer(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_digit(), _lexer_action_none(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_alpha(), _lexer_action_none(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_underscore(), _lexer_action_none(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_hex(), _lexer_action_none(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_zero(), _lexer_action_none(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_x(), _lexer_action_none(), _lexer_state_end()); - - (* Digit with a character suffix. *) - _lexer_default_transition(_lexer_state_decimal_suffix(), _lexer_action_integer(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_decimal_suffix(), _lexer_class_digit(), _lexer_action_none(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_decimal_suffix(), _lexer_class_alpha(), _lexer_action_none(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_decimal_suffix(), _lexer_class_hex(), _lexer_action_none(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_decimal_suffix(), _lexer_class_zero(), _lexer_action_none(), _lexer_state_end()); - _lexer_set_transition(_lexer_state_decimal_suffix(), _lexer_class_x(), _lexer_action_none(), _lexer_state_end()) + (* Escape sequence in a string. *) + _lexer_default_transition(_lexer_state_string_escape(), _lexer_action_accumulate(), _lexer_state_string()); + _lexer_set_transition(_lexer_state_string_escape(), _lexer_class_invalid(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_string_escape(), _lexer_class_eof(), _lexer_action_none(), _lexer_state_end()) end; (** @@ -2136,24 +2309,54 @@ end; (** * Lexer state is saved after the transition tables. * Each transition table entry is 8 bytes long. The table has 16 rows (transition states) - * and 22 columns (character classes), so 2816 = 8 * 16 * 22. + * and 22 columns (character classes), so 2992 = 8 * 17 * 22. *) proc _lexer_global_state(); - return _lexer_get_transition_table() + 2816 + return _lexer_get_transition_table() + 2992 end; (** * Gets pointer to the token start. *) -proc _lexer_global_start(); - return _lexer_global_state() + 4 +proc _lexer_global_get_start(); +var + target: Word; +begin + target := _lexer_global_state() + 4; + return _load_word(target) +end; + +(** + * Sets pointer to the token start. + *) +proc _lexer_global_set_start(new_start: Word); +var + target: Word; +begin + target := _lexer_global_state() + 4; + _store_word(new_start, target) end; (** * Gets pointer to the token end. *) -proc _lexer_global_end(); - return _lexer_global_start() + 4 +proc _lexer_global_get_end(); +var + target: Word; +begin + target := _lexer_global_state() + 8; + return _load_word(target) +end; + +(** + * Sets pointer to the token end. + *) +proc _lexer_global_set_end(new_start: Word); +var + target: Word; +begin + target := _lexer_global_state() + 8; + _store_word(new_start, target) end; proc _lexer_transition_get_action(transition: Word); @@ -2180,17 +2383,13 @@ end; proc _lexer_reset(); var state: Word; - current: Word; begin (* Transition start state is 1. *) state := _lexer_global_state(); _store_word(_lexer_state_start(), state); - current := _lexer_global_start(); - _store_word(source_code_position, current); - - current := _lexer_global_end(); - _store_word(source_code_position, current) + state := _lexer_global_get_start(); + _lexer_global_set_end(state) end; (** @@ -2199,7 +2398,10 @@ end; proc _lexer_initialize(); begin _lexer_classifications(); - _lexer_transitions() + _lexer_transitions(); + + _lexer_global_set_start(@source_code); + _lexer_global_set_end(@source_code) end; proc _lexer_next_transition(); @@ -2208,8 +2410,7 @@ var character_class: Word; current_state: Word; begin - current_character := _lexer_global_end(); - current_character := _load_word(current_character); + current_character := _lexer_global_get_end(); current_character := _load_byte(current_character); character_class := _get_at(@classification, current_character + 1); @@ -2452,7 +2653,7 @@ proc _lexer_token_kind_at(); return 57 end; -proc _lexer_token_kind_exclamation(); +proc _lexer_token_kind_comment(); return 58 end; @@ -2476,6 +2677,10 @@ proc _lexer_token_kind_goto(); return 63 end; +proc _lexer_token_kind_eof(); + return 64 +end; + proc _lexer_compare_keyword(lhs_pointer: Word, lhs_length: Word, rhs_pointer: Word, rhs_length: Word); var result: Word; @@ -2522,6 +2727,10 @@ begin result := _lexer_token_kind_else() elsif _lexer_compare_keyword(position_start, token_length, "elsif", 5) = 1 then result := _lexer_token_kind_elsif() + elsif _lexer_compare_keyword(position_start, token_length, "or", 2) = 1 then + result := _lexer_token_kind_or() + elsif _lexer_compare_keyword(position_start, token_length, "xor", 2) = 1 then + result := _lexer_token_kind_xor() end; return result end; @@ -2538,6 +2747,14 @@ begin result := _lexer_token_kind_colon() elsif character = '.' then result := _lexer_token_kind_dot() + elsif character = '(' then + result := _lexer_token_kind_left_paren() + elsif character = '-' then + result := _lexer_token_kind_minus() + elsif character = '<' then + result := _lexer_token_kind_less_than() + elsif character = '>' then + result := _lexer_token_kind_greater_than() end; return result end; @@ -2552,44 +2769,126 @@ begin if character = ';' then result := _lexer_token_kind_semicolon() + elsif character = ',' then + result := _lexer_token_kind_comma() + elsif character = ')' then + result := _lexer_token_kind_right_paren() + elsif character = '@' then + result := _lexer_token_kind_at() + elsif character = '~' then + result := _lexer_token_kind_not() + elsif character = '&' then + result := _lexer_token_kind_and() + elsif character = '+' then + result := _lexer_token_kind_plus() + elsif character = '*' then + result := _lexer_token_kind_multiplication() + elsif character = '=' then + result := _lexer_token_kind_equals() + elsif character = '%' then + result := _lexer_token_kind_remainder() + elsif character = '/' then + result := _lexer_token_kind_division() + elsif character = '.' then + result := _lexer_token_kind_dot() + elsif character = '^' then + result := _lexer_token_kind_hat() end; return result end; +proc _lexer_classify_composite(start_position: Word, one_before_last: Word); +var + first_character: Word; + last_character: Word; + result: Word; +begin + first_character := _load_byte(start_position); + last_character := _load_byte(one_before_last); + + if first_character = ':' then + result := _lexer_token_kind_assignment() + elsif first_character = '<' then + if last_character = '=' then + result := _lexer_token_kind_less_equal() + elsif last_character = '>' then + result := _lexer_token_kind_not_equal() + end + elsif first_character = '>' then + if last_character = '=' then + result := _lexer_token_kind_greater_equal() + end + end; + + return result +end; + +proc _lexer_classify_delimited(start_position: Word, end_position: Word); +var + token_length: Word; + delimiter: Word; + result: Word; +begin + token_length := end_position + -start_position; + delimiter := _load_byte(start_position); + + if delimiter = '(' then + result := _lexer_token_kind_comment() + elsif delimiter = '\'' then + result := _lexer_token_kind_character() + elsif delimiter = '"' then + result := _lexer_token_kind_string() + end; + return result +end; + +proc _lexer_classify_integer(start_position: Word, end_position: Word); +begin + return _lexer_token_kind_integer() +end; + proc _lexer_execute_action(action_to_perform: Word, kind: Word); var - pointer_start: Word; - pointer_end: Word; position_start: Word; position_end: Word; intermediate: Word; begin - pointer_start := _lexer_global_start(); - position_start := _load_word(pointer_start); - pointer_end := _lexer_global_end(); - position_end := _load_word(pointer_end); + position_start := _lexer_global_get_start(); + position_end := _lexer_global_get_end(); if action_to_perform = _lexer_action_none() then elsif action_to_perform = _lexer_action_accumulate() then - _store_word(position_end + 1, pointer_end) + _lexer_global_set_end(position_end + 1) elsif action_to_perform = _lexer_action_skip() then - _store_word(position_start + 1, pointer_start); - _store_word(position_end + 1, pointer_end) + _lexer_global_set_start(position_start + 1); + _lexer_global_set_end(position_end + 1) elsif action_to_perform = _lexer_action_single() then - _store_word(position_end + 1, pointer_end); + _lexer_global_set_end(position_end + 1); intermediate := _lexer_classify_single(position_start); _store_word(intermediate, kind) elsif action_to_perform = _lexer_action_eof() then + intermediate := _lexer_token_kind_eof(); + _store_word(intermediate, kind) elsif action_to_perform = _lexer_action_finalize() then intermediate := _lexer_classify_finalize(position_start); _store_word(intermediate, kind) elsif action_to_perform = _lexer_action_composite() then + _lexer_global_set_end(position_end + 1); + + intermediate := _lexer_classify_composite(position_start, position_end); + _store_word(intermediate, kind) elsif action_to_perform = _lexer_action_key_id() then intermediate := _lexer_classify_keyword(position_start, position_end); _store_word(intermediate, kind) elsif action_to_perform = _lexer_action_integer() then + intermediate := _lexer_classify_integer(position_start, position_end); + _store_word(intermediate, kind) elsif action_to_perform = _lexer_action_delimited() then + _lexer_global_set_end(position_end + 1); + + intermediate := _lexer_classify_delimited(position_start, position_end + 1); + _store_word(intermediate, kind) end; end; @@ -2620,19 +2919,12 @@ begin end; (** - * Reads the next token. - * - * Returns token length in a0. + * Reads the next token and writes its type into the address in the kind parameter. *) proc _lexer_read_token(kind: Word); -var - new_position: Word; begin _lexer_reset(); - _lexer_advance_token(kind); - - new_position := _lexer_global_end(); - return _load_word(new_position) + -source_code_position + _lexer_advance_token(kind) end; (** @@ -2640,10 +2932,10 @@ end; *) proc _lexer_skip_token(); var - new_position: Word; + old_end: Word; begin - new_position := _lexer_global_end(); - source_code_position := _load_word(new_position) + old_end := _lexer_global_get_end(); + _lexer_global_set_start(old_end) end; (* @@ -2652,7 +2944,7 @@ end; proc _start(); var last_read: Word; - offset: Wort; + offset: Word; begin _lexer_initialize(); _symbol_table_build(); diff --git a/boot/stage15.elna b/boot/stage15.elna new file mode 100644 index 0000000..9c8d3a3 --- /dev/null +++ b/boot/stage15.elna @@ -0,0 +1,2970 @@ +(* + * This Source Code Form is subject to the terms of the Mozilla Public License, + * v. 2.0. If a copy of the MPL was not distributed with this file, You can + * obtain one at https://mozilla.org/MPL/2.0/. + *) + +(* Stage 14 compiler. *) + +(* - Binary minus. *) +(* - Space independent parsing. *) +(* - Label names in goto statements aren't required to begin with a dot. *) +(* - Dereferencing pointers pointing to word long data. *) + +const + symbol_builtin_name_int := "Int"; + symbol_builtin_name_word := "Word"; + symbol_builtin_name_pointer := "Pointer"; + symbol_builtin_name_char := "Char"; + symbol_builtin_name_array := "Array"; + + (* Every type info starts with a word describing what type it is. + + PRIMITIVE_TYPE = 1 + ENUMERATION_TYPE = 2 + + Primitive types have only type size. *) + symbol_builtin_type_int := S(1, 4); + symbol_builtin_type_word := S(1, 4); + symbol_builtin_type_pointer := S(1, 4); + symbol_builtin_type_char := S(1, 1); + symbol_builtin_type_array := S(1, 4); + + (* Info objects start with a word describing its type. + + TYPE_INFO = 1 + PARAMETER_INFO = 2 + TEMPORARY_INFO = 3 + + Type info has the type it belongs to. *) + symbol_type_info_int := S(1, @symbol_builtin_type_int); + symbol_type_info_word := S(1, @symbol_builtin_type_word); + symbol_type_info_pointer := S(1, @symbol_builtin_type_pointer); + symbol_type_info_char := S(1, @symbol_builtin_type_char); + symbol_type_info_array := S(1, @symbol_builtin_type_array); + +var + source_code: Array; + compiler_strings: Array; + symbol_table_global: Array; + symbol_table_local: Array; + classification: Array; + + (* To reserve memory just add the value of needed bytes to the memory_free_pointer_variable. *) + memory: Array; + + compiler_strings_position: Pointer := @compiler_strings; + compiler_strings_length: Word := 0; + label_counter: Word := 0; + + (* Points to a segment of free memory. *) + memory_free_pointer: Word := @memory; + +(** + * Calculates and returns the string token length between quotes, including the + * escaping slash characters. + * + * Parameters: + * string - String token pointer. + * + * Returns the length in a0. + *) +proc _string_length(string: Word); +var + counter: Word; +begin + (* Reset the counter. *) + counter := 0; + + .string_length_loop; + string := string + 1; + + if _load_byte(string) <> '"' then + counter := counter + 1; + goto .string_length_loop + end; + + return counter +end; + +(** + * Adds a string to the global, read-only string storage. + * + * Parameters: + * string - String token. + * + * Returns the offset from the beginning of the storage to the new string in a0. + *) +proc _add_string(string: Word); +var + contents: Word; + result: Word; + current_byte: Word; +begin + contents := string + 1; + result := compiler_strings_length; + + .add_string_loop; + if _load_byte(contents) <> '"' then + current_byte := _load_byte(contents); + _store_byte(current_byte, compiler_strings_position); + compiler_strings_position := compiler_strings_position + 1; + contents := contents + 1; + + if current_byte <> '\\' then + compiler_strings_length := compiler_strings_length + 1 + end; + goto .add_string_loop + end; + + return result +end; + +(** + * Reads standard input into a buffer. + * + * Parameters: + * buffer - Buffer pointer. + * size - Buffer size. + * + * Returns the amount of bytes written in a0. + *) +proc _read_file(buffer: Word, size: Word); + return _syscall(0, buffer, size, 0, 0, 0, 63) +end; + +(** + * Writes to the standard output. + * + * Parameters: + * buffer - Buffer. + * size - Buffer length. + *) +proc _write_s(buffer: Word, size: Word); +begin + _syscall(1, buffer, size, 0, 0, 0, 64) +end; + +(** + * Writes a number to a string buffer. + * + * Parameters: + * number - Whole number. + * output_buffer - Buffer pointer. + * + * Sets a0 to the length of the written number. + *) +proc _print_i(number: Word, output_buffer: Word); +var + local_buffer: Word; + is_negative: Word; + current_character: Word; + result: Word; +begin + local_buffer := @result + 11; + + if number >= 0 then + is_negative := 0 + else + number = -number; + is_negative := 1 + end; + + .print_i_digit10; + current_character := number % 10; + _store_byte(current_character + '0', local_buffer); + + number := number / 10; + local_buffer := local_buffer + -1; + + if number <> 0 then + goto .print_i_digit10 + end; + if is_negative = 1 then + _store_byte('-', local_buffer); + local_buffer := local_buffer + -1 + end; + result := @result + 11; + result := result + -local_buffer; + _memcpy(output_buffer, local_buffer + 1, result); + + return result +end; + +(** + * Writes a number to the standard output. + * + * Parameters: + * number - Whole number. + *) +proc _write_i(number: Word); +var + local_buffer: Word; + length: Word; +begin + length := _print_i(number, @local_buffer); + _write_s(@local_buffer, length) +end; + +(** + * Writes a character from a0 into the standard output. + * + * Parameters: + * character - Character to write. + *) +proc _write_c(character: Word); +begin + _write_s(@character, 1) +end; + +(** + * Write null terminated string. + * + * Parameters: + * string - String. + *) +proc _write_z(string: Word); +var + next_byte: Word; +begin + (* Check for 0 character. *) + next_byte := _load_byte(string); + + if next_byte <> 0 then + (* Print a character. *) + _write_c(next_byte); + + (* Advance the input string by one byte. *) + _write_z(string + 1) + end +end; + +(** + * Detects if a0 is an uppercase character. Sets a0 to 1 if so, otherwise to 0. + *) +proc _is_upper(character: Word); +var + lhs: Word; + rhs: Word; +begin + lhs := character >= 'A'; + rhs := character <= 'Z'; + + return lhs & rhs + +end; + +(** + * Detects if a0 is an lowercase character. Sets a0 to 1 if so, otherwise to 0. + *) +proc _is_lower(character: Word); +var + lhs: Word; + rhs: Word; +begin + lhs := character >= 'a'; + rhs := character <= 'z'; + + return lhs & rhs +end; + +(** + * Detects if the passed character is a 7-bit alpha character or an underscore. + * + * Paramters: + * character - Tested character. + * + * Sets a0 to 1 if the character is an alpha character or underscore, sets it to 0 otherwise. + *) +proc _is_alpha(character: Word); +var + is_upper_result: Word; + is_lower_result: Word; + is_alpha_result: Word; + is_underscore: Word; +begin + is_upper_result := _is_upper(character); + is_lower_result := _is_lower(character); + is_underscore := character = '_'; + + is_alpha_result := is_lower_result or is_upper_result; + return is_alpha_result or is_underscore +end; + +(** + * Detects whether the passed character is a digit (a value between 0 and 9). + * + * Parameters: + * character - Exemined value. + * + * Sets a0 to 1 if it is a digit, to 0 otherwise. + *) +proc _is_digit(character: Word); +var + lhs: Word; + rhs: Word; +begin + lhs := character >= '0'; + rhs := character <= '9'; + + return lhs & rhs +end; + +proc _is_alnum(character: Word); +var + lhs: Word; + rhs: Word; +begin + lhs := _is_alpha(character); + rhs := _is_digit(character); + + return lhs or rhs +end; + +(** + * Parameters: + * lhs - First pointer. + * rhs - Second pointer. + * count - The length to compare. + * + * Returns 0 if memory regions are equal. + *) +proc _memcmp(lhs: Word, rhs: Word, count: Word); +var + lhs_byte: Word; + rhs_byte: Word; + result: Word; +begin + result := 0; + + .memcmp_loop; + if count <> 0 then + lhs_byte := _load_byte(lhs); + rhs_byte := _load_byte(rhs); + result := lhs_byte + -rhs_byte; + + lhs := lhs + 1; + rhs := rhs + 1; + count := count + -1; + + if result = 0 then + goto .memcmp_loop + end + end; + + return result +end; + +(** + * Copies memory. + * + * Parameters: + * destination - Destination. + * source - Source. + * count - Size. + * + * Returns the destination. + *) +proc _memcpy(destination: Word, source: Word, count: Word); +var + current_byte: Word; +begin + .memcpy_loop; + if count <> 0 then + current_byte := _load_byte(source); + _store_byte(current_byte, destination); + + destination := destination + 1; + source := source + 1; + count := count + -1; + goto .memcpy_loop + end; + + return destination +end; + +proc _compile_integer_literal(); +var + integer_token: Word; + integer_length: Word; + token_kind: Word; +begin + _write_z("\tli t0, \0"); + + integer_token := _lexer_global_get_start(); + integer_length := _lexer_global_get_end() + -integer_token; + + _write_s(integer_token, integer_length); + _lexer_skip_token(); + + _write_c('\n') +end; + +proc _compile_character_literal(); +var + character: Word; + token_kind: Word; + character_length: Word; +begin + character := _lexer_global_get_start(); + character_length := _lexer_global_get_end() + -character; + + _write_z("\tli t0, \0"); + _write_s(character, character_length); + _write_c('\n'); + _lexer_skip_token() +end; + +proc _compile_variable_expression(); +begin + _compile_designator(); + _write_z("\tlw t0, (t0)\n\0") +end; + +(** + * Compiled take address expression, starting with an "@" sign. + *) +proc _compile_address_expression(); +begin + _lexer_skip_token(); + _compile_designator() +end; + +(** + * Compile unary negation, "-" sign. + *) +proc _compile_negate_expression(); +begin + _lexer_skip_token(); + _compile_term(); + _write_z("\tneg t0, t0\n\0") +end; + +(* Compile unary negation, "~" sign. *) +proc _compile_not_expression(); +var + token_kind: Word; +begin + _lexer_read_token(@token_kind); + _lexer_skip_token(); + _compile_term(); + _write_z("\tnot t0, t0\n\0") +end; + +proc _compile_string_literal(); +var + token_kind: Word; + token_start: Word; + length: Word; + offset: Word; +begin + _lexer_read_token(@token_kind); + token_start := _lexer_global_get_start(); + length := _string_length(token_start); + offset := _add_string(token_start); + + _lexer_skip_token(); + _write_z("\tla t0, strings\n\0"); + + _write_z("\tli t1, \0"); + _write_i(offset); + _write_c('\n'); + + _write_z("\tadd t0, t0, t1\n\0") +end; + +proc _compile_term(); +var + current_character: Word; + token_kind: Word; +begin + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_character() then + _compile_character_literal() + elsif token_kind = _lexer_token_kind_string() then + _compile_string_literal() + elsif token_kind = _lexer_token_kind_integer() then + _compile_integer_literal() + elsif token_kind = _lexer_token_kind_at() then + _compile_address_expression() + elsif token_kind = _lexer_token_kind_minus() then + _compile_negate_expression() + elsif token_kind = _lexer_token_kind_not() then + _compile_not_expression() + elsif token_kind = _lexer_token_kind_identifier() then + current_character := _lexer_global_get_start(); + current_character := _load_byte(current_character); + + (* This is a call if the statement starts with an underscore. *) + if current_character = '_' then + _compile_call(); + _write_z("\tmv t0, a0\n\0") + else + _compile_variable_expression() + end + end +end; + +proc _compile_binary_rhs(); +begin + (* Save the value of the left expression on the stack. *) + _write_z("\tsw t0, 64(sp)\n\0"); + _compile_term(); + + (* Load the left expression from the stack; *) + _write_z("\tlw t1, 64(sp)\n\0") +end; + +proc _compile_expression(); +var + token_kind: Word; +begin + _compile_term(); + + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_plus() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\tadd t0, t0, t1\n\0") + elsif token_kind = _lexer_token_kind_minus() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\tsub t0, t1, t0\n\0"); + elsif token_kind = _lexer_token_kind_multiplication() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\tmul t0, t0, t1\n\0") + elsif token_kind = _lexer_token_kind_and() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\tand t0, t0, t1\n\0") + elsif token_kind = _lexer_token_kind_or() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\tor t0, t0, t1\n\0") + elsif token_kind = _lexer_token_kind_xor() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\txor t0, t0, t1\n\0") + elsif token_kind = _lexer_token_kind_equals() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\txor t0, t0, t1\n\tseqz t0, t0\n\0") + elsif token_kind = _lexer_token_kind_remainder() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\trem t0, t1, t0\n\0") + elsif token_kind = _lexer_token_kind_division() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\tdiv t0, t1, t0\n\0") + elsif token_kind = _lexer_token_kind_less_than() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\tslt t0, t1, t0\n\0") + elsif token_kind = _lexer_token_kind_greater_than() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\tslt t0, t0, t1\n\0") + elsif token_kind = _lexer_token_kind_less_equal() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\tslt t0, t0, t1\n\txori t0, t0, 1\n\0") + elsif token_kind = _lexer_token_kind_not_equal() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\txor t0, t0, t1\n\tsnez t0, t0\n\0") + elsif token_kind = _lexer_token_kind_greater_equal() then + _lexer_skip_token(); + _compile_binary_rhs(); + + (* Execute the operation. *) + _write_z("\tslt t0, t1, t0\n\txori t0, t0, 1\n\0") + end; + + .compile_expression_end; +end; + +proc _compile_call(); +var + name_length: Word; + name: Word; + argument_count: Word; + stack_offset: Word; + token_kind: Word; +begin + _lexer_read_token(@token_kind); + name := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -name; + argument_count := 0; + + (* Skip the identifier and left paren. *) + _lexer_skip_token(); + _lexer_read_token(@token_kind); + _lexer_skip_token(); + + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_right_paren() then + goto .compile_call_finalize + end; + .compile_call_loop; + _compile_expression(); + + (* Save the argument on the stack. *) + _write_z("\tsw t0, \0"); + + (* Calculate the stack offset: 116 - (4 * argument_counter) *) + stack_offset := argument_count * 4; + _write_i(116 + -stack_offset); + + _write_z("(sp)\n\0"); + + (* Add one to the argument counter. *) + argument_count := argument_count + 1; + + _lexer_read_token(@token_kind); + + if token_kind <> _lexer_token_kind_comma() then + goto .compile_call_finalize + end; + _lexer_skip_token(); + goto .compile_call_loop; + + .compile_call_finalize; + (* Load the argument from the stack. *) + if argument_count <> 0 then + (* Decrement the argument counter. *) + argument_count := argument_count + -1; + + _write_z("\tlw a\0"); + _write_i(argument_count); + + _write_z(", \0"); + + (* Calculate the stack offset: 116 - (4 * argument_counter) *) + stack_offset := argument_count * 4; + _write_i(116 + -stack_offset); + + _write_z("(sp)\n\0"); + + goto .compile_call_finalize + end; + + .compile_call_end; + _write_z("\tcall \0"); + _write_s(name, name_length); + _write_c('\n'); + + (* Skip the right paren. *) + _lexer_read_token(@token_kind); + _lexer_skip_token() +end; + +proc _compile_goto(); +var + next_token: Word; + next_length: Word; + token_kind: Word; +begin + _lexer_skip_token(); + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_dot() then + _lexer_skip_token(); + _lexer_read_token(@token_kind) + end; + next_token := _lexer_global_get_start(); + next_length := _lexer_global_get_end() + -next_token; + + _write_z("\tj .\0"); + + _write_s(next_token, next_length); + _lexer_skip_token() +end; + +proc _compile_local_designator(symbol: Word); +var + variable_offset: Word; +begin + _write_z("\taddi t0, sp, \0"); + variable_offset := _parameter_info_get_offset(symbol); + _write_i(variable_offset); + _write_c('\n'); + _lexer_skip_token() +end; + +proc _compile_global_designator(); +var + name: Word; + token_kind: Word; + token_length: Word; +begin + _write_z("\tla t0, \0"); + + _lexer_read_token(@token_kind); + name := _lexer_global_get_start(); + token_length := _lexer_global_get_end() + -name; + _write_s(name, token_length); + _lexer_skip_token(); + + _write_c('\n') +end; + +proc _compile_designator(); +var + name_token: Word; + lookup_result: Word; + token_kind: Word; + name: Word; +begin + _lexer_read_token(@token_kind); + name := _lexer_global_get_start(); + name_token := _lexer_global_get_end() + -name; + lookup_result := _symbol_table_lookup(@symbol_table_local, name, name_token); + + if lookup_result <> 0 then + _compile_local_designator(lookup_result) + else + _compile_global_designator() + end; + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_hat() then + _lexer_skip_token(); + _write_z("\tlw t0, (t0)\n\0") + end +end; + +proc _compile_assignment(); +var + token_kind: Word; +begin + _compile_designator(); + + (* Save the assignee address on the stack. *) + _write_z("\tsw t0, 60(sp)\n\0"); + + (* Skip the assignment sign (:=) with surrounding whitespaces. *) + _lexer_read_token(@token_kind); + _lexer_skip_token(); + + (* Compile the assignment. *) + _compile_expression(); + + _write_z("\tlw t1, 60(sp)\n\tsw t0, (t1)\n\0") +end; + +proc _compile_return_statement(); +var + token_kind: Word; +begin + (* Skip "return" keyword and whitespace after it. *) + _lexer_read_token(@token_kind); + _lexer_skip_token(); + + _compile_expression(); + _write_z("\tmv a0, t0\n\0") +end; + +(** + * Writes a label, .Ln, where n is a unique number. + * + * Parameters: + * counter - Label counter. + *) +proc _write_label(counter: Word); +begin + _write_z(".L\0"); + _write_i(counter) +end; + +proc _compile_condition(after_end_label: Word); +var + condition_label: Word; + token_kind: Word; +begin + (* Compile condition. *) + _compile_expression(); + (* Skip " then" with newline. *) + _lexer_read_token(@token_kind); + _lexer_skip_token(); + + (* condition_label is the label in front of the next elsif condition or end. *) + condition_label := label_counter; + label_counter := label_counter + 1; + + _write_z("\tbeqz t0, \0"); + _write_label(condition_label); + _write_c('\n'); + + _compile_statement_list(); + + _write_z("\tj \0"); + _write_label(after_end_label); + _write_c('\n'); + + _write_label(condition_label); + _write_z(":\n\0") +end; + +proc _compile_if(); +var + after_end_label: Word; + condition_label: Word; + token_kind: Word; +begin + (* Skip "if ". *) + _lexer_read_token(@token_kind); + _lexer_skip_token(); + + after_end_label := label_counter; + label_counter := label_counter + 1; + + _compile_condition(after_end_label); + .compile_if_loop; + + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_else() then + _lexer_skip_token(); + _compile_statement_list() + elsif token_kind = _lexer_token_kind_elsif() then + _lexer_skip_token(); + _compile_condition(after_end_label); + + goto .compile_if_loop + end; + _lexer_skip_token(); + + _write_label(after_end_label); + _write_z(":\n\0") +end; + +proc _compile_label_declaration(); +var + label_token: Word; + token_kind: Word; + name: Word; +begin + (* Skip the dot. *) + _lexer_skip_token(); + _lexer_read_token(@token_kind); + name := _lexer_global_get_start(); + label_token := _lexer_global_get_end() + -name; + _write_c('.'); + _write_s(name, label_token); + _write_z(":\n\0"); + _lexer_skip_token() +end; + +proc _compile_statement(); +var + current_byte: Word; + token_kind: Word; +begin + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_goto() then + _compile_goto() + elsif token_kind = _lexer_token_kind_if() then + _compile_if() + elsif token_kind = _lexer_token_kind_return() then + _compile_return_statement() + elsif token_kind = _lexer_token_kind_dot() then + _compile_label_declaration() + elsif token_kind = _lexer_token_kind_identifier() then + current_byte := _lexer_global_get_start(); + current_byte := _load_byte(current_byte); + + (* This is a call if the statement starts with an underscore. *) + if current_byte = '_' then + _compile_call() + else + _compile_assignment() + end + end; + _write_c('\n') +end; + +proc _compile_statement_list(); +var + token_kind: Word; +begin + _skip_empty_lines(); + _compile_statement(); + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_semicolon() then + _lexer_skip_token(); + _compile_statement_list() + end; + _skip_empty_lines() +end; + +(** + * Writes a regster name to the standard output. + * + * Parameters: + * register_character - Register character. + * register_number - Register number. + *) +proc _write_register(register_character: Word, register_number: Word); +begin + _write_c(register_character); + _write_c(register_number + '0') +end; + +proc _type_get_kind(this: Word); + return _load_word(this) +end; + +proc _type_set_kind(this: Word, value: Word); +begin + _store_word(value, this) +end; + +proc _type_get_size(this: Word); + return _load_word(this + 4) +end; + +proc _type_set_size(this: Word, value: Word); +begin + _store_word(value, this + 4) +end; + +proc _enumeration_type_get_members(this: Word); + return _load_word(this + 8) +end; + +proc _enumeration_type_set_members(this: Word, value: Word); +begin + _store_word(value, this + 8) +end; + +proc _enumeration_type_get_length(this: Word); + return _load_word(this + 12) +end; + +proc _enumeration_type_set_length(this: Word, value: Word); +begin + _store_word(value, this + 12) +end; + +(** + * Reads and creates enumeration type representation. + * + * record + * type_kind: Word; + * size: Word; + * members: StringArray; + * length: Word + * end; + * + * Returns enumeration type description. + *) +proc _read_type_enumeration(); +var + token_kind: Word; + enumeration_name: Word; + name_length: Word; + memory_start: Word; + member_count: Word; + result: Word; +begin + _lexer_skip_token(); + memory_start := memory_free_pointer; + member_count := 0; + + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_right_paren() then + goto .read_type_enumeration_end + end; + .read_type_enumeration_loop; + member_count := member_count + 1; + + enumeration_name := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -enumeration_name; + + _store_word(enumeration_name, memory_free_pointer); + memory_free_pointer := memory_free_pointer + 4; + + _store_word(name_length, memory_free_pointer); + memory_free_pointer := memory_free_pointer + 4; + + (* Skip the identifier. *) + _lexer_skip_token(); + + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_comma() then + _lexer_skip_token(); + _lexer_read_token(@token_kind); + goto .read_type_enumeration_loop + end; + + .read_type_enumeration_end; + _lexer_skip_token(); + + (* The resulting structure is 16 bytes long. *) + result := memory_free_pointer; + memory_free_pointer := memory_free_pointer + 16; + + (* ENUMERATION_TYPE is 2. *) + _type_set_kind(result, 2); + _type_set_size(result, 4); + _enumeration_type_set_members(result, memory_start); + _enumeration_type_set_length(result, member_count); + + return result +end; + +proc _read_type_expression(); +var + token_kind: Word; + type_name: Word; + name_length: Word; + result: Word; +begin + result := 0; + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_identifier() then + (* Named type. *) + type_name := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -type_name; + _write_c('#'); + _write_s(type_name, name_length); + _write_c(' '); + result := _symbol_table_lookup(@symbol_table_global, type_name, name_length); + _write_i(result); + _write_c('\n'); + result := _type_info_get_type(result); + + _lexer_skip_token() + elsif token_kind = _lexer_token_kind_left_paren() then + result := _read_type_enumeration() + end; + + return result +end; + +proc _type_info_get_type(this: Word); + return _load_word(this + 4) +end; + +(** + * Parameters: + * parameter_index - Parameter index. + *) +proc _parameter_info_create(parameter_index: Word); +var + offset: Word; + current_word: Word; + result: Word; +begin + result := memory_free_pointer; + current_word := result; + (* 2 is INFO_PARAMETER *) + _store_word(2, current_word); + + current_word := current_word + 4; + + (* Calculate the stack offset: 88 - (4 * parameter_counter) *) + offset := parameter_index * 4; + _store_word(88 + -offset, current_word); + + memory_free_pointer := current_word + 4; + + return result +end; + +proc _parameter_info_get_offset(this: Word); +begin + this := this + 4; + return _load_word(this) +end; + +(** + * Parameters: + * temporary_index - Parameter index. + *) +proc _temporary_info_create(temporary_index: Word); +var + offset: Word; + current_word: Word; + result: Word; +begin + result := memory_free_pointer; + current_word := result; + (* 3 is INFO_TEMPORARY *) + _store_word(3, current_word); + + current_word := current_word + 4; + + (* Calculate the stack offset: 4 * variable_counter. *) + _store_word(temporary_index * 4, current_word); + + memory_free_pointer := current_word + 4; + + return result +end; + +proc _temporary_info_get_offset(this: Word); +begin + this := this + 4; + return _load_word(this) +end; + +(** + * Parameters: + * parameter_index - Parameter index. + *) +proc _read_procedure_parameter(parameter_index: Word); +var + name_length: Word; + info: Word; + name_position: Word; + token_kind: Word; +begin + (* Read the parameter name. *) + _lexer_read_token(@token_kind); + name_position := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -name_position; + _lexer_skip_token(); + + (* Skip colon and space in front of the type expression. *) + _lexer_read_token(@token_kind); + _lexer_skip_token(); + + _read_type_expression(); + + _write_z("\tsw a\0"); + _write_i(parameter_index); + _write_z(", \0"); + + info := _parameter_info_create(parameter_index); + _symbol_table_enter(@symbol_table_local, name_position, name_length, info); + + info := _parameter_info_get_offset(info); + _write_i(info); + + _write_z("(sp)\n\0") +end; + +proc _read_procedure_parameters(); +var + parameter_counter: Word; + token_kind: Word; +begin + (* Skip open paren. *) + _lexer_read_token(@token_kind); + _lexer_skip_token(); + parameter_counter := 0; + + .compile_procedure_prologue_skip; + _lexer_read_token(@token_kind); + + if token_kind <> _lexer_token_kind_right_paren() then + _read_procedure_parameter(parameter_counter); + parameter_counter := parameter_counter + 1; + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_comma() then + _lexer_skip_token(); + goto .compile_procedure_prologue_skip + end + end; + (* Skip close paren. *) + _lexer_skip_token() +end; + +(** + * Parameters: + * variable_index - Variable index. + *) +proc _read_procedure_temporary(variable_index: Word); +var + name_length: Word; + info: Word; + name_position: Word; + token_kind: Word; +begin + _lexer_read_token(@token_kind); + name_position := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -name_position; + _lexer_skip_token(); + + (* Read and skip variable name, colon and the space *) + _lexer_read_token(@token_kind); + _lexer_skip_token(); + + _read_type_expression(); + + info := _temporary_info_create(variable_index); + _symbol_table_enter(@symbol_table_local, name_position, name_length, info); + + (* Skip semicolon and newline after the variable declaration *) + _lexer_read_token(@token_kind); + _lexer_skip_token() +end; + +proc _read_procedure_temporaries(); +var + temporary_counter: Word; + token_kind: Word; +begin + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_var() then + _lexer_skip_token(); + temporary_counter := 0; + + .read_local_variables_loop; + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_identifier() then + _read_procedure_temporary(temporary_counter); + + temporary_counter := temporary_counter + 1; + goto .read_local_variables_loop + end + end +end; + +proc _compile_procedure(); +var + name_pointer: Word; + name_length: Word; + token_kind: Word; +begin + (* Skip "proc ". *) + _lexer_read_token(@token_kind); + _lexer_skip_token(); + + (* Clear local symbol table. *) + _store_word(0, @symbol_table_local); + + _lexer_read_token(@token_kind); + name_pointer := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -name_pointer; + + (* Write .type _procedure_name, @function. *) + _write_z(".type \0"); + + _write_s(name_pointer, name_length); + _write_z(", @function\n\0"); + + (* Write procedure label, _procedure_name: *) + _write_s(name_pointer, name_length); + _write_z(":\n\0"); + + (* Skip procedure name. *) + _lexer_skip_token(); + _write_z("\taddi sp, sp, -128\n\tsw ra, 124(sp)\n\tsw s0, 120(sp)\n\taddi s0, sp, 128\n\0"); + _read_procedure_parameters(); + + (* Skip semicolon and newline. *) + _lexer_read_token(@token_kind); + _lexer_skip_token(); + _read_procedure_temporaries(); + + (* Skip semicolon, "begin" and newline. *) + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_begin() then + _lexer_skip_token(); + _compile_statement_list() + elsif token_kind = _lexer_token_kind_return() then + _compile_return_statement() + end; + + (* Write the epilogue. *) + _write_z("\tlw ra, 124(sp)\n\tlw s0, 120(sp)\n\taddi sp, sp, 128\n\tret\n\0"); + + (* Skip the "end" keyword, semicolon and newline. *) + _lexer_read_token(@token_kind); + _lexer_skip_token(); + _lexer_read_token(@token_kind); + _lexer_skip_token() +end; + +(** + * Skips comments. + *) +proc _skip_empty_lines(); +var + token_kind: Word; +begin + .skip_empty_lines_rerun; + + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_comment() then + _lexer_skip_token(); + goto .skip_empty_lines_rerun + end +end; + +(** + * Compile global variable initializer. + *) +proc _compile_global_initializer(); +var + current_byte: Word; + length: Word; + token_kind: Word; + token_start: Word; +begin + _lexer_read_token(@token_kind); + token_start := _lexer_global_get_start(); + current_byte := _load_byte(token_start); + + if token_kind = _lexer_token_kind_string() then + _write_z("\n\t.word strings + \0"); + length := _string_length(token_start); + + _add_string(token_start); + _write_i(); + + (* Skip the quoted string. *) + _lexer_skip_token(); + + goto .compile_global_initializer_end + elsif current_byte = 'S' then + (* Skip "S(". *) + _lexer_skip_token(); + _lexer_read_token(@token_kind); + _lexer_skip_token(); + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_right_paren() then + goto .compile_global_initializer_closing + end; + goto .compile_global_initializer_loop + elsif token_kind = _lexer_token_kind_at() then + (* Skip @. *) + _lexer_skip_token(); + _write_z("\n\t.word \0"); + _lexer_read_token(@token_kind); + token_start := _lexer_global_get_start(); + _write_s(token_start, _lexer_global_get_end() + -token_start); + _lexer_skip_token(); + + goto .compile_global_initializer_end + elsif token_kind = _lexer_token_kind_integer() then + _write_z("\n\t.word \0"); + _write_s(token_start, _lexer_global_get_end() + -token_start); + _lexer_skip_token(); + + goto .compile_global_initializer_end + end; + + .compile_global_initializer_loop; + _compile_global_initializer(); + + _lexer_read_token(@token_kind); + if token_kind <> _lexer_token_kind_right_paren() then + (* Skip comma and whitespace after it. *) + _lexer_skip_token(); + + goto .compile_global_initializer_loop + end; + + .compile_global_initializer_closing; + (* Skip ")" *) + _lexer_skip_token(); + + .compile_global_initializer_end +end; + +proc _compile_constant_declaration(); +var + name: Word; + name_length: Word; + token_kind: Word; +begin + name := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -name; + + _write_z(".type \0"); + _write_s(name, name_length); + _write_z(", @object\n\0"); + + _write_s(name, name_length); + _write_c(':'); + + (* Skip the constant name with assignment sign and surrounding whitespaces. *) + _lexer_skip_token(); + _lexer_read_token(@token_kind); + _lexer_skip_token(); + _compile_global_initializer(); + + (* Skip semicolon and newline. *) + _lexer_read_token(@token_kind); + _lexer_skip_token(); + _write_c('\n') +end; + +proc _compile_type_declaration(); +var + token_kind: Word; +begin + _lexer_skip_token(); + _lexer_read_token(@token_kind); + _lexer_skip_token(); + _read_type_expression(); + _lexer_read_token(@token_kind); + _lexer_skip_token() +end; + +proc _compile_type_part(); +var + token_kind: Word; +begin + _skip_empty_lines(); + _lexer_read_token(@token_kind); + + if token_kind <> _lexer_token_kind_type() then + goto .compile_type_part_end + end; + _lexer_skip_token(); + + .compile_type_part_loop; + _skip_empty_lines(); + + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_identifier() then + _compile_type_declaration(); + goto .compile_type_part_loop + end; + + .compile_type_part_end +end; + +proc _compile_const_part(); +var + token_kind: Word; +begin + _skip_empty_lines(); + _lexer_read_token(@token_kind); + + if token_kind <> _lexer_token_kind_const() then + goto .compile_const_part_end + end; + (* Skip "const" with the newline after it. *) + _lexer_skip_token(); + _write_z(".section .rodata # Compiled from const section.\n\n\0"); + + .compile_const_part_loop; + _skip_empty_lines(); + + (* If the character at the line beginning is not indentation, + it is probably the next code section. *) + _lexer_read_token(@token_kind); + if token_kind = _lexer_token_kind_identifier() then + _compile_constant_declaration(); + goto .compile_const_part_loop + end; + + .compile_const_part_end +end; + +proc _compile_variable_declaration(); +var + name: Word; + name_length: Word; + token_kind: Word; +begin + _lexer_read_token(@token_kind); + name := _lexer_global_get_start(); + name_length := _lexer_global_get_end() + -name; + + _write_z(".type \0"); + _write_s(name, name_length); + _write_z(", @object\n\0"); + + _write_s(name, name_length); + _write_c(':'); + + (* Skip the variable name and colon with space before the type. *) + _lexer_skip_token(); + _lexer_read_token(@token_kind); + _lexer_skip_token(); + _read_type_expression(); + + _lexer_read_token(@token_kind); + + if token_kind <> _lexer_token_kind_assignment() then + (* Else we assume this is a zeroed 81920 bytes big array. *) + _write_z(" .zero 81920\0") + else + (* Skip the assignment sign with surrounding whitespaces. *) + _lexer_skip_token(); + _compile_global_initializer(); + _lexer_read_token(@token_kind) + end; + + (* Skip semicolon and newline. *) + _lexer_read_token(@token_kind); + _lexer_skip_token(); + _write_c('\n') +end; + +proc _compile_var_part(); +var + token_kind: Word; +begin + _lexer_read_token(@token_kind); + + if token_kind <> _lexer_token_kind_var() then + goto .compile_var_part_end + end; + (* Skip "var" and newline. *) + _lexer_skip_token(); + _write_z(".section .data\n\0"); + + .compile_var_part_loop; + _skip_empty_lines(); + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_identifier() then + _compile_variable_declaration(); + goto .compile_var_part_loop + end; + + .compile_var_part_end +end; + +(** + * Process the source code and print the generated code. + *) +proc _compile_module(); +var + token_kind: Word; +begin + _compile_type_part(); + _compile_const_part(); + _skip_empty_lines(); + _compile_var_part(); + + _write_z(".section .text\n\n\0"); + _write_z(".type _syscall, @function\n_syscall:\n\tmv a7, a6\n\tecall\n\tret\n\n\0"); + _write_z(".type _load_byte, @function\n_load_byte:\n\tlb a0, (a0)\nret\n\n\0"); + _write_z(".type _load_word, @function\n_load_word:\n\tlw a0, (a0)\nret\n\n\0"); + _write_z(".type _store_byte, @function\n_store_byte:\n\tsb a0, (a1)\nret\n\n\0"); + _write_z(".type _store_word, @function\n_store_word:\n\tsw a0, (a1)\nret\n\n\0"); + + .compile_module_loop; + _skip_empty_lines(); + _lexer_read_token(@token_kind); + + if token_kind = _lexer_token_kind_proc() then + _compile_procedure(); + goto .compile_module_loop + end +end; + +proc _compile(); +var + compiler_strings_copy: Word; + compiler_strings_end: Word; + current_byte: Word; +begin + _write_z(".globl _start\n\n\0"); + _compile_module(); + + _write_z(".section .rodata\n.type strings, @object\nstrings: .ascii \0"); + _write_c('"'); + + compiler_strings_copy := @compiler_strings; + compiler_strings_end := compiler_strings_position; + + .compile_loop; + if compiler_strings_copy < compiler_strings_end then + current_byte := _load_byte(compiler_strings_copy); + compiler_strings_copy := compiler_strings_copy + 1; + _write_c(current_byte); + + goto .compile_loop + end; + _write_c('"'); + _write_c('\n') +end; + +(** + * Terminates the program. a0 contains the return code. + * + * Parameters: + * a0 - Status code. + *) +proc _exit(); +begin + _syscall(0, 0, 0, 0, 0, 0, 93) +end; + +(** + * Looks for a symbol in the given symbol table. + * + * Parameters: + * symbol_table - Symbol table. + * symbol_name - Symbol name pointer. + * name_length - Symbol name length. + * + * Returns the symbol pointer or 0 in a0. + *) +proc _symbol_table_lookup(symbol_table: Word, symbol_name: Word, name_length: Word); +var + result: Word; + symbol_table_length: Word; + current_name: Word; + current_length: Word; +begin + result := 0; + + (* The first word in the symbol table is its length, get it. *) + symbol_table_length := _load_word(symbol_table); + + (* Go to the first symbol position. *) + symbol_table := symbol_table + 4; + + .symbol_table_lookup_loop; + if symbol_table_length = 0 then + goto .symbol_table_lookup_end + end; + + (* Symbol name pointer and length. *) + current_name := _load_word(symbol_table); + current_length := _load_word(symbol_table + 4); + + (* If lengths don't match, exit and return nil. *) + if name_length <> current_length then + goto .symbol_table_lookup_repeat + end; + (* If names don't match, exit and return nil. *) + if _memcmp(symbol_name, current_name, name_length) <> 0 then + goto .symbol_table_lookup_repeat + end; + (* Otherwise, the symbol is found. *) + result := _load_word(symbol_table + 8); + goto .symbol_table_lookup_end; + + .symbol_table_lookup_repeat; + symbol_table := symbol_table + 12; + symbol_table_length := symbol_table_length + -1; + goto .symbol_table_lookup_loop; + + .symbol_table_lookup_end; + return result +end; + +(** + * Inserts a symbol into the table. + * + * Parameters: + * symbol_table - Symbol table. + * symbol_name - Symbol name pointer. + * name_length - Symbol name length. + * symbol - Symbol pointer. + *) +proc _symbol_table_enter(symbol_table: Word, symbol_name: Word, name_length: Word, symbol: Word); +var + table_length: Word; + symbol_pointer: Word; +begin + (* The first word in the symbol table is its length, get it. *) + table_length := _load_word(symbol_table); + + (* Calculate the offset for the new symbol. *) + symbol_pointer := table_length * 12; + symbol_pointer := symbol_pointer + 4; + symbol_pointer := symbol_table + symbol_pointer; + + _store_word(symbol_name, symbol_pointer); + symbol_pointer := symbol_pointer + 4; + _store_word(name_length, symbol_pointer); + symbol_pointer := symbol_pointer + 4; + _store_word(symbol, symbol_pointer); + + (* Increment the symbol table length. *) + table_length := table_length + 1; + _store_word(table_length, symbol_table) +end; + +proc _symbol_table_build(); +begin + (* Set the table length to 0. *) + _store_word(0, @symbol_table_global); + + (* Enter built-in symbols. *) + _symbol_table_enter(@symbol_table_global, symbol_builtin_name_int, 3, @symbol_type_info_int); + _symbol_table_enter(@symbol_table_global, symbol_builtin_name_word, 4, @symbol_type_info_word); + _symbol_table_enter(@symbol_table_global, symbol_builtin_name_pointer, 7, @symbol_type_info_pointer); + _symbol_table_enter(@symbol_table_global, symbol_builtin_name_char, 4, @symbol_type_info_char); + _symbol_table_enter(@symbol_table_global, symbol_builtin_name_array, 5, @symbol_type_info_array) +end; + + +(** + * Classification table assigns each possible character to a group (class). All + * characters of the same group a handled equivalently. + * + * Transition = record + * action: TransitionAction; + * next_state: TransitionState + * end; + *) +proc _lexer_class_invalid(); + return 1 +end; + +proc _lexer_class_digit(); + return 2 +end; + +proc _lexer_class_alpha(); + return 3 +end; + +proc _lexer_class_space(); + return 4 +end; + +proc _lexer_class_colon(); + return 5 +end; + +proc _lexer_class_equals(); + return 6 +end; + +proc _lexer_class_left_paren(); + return 7 +end; + +proc _lexer_class_right_paren(); + return 8 +end; + +proc _lexer_class_asterisk(); + return 9 +end; + +proc _lexer_class_backslash(); + return 10 +end; + +proc _lexer_class_single(); + return 11 +end; + +proc _lexer_class_hex(); + return 12 +end; + +proc _lexer_class_zero(); + return 13 +end; + +proc _lexer_class_x(); + return 14 +end; + +proc _lexer_class_eof(); + return 15 +end; + +proc _lexer_class_dot(); + return 16 +end; + +proc _lexer_class_minus(); + return 17 +end; + +proc _lexer_class_single_quote(); + return 18 +end; + +proc _lexer_class_double_quote(); + return 19 +end; + +proc _lexer_class_greater(); + return 20 +end; + +proc _lexer_class_less(); + return 21 +end; + +proc _lexer_class_other(); + return 22 +end; + +proc _lexer_state_start(); + return 1 +end; + +proc _lexer_state_colon(); + return 2 +end; + +proc _lexer_state_identifier(); + return 3 +end; + +proc _lexer_state_decimal(); + return 4 +end; + +proc _lexer_state_leading_zero(); + return 5 +end; + +proc _lexer_state_greater(); + return 6 +end; + +proc _lexer_state_minus(); + return 7 +end; + +proc _lexer_state_left_paren(); + return 8 +end; + +proc _lexer_state_less(); + return 9 +end; + +proc _lexer_state_dot(); + return 10 +end; + +proc _lexer_state_comment(); + return 11 +end; + +proc _lexer_state_closing_comment(); + return 12 +end; + +proc _lexer_state_character(); + return 13 +end; + +proc _lexer_state_character_escape(); + return 14 +end; + +proc _lexer_state_string(); + return 15 +end; + +proc _lexer_state_string_escape(); + return 16 +end; + +proc _lexer_state_end(); + return 17 +end; + +proc _lexer_action_none(); + return 1 +end; + +proc _lexer_action_accumulate(); + return 2 +end; + +proc _lexer_action_skip(); + return 3 +end; + +proc _lexer_action_single(); + return 4 +end; + +proc _lexer_action_eof(); + return 5 +end; + +proc _lexer_action_finalize(); + return 6 +end; + +proc _lexer_action_composite(); + return 7 +end; + +proc _lexer_action_key_id(); + return 8 +end; + +proc _lexer_action_integer(); + return 9 +end; + +proc _lexer_action_delimited(); + return 10 +end; + +(** + * Assigns some value to at array index. + * + * Parameters: + * array - Array pointer. + * index - Index (word offset into the array). + * data - Data to assign. + *) +proc _assign_at(array: Word, index: Word, data: Word); +var + target: Word; +begin + target := index + -1; + target := target * 4; + target := array + target; + + _store_word(data, target) +end; + +proc _get_at(array: Word, index: Word); +var + target: Word; +begin + target := index + -1; + target := target * 4; + target := array + target; + + return _load_word(target) +end; + +(** + * Initializes the array with character classes. + *) +proc _lexer_classifications(); +var + code: Word; +begin + _assign_at(@classification, 1, _lexer_class_eof()); + _assign_at(@classification, 2, _lexer_class_invalid()); + _assign_at(@classification, 3, _lexer_class_invalid()); + _assign_at(@classification, 4, _lexer_class_invalid()); + _assign_at(@classification, 5, _lexer_class_invalid()); + _assign_at(@classification, 6, _lexer_class_invalid()); + _assign_at(@classification, 7, _lexer_class_invalid()); + _assign_at(@classification, 8, _lexer_class_invalid()); + _assign_at(@classification, 9, _lexer_class_invalid()); + _assign_at(@classification, 10, _lexer_class_space()); + _assign_at(@classification, 11, _lexer_class_space()); + _assign_at(@classification, 12, _lexer_class_invalid()); + _assign_at(@classification, 13, _lexer_class_invalid()); + _assign_at(@classification, 14, _lexer_class_space()); + _assign_at(@classification, 15, _lexer_class_invalid()); + _assign_at(@classification, 16, _lexer_class_invalid()); + _assign_at(@classification, 17, _lexer_class_invalid()); + _assign_at(@classification, 18, _lexer_class_invalid()); + _assign_at(@classification, 19, _lexer_class_invalid()); + _assign_at(@classification, 20, _lexer_class_invalid()); + _assign_at(@classification, 21, _lexer_class_invalid()); + _assign_at(@classification, 22, _lexer_class_invalid()); + _assign_at(@classification, 23, _lexer_class_invalid()); + _assign_at(@classification, 24, _lexer_class_invalid()); + _assign_at(@classification, 25, _lexer_class_invalid()); + _assign_at(@classification, 26, _lexer_class_invalid()); + _assign_at(@classification, 27, _lexer_class_invalid()); + _assign_at(@classification, 28, _lexer_class_invalid()); + _assign_at(@classification, 29, _lexer_class_invalid()); + _assign_at(@classification, 30, _lexer_class_invalid()); + _assign_at(@classification, 31, _lexer_class_invalid()); + _assign_at(@classification, 32, _lexer_class_invalid()); + _assign_at(@classification, 33, _lexer_class_space()); + _assign_at(@classification, 34, _lexer_class_single()); + _assign_at(@classification, 35, _lexer_class_double_quote()); + _assign_at(@classification, 36, _lexer_class_other()); + _assign_at(@classification, 37, _lexer_class_other()); + _assign_at(@classification, 38, _lexer_class_single()); + _assign_at(@classification, 39, _lexer_class_single()); + _assign_at(@classification, 40, _lexer_class_single_quote()); + _assign_at(@classification, 41, _lexer_class_left_paren()); + _assign_at(@classification, 42, _lexer_class_right_paren()); + _assign_at(@classification, 43, _lexer_class_asterisk()); + _assign_at(@classification, 44, _lexer_class_single()); + _assign_at(@classification, 45, _lexer_class_single()); + _assign_at(@classification, 46, _lexer_class_minus()); + _assign_at(@classification, 47, _lexer_class_dot()); + _assign_at(@classification, 48, _lexer_class_single()); + _assign_at(@classification, 49, _lexer_class_zero()); + _assign_at(@classification, 50, _lexer_class_digit()); + _assign_at(@classification, 51, _lexer_class_digit()); + _assign_at(@classification, 52, _lexer_class_digit()); + _assign_at(@classification, 53, _lexer_class_digit()); + _assign_at(@classification, 54, _lexer_class_digit()); + _assign_at(@classification, 55, _lexer_class_digit()); + _assign_at(@classification, 56, _lexer_class_digit()); + _assign_at(@classification, 57, _lexer_class_digit()); + _assign_at(@classification, 58, _lexer_class_digit()); + _assign_at(@classification, 59, _lexer_class_colon()); + _assign_at(@classification, 60, _lexer_class_single()); + _assign_at(@classification, 61, _lexer_class_less()); + _assign_at(@classification, 62, _lexer_class_equals()); + _assign_at(@classification, 63, _lexer_class_greater()); + _assign_at(@classification, 64, _lexer_class_other()); + _assign_at(@classification, 65, _lexer_class_single()); + _assign_at(@classification, 66, _lexer_class_alpha()); + _assign_at(@classification, 67, _lexer_class_alpha()); + _assign_at(@classification, 68, _lexer_class_alpha()); + _assign_at(@classification, 69, _lexer_class_alpha()); + _assign_at(@classification, 70, _lexer_class_alpha()); + _assign_at(@classification, 71, _lexer_class_alpha()); + _assign_at(@classification, 72, _lexer_class_alpha()); + _assign_at(@classification, 73, _lexer_class_alpha()); + _assign_at(@classification, 74, _lexer_class_alpha()); + _assign_at(@classification, 75, _lexer_class_alpha()); + _assign_at(@classification, 76, _lexer_class_alpha()); + _assign_at(@classification, 77, _lexer_class_alpha()); + _assign_at(@classification, 78, _lexer_class_alpha()); + _assign_at(@classification, 79, _lexer_class_alpha()); + _assign_at(@classification, 80, _lexer_class_alpha()); + _assign_at(@classification, 81, _lexer_class_alpha()); + _assign_at(@classification, 82, _lexer_class_alpha()); + _assign_at(@classification, 83, _lexer_class_alpha()); + _assign_at(@classification, 84, _lexer_class_alpha()); + _assign_at(@classification, 85, _lexer_class_alpha()); + _assign_at(@classification, 86, _lexer_class_alpha()); + _assign_at(@classification, 87, _lexer_class_alpha()); + _assign_at(@classification, 88, _lexer_class_alpha()); + _assign_at(@classification, 89, _lexer_class_alpha()); + _assign_at(@classification, 90, _lexer_class_alpha()); + _assign_at(@classification, 91, _lexer_class_alpha()); + _assign_at(@classification, 92, _lexer_class_single()); + _assign_at(@classification, 93, _lexer_class_backslash()); + _assign_at(@classification, 94, _lexer_class_single()); + _assign_at(@classification, 95, _lexer_class_single()); + _assign_at(@classification, 96, _lexer_class_alpha()); + _assign_at(@classification, 97, _lexer_class_other()); + _assign_at(@classification, 98, _lexer_class_hex()); + _assign_at(@classification, 99, _lexer_class_hex()); + _assign_at(@classification, 100, _lexer_class_hex()); + _assign_at(@classification, 101, _lexer_class_hex()); + _assign_at(@classification, 102, _lexer_class_hex()); + _assign_at(@classification, 103, _lexer_class_hex()); + _assign_at(@classification, 104, _lexer_class_alpha()); + _assign_at(@classification, 105, _lexer_class_alpha()); + _assign_at(@classification, 106, _lexer_class_alpha()); + _assign_at(@classification, 107, _lexer_class_alpha()); + _assign_at(@classification, 108, _lexer_class_alpha()); + _assign_at(@classification, 109, _lexer_class_alpha()); + _assign_at(@classification, 110, _lexer_class_alpha()); + _assign_at(@classification, 111, _lexer_class_alpha()); + _assign_at(@classification, 112, _lexer_class_alpha()); + _assign_at(@classification, 113, _lexer_class_alpha()); + _assign_at(@classification, 114, _lexer_class_alpha()); + _assign_at(@classification, 115, _lexer_class_alpha()); + _assign_at(@classification, 116, _lexer_class_alpha()); + _assign_at(@classification, 117, _lexer_class_alpha()); + _assign_at(@classification, 118, _lexer_class_alpha()); + _assign_at(@classification, 119, _lexer_class_alpha()); + _assign_at(@classification, 120, _lexer_class_alpha()); + _assign_at(@classification, 121, _lexer_class_x()); + _assign_at(@classification, 122, _lexer_class_alpha()); + _assign_at(@classification, 123, _lexer_class_alpha()); + _assign_at(@classification, 124, _lexer_class_other()); + _assign_at(@classification, 125, _lexer_class_single()); + _assign_at(@classification, 126, _lexer_class_other()); + _assign_at(@classification, 127, _lexer_class_single()); + _assign_at(@classification, 128, _lexer_class_invalid()); + + code := 129; + + (* Set the remaining 129 - 256 bytes to transitionClassOther. *) + .create_classification_loop; + _assign_at(@classification, code, _lexer_class_other()); + code := code + 1; + + if code < 257 then + goto .create_classification_loop + end +end; + +proc _lexer_get_transition(current_state: Word, character_class: Word); +var + transition_table: Word; + row_position: Word; + column_position: Word; + target: Word; +begin + (* Each state is 8 bytes long (2 words: action and next state). + There are 22 character classes, so a transition row 8 * 22 = 176 bytes long. *) + row_position := current_state + -1; + row_position := row_position * 176; + + column_position := character_class + -1; + column_position := column_position * 8; + + target := _lexer_get_transition_table() + row_position; + + return target + column_position +end; + +(** + * Parameters: + * current_state - First index into transitions table. + * character_class - Second index into transitions table. + * action - Action to assign. + * next_state - Next state to assign. + *) +proc _lexer_set_transition(current_state: Word, character_class: Word, action: Word, next_state: Word); +var + transition: Word; +begin + transition := _lexer_get_transition(current_state, character_class); + + _lexer_transition_set_action(transition, action); + _lexer_transition_set_state(transition, next_state) +end; + +(* Sets same action and state transition for all character classes in one transition row. *) + +(** + * Parameters: + * current_state - Current state (Transition state enumeration). + * default_action - Default action (Callback). + * next_state - Next state (Transition state enumeration). + *) +proc _lexer_default_transition(current_state: Word, default_action: Word, next_state: Word); +begin + _lexer_set_transition(current_state, _lexer_class_invalid(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_digit(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_alpha(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_space(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_colon(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_equals(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_left_paren(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_right_paren(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_asterisk(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_backslash(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_single(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_hex(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_zero(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_x(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_eof(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_dot(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_minus(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_single_quote(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_double_quote(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_greater(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_less(), default_action, next_state); + _lexer_set_transition(current_state, _lexer_class_other(), default_action, next_state) +end; + +(** + * The transition table describes transitions from one state to another, given + * a symbol (character class). + * + * The table has m rows and n columns, where m is the amount of states and n is + * the amount of classes. So given the current state and a classified character + * the table can be used to look up the next state. + *) +proc _lexer_transitions(); +begin + (* Start state. *) + _lexer_set_transition(_lexer_state_start(), _lexer_class_invalid(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_digit(), _lexer_action_accumulate(), _lexer_state_decimal()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_alpha(), _lexer_action_accumulate(), _lexer_state_identifier()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_space(), _lexer_action_skip(), _lexer_state_start()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_colon(), _lexer_action_accumulate(), _lexer_state_colon()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_equals(), _lexer_action_single(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_left_paren(), _lexer_action_accumulate(), _lexer_state_left_paren()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_right_paren(), _lexer_action_single(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_asterisk(), _lexer_action_single(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_backslash(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_single(), _lexer_action_single(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_hex(), _lexer_action_accumulate(), _lexer_state_identifier()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_zero(), _lexer_action_accumulate(), _lexer_state_leading_zero()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_x(), _lexer_action_accumulate(), _lexer_state_identifier()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_eof(), _lexer_action_eof(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_dot(), _lexer_action_single(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_minus(), _lexer_action_accumulate(), _lexer_state_minus()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_single_quote(), _lexer_action_accumulate(), _lexer_state_character()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_double_quote(), _lexer_action_accumulate(), _lexer_state_string()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_greater(), _lexer_action_accumulate(), _lexer_state_greater()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_less(), _lexer_action_accumulate(), _lexer_state_less()); + _lexer_set_transition(_lexer_state_start(), _lexer_class_other(), _lexer_action_none(), _lexer_state_end()); + + (* Colon state. *) + _lexer_default_transition(_lexer_state_colon(), _lexer_action_finalize(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_colon(), _lexer_class_equals(), _lexer_action_composite(), _lexer_state_end()); + + (* Identifier state. *) + _lexer_default_transition(_lexer_state_identifier(), _lexer_action_key_id(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_identifier(), _lexer_class_digit(), _lexer_action_accumulate(), _lexer_state_identifier()); + _lexer_set_transition(_lexer_state_identifier(), _lexer_class_alpha(), _lexer_action_accumulate(), _lexer_state_identifier()); + _lexer_set_transition(_lexer_state_identifier(), _lexer_class_hex(), _lexer_action_accumulate(), _lexer_state_identifier()); + _lexer_set_transition(_lexer_state_identifier(), _lexer_class_zero(), _lexer_action_accumulate(), _lexer_state_identifier()); + _lexer_set_transition(_lexer_state_identifier(), _lexer_class_x(), _lexer_action_accumulate(), _lexer_state_identifier()); + + (* Decimal state. *) + _lexer_default_transition(_lexer_state_decimal(), _lexer_action_integer(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_decimal(), _lexer_class_digit(), _lexer_action_accumulate(), _lexer_state_decimal()); + _lexer_set_transition(_lexer_state_decimal(), _lexer_class_alpha(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_decimal(), _lexer_class_hex(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_decimal(), _lexer_class_zero(), _lexer_action_accumulate(), _lexer_state_decimal()); + _lexer_set_transition(_lexer_state_decimal(), _lexer_class_x(), _lexer_action_none(), _lexer_state_end()); + + (* Leading zero. *) + _lexer_default_transition(_lexer_state_leading_zero(), _lexer_action_integer(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_digit(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_alpha(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_hex(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_zero(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_leading_zero(), _lexer_class_x(), _lexer_action_none(), _lexer_state_dot()); + + (* Greater state. *) + _lexer_default_transition(_lexer_state_greater(), _lexer_action_finalize(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_greater(), _lexer_class_equals(), _lexer_action_composite(), _lexer_state_end()); + + (* Minus state. *) + _lexer_default_transition(_lexer_state_minus(), _lexer_action_finalize(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_minus(), _lexer_class_greater(), _lexer_action_composite(), _lexer_state_end()); + + (* Left paren state. *) + _lexer_default_transition(_lexer_state_left_paren(), _lexer_action_finalize(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_left_paren(), _lexer_class_asterisk(), _lexer_action_accumulate(), _lexer_state_comment()); + + (* Less state. *) + _lexer_default_transition(_lexer_state_less(), _lexer_action_finalize(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_less(), _lexer_class_equals(), _lexer_action_composite(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_less(), _lexer_class_greater(), _lexer_action_composite(), _lexer_state_end()); + + (* Hexadecimal after 0x. *) + _lexer_default_transition(_lexer_state_dot(), _lexer_action_finalize(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_dot(), _lexer_class_dot(), _lexer_action_composite(), _lexer_state_end()); + + (* Comment. *) + _lexer_default_transition(_lexer_state_comment(), _lexer_action_accumulate(), _lexer_state_comment()); + _lexer_set_transition(_lexer_state_comment(), _lexer_class_asterisk(), _lexer_action_accumulate(), _lexer_state_closing_comment()); + _lexer_set_transition(_lexer_state_comment(), _lexer_class_eof(), _lexer_action_none(), _lexer_state_end()); + + (* Closing comment. *) + _lexer_default_transition(_lexer_state_closing_comment(), _lexer_action_accumulate(), _lexer_state_comment()); + _lexer_set_transition(_lexer_state_closing_comment(), _lexer_class_invalid(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_closing_comment(), _lexer_class_right_paren(), _lexer_action_delimited(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_closing_comment(), _lexer_class_asterisk(), _lexer_action_accumulate(), _lexer_state_closing_comment()); + _lexer_set_transition(_lexer_state_closing_comment(), _lexer_class_eof(), _lexer_action_none(), _lexer_state_end()); + + (* Character. *) + _lexer_default_transition(_lexer_state_character(), _lexer_action_accumulate(), _lexer_state_character()); + _lexer_set_transition(_lexer_state_character(), _lexer_class_invalid(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_character(), _lexer_class_eof(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_character(), _lexer_class_single_quote(), _lexer_action_delimited(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_character(), _lexer_class_backslash(), _lexer_action_accumulate(), _lexer_state_character_escape()); + + (* Escape sequence in a character. *) + _lexer_default_transition(_lexer_state_character_escape(), _lexer_action_accumulate(), _lexer_state_character()); + _lexer_set_transition(_lexer_state_character_escape(), _lexer_class_invalid(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_character_escape(), _lexer_class_eof(), _lexer_action_none(), _lexer_state_end()); + + (* String. *) + _lexer_default_transition(_lexer_state_string(), _lexer_action_accumulate(), _lexer_state_string()); + _lexer_set_transition(_lexer_state_string(), _lexer_class_invalid(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_string(), _lexer_class_eof(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_string(), _lexer_class_double_quote(), _lexer_action_delimited(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_string(), _lexer_class_backslash(), _lexer_action_accumulate(), _lexer_state_string_escape()); + + (* Escape sequence in a string. *) + _lexer_default_transition(_lexer_state_string_escape(), _lexer_action_accumulate(), _lexer_state_string()); + _lexer_set_transition(_lexer_state_string_escape(), _lexer_class_invalid(), _lexer_action_none(), _lexer_state_end()); + _lexer_set_transition(_lexer_state_string_escape(), _lexer_class_eof(), _lexer_action_none(), _lexer_state_end()) +end; + +(** + * Transition table is saved after character classification table. + * Each character entry is 1 word long and there are 256 characters. + * 1024 = 256 * 4 + *) +proc _lexer_get_transition_table(); + return @classification + 1024 +end; + +(** + * Lexer state is saved after the transition tables. + * Each transition table entry is 8 bytes long. The table has 16 rows (transition states) + * and 22 columns (character classes), so 2992 = 8 * 17 * 22. + *) +proc _lexer_global_state(); + return _lexer_get_transition_table() + 2992 +end; + +(** + * Gets pointer to the token start. + *) +proc _lexer_global_get_start(); +var + target: Word; +begin + target := _lexer_global_state() + 4; + return _load_word(target) +end; + +(** + * Sets pointer to the token start. + *) +proc _lexer_global_set_start(new_start: Word); +var + target: Word; +begin + target := _lexer_global_state() + 4; + _store_word(new_start, target) +end; + +(** + * Gets pointer to the token end. + *) +proc _lexer_global_get_end(); +var + target: Word; +begin + target := _lexer_global_state() + 8; + return _load_word(target) +end; + +(** + * Sets pointer to the token end. + *) +proc _lexer_global_set_end(new_start: Word); +var + target: Word; +begin + target := _lexer_global_state() + 8; + _store_word(new_start, target) +end; + +proc _lexer_transition_get_action(transition: Word); + return _load_word(transition) +end; + +proc _lexer_transition_set_action(transition: Word, action: Word); +begin + _store_word(action, transition) +end; + +proc _lexer_transition_get_state(transition: Word); + return _load_word(transition + 4) +end; + +proc _lexer_transition_set_state(transition: Word, state: Word); +begin + _store_word(state, transition + 4) +end; + +(** + * Resets the lexer state for reading the next token. + *) +proc _lexer_reset(); +var + state: Word; +begin + (* Transition start state is 1. *) + state := _lexer_global_state(); + _store_word(_lexer_state_start(), state); + + state := _lexer_global_get_start(); + _lexer_global_set_end(state) +end; + +(** + * One time lexer initialization. + *) +proc _lexer_initialize(); +begin + _lexer_classifications(); + _lexer_transitions(); + + _lexer_global_set_start(@source_code); + _lexer_global_set_end(@source_code) +end; + +proc _lexer_next_transition(); +var + current_character: Word; + character_class: Word; + current_state: Word; +begin + current_character := _lexer_global_get_end(); + current_character := _load_byte(current_character); + + character_class := _get_at(@classification, current_character + 1); + + current_state := _lexer_global_state(); + current_state := _load_word(current_state); + + return _lexer_get_transition(current_state, character_class) +end; + +proc _lexer_token_kind_identifier(); + return 1 +end; + +proc _lexer_token_kind_const(); + return 2 +end; + +proc _lexer_token_kind_var(); + return 3 +end; + +proc _lexer_token_kind_proc(); + return 4 +end; + +proc _lexer_token_kind_type(); + return 5 +end; + +proc _lexer_token_kind_begin(); + return 6 +end; + +proc _lexer_token_kind_end(); + return 7 +end; + +proc _lexer_token_kind_if(); + return 8 +end; + +proc _lexer_token_kind_then(); + return 9 +end; + +proc _lexer_token_kind_else(); + return 10 +end; + +proc _lexer_token_kind_elsif(); + return 11 +end; + +proc _lexer_token_kind_while(); + return 12 +end; + +proc _lexer_token_kind_do(); + return 13 +end; + +proc _lexer_token_kind_extern(); + return 14 +end; + +proc _lexer_token_kind_record(); + return 15 +end; + +proc _lexer_token_kind_union(); + return 16 +end; + +proc _lexer_token_kind_true(); + return 17 +end; + +proc _lexer_token_kind_false(); + return 18 +end; + +proc _lexer_token_kind_nil(); + return 19 +end; + +proc _lexer_token_kind_and(); + return 20 +end; + +proc _lexer_token_kind_or(); + return 21 +end; + +proc _lexer_token_kind_xor(); + return 22 +end; + +proc _lexer_token_kind_pipe(); + return 23 +end; + +proc _lexer_token_kind_not(); + return 24 +end; + +proc _lexer_token_kind_return(); + return 24 +end; + +proc _lexer_token_kind_module(); + return 25 +end; + +proc _lexer_token_kind_program(); + return 26 +end; + +proc _lexer_token_kind_import(); + return 27 +end; + +proc _lexer_token_kind_cast(); + return 28 +end; + +proc _lexer_token_kind_defer(); + return 29 +end; + +proc _lexer_token_kind_case(); + return 30 +end; + +proc _lexer_token_kind_of(); + return 31 +end; + +proc _lexer_token_kind_trait(); + return 32 +end; + +proc _lexer_token_kind_left_paren(); + return 33 +end; + +proc _lexer_token_kind_right_paren(); + return 34 +end; + +proc _lexer_token_kind_left_square(); + return 35 +end; + +proc _lexer_token_kind_right_square(); + return 36 +end; + +proc _lexer_token_kind_shift_left(); + return 37 +end; + +proc _lexer_token_kind_shift_right(); + return 38 +end; + +proc _lexer_token_kind_greater_equal(); + return 39 +end; + +proc _lexer_token_kind_less_equal(); + return 40 +end; + +proc _lexer_token_kind_greater_than(); + return 41 +end; + +proc _lexer_token_kind_less_than(); + return 42 +end; + +proc _lexer_token_kind_not_equal(); + return 43 +end; + +proc _lexer_token_kind_equals(); + return 44 +end; + +proc _lexer_token_kind_semicolon(); + return 45 +end; + +proc _lexer_token_kind_dot(); + return 46 +end; + +proc _lexer_token_kind_comma(); + return 47 +end; + +proc _lexer_token_kind_plus(); + return 48 +end; + +proc _lexer_token_kind_arrow(); + return 49 +end; + +proc _lexer_token_kind_minus(); + return 50 +end; + +proc _lexer_token_kind_multiplication(); + return 51 +end; + +proc _lexer_token_kind_division(); + return 52 +end; + +proc _lexer_token_kind_remainder(); + return 53 +end; + +proc _lexer_token_kind_assignment(); + return 54 +end; + +proc _lexer_token_kind_colon(); + return 55 +end; + +proc _lexer_token_kind_hat(); + return 56 +end; + +proc _lexer_token_kind_at(); + return 57 +end; + +proc _lexer_token_kind_comment(); + return 58 +end; + +proc _lexer_token_kind_string(); + return 59 +end; + +proc _lexer_token_kind_character(); + return 60 +end; + +proc _lexer_token_kind_integer(); + return 61 +end; + +proc _lexer_token_kind_word(); + return 62 +end; + +proc _lexer_token_kind_goto(); + return 63 +end; + +proc _lexer_token_kind_eof(); + return 64 +end; + +proc _lexer_compare_keyword(lhs_pointer: Word, lhs_length: Word, rhs_pointer: Word, rhs_length: Word); +var + result: Word; +begin + result := 0; + + if lhs_length = rhs_length then + result := _memcmp(lhs_pointer, rhs_pointer, lhs_length) = 0 + end; + return result +end; + +proc _lexer_classify_keyword(position_start: Word, position_end: Word); +var + result: Word; + token_length: Word; +begin + result := _lexer_token_kind_identifier(); + token_length := position_end + -position_start; + + if _lexer_compare_keyword(position_start, token_length, "const", 5) = 1 then + result := _lexer_token_kind_const() + elsif _lexer_compare_keyword(position_start, token_length, "var", 3) = 1 then + result := _lexer_token_kind_var() + elsif _lexer_compare_keyword(position_start, token_length, "proc", 4) = 1 then + result := _lexer_token_kind_proc() + elsif _lexer_compare_keyword(position_start, token_length, "type", 4) = 1 then + result := _lexer_token_kind_type() + elsif _lexer_compare_keyword(position_start, token_length, "begin", 5) = 1 then + result := _lexer_token_kind_begin() + elsif _lexer_compare_keyword(position_start, token_length, "end", 3) = 1 then + result := _lexer_token_kind_end() + elsif _lexer_compare_keyword(position_start, token_length, "return", 6) = 1 then + result := _lexer_token_kind_return() + elsif _lexer_compare_keyword(position_start, token_length, "goto", 4) = 1 then + result := _lexer_token_kind_goto() + elsif _lexer_compare_keyword(position_start, token_length, "if", 2) = 1 then + result := _lexer_token_kind_if() + elsif _lexer_compare_keyword(position_start, token_length, "while", 5) = 1 then + result := _lexer_token_kind_while() + elsif _lexer_compare_keyword(position_start, token_length, "then", 4) = 1 then + result := _lexer_token_kind_then() + elsif _lexer_compare_keyword(position_start, token_length, "else", 4) = 1 then + result := _lexer_token_kind_else() + elsif _lexer_compare_keyword(position_start, token_length, "elsif", 5) = 1 then + result := _lexer_token_kind_elsif() + elsif _lexer_compare_keyword(position_start, token_length, "or", 2) = 1 then + result := _lexer_token_kind_or() + elsif _lexer_compare_keyword(position_start, token_length, "xor", 2) = 1 then + result := _lexer_token_kind_xor() + end; + return result +end; + +proc _lexer_classify_finalize(start_position: Word); +var + character: Word; + result: Word; +begin + result := 0; + character := _load_byte(start_position); + + if character = ':' then + result := _lexer_token_kind_colon() + elsif character = '.' then + result := _lexer_token_kind_dot() + elsif character = '(' then + result := _lexer_token_kind_left_paren() + elsif character = '-' then + result := _lexer_token_kind_minus() + elsif character = '<' then + result := _lexer_token_kind_less_than() + elsif character = '>' then + result := _lexer_token_kind_greater_than() + end; + return result +end; + +proc _lexer_classify_single(start_position: Word); +var + character: Word; + result: Word; +begin + result := 0; + character := _load_byte(start_position); + + if character = ';' then + result := _lexer_token_kind_semicolon() + elsif character = ',' then + result := _lexer_token_kind_comma() + elsif character = ')' then + result := _lexer_token_kind_right_paren() + elsif character = '@' then + result := _lexer_token_kind_at() + elsif character = '~' then + result := _lexer_token_kind_not() + elsif character = '&' then + result := _lexer_token_kind_and() + elsif character = '+' then + result := _lexer_token_kind_plus() + elsif character = '*' then + result := _lexer_token_kind_multiplication() + elsif character = '=' then + result := _lexer_token_kind_equals() + elsif character = '%' then + result := _lexer_token_kind_remainder() + elsif character = '/' then + result := _lexer_token_kind_division() + elsif character = '.' then + result := _lexer_token_kind_dot() + elsif character = '^' then + result := _lexer_token_kind_hat() + end; + return result +end; + +proc _lexer_classify_composite(start_position: Word, one_before_last: Word); +var + first_character: Word; + last_character: Word; + result: Word; +begin + first_character := _load_byte(start_position); + last_character := _load_byte(one_before_last); + + if first_character = ':' then + result := _lexer_token_kind_assignment() + elsif first_character = '<' then + if last_character = '=' then + result := _lexer_token_kind_less_equal() + elsif last_character = '>' then + result := _lexer_token_kind_not_equal() + end + elsif first_character = '>' then + if last_character = '=' then + result := _lexer_token_kind_greater_equal() + end + end; + + return result +end; + +proc _lexer_classify_delimited(start_position: Word, end_position: Word); +var + token_length: Word; + delimiter: Word; + result: Word; +begin + token_length := end_position + -start_position; + delimiter := _load_byte(start_position); + + if delimiter = '(' then + result := _lexer_token_kind_comment() + elsif delimiter = '\'' then + result := _lexer_token_kind_character() + elsif delimiter = '"' then + result := _lexer_token_kind_string() + end; + return result +end; + +proc _lexer_classify_integer(start_position: Word, end_position: Word); +begin + return _lexer_token_kind_integer() +end; + +proc _lexer_execute_action(action_to_perform: Word, kind: Word); +var + position_start: Word; + position_end: Word; + intermediate: Word; +begin + position_start := _lexer_global_get_start(); + position_end := _lexer_global_get_end(); + + if action_to_perform = _lexer_action_none() then + elsif action_to_perform = _lexer_action_accumulate() then + _lexer_global_set_end(position_end + 1) + elsif action_to_perform = _lexer_action_skip() then + _lexer_global_set_start(position_start + 1); + _lexer_global_set_end(position_end + 1) + elsif action_to_perform = _lexer_action_single() then + _lexer_global_set_end(position_end + 1); + + intermediate := _lexer_classify_single(position_start); + _store_word(intermediate, kind) + elsif action_to_perform = _lexer_action_eof() then + intermediate := _lexer_token_kind_eof(); + _store_word(intermediate, kind) + elsif action_to_perform = _lexer_action_finalize() then + intermediate := _lexer_classify_finalize(position_start); + _store_word(intermediate, kind) + elsif action_to_perform = _lexer_action_composite() then + _lexer_global_set_end(position_end + 1); + + intermediate := _lexer_classify_composite(position_start, position_end); + _store_word(intermediate, kind) + elsif action_to_perform = _lexer_action_key_id() then + intermediate := _lexer_classify_keyword(position_start, position_end); + _store_word(intermediate, kind) + elsif action_to_perform = _lexer_action_integer() then + intermediate := _lexer_classify_integer(position_start, position_end); + _store_word(intermediate, kind) + elsif action_to_perform = _lexer_action_delimited() then + _lexer_global_set_end(position_end + 1); + + intermediate := _lexer_classify_delimited(position_start, position_end + 1); + _store_word(intermediate, kind) + end; +end; + +proc _lexer_execute_transition(kind: Word); +var + next_transition: Word; + next_state: Word; + global_state: Word; + action_to_perform: Word; +begin + next_transition := _lexer_next_transition(); + next_state := _lexer_transition_get_state(next_transition); + action_to_perform := _lexer_transition_get_action(next_transition); + + global_state := _lexer_global_state(); + + _store_word(next_state, global_state); + _lexer_execute_action(action_to_perform, kind); + + return next_state +end; + +proc _lexer_advance_token(kind: Word); +begin + if _lexer_execute_transition(kind) <> _lexer_state_end() then + _lexer_advance_token(kind) + end +end; + +(** + * Reads the next token and writes its type into the address in the kind parameter. + *) +proc _lexer_read_token(kind: Word); +begin + _lexer_reset(); + _lexer_advance_token(kind) +end; + +(** + * Advances the token stream past the last read token. + *) +proc _lexer_skip_token(); +var + old_end: Word; +begin + old_end := _lexer_global_get_end(); + _lexer_global_set_start(old_end) +end; + +(* + * Entry point. + *) +proc _start(); +var + last_read: Word; + offset: Word; +begin + _lexer_initialize(); + _symbol_table_build(); + + (* Read the source from the standard input. *) + offset := @source_code; + + .start_read; + (* Second argument is buffer size. Modifying update the source_code definition. *) + last_read := _read_file(offset, 81920); + if last_read > 0 then + offset := offset + last_read; + goto .start_read + end; + _compile(); + + _exit(0) +end;