(* This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. *) program; import cstdio, cctype, common, command_line_interface, lexer; type SourceFile* = record buffer: [1024]Char; handle: ^FILE; size: Word; index: Word end; StringBuffer* = record data: Pointer; size: Word; capacity: Word end; SourceCode = record position: TextLocation; input: Pointer; empty: proc(Pointer) -> Bool; advance: proc(Pointer); head: proc(Pointer) -> Char end; Token* = record kind: LexerKind; value: union int_value: Int; string: String; boolean_value: Bool; char_value: Char end end; Tokenizer* = record length: Word; data: ^Token end; (* Standard procedures. *) proc reallocarray(ptr: Pointer, n: Word, size: Word) -> Pointer; return realloc(ptr, n * size) end; proc substring(string: String, start: Word, count: Word) -> String; return String(string.ptr + start, count) end; proc open_substring(string: String, start: Word) -> String; return substring(string, start, string.length - start) end; proc string_dup(origin: String) -> String; var copy: ^Char; begin copy := cast(malloc(origin.length): ^Char); strncpy(copy, origin.ptr, origin.length); return String(copy, origin.length) end; proc string_buffer_new() -> StringBuffer; var result: StringBuffer; begin result.capacity := 64u; result.data := malloc(result.capacity); result.size := 0u; return result end; proc string_buffer_push(buffer: ^StringBuffer, char: Char); begin if buffer^.size >= buffer^.capacity then buffer^.capacity := buffer^.capacity + 1024u; buffer^.data := realloc(buffer^.data, buffer^.capacity) end; cast(buffer^.data + buffer^.size: ^Char)^ := cast(char: Char); buffer^.size := buffer^.size + 1u end; proc string_buffer_pop(buffer: ^StringBuffer, count: Word); begin buffer^.size := buffer^.size - count end; proc string_buffer_clear(buffer: ^StringBuffer) -> String; var result: String; begin result := String(cast(buffer^.data: ^Char), buffer^.size); buffer^.size := 0u; return result end; (* Source code stream procedures. *) proc read_source(filename: ^Char) -> ^SourceFile; var result: ^SourceFile; file_handle: ^FILE; begin file_handle := fopen(filename, "rb\0".ptr); if file_handle <> nil then result := cast(malloc(#size(SourceFile)): ^SourceFile); result^.handle := file_handle; result^.size := 0u; result^.index := 1u end; return result end; proc source_file_empty(source_input: Pointer) -> Bool; var source_file: ^SourceFile; begin source_file := cast(source_input: ^SourceFile); if source_file^.index > source_file^.size then source_file^.size := fread(cast(@source_file^.buffer: Pointer), 1u, 1024u, source_file^.handle); source_file^.index := 1u end; return source_file^.size = 0u end; proc source_file_head(source_input: Pointer) -> Char; var source_file: ^SourceFile; begin source_file := cast(source_input: ^SourceFile); return source_file^.buffer[source_file^.index] end; proc source_file_advance(source_input: Pointer); var source_file: ^SourceFile; begin source_file := cast(source_input: ^SourceFile); source_file^.index := source_file^.index + 1u end; proc source_code_empty(source_code: ^SourceCode) -> Bool; return source_code^.empty(source_code^.input) end; proc source_code_head(source_code: SourceCode) -> Char; return source_code.head(source_code.input) end; proc source_code_advance(source_code: ^SourceCode); begin source_code^.advance(source_code^.input); source_code^.position.column := source_code^.position.column end; proc source_code_break(source_code: ^SourceCode); begin source_code^.position.line := source_code^.position.line + 1u; source_code^.position.column := 0u end; proc source_code_expect(source_code: ^SourceCode, expected: Char) -> Bool; return ~source_code_empty(source_code) & source_code_head(source_code^) = expected end; (* Token procedures. *) proc lexer_escape(escape: Char, result: ^Char) -> Bool; var successful: Bool; begin if escape = 'n' then result^ := '\n'; successful := true elsif escape = 'a' then result^ := '\a'; successful := true elsif escape = 'b' then result^ := '\b'; successful := true elsif escape = 't' then result^ := '\t'; successful := true elsif escape = 'f' then result^ := '\f'; successful := true elsif escape = 'r' then result^ := '\r'; successful := true elsif escape = 'v' then result^ := '\v'; successful := true elsif escape = '\\' then result^ := '\\'; successful := true elsif escape = '\'' then result^ := '\''; successful := true elsif escape = '"' then result^ := '"'; successful := true elsif escape = '?' then result^ := '\?'; successful := true elsif escape = '0' then result^ := '\0'; successful := true else successful := false end; return successful end; (* Skip spaces. *) proc lexer_spaces(source_code: ^SourceCode); var current: Char; begin while ~source_code_empty(source_code) & isspace(cast(source_code_head(source_code^): Int)) <> 0 do current := source_code_head(source_code^); if current = '\n' then source_code_break(source_code) end; source_code_advance(source_code) end end; (* Checker whether the character is allowed in an identificator. *) proc lexer_is_ident(char: Char) -> Bool; return isalnum(cast(char: Int)) <> 0 or char = '_' end; proc lexer_identifier(source_code: ^SourceCode, token_content: ^StringBuffer); var content_length: Word; begin while ~source_code_empty(source_code) & lexer_is_ident(source_code_head(source_code^)) do string_buffer_push(token_content, source_code_head(source_code^)); source_code_advance(source_code) end end; proc lexer_comment(source_code: ^SourceCode, token_content: ^StringBuffer) -> Bool; var trailing: Word; begin trailing := 0u; while ~source_code_empty(source_code) & trailing < 2u do if source_code_head(source_code^) = '*' then string_buffer_push(token_content, '*'); trailing := 1u elsif source_code_head(source_code^) = ')' & trailing = 1u then string_buffer_pop(token_content, 1u); trailing := 2u else string_buffer_push(token_content, source_code_head(source_code^)); trailing := 0u end; source_code_advance(source_code) end; return trailing = 2u end; proc lexer_character(source_code: ^SourceCode, token_content: ^Char) -> Bool; var successful: Bool; begin successful := ~source_code_empty(source_code); if successful then if source_code_head(source_code^) = '\\' then source_code_advance(source_code); successful := ~source_code_empty(source_code) & lexer_escape(source_code_head(source_code^), token_content) else token_content^ := source_code_head(source_code^); successful := true end end; if successful then source_code_advance(source_code) end; return successful end; proc lexer_string(source_code: ^SourceCode, token_content: ^StringBuffer) -> Bool; var token_end, constructed_string: ^Char; token_length: Word; is_valid: Bool; next_char: Char; begin is_valid := true; while is_valid & ~source_code_empty(source_code) & source_code_head(source_code^) <> '"' do is_valid := lexer_character(source_code, @next_char); if is_valid then string_buffer_push(token_content, next_char) end end; if is_valid & source_code_expect(source_code, '"') then source_code_advance(source_code) else is_valid := false end; return is_valid end; proc lexer_number(source_code: ^SourceCode, token_content: ^Int); begin token_content^ := 0; while ~source_code_empty(source_code) & isdigit(cast(source_code_head(source_code^): Int)) <> 0 do token_content^ := token_content^ * 10 + (cast(source_code_head(source_code^): Int) - cast('0': Int)); source_code_advance(source_code) end end; (* Categorize an identifier. *) proc lexer_categorize(token_content: String) -> Token; var current_token: Token; begin if token_content = "if" then current_token.kind := LexerKind._if elsif token_content = "then" then current_token.kind := LexerKind._then elsif token_content = "else" then current_token.kind := LexerKind._else elsif token_content = "elsif" then current_token.kind := LexerKind._elsif elsif token_content = "while" then current_token.kind := LexerKind._while elsif token_content = "do" then current_token.kind := LexerKind._do elsif token_content = "proc" then current_token.kind := LexerKind._proc elsif token_content = "begin" then current_token.kind := LexerKind._begin elsif token_content = "end" then current_token.kind := LexerKind._end elsif token_content = "extern" then current_token.kind := LexerKind._extern elsif token_content = "const" then current_token.kind := LexerKind._const elsif token_content = "var" then current_token.kind := LexerKind._var elsif token_content = "case" then current_token.kind := LexerKind._case elsif token_content = "of" then current_token.kind := LexerKind._of elsif token_content = "type" then current_token.kind := LexerKind._type elsif token_content = "record" then current_token.kind := LexerKind._record elsif token_content = "union" then current_token.kind := LexerKind._union elsif token_content = "true" then current_token.kind := LexerKind.boolean; current_token.value.boolean_value := true elsif token_content = "false" then current_token.kind := LexerKind.boolean; current_token.value.boolean_value := false elsif token_content = "nil" then current_token.kind := LexerKind.null elsif token_content = "or" then current_token.kind := LexerKind._or elsif token_content = "return" then current_token.kind := LexerKind._return elsif token_content = "cast" then current_token.kind := LexerKind._cast elsif token_content = "defer" then current_token.kind := LexerKind._defer elsif token_content = "program" then current_token.kind := LexerKind._program elsif token_content = "module" then current_token.kind := LexerKind._module elsif token_content = "import" then current_token.kind := LexerKind._import else current_token.kind := LexerKind.identifier; current_token.value.string := string_dup(token_content) end; return current_token end; proc lexer_add_token(lexer: ^Tokenizer, token: Token); var new_length: Word; begin new_length := lexer^.length + 1u; lexer^.data := cast(reallocarray(cast(lexer^.data: Pointer), new_length, #size(Token)): ^Token); (lexer^.data + lexer^.length)^ := token; lexer^.length := new_length end; (* Read the next token from the input. *) proc lexer_next(source_code: SourceCode, token_buffer: ^StringBuffer) -> Token; var current_token: Token; first_char: Char; begin current_token.kind := LexerKind.unknown; first_char := source_code_head(source_code); if isalpha(cast(first_char: Int)) <> 0 or first_char = '_' then lexer_identifier(@source_code, token_buffer); current_token := lexer_categorize(string_buffer_clear(token_buffer)) elsif first_char = '#' then source_code_advance(@source_code); lexer_identifier(@source_code, token_buffer); current_token.kind := LexerKind.trait; current_token.value.string := string_dup(string_buffer_clear(token_buffer)) elsif isdigit(cast(first_char: Int)) <> 0 then lexer_number(@source_code, @current_token.value.int_value); if source_code_expect(@source_code, 'u') then current_token.kind := LexerKind.word; source_code_advance(@source_code) else current_token.kind := LexerKind.integer end elsif first_char = '(' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token.kind := LexerKind.left_paren elsif source_code_head(source_code) = '*' then source_code_advance(@source_code); if lexer_comment(@source_code, token_buffer) then current_token.value.string := string_dup(string_buffer_clear(token_buffer)); current_token.kind := LexerKind.comment else current_token.kind := LexerKind.unknown end else current_token.kind := LexerKind.left_paren end elsif first_char = ')' then current_token.kind := LexerKind.right_paren; source_code_advance(@source_code) elsif first_char = '\'' then source_code_advance(@source_code); if lexer_character(@source_code, @current_token.value.char_value) & source_code_expect(@source_code, '\'') then current_token.kind := LexerKind.character; source_code_advance(@source_code) else current_token.kind := LexerKind.unknown end elsif first_char = '"' then source_code_advance(@source_code); if lexer_string(@source_code, token_buffer) then current_token.kind := LexerKind.string; current_token.value.string := string_dup(string_buffer_clear(token_buffer)) else current_token.kind := LexerKind.unknown end elsif first_char = '[' then current_token.kind := LexerKind.left_square; source_code_advance(@source_code) elsif first_char = ']' then current_token.kind := LexerKind.right_square; source_code_advance(@source_code) elsif first_char = '>' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token.kind := LexerKind.greater_than elsif source_code_head(source_code) = '=' then current_token.kind := LexerKind.greater_equal; source_code_advance(@source_code) elsif source_code_head(source_code) = '>' then current_token.kind := LexerKind.shift_right; source_code_advance(@source_code) else current_token.kind := LexerKind.greater_than end elsif first_char = '<' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token.kind := LexerKind.less_than elsif source_code_head(source_code) = '=' then current_token.kind := LexerKind.less_equal; source_code_advance(@source_code) elsif source_code_head(source_code) = '<' then current_token.kind := LexerKind.shift_left; source_code_advance(@source_code) elsif source_code_head(source_code) = '>' then current_token.kind := LexerKind.not_equal; source_code_advance(@source_code) else current_token.kind := LexerKind.less_than end elsif first_char = '=' then current_token.kind := LexerKind.equal; source_code_advance(@source_code) elsif first_char = ';' then current_token.kind := LexerKind.semicolon; source_code_advance(@source_code) elsif first_char = '.' then current_token.kind := LexerKind.dot; source_code_advance(@source_code) elsif first_char = ',' then current_token.kind := LexerKind.comma; source_code_advance(@source_code) elsif first_char = '+' then current_token.kind := LexerKind.plus; source_code_advance(@source_code) elsif first_char = '-' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token.kind := LexerKind.minus elsif source_code_head(source_code) = '>' then current_token.kind := LexerKind.arrow; source_code_advance(@source_code) else current_token.kind := LexerKind.minus end elsif first_char = '*' then current_token.kind := LexerKind.multiplication; source_code_advance(@source_code) elsif first_char = '/' then current_token.kind := LexerKind.division; source_code_advance(@source_code) elsif first_char = '%' then current_token.kind := LexerKind.remainder; source_code_advance(@source_code) elsif first_char = ':' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token.kind := LexerKind.colon elsif source_code_head(source_code) = '=' then current_token.kind := LexerKind.assignment; source_code_advance(@source_code) else current_token.kind := LexerKind.colon end elsif first_char = '^' then current_token.kind := LexerKind.hat; source_code_advance(@source_code) elsif first_char = '@' then current_token.kind := LexerKind.at; source_code_advance(@source_code) elsif first_char = '!' then current_token.kind := LexerKind.exclamation; source_code_advance(@source_code) elsif first_char = '&' then current_token.kind := LexerKind.and; source_code_advance(@source_code) elsif first_char = '~' then current_token.kind := LexerKind.not; source_code_advance(@source_code) elsif first_char = '|' then current_token.kind := LexerKind.pipe; source_code_advance(@source_code) else current_token.kind := LexerKind.unknown; source_code_advance(@source_code) end; return current_token end; (* Split the source text into tokens. *) proc lexer_text(source_code: SourceCode) -> Tokenizer; var current_token: Token; token_buffer: StringBuffer; lexer: Tokenizer; begin lexer := Tokenizer(0u, nil); token_buffer := string_buffer_new(); lexer_spaces(@source_code); while ~source_code_empty(@source_code) do current_token := lexer_next(source_code, @token_buffer); if current_token.kind <> LexerKind.unknown then lexer_add_token(@lexer, current_token); lexer_spaces(@source_code) else write_s("Lexical analysis error on \""); write_c(source_code_head(source_code)); write_s("\".\n") end end; return lexer end; (* Parser. *) proc parse(tokens: ^Token, tokens_size: Word); var current_token: ^Token; i: Word; begin i := 0u; while i < tokens_size do current_token := tokens + i; case current_token^.kind of LexerKind._if: write_s("IF") | LexerKind._then: write_s("THEN") | LexerKind._else: write_s("ELSE") | LexerKind._elsif: write_s("ELSIF") | LexerKind._while: write_s("WHILE") | LexerKind._do: write_s("DO") | LexerKind._proc: write_s("PROC") | LexerKind._begin: write_s("BEGIN") | LexerKind._end: write_s("END") | LexerKind._extern: write_s("EXTERN") | LexerKind._const: write_s("CONST") | LexerKind._var: write_s("VAR") | LexerKind._case: write_s("CASE") | LexerKind._of: write_s("OF") | LexerKind._type: write_s("TYPE") | LexerKind._record: write_s("RECORD") | LexerKind._union: write_s("UNION") | LexerKind.pipe: write_s("|") | LexerKind.to: write_s("TO") | LexerKind.boolean: write_s("BOOLEAN<"); write_b(current_token^.value.boolean_value); write_c('>') | LexerKind.null: write_s("NIL") | LexerKind.and: write_s("&") | LexerKind._or: write_s("OR") | LexerKind.not: write_s("~") | LexerKind._return: write_s("RETURN") | LexerKind._cast: write_s("CAST") | LexerKind.shift_left: write_s("<<") | LexerKind.shift_right: write_s(">>") | LexerKind.identifier: write_c('<'); write_s(current_token^.value.string); write_c('>') | LexerKind.trait: write_c('#'); write_s(current_token^.value.string) | LexerKind.left_paren: write_s("(") | LexerKind.right_paren: write_s(")") | LexerKind.left_square: write_s("[") | LexerKind.right_square: write_s("]") | LexerKind.greater_equal: write_s(">=") | LexerKind.less_equal: write_s("<=") | LexerKind.greater_than: write_s(">") | LexerKind.less_than: write_s("<") | LexerKind.equal: write_s("=") | LexerKind.not_equal: write_s("<>") | LexerKind.semicolon: write_c(';') | LexerKind.dot: write_c('.') | LexerKind.comma: write_c(',') | LexerKind.plus: write_c('+') | LexerKind.minus: write_c('-') | LexerKind.multiplication: write_c('*') | LexerKind.division: write_c('/') | LexerKind.remainder: write_c('%') | LexerKind.assignment: write_s(":=") | LexerKind.colon: write_c(':') | LexerKind.hat: write_c('^') | LexerKind.at: write_c('@') | LexerKind.comment: write_s("(* COMMENT *)") | LexerKind.integer: write_c('<'); write_i(current_token^.value.int_value); write_c('>') | LexerKind.word: write_c('<'); write_i(current_token^.value.int_value); write_s("u>") | LexerKind.character: write_c('<'); write_i(cast(current_token^.value.char_value: Int)); write_s("c>") | LexerKind.string: write_s("\"...\"") | LexerKind._defer: write_s("DEFER") | LexerKind.exclamation: write_c('!') | LexerKind.arrow: write_s("->") | LexerKind._program: write_s("PROGRAM") | LexerKind._module: write_s("MODULE") | LexerKind._import: write_s("IMPORT") else write_s("UNKNOWN<"); write_i(cast(current_token^.kind: Int)); write_c('>') end; write_c(' '); i := i + 1u end; write_c('\n') end; (* Compilation entry. *) proc compile_in_stages(command_line: ^CommandLine, source_code: SourceCode) -> Int; var return_code: Int; lexer: Tokenizer; begin return_code := 0; if command_line^.lex or command_line^.parse then lexer := lexer_text(source_code) end; if command_line^.parse then parse(lexer.data, lexer.length) end; return return_code end; proc process(argc: Int, argv: ^^Char) -> Int; var tokens: ^Token; tokens_size: Word; source_code: SourceCode; command_line: ^CommandLine; return_code: Int; source_file: ^SourceFile; begin return_code := 0; command_line := parse_command_line(argc, argv); if command_line = nil then return_code := 2 end; if return_code = 0 then source_file := read_source(command_line^.input); if source_file = nil then perror(command_line^.input); return_code := 3 end end; if return_code = 0 then defer fclose(source_file^.handle) end; source_code.position := TextLocation(1u, 1u); source_code.input := cast(source_file: Pointer); source_code.empty := source_file_empty; source_code.head := source_file_head; source_code.advance := source_file_advance; return_code := compile_in_stages(command_line, source_code) end; return return_code end; return process(count, parameters) end.