diff options
Diffstat (limited to 'source/main.elna')
| -rw-r--r-- | source/main.elna | 841 |
1 files changed, 841 insertions, 0 deletions
diff --git a/source/main.elna b/source/main.elna new file mode 100644 index 0000000..dae045b --- /dev/null +++ b/source/main.elna @@ -0,0 +1,841 @@ +(* This Source Code Form is subject to the terms of the Mozilla Public License, + v. 2.0. If a copy of the MPL was not distributed with this file, You can + obtain one at https://mozilla.org/MPL/2.0/. *) +program; + +import cstdio, cctype, common, command_line_interface, lexer; + +type + SourceFile* = record + buffer: [1024]Char; + handle: ^FILE; + size: Word; + index: Word + end; + StringBuffer* = record + data: Pointer; + size: Word; + capacity: Word + end; + SourceCode = record + position: TextLocation; + + input: Pointer; + empty: proc(Pointer) -> Bool; + advance: proc(Pointer); + head: proc(Pointer) -> Char + end; + Token* = record + kind: LexerKind; + value: union + int_value: Int; + string: String; + boolean_value: Bool; + char_value: Char + end + end; + Tokenizer* = record + length: Word; + data: ^Token + end; + +(* + Standard procedures. +*) +proc reallocarray(ptr: Pointer, n: Word, size: Word) -> Pointer; + return realloc(ptr, n * size) +end; + +proc substring(string: String, start: Word, count: Word) -> String; + return String(string.ptr + start, count) +end; + +proc open_substring(string: String, start: Word) -> String; + return substring(string, start, string.length - start) +end; + +proc string_dup(origin: String) -> String; +var + copy: ^Char; +begin + copy := cast(malloc(origin.length): ^Char); + strncpy(copy, origin.ptr, origin.length); + + return String(copy, origin.length) +end; + +proc string_buffer_new() -> StringBuffer; +var + result: StringBuffer; +begin + result.capacity := 64u; + result.data := malloc(result.capacity); + result.size := 0u; + + return result +end; + +proc string_buffer_push(buffer: ^StringBuffer, char: Char); +begin + if buffer^.size >= buffer^.capacity then + buffer^.capacity := buffer^.capacity + 1024u; + buffer^.data := realloc(buffer^.data, buffer^.capacity) + end; + cast(buffer^.data + buffer^.size: ^Char)^ := cast(char: Char); + buffer^.size := buffer^.size + 1u +end; + +proc string_buffer_pop(buffer: ^StringBuffer, count: Word); +begin + buffer^.size := buffer^.size - count +end; + +proc string_buffer_clear(buffer: ^StringBuffer) -> String; +var + result: String; +begin + result := String(cast(buffer^.data: ^Char), buffer^.size); + buffer^.size := 0u; + return result +end; + +(* + Source code stream procedures. +*) + +proc read_source(filename: ^Char) -> ^SourceFile; +var + result: ^SourceFile; + file_handle: ^FILE; +begin + file_handle := fopen(filename, "rb\0".ptr); + + if file_handle <> nil then + result := cast(malloc(#size(SourceFile)): ^SourceFile); + result^.handle := file_handle; + result^.size := 0u; + result^.index := 1u + end; + return result +end; + +proc source_file_empty(source_input: Pointer) -> Bool; +var + source_file: ^SourceFile; +begin + source_file := cast(source_input: ^SourceFile); + + if source_file^.index > source_file^.size then + source_file^.size := fread(cast(@source_file^.buffer: Pointer), 1u, 1024u, source_file^.handle); + source_file^.index := 1u + end; + + return source_file^.size = 0u +end; + +proc source_file_head(source_input: Pointer) -> Char; +var + source_file: ^SourceFile; +begin + source_file := cast(source_input: ^SourceFile); + + return source_file^.buffer[source_file^.index] +end; + +proc source_file_advance(source_input: Pointer); +var + source_file: ^SourceFile; +begin + source_file := cast(source_input: ^SourceFile); + + source_file^.index := source_file^.index + 1u +end; + +proc source_code_empty(source_code: ^SourceCode) -> Bool; + return source_code^.empty(source_code^.input) +end; + +proc source_code_head(source_code: SourceCode) -> Char; + return source_code.head(source_code.input) +end; + +proc source_code_advance(source_code: ^SourceCode); +begin + source_code^.advance(source_code^.input); + source_code^.position.column := source_code^.position.column +end; + +proc source_code_break(source_code: ^SourceCode); +begin + source_code^.position.line := source_code^.position.line + 1u; + source_code^.position.column := 0u +end; + +proc source_code_expect(source_code: ^SourceCode, expected: Char) -> Bool; + return ~source_code_empty(source_code) & source_code_head(source_code^) = expected +end; + +(* + Token procedures. +*) + +proc lexer_escape(escape: Char, result: ^Char) -> Bool; +var + successful: Bool; +begin + case escape of + 'n': + result^ := '\n'; + successful := true + | 'a': + result^ := '\a'; + successful := true + | 'b': + result^ := '\b'; + successful := true + | 't': + result^ := '\t'; + successful := true + | 'f': + result^ := '\f'; + successful := true + | 'r': + result^ := '\r'; + successful := true + | 'v': + result^ := '\v'; + successful := true + | '\\': + result^ := '\\'; + successful := true + | '\'': + result^ := '\''; + successful := true + | '"': + result^ := '"'; + successful := true + | '?': + result^ := '\?'; + successful := true + | '0': + result^ := '\0'; + successful := true + else + successful := false + end; + return successful +end; + +(* Skip spaces. *) +proc lexer_spaces(source_code: ^SourceCode); +var + current: Char; +begin + while ~source_code_empty(source_code) & isspace(cast(source_code_head(source_code^): Int)) <> 0 do + current := source_code_head(source_code^); + + if current = '\n' then + source_code_break(source_code) + end; + source_code_advance(source_code) + end +end; + +(* Checker whether the character is allowed in an identificator. *) +proc lexer_is_ident(char: Char) -> Bool; + return isalnum(cast(char: Int)) <> 0 or char = '_' +end; + +proc lexer_identifier(source_code: ^SourceCode, token_content: ^StringBuffer); +var + content_length: Word; +begin + while ~source_code_empty(source_code) & lexer_is_ident(source_code_head(source_code^)) do + string_buffer_push(token_content, source_code_head(source_code^)); + source_code_advance(source_code) + end +end; + +proc lexer_comment(source_code: ^SourceCode, token_content: ^StringBuffer) -> Bool; +var + trailing: Word; +begin + trailing := 0u; + + while ~source_code_empty(source_code) & trailing < 2u do + if source_code_head(source_code^) = '*' then + string_buffer_push(token_content, '*'); + trailing := 1u + elsif source_code_head(source_code^) = ')' & trailing = 1u then + string_buffer_pop(token_content, 1u); + trailing := 2u + else + string_buffer_push(token_content, source_code_head(source_code^)); + trailing := 0u + end; + source_code_advance(source_code) + end; + + return trailing = 2u +end; + +proc lexer_character(source_code: ^SourceCode, token_content: ^Char) -> Bool; +var + successful: Bool; +begin + successful := ~source_code_empty(source_code); + + if successful then + if source_code_head(source_code^) = '\\' then + source_code_advance(source_code); + + successful := ~source_code_empty(source_code) & lexer_escape(source_code_head(source_code^), token_content) + else + token_content^ := source_code_head(source_code^); + successful := true + end + end; + if successful then + source_code_advance(source_code) + end; + return successful +end; + +proc lexer_string(source_code: ^SourceCode, token_content: ^StringBuffer) -> Bool; +var + token_end, constructed_string: ^Char; + token_length: Word; + is_valid: Bool := true; + next_char: Char; +begin + while is_valid & ~source_code_empty(source_code) & source_code_head(source_code^) <> '"' do + is_valid := lexer_character(source_code, @next_char); + + if is_valid then + string_buffer_push(token_content, next_char) + end + end; + + if is_valid & source_code_expect(source_code, '"') then + source_code_advance(source_code) + else + is_valid := false + end; + return is_valid +end; + +proc lexer_number(source_code: ^SourceCode, token_content: ^Int); +begin + token_content^ := 0; + + while ~source_code_empty(source_code) & isdigit(cast(source_code_head(source_code^): Int)) <> 0 do + token_content^ := token_content^ * 10 + (cast(source_code_head(source_code^): Int) - cast('0': Int)); + + source_code_advance(source_code) + end +end; + +(* Categorize an identifier. *) +proc lexer_categorize(token_content: String) -> Token; +var + current_token: Token; +begin + if token_content = "if" then + current_token.kind := LexerKind._if + elsif token_content = "then" then + current_token.kind := LexerKind._then + elsif token_content = "else" then + current_token.kind := LexerKind._else + elsif token_content = "elsif" then + current_token.kind := LexerKind._elsif + elsif token_content = "while" then + current_token.kind := LexerKind._while + elsif token_content = "do" then + current_token.kind := LexerKind._do + elsif token_content = "proc" then + current_token.kind := LexerKind._proc + elsif token_content = "begin" then + current_token.kind := LexerKind._begin + elsif token_content = "end" then + current_token.kind := LexerKind._end + elsif token_content = "extern" then + current_token.kind := LexerKind._extern + elsif token_content = "const" then + current_token.kind := LexerKind._const + elsif token_content = "var" then + current_token.kind := LexerKind._var + elsif token_content = "case" then + current_token.kind := LexerKind._case + elsif token_content = "of" then + current_token.kind := LexerKind._of + elsif token_content = "type" then + current_token.kind := LexerKind._type + elsif token_content = "record" then + current_token.kind := LexerKind._record + elsif token_content = "union" then + current_token.kind := LexerKind._union + elsif token_content = "true" then + current_token.kind := LexerKind.boolean; + current_token.value.boolean_value := true + elsif token_content = "false" then + current_token.kind := LexerKind.boolean; + current_token.value.boolean_value := false + elsif token_content = "nil" then + current_token.kind := LexerKind.null + elsif token_content = "or" then + current_token.kind := LexerKind._or + elsif token_content = "return" then + current_token.kind := LexerKind._return + elsif token_content = "cast" then + current_token.kind := LexerKind._cast + elsif token_content = "defer" then + current_token.kind := LexerKind._defer + elsif token_content = "program" then + current_token.kind := LexerKind._program + elsif token_content = "module" then + current_token.kind := LexerKind._module + elsif token_content = "import" then + current_token.kind := LexerKind._import + else + current_token.kind := LexerKind.identifier; + current_token.value.string := string_dup(token_content) + end; + + return current_token +end; + +proc lexer_add_token(lexer: ^Tokenizer, token: Token); +var + new_length: Word; +begin + new_length := lexer^.length + 1u; + lexer^.data := cast(reallocarray(cast(lexer^.data: Pointer), new_length, #size(Token)): ^Token); + (lexer^.data + lexer^.length)^ := token; + lexer^.length := new_length +end; + +(* Read the next token from the input. *) +proc lexer_next(source_code: SourceCode, token_buffer: ^StringBuffer) -> Token; +var + current_token: Token; + first_char: Char; +begin + current_token.kind := LexerKind.unknown; + + first_char := source_code_head(source_code); + + if isalpha(cast(first_char: Int)) <> 0 or first_char = '_' then + lexer_identifier(@source_code, token_buffer); + current_token := lexer_categorize(string_buffer_clear(token_buffer)) + elsif first_char = '#' then + source_code_advance(@source_code); + lexer_identifier(@source_code, token_buffer); + + current_token.kind := LexerKind.trait; + current_token.value.string := string_dup(string_buffer_clear(token_buffer)) + elsif isdigit(cast(first_char: Int)) <> 0 then + lexer_number(@source_code, @current_token.value.int_value); + + if source_code_expect(@source_code, 'u') then + current_token.kind := LexerKind.word; + source_code_advance(@source_code) + else + current_token.kind := LexerKind.integer + end + elsif first_char = '(' then + source_code_advance(@source_code); + + if source_code_empty(@source_code) then + current_token.kind := LexerKind.left_paren + elsif source_code_head(source_code) = '*' then + source_code_advance(@source_code); + + if lexer_comment(@source_code, token_buffer) then + current_token.value.string := string_dup(string_buffer_clear(token_buffer)); + current_token.kind := LexerKind.comment + else + current_token.kind := LexerKind.unknown + end + else + current_token.kind := LexerKind.left_paren + end + elsif first_char = ')' then + current_token.kind := LexerKind.right_paren; + source_code_advance(@source_code) + elsif first_char = '\'' then + source_code_advance(@source_code); + + if lexer_character(@source_code, @current_token.value.char_value) & source_code_expect(@source_code, '\'') then + current_token.kind := LexerKind.character; + source_code_advance(@source_code) + else + current_token.kind := LexerKind.unknown + end + elsif first_char = '"' then + source_code_advance(@source_code); + + if lexer_string(@source_code, token_buffer) then + current_token.kind := LexerKind.string; + current_token.value.string := string_dup(string_buffer_clear(token_buffer)) + else + current_token.kind := LexerKind.unknown + end + elsif first_char = '[' then + current_token.kind := LexerKind.left_square; + source_code_advance(@source_code) + elsif first_char = ']' then + current_token.kind := LexerKind.right_square; + source_code_advance(@source_code) + elsif first_char = '>' then + source_code_advance(@source_code); + + if source_code_empty(@source_code) then + current_token.kind := LexerKind.greater_than + elsif source_code_head(source_code) = '=' then + current_token.kind := LexerKind.greater_equal; + source_code_advance(@source_code) + elsif source_code_head(source_code) = '>' then + current_token.kind := LexerKind.shift_right; + source_code_advance(@source_code) + else + current_token.kind := LexerKind.greater_than + end + elsif first_char = '<' then + source_code_advance(@source_code); + + if source_code_empty(@source_code) then + current_token.kind := LexerKind.less_than + elsif source_code_head(source_code) = '=' then + current_token.kind := LexerKind.less_equal; + source_code_advance(@source_code) + elsif source_code_head(source_code) = '<' then + current_token.kind := LexerKind.shift_left; + source_code_advance(@source_code) + elsif source_code_head(source_code) = '>' then + current_token.kind := LexerKind.not_equal; + source_code_advance(@source_code) + else + current_token.kind := LexerKind.less_than + end + elsif first_char = '=' then + current_token.kind := LexerKind.equal; + source_code_advance(@source_code) + elsif first_char = ';' then + current_token.kind := LexerKind.semicolon; + source_code_advance(@source_code) + elsif first_char = '.' then + current_token.kind := LexerKind.dot; + source_code_advance(@source_code) + elsif first_char = ',' then + current_token.kind := LexerKind.comma; + source_code_advance(@source_code) + elsif first_char = '+' then + current_token.kind := LexerKind.plus; + source_code_advance(@source_code) + elsif first_char = '-' then + source_code_advance(@source_code); + + if source_code_empty(@source_code) then + current_token.kind := LexerKind.minus + elsif source_code_head(source_code) = '>' then + current_token.kind := LexerKind.arrow; + source_code_advance(@source_code) + else + current_token.kind := LexerKind.minus + end + elsif first_char = '*' then + current_token.kind := LexerKind.multiplication; + source_code_advance(@source_code) + elsif first_char = '/' then + current_token.kind := LexerKind.division; + source_code_advance(@source_code) + elsif first_char = '%' then + current_token.kind := LexerKind.remainder; + source_code_advance(@source_code) + elsif first_char = ':' then + source_code_advance(@source_code); + + if source_code_empty(@source_code) then + current_token.kind := LexerKind.colon + elsif source_code_head(source_code) = '=' then + current_token.kind := LexerKind.assignment; + source_code_advance(@source_code) + else + current_token.kind := LexerKind.colon + end + elsif first_char = '^' then + current_token.kind := LexerKind.hat; + source_code_advance(@source_code) + elsif first_char = '@' then + current_token.kind := LexerKind.at; + source_code_advance(@source_code) + elsif first_char = '!' then + current_token.kind := LexerKind.exclamation; + source_code_advance(@source_code) + elsif first_char = '&' then + current_token.kind := LexerKind.and; + source_code_advance(@source_code) + elsif first_char = '~' then + current_token.kind := LexerKind.not; + source_code_advance(@source_code) + elsif first_char = '|' then + current_token.kind := LexerKind.pipe; + source_code_advance(@source_code) + else + current_token.kind := LexerKind.unknown; + source_code_advance(@source_code) + end; + + return current_token +end; + +(* Split the source text into tokens. *) +proc lexer_text(source_code: SourceCode) -> Tokenizer; +var + current_token: Token; + token_buffer: StringBuffer; + lexer: Tokenizer; +begin + lexer := Tokenizer(0u, nil); + token_buffer := string_buffer_new(); + + lexer_spaces(@source_code); + + while ~source_code_empty(@source_code) do + current_token := lexer_next(source_code, @token_buffer); + + if current_token.kind <> LexerKind.unknown then + lexer_add_token(@lexer, current_token); + lexer_spaces(@source_code) + else + write_s("Lexical analysis error on \""); + write_c(source_code_head(source_code)); + write_s("\".\n") + end + end; + + return lexer +end; + +(* + Parser. +*) + +proc parse(tokens: ^Token, tokens_size: Word); +var + current_token: ^Token; + i: Word := 0u; +begin + while i < tokens_size do + current_token := tokens + i; + + case current_token^.kind of + LexerKind._if: + write_s("IF") + | LexerKind._then: + write_s("THEN") + | LexerKind._else: + write_s("ELSE") + | LexerKind._elsif: + write_s("ELSIF") + | LexerKind._while: + write_s("WHILE") + | LexerKind._do: + write_s("DO") + | LexerKind._proc: + write_s("PROC") + | LexerKind._begin: + write_s("BEGIN") + | LexerKind._end: + write_s("END") + | LexerKind._extern: + write_s("EXTERN") + | LexerKind._const: + write_s("CONST") + | LexerKind._var: + write_s("VAR") + | LexerKind._case: + write_s("CASE") + | LexerKind._of: + write_s("OF") + | LexerKind._type: + write_s("TYPE") + | LexerKind._record: + write_s("RECORD") + | LexerKind._union: + write_s("UNION") + | LexerKind.pipe: + write_s("|") + | LexerKind.to: + write_s("TO") + | LexerKind.boolean: + write_s("BOOLEAN<"); + write_b(current_token^.value.boolean_value); + write_c('>') + | LexerKind.null: + write_s("NIL") + | LexerKind.and: + write_s("&") + | LexerKind._or: + write_s("OR") + | LexerKind.not: + write_s("~") + | LexerKind._return: + write_s("RETURN") + | LexerKind._cast: + write_s("CAST") + | LexerKind.shift_left: + write_s("<<") + | LexerKind.shift_right: + write_s(">>") + | LexerKind.identifier: + write_c('<'); + write_s(current_token^.value.string); + write_c('>') + | LexerKind.trait: + write_c('#'); + write_s(current_token^.value.string) + | LexerKind.left_paren: + write_s("(") + | LexerKind.right_paren: + write_s(")") + | LexerKind.left_square: + write_s("[") + | LexerKind.right_square: + write_s("]") + | LexerKind.greater_equal: + write_s(">=") + | LexerKind.less_equal: + write_s("<=") + | LexerKind.greater_than: + write_s(">") + | LexerKind.less_than: + write_s("<") + | LexerKind.equal: + write_s("=") + | LexerKind.not_equal: + write_s("<>") + | LexerKind.semicolon: + write_c(';') + | LexerKind.dot: + write_c('.') + | LexerKind.comma: + write_c(',') + | LexerKind.plus: + write_c('+') + | LexerKind.minus: + write_c('-') + | LexerKind.multiplication: + write_c('*') + | LexerKind.division: + write_c('/') + | LexerKind.remainder: + write_c('%') + | LexerKind.assignment: + write_s(":=") + | LexerKind.colon: + write_c(':') + | LexerKind.hat: + write_c('^') + | LexerKind.at: + write_c('@') + | LexerKind.comment: + write_s("(* COMMENT *)") + | LexerKind.integer: + write_c('<'); + write_i(current_token^.value.int_value); + write_c('>') + | LexerKind.word: + write_c('<'); + write_i(current_token^.value.int_value); + write_s("u>") + | LexerKind.character: + write_c('<'); + write_i(cast(current_token^.value.char_value: Int)); + write_s("c>") + | LexerKind.string: + write_s("\"...\"") + | LexerKind._defer: + write_s("DEFER") + | LexerKind.exclamation: + write_c('!') + | LexerKind.arrow: + write_s("->") + | LexerKind._program: + write_s("PROGRAM") + | LexerKind._module: + write_s("MODULE") + | LexerKind._import: + write_s("IMPORT") + else + write_s("UNKNOWN<"); + write_i(cast(current_token^.kind: Int)); + write_c('>') + end; + write_c(' '); + + i := i + 1u + end; + write_c('\n') +end; + +(* + Compilation entry. +*) + +proc compile_in_stages(command_line: ^CommandLine, source_code: SourceCode) -> Int; +var + return_code: Int := 0; + lexer: Tokenizer; +begin + if command_line^.lex or command_line^.parse then + lexer := lexer_text(source_code) + end; + if command_line^.parse then + parse(lexer.data, lexer.length) + end; + + return return_code +end; + +proc process(argc: Int, argv: ^^Char) -> Int; +var + tokens: ^Token; + tokens_size: Word; + source_code: SourceCode; + command_line: ^CommandLine; + return_code: Int := 0; + source_file: ^SourceFile; +begin + command_line := parse_command_line(argc, argv); + if command_line = nil then + return_code := 2 + end; + + if return_code = 0 then + source_file := read_source(command_line^.input); + + if source_file = nil then + perror(command_line^.input); + return_code := 3 + end + end; + + if return_code = 0 then + defer + fclose(source_file^.handle) + end; + + source_code.position := TextLocation(1u, 1u); + source_code.input := cast(source_file: Pointer); + source_code.empty := source_file_empty; + source_code.head := source_file_head; + source_code.advance := source_file_advance; + + return_code := compile_in_stages(command_line, source_code) + end; + return return_code +end; + + return process(count, parameters) +end. |
