elna/source/main.elna

(* This Source Code Form is subject to the terms of the Mozilla Public License,
  v. 2.0. If a copy of the MPL was not distributed with this file, You can
  obtain one at https://mozilla.org/MPL/2.0/. *)
program;

import cstdio, cctype, common, command_line_interface, lexer;

type
  SourceFile* = record
    buffer: [1024]Char;
    handle: ^FILE;
    size: Word;
    index: Word
  end;
  StringBuffer* = record
    data: Pointer;
    size: Word;
    capacity: Word
  end;
  SourceCode = record
    position: TextLocation;

    input: Pointer;
    empty: proc(Pointer) -> Bool;
    advance: proc(Pointer);
    head: proc(Pointer) -> Char
  end;
  Token* = record
    kind: LexerKind;
    value: union
      int_value: Int;
      string: String;
      boolean_value: Bool;
      char_value: Char
    end
  end;
  Tokenizer* = record
    length: Word;
    data: ^Token
  end;

(*
  Standard procedures.
*)
proc reallocarray(ptr: Pointer, n: Word, size: Word) -> Pointer;
  return realloc(ptr, n * size)
end;

proc substring(string: String, start: Word, count: Word) -> String;
  return String(string.ptr + start, count)
end;

proc open_substring(string: String, start: Word) -> String;
  return substring(string, start, string.length - start)
end;

proc string_dup(origin: String) -> String;
var
  copy: ^Char;
begin
  copy := cast(malloc(origin.length): ^Char);
  strncpy(copy, origin.ptr, origin.length);

  return String(copy, origin.length)
end;

proc string_buffer_new() -> StringBuffer;
var
  result: StringBuffer;
begin
  result.capacity := 64u;
  result.data := malloc(result.capacity);
  result.size := 0u;

  return result
end;

proc string_buffer_push(buffer: ^StringBuffer, char: Char);
begin
  if buffer^.size >= buffer^.capacity then
    buffer^.capacity := buffer^.capacity + 1024u;
    buffer^.data := realloc(buffer^.data, buffer^.capacity)
  end;
  cast(buffer^.data + buffer^.size: ^Char)^ := cast(char: Char);
  buffer^.size := buffer^.size + 1u
end;

proc string_buffer_pop(buffer: ^StringBuffer, count: Word);
begin
  buffer^.size := buffer^.size - count
end;

proc string_buffer_clear(buffer: ^StringBuffer) -> String;
var
  result: String;
begin
  result := String(cast(buffer^.data: ^Char), buffer^.size);
  buffer^.size := 0u;
  return result
end;

(*
  Source code stream procedures.
*)

proc read_source(filename: ^Char) -> ^SourceFile;
var
  result: ^SourceFile;
  file_handle: ^FILE;
begin
  file_handle := fopen(filename, "rb\0".ptr);

  if file_handle <> nil then
    result := cast(malloc(#size(SourceFile)): ^SourceFile);
    result^.handle := file_handle;
    result^.size := 0u;
    result^.index := 1u
  end;
  return result
end;

proc source_file_empty(source_input: Pointer) -> Bool;
var
  source_file: ^SourceFile;
begin
  source_file := cast(source_input: ^SourceFile);

  if source_file^.index > source_file^.size then
    source_file^.size := fread(cast(@source_file^.buffer: Pointer), 1u, 1024u, source_file^.handle);
    source_file^.index := 1u
  end;

  return source_file^.size = 0u
end;

proc source_file_head(source_input: Pointer) -> Char;
var
  source_file: ^SourceFile;
begin
  source_file := cast(source_input: ^SourceFile);

  return source_file^.buffer[source_file^.index]
end;

proc source_file_advance(source_input: Pointer);
var
  source_file: ^SourceFile;
begin
  source_file := cast(source_input: ^SourceFile);

  source_file^.index := source_file^.index + 1u
end;

proc source_code_empty(source_code: ^SourceCode) -> Bool;
  return source_code^.empty(source_code^.input)
end;

proc source_code_head(source_code: SourceCode) -> Char;
  return source_code.head(source_code.input)
end;

proc source_code_advance(source_code: ^SourceCode);
begin
  source_code^.advance(source_code^.input);
  source_code^.position.column := source_code^.position.column
end;

proc source_code_break(source_code: ^SourceCode);
begin
  source_code^.position.line := source_code^.position.line + 1u;
  source_code^.position.column := 0u
end;

proc source_code_expect(source_code: ^SourceCode, expected: Char) -> Bool;
  return ~source_code_empty(source_code) & source_code_head(source_code^) = expected
end;

(*
  Token procedures.
*)

proc lexer_escape(escape: Char, result: ^Char) -> Bool;
var
  successful: Bool;
begin
  case escape of
    'n':
      result^ := '\n';
      successful := true
    | 'a':
      result^ := '\a';
      successful := true
    | 'b':
      result^ := '\b';
      successful := true
    | 't':
      result^ := '\t';
      successful := true
    | 'f':
      result^ := '\f';
      successful := true
    | 'r':
      result^ := '\r';
      successful := true
    | 'v':
      result^ := '\v';
      successful := true
    | '\\':
      result^ := '\\';
      successful := true
    | '\'':
      result^ := '\'';
      successful := true
    | '"':
      result^ := '"';
      successful := true
    | '?':
      result^ := '\?';
      successful := true
    | '0':
      result^ := '\0';
      successful := true
    else
      successful := false
  end;
  return successful
end;

(* Skip spaces. *)
proc lexer_spaces(source_code: ^SourceCode);
var
  current: Char;
begin
  while ~source_code_empty(source_code) & isspace(cast(source_code_head(source_code^): Int)) <> 0 do
    current := source_code_head(source_code^);

    if current = '\n' then
      source_code_break(source_code)
    end;
    source_code_advance(source_code)
  end
end;

(* Checker whether the character is allowed in an identificator. *)
proc lexer_is_ident(char: Char) -> Bool;
  return isalnum(cast(char: Int)) <> 0 or char = '_'
end;

proc lexer_identifier(source_code: ^SourceCode, token_content: ^StringBuffer);
var
  content_length: Word;
begin
  while ~source_code_empty(source_code) & lexer_is_ident(source_code_head(source_code^)) do
    string_buffer_push(token_content, source_code_head(source_code^));
    source_code_advance(source_code)
  end
end;

proc lexer_comment(source_code: ^SourceCode, token_content: ^StringBuffer) -> Bool;
var
  trailing: Word;
begin
  trailing := 0u;

  while ~source_code_empty(source_code) & trailing < 2u do
    if source_code_head(source_code^) = '*' then
      string_buffer_push(token_content, '*');
      trailing := 1u
    elsif source_code_head(source_code^) = ')' & trailing = 1u then
      string_buffer_pop(token_content, 1u);
      trailing := 2u
    else
      string_buffer_push(token_content, source_code_head(source_code^));
      trailing := 0u
    end;
    source_code_advance(source_code)
  end;

  return trailing = 2u
end;

proc lexer_character(source_code: ^SourceCode, token_content: ^Char) -> Bool;
var
  successful: Bool;
begin
  successful := ~source_code_empty(source_code);

  if successful then
    if source_code_head(source_code^) = '\\' then
      source_code_advance(source_code);

      successful := ~source_code_empty(source_code) & lexer_escape(source_code_head(source_code^), token_content)
    else
      token_content^ := source_code_head(source_code^);
      successful := true
    end
  end;
  if successful then
    source_code_advance(source_code)
  end;
  return successful
end;

proc lexer_string(source_code: ^SourceCode, token_content: ^StringBuffer) -> Bool;
var
  token_end, constructed_string: ^Char;
  token_length: Word;
  is_valid: Bool := true;
  next_char: Char;
begin
  while is_valid & ~source_code_empty(source_code) & source_code_head(source_code^) <> '"' do
    is_valid := lexer_character(source_code, @next_char);

    if is_valid then
      string_buffer_push(token_content, next_char)
    end
  end;

  if is_valid & source_code_expect(source_code, '"') then
    source_code_advance(source_code)
  else
    is_valid := false
  end;
  return is_valid
end;

proc lexer_number(source_code: ^SourceCode, token_content: ^Int);
begin
  token_content^ := 0;

  while ~source_code_empty(source_code) & isdigit(cast(source_code_head(source_code^): Int)) <> 0 do
    token_content^ := token_content^ * 10 + (cast(source_code_head(source_code^): Int) - cast('0': Int));

    source_code_advance(source_code)
  end
end;

(* Categorize an identifier. *)
proc lexer_categorize(token_content: String) -> Token;
var
  current_token: Token;
begin
  if token_content = "if" then
    current_token.kind := LexerKind._if
  elsif token_content = "then" then
    current_token.kind := LexerKind._then
  elsif token_content = "else" then
    current_token.kind := LexerKind._else
  elsif token_content = "elsif" then
    current_token.kind := LexerKind._elsif
  elsif token_content = "while" then
    current_token.kind := LexerKind._while
  elsif token_content = "do" then
    current_token.kind := LexerKind._do
  elsif token_content = "proc" then
    current_token.kind := LexerKind._proc
  elsif token_content = "begin" then
    current_token.kind := LexerKind._begin
  elsif token_content = "end" then
    current_token.kind := LexerKind._end
  elsif token_content = "extern" then
    current_token.kind := LexerKind._extern
  elsif token_content = "const" then
    current_token.kind := LexerKind._const
  elsif token_content = "var" then
    current_token.kind := LexerKind._var
  elsif token_content = "case" then
    current_token.kind := LexerKind._case
  elsif token_content = "of" then
    current_token.kind := LexerKind._of
  elsif token_content = "type" then
    current_token.kind := LexerKind._type
  elsif token_content = "record" then
    current_token.kind := LexerKind._record
  elsif token_content = "union" then
    current_token.kind := LexerKind._union
  elsif token_content = "true" then
    current_token.kind := LexerKind.boolean;
    current_token.value.boolean_value := true
  elsif token_content = "false" then
    current_token.kind := LexerKind.boolean;
    current_token.value.boolean_value := false
  elsif token_content = "nil" then
    current_token.kind := LexerKind.null
  elsif token_content = "or" then
    current_token.kind := LexerKind._or
  elsif token_content = "return" then
    current_token.kind := LexerKind._return
  elsif token_content = "cast" then
    current_token.kind := LexerKind._cast
  elsif token_content = "defer" then
    current_token.kind := LexerKind._defer
  elsif token_content = "program" then
    current_token.kind := LexerKind._program
  elsif token_content = "module" then
    current_token.kind := LexerKind._module
  elsif token_content = "import" then
    current_token.kind := LexerKind._import
  else
    current_token.kind := LexerKind.identifier;
    current_token.value.string := string_dup(token_content)
  end;

  return current_token
end;

proc lexer_add_token(lexer: ^Tokenizer, token: Token);
var
  new_length: Word;
begin
  new_length := lexer^.length + 1u;
  lexer^.data := cast(reallocarray(cast(lexer^.data: Pointer), new_length, #size(Token)): ^Token);
  (lexer^.data + lexer^.length)^ := token;
  lexer^.length := new_length
end;

(* Read the next token from the input. *)
proc lexer_next(source_code: SourceCode, token_buffer: ^StringBuffer) -> Token;
var
  current_token: Token;
  first_char: Char;
begin
  current_token.kind := LexerKind.unknown;

  first_char := source_code_head(source_code);

  if isalpha(cast(first_char: Int)) <> 0 or first_char = '_' then
    lexer_identifier(@source_code, token_buffer);
    current_token := lexer_categorize(string_buffer_clear(token_buffer))
  elsif first_char = '#' then
    source_code_advance(@source_code);
    lexer_identifier(@source_code, token_buffer);

    current_token.kind := LexerKind.trait;
    current_token.value.string := string_dup(string_buffer_clear(token_buffer))
  elsif isdigit(cast(first_char: Int)) <> 0 then
    lexer_number(@source_code, @current_token.value.int_value);

    if source_code_expect(@source_code, 'u') then
      current_token.kind := LexerKind.word;
        source_code_advance(@source_code)
    else
      current_token.kind := LexerKind.integer
    end
  elsif first_char = '(' then
    source_code_advance(@source_code);

    if source_code_empty(@source_code) then
      current_token.kind := LexerKind.left_paren
    elsif source_code_head(source_code) = '*' then
      source_code_advance(@source_code);

      if lexer_comment(@source_code, token_buffer) then
        current_token.value.string := string_dup(string_buffer_clear(token_buffer));
        current_token.kind := LexerKind.comment
      else
        current_token.kind := LexerKind.unknown
      end
    else
      current_token.kind := LexerKind.left_paren
    end
  elsif first_char = ')' then
    current_token.kind := LexerKind.right_paren;
    source_code_advance(@source_code)
  elsif first_char = '\'' then
    source_code_advance(@source_code);

    if lexer_character(@source_code, @current_token.value.char_value) & source_code_expect(@source_code, '\'') then
      current_token.kind := LexerKind.character;
      source_code_advance(@source_code)
    else
      current_token.kind := LexerKind.unknown
    end
  elsif first_char = '"' then
    source_code_advance(@source_code);

    if lexer_string(@source_code, token_buffer) then
      current_token.kind := LexerKind.string;
      current_token.value.string := string_dup(string_buffer_clear(token_buffer))
    else
      current_token.kind := LexerKind.unknown
    end
  elsif first_char = '[' then
    current_token.kind := LexerKind.left_square;
    source_code_advance(@source_code)
  elsif first_char = ']' then
    current_token.kind := LexerKind.right_square;
    source_code_advance(@source_code)
  elsif first_char = '>' then
    source_code_advance(@source_code);

    if source_code_empty(@source_code) then
      current_token.kind := LexerKind.greater_than
    elsif source_code_head(source_code) = '=' then
      current_token.kind := LexerKind.greater_equal;
      source_code_advance(@source_code)
    elsif source_code_head(source_code) = '>' then
      current_token.kind := LexerKind.shift_right;
      source_code_advance(@source_code)
    else
      current_token.kind := LexerKind.greater_than
    end
  elsif first_char = '<' then
    source_code_advance(@source_code);

    if source_code_empty(@source_code) then
      current_token.kind := LexerKind.less_than
    elsif source_code_head(source_code) = '=' then
      current_token.kind := LexerKind.less_equal;
      source_code_advance(@source_code)
    elsif source_code_head(source_code) = '<' then
      current_token.kind := LexerKind.shift_left;
      source_code_advance(@source_code)
    elsif source_code_head(source_code) = '>' then
      current_token.kind := LexerKind.not_equal;
      source_code_advance(@source_code)
    else
      current_token.kind := LexerKind.less_than
    end
  elsif first_char = '=' then
    current_token.kind := LexerKind.equal;
    source_code_advance(@source_code)
  elsif first_char = ';' then
    current_token.kind := LexerKind.semicolon;
    source_code_advance(@source_code)
  elsif first_char = '.' then
    current_token.kind := LexerKind.dot;
    source_code_advance(@source_code)
  elsif first_char = ',' then
    current_token.kind := LexerKind.comma;
    source_code_advance(@source_code)
  elsif first_char = '+' then
    current_token.kind := LexerKind.plus;
    source_code_advance(@source_code)
  elsif first_char = '-' then
    source_code_advance(@source_code);

    if source_code_empty(@source_code) then
      current_token.kind := LexerKind.minus
    elsif source_code_head(source_code) = '>' then
      current_token.kind := LexerKind.arrow;
      source_code_advance(@source_code)
    else
      current_token.kind := LexerKind.minus
    end
  elsif first_char = '*' then
    current_token.kind := LexerKind.multiplication;
    source_code_advance(@source_code)
  elsif first_char = '/' then
    current_token.kind := LexerKind.division;
    source_code_advance(@source_code)
  elsif first_char = '%' then
    current_token.kind := LexerKind.remainder;
    source_code_advance(@source_code)
  elsif first_char = ':' then
    source_code_advance(@source_code);

    if source_code_empty(@source_code) then
      current_token.kind := LexerKind.colon
    elsif source_code_head(source_code) = '=' then
      current_token.kind := LexerKind.assignment;
      source_code_advance(@source_code)
    else
      current_token.kind := LexerKind.colon
    end
  elsif first_char = '^' then
    current_token.kind := LexerKind.hat;
    source_code_advance(@source_code)
  elsif first_char = '@' then
    current_token.kind := LexerKind.at;
    source_code_advance(@source_code)
  elsif first_char = '!' then
    current_token.kind := LexerKind.exclamation;
    source_code_advance(@source_code)
  elsif first_char = '&' then
    current_token.kind := LexerKind.and;
    source_code_advance(@source_code)
  elsif first_char = '~' then
    current_token.kind := LexerKind.not;
    source_code_advance(@source_code)
  elsif first_char = '|' then
    current_token.kind := LexerKind.pipe;
    source_code_advance(@source_code)
  else
    current_token.kind := LexerKind.unknown;
    source_code_advance(@source_code)
  end;

  return current_token
end;

(* Split the source text into tokens. *)
proc lexer_text(source_code: SourceCode) -> Tokenizer;
var
  current_token: Token;
  token_buffer: StringBuffer;
  lexer: Tokenizer;
begin
  lexer := Tokenizer(0u, nil);
  token_buffer := string_buffer_new();

  lexer_spaces(@source_code);

  while ~source_code_empty(@source_code) do
    current_token := lexer_next(source_code, @token_buffer);

    if current_token.kind <> LexerKind.unknown then
      lexer_add_token(@lexer, current_token);
        lexer_spaces(@source_code)
    else
      write_s("Lexical analysis error on \"");
      write_c(source_code_head(source_code));
      write_s("\".\n")
    end
  end;

  return lexer
end;

(*
  Parser.
*)

proc parse(tokens: ^Token, tokens_size: Word);
var
  current_token: ^Token;
  i: Word := 0u;
begin
  while i < tokens_size do
    current_token := tokens + i;

    case current_token^.kind of
      LexerKind._if:
        write_s("IF")
      | LexerKind._then:
        write_s("THEN")
      | LexerKind._else:
        write_s("ELSE")
      | LexerKind._elsif:
        write_s("ELSIF")
      | LexerKind._while:
        write_s("WHILE")
      | LexerKind._do:
        write_s("DO")
      | LexerKind._proc:
        write_s("PROC")
      | LexerKind._begin:
        write_s("BEGIN")
      | LexerKind._end:
        write_s("END")
      | LexerKind._extern:
        write_s("EXTERN")
      | LexerKind._const:
        write_s("CONST")
      | LexerKind._var:
        write_s("VAR")
     | LexerKind._case:
        write_s("CASE")
      | LexerKind._of:
        write_s("OF")
      | LexerKind._type:
        write_s("TYPE")
      | LexerKind._record:
        write_s("RECORD")
      | LexerKind._union:
        write_s("UNION")
      | LexerKind.pipe:
        write_s("|")
      | LexerKind.to:
        write_s("TO")
      | LexerKind.boolean:
        write_s("BOOLEAN<");
        write_b(current_token^.value.boolean_value);
        write_c('>')
      | LexerKind.null:
        write_s("NIL")
      | LexerKind.and:
        write_s("&")
      | LexerKind._or:
        write_s("OR")
      | LexerKind.not:
        write_s("~")
      | LexerKind._return:
        write_s("RETURN")
      | LexerKind._cast:
        write_s("CAST")
      | LexerKind.shift_left:
        write_s("<<")
      | LexerKind.shift_right:
        write_s(">>")
      | LexerKind.identifier:
        write_c('<');
        write_s(current_token^.value.string);
        write_c('>')
      | LexerKind.trait:
        write_c('#');
        write_s(current_token^.value.string)
      | LexerKind.left_paren:
        write_s("(")
      | LexerKind.right_paren:
        write_s(")")
      | LexerKind.left_square:
        write_s("[")
      | LexerKind.right_square:
        write_s("]")
      | LexerKind.greater_equal:
        write_s(">=")
      | LexerKind.less_equal:
        write_s("<=")
      | LexerKind.greater_than:
        write_s(">")
      | LexerKind.less_than:
        write_s("<")
      | LexerKind.equal:
        write_s("=")
      | LexerKind.not_equal:
        write_s("<>")
      | LexerKind.semicolon:
        write_c(';')
      | LexerKind.dot:
        write_c('.')
      | LexerKind.comma:
        write_c(',')
      | LexerKind.plus:
        write_c('+')
      | LexerKind.minus:
        write_c('-')
      | LexerKind.multiplication:
        write_c('*')
      | LexerKind.division:
        write_c('/')
      | LexerKind.remainder:
        write_c('%')
      | LexerKind.assignment:
        write_s(":=")
      | LexerKind.colon:
        write_c(':')
      | LexerKind.hat:
        write_c('^')
      | LexerKind.at:
        write_c('@')
      | LexerKind.comment:
        write_s("(* COMMENT *)")
      | LexerKind.integer:
        write_c('<');
        write_i(current_token^.value.int_value);
        write_c('>')
      | LexerKind.word:
        write_c('<');
        write_i(current_token^.value.int_value);
        write_s("u>")
      | LexerKind.character:
        write_c('<');
        write_i(cast(current_token^.value.char_value: Int));
        write_s("c>")
      | LexerKind.string:
        write_s("\"...\"")
      | LexerKind._defer:
        write_s("DEFER")
      | LexerKind.exclamation:
        write_c('!')
      | LexerKind.arrow:
        write_s("->")
      | LexerKind._program:
        write_s("PROGRAM")
      | LexerKind._module:
      write_s("MODULE")
      | LexerKind._import:
      write_s("IMPORT")
      else
        write_s("UNKNOWN<");
        write_i(cast(current_token^.kind: Int));
        write_c('>')
    end;
    write_c(' ');

    i := i + 1u
  end;
  write_c('\n')
end;

(*
  Compilation entry.
*)

proc compile_in_stages(command_line: ^CommandLine, source_code: SourceCode) -> Int;
var
  return_code: Int := 0;
  lexer: Tokenizer;
begin
  if command_line^.lex or command_line^.parse then
    lexer := lexer_text(source_code)
  end;
  if command_line^.parse then
    parse(lexer.data, lexer.length)
  end;

  return return_code
end;

proc process(argc: Int, argv: ^^Char) -> Int;
var
  tokens: ^Token;
  tokens_size: Word;
  source_code: SourceCode;
  command_line: ^CommandLine;
  return_code: Int := 0;
  source_file: ^SourceFile;
begin
  command_line := parse_command_line(argc, argv);
  if command_line = nil then
    return_code := 2
  end;

  if return_code = 0 then
    source_file := read_source(command_line^.input);

    if source_file = nil then
      perror(command_line^.input);
      return_code := 3
    end
  end;

  if return_code = 0 then
    defer
      fclose(source_file^.handle)
    end;

    source_code.position := TextLocation(1u, 1u);
    source_code.input := cast(source_file: Pointer);
    source_code.empty := source_file_empty;
    source_code.head := source_file_head;
    source_code.advance := source_file_advance;

    return_code := compile_in_stages(command_line, source_code)
  end;
  return return_code
end;

  return process(count, parameters)
end.