const SEEK_SET* = 0; SEEK_CUR* = 1; SEEK_END* = 2; type TokenKind* = ( unknown, identifier, _if, _then, _else, _elsif, _while, _do, _proc, _begin, _end, _extern, _const, _var, array, _of, _type, _record, _union, pointer, to, boolean, _nil, and, _or, not, _return, _cast, shift_left, shift_right, left_paren, right_paren, left_square, right_square, greater_equal, less_equal, greater_than, less_than, not_equal, equal, semicolon, dot, comma, plus, minus, multiplication, division, remainder, assignment, colon, hat, at, comment, integer, word, character, string, _defer, exclamation, arrow ); Position* = record line: Word; column: Word end; Location* = record first: Position; last: Position end; SourceFile* = record buffer: [1024]Char; handle: ^FILE; size: Word; index: Word end; FILE* = record end; StringBuffer* = record data: ^Byte; size: Word; capacity: Word end; SourceCode = record position: Position; input: ^Byte; empty: proc(^Byte) -> Bool; advance: proc(^Byte); head: proc(^Byte) -> Char end; Token* = record kind: TokenKind; value: union int_value: Int; string: String; boolean_value: Bool; char_value: Char end; location: Location end; CommandLine* = record input: ^Char; tokenize: Bool; syntax_tree: Bool end; (* External procedures. *) proc fopen(pathname: ^Char, mode: ^Char) -> ^FILE; extern proc fclose(stream: ^FILE) -> Int; extern proc fseek(stream: ^FILE, off: Int, whence: Int) -> Int; extern proc rewind(stream: ^FILE); extern proc ftell(stream: ^FILE) -> Int; extern proc fread(ptr: ^Byte, size: Word, nmemb: Word, stream: ^FILE) -> Word; extern proc write(fd: Int, buf: ^Byte, Word: Int) -> Int; extern proc malloc(size: Word) -> ^Byte; extern proc free(ptr: ^Byte); extern proc calloc(nmemb: Word, size: Word) -> ^Byte; extern proc realloc(ptr: ^Byte, size: Word) -> ^Byte; extern proc memset(ptr: ^Char, c: Int, n: Int) -> ^Char; extern proc strcmp(s1: ^Char, s2: ^Char) -> Int; extern proc strncmp(s1: ^Char, s2: ^Char, n: Word) -> Int; extern proc strncpy(dst: ^Char, src: ^Char, dsize: Word) -> ^Char; extern proc strcpy(dst: ^Char, src: ^Char) -> ^Char; extern proc strlen(ptr: ^Char) -> Word; extern proc perror(s: ^Char); extern proc exit(code: Int) -> !; extern (* Standard procedures. *) proc reallocarray(ptr: ^Byte, n: Word, size: Word) -> ^Byte; begin return realloc(ptr, n * size) end proc write_s(value: String); begin write(0, cast(value.ptr: ^Byte), cast(value.length: Int)) end proc write_z(value: ^Char); begin write(0, cast(value: ^Byte), cast(strlen(value): Int)) end proc write_b(value: Bool); begin if value then write_s("true") else write_s("false") end end proc write_c(value: Char); begin write(0, cast(@value: ^Byte), 1) end proc write_i(value: Int); var digit: Int; n: Word; buffer: [10]Char; begin n := 10u; if value = 0 then write_c('0') end; while value <> 0 do digit := value % 10; value := value / 10; buffer[n] := cast(cast('0': Int) + digit: Char); n := n - 1u end; while n < 10u do n := n + 1u; write_c(buffer[n]) end end proc write_u(value: Word); begin write_i(cast(value: Int)) end proc is_digit(c: Char) -> Bool; begin return cast(c: Int) >= cast('0': Int) & cast(c: Int) <= cast('9': Int) end proc is_alpha(c: Char) -> Bool; begin return cast(c: Int) >= cast('A': Int) & cast(c: Int) <= cast('z': Int) end proc is_alnum(c: Char) -> Bool; begin return is_digit(c) or is_alpha(c) end proc is_space(c: Char) -> Bool; begin return c = ' ' or c = '\n' or c = '\t' end proc substring(string: String, start: Word, count: Word) -> String; begin return String(string.ptr + start, count) end proc open_substring(string: String, start: Word) -> String; begin return substring(string, start, string.length - start) end proc string_dup(origin: String) -> String; var copy: ^Char; begin copy := cast(malloc(origin.length): ^Char); strncpy(copy, origin.ptr, origin.length); return String(copy, origin.length) end proc string_buffer_new() -> StringBuffer; var result: StringBuffer; begin result.capacity := 64u; result.data := malloc(result.capacity); result.size := 0u; return result end proc string_buffer_push(buffer: ^StringBuffer, char: Char); begin if buffer^.size >= buffer^.capacity then buffer^.capacity := buffer^.capacity + 1024u; buffer^.data := realloc(buffer^.data, buffer^.capacity) end; (buffer^.data + buffer^.size)^ := cast(char: Byte); buffer^.size := buffer^.size + 1u end proc string_buffer_pop(buffer: ^StringBuffer, count: Word); begin buffer^.size := buffer^.size - count end proc string_buffer_clear(buffer: ^StringBuffer) -> String; var result: String; begin result := String(cast(buffer^.data: ^Char), buffer^.size); buffer^.size := 0u; return result end (* End of standard procedures. *) proc make_position() -> Position; begin return Position(1u, 1u) end proc read_source(filename: ^Char) -> ^SourceFile; var result: ^SourceFile; file_handle: ^FILE; begin file_handle := fopen(filename, "rb\0".ptr); if file_handle <> nil then result := cast(malloc(#size(SourceFile)): ^SourceFile); result^.handle := file_handle; result^.size := 0u; result^.index := 1u end; return result end proc escape_char(escape: Char, result: ^Char) -> Bool; var successful: Bool; begin if escape = 'n' then result^ := '\n'; successful := true; elsif escape = 'a' then result^ := '\a'; successful := true elsif escape = 'b' then result^ := '\b'; successful := true elsif escape = 't' then result^ := '\t'; successful := true elsif escape = 'f' then result^ := '\f'; successful := true elsif escape = 'r' then result^ := '\r'; successful := true elsif escape = 'v' then result^ := '\v'; successful := true elsif escape = '\\' then result^ := '\\'; successful := true elsif escape = '\'' then result^ := '\''; successful := true elsif escape = '"' then result^ := '"'; successful := true elsif escape = '?' then result^ := '\?'; successful := true elsif escape = '0' then result^ := '\0'; successful := true else successful := false end; return successful end proc source_file_empty(source_input: ^Byte) -> Bool; var source_file: ^SourceFile; begin source_file := cast(source_input: ^SourceFile); if source_file^.index > source_file^.size then source_file^.size := fread(cast(@source_file^.buffer: ^Byte), 1u, 1024u, source_file^.handle); source_file^.index := 1u end; return source_file^.size = 0u end proc source_file_head(source_input: ^Byte) -> Char; var source_file: ^SourceFile; begin source_file := cast(source_input: ^SourceFile); return source_file^.buffer[source_file^.index] end proc source_file_advance(source_input: ^Byte); var source_file: ^SourceFile; begin source_file := cast(source_input: ^SourceFile); source_file^.index := source_file^.index + 1u end proc source_code_empty(source_code: ^SourceCode) -> Bool; begin return source_code^.empty(source_code^.input) end proc source_code_head(source_code: SourceCode) -> Char; begin return source_code.head(source_code.input) end proc source_code_advance(source_code: ^SourceCode); begin source_code^.advance(source_code^.input); source_code^.position.column := source_code^.position.column end proc source_code_break(source_code: ^SourceCode); begin source_code^.position.line := source_code^.position.line + 1u; source_code^.position.column := 0u end proc source_code_expect(source_code: ^SourceCode, expected: Char) -> Bool; begin return ~source_code_empty(source_code) & source_code_head(source_code^) = expected end proc skip_spaces(source_code: ^SourceCode); begin while ~source_code_empty(source_code) & is_space(source_code_head(source_code^)) do if source_code_head(source_code^) = '\n' then source_code_break(source_code) end; source_code_advance(source_code) end end proc is_ident(char: Char) -> Bool; begin return is_alnum(char) or char = '_' end proc lex_identifier(source_code: ^SourceCode, token_content: ^StringBuffer); var content_length: Word; begin while ~source_code_empty(source_code) & is_ident(source_code_head(source_code^)) do string_buffer_push(token_content, source_code_head(source_code^)); source_code_advance(source_code) end end proc lex_comment(source_code: ^SourceCode, token_content: ^StringBuffer) -> Bool; var trailing: Word; begin trailing := 0u; while ~source_code_empty(source_code) & trailing < 2u do if source_code_head(source_code^) = '*' then string_buffer_push(token_content, '*'); trailing := 1u elsif source_code_head(source_code^) = ')' & trailing = 1u then string_buffer_pop(token_content, 1u); trailing := 2u else string_buffer_push(token_content, source_code_head(source_code^)); trailing := 0u end; source_code_advance(source_code) end; return trailing = 2u end proc lex_character(source_code: ^SourceCode, token_content: ^Char) -> Bool; var successful: Bool; begin successful := ~source_code_empty(source_code); if successful then if source_code_head(source_code^) = '\\' then source_code_advance(source_code); successful := ~source_code_empty(source_code) & escape_char(source_code_head(source_code^), token_content) else token_content^ := source_code_head(source_code^); successful := true end end; if successful then source_code_advance(source_code) end; return successful end proc lex_string(source_code: ^SourceCode, token_content: ^StringBuffer) -> Bool; var token_end, constructed_string: ^Char; token_length: Word; is_valid: Bool; next_char: Char; begin is_valid := true; while is_valid & ~source_code_empty(source_code) & source_code_head(source_code^) <> '"' do is_valid := lex_character(source_code, @next_char); if is_valid then string_buffer_push(token_content, next_char) end end; if is_valid & source_code_expect(source_code, '"') then source_code_advance(source_code) else is_valid := false end; return is_valid end proc lex_number(source_code: ^SourceCode, token_content: ^Int); begin token_content^ := 0; while ~source_code_empty(source_code) & is_digit(source_code_head(source_code^)) do token_content^ := token_content^ * 10 + (cast(source_code_head(source_code^): Int) - cast('0': Int)); source_code_advance(source_code) end end proc print_tokens(tokens: ^Token, tokens_size: Word); var current_token: ^Token; i: Word; begin i := 0u; while i < tokens_size do current_token := tokens + i; if current_token^.kind = TokenKind._if then write_s("IF") elsif current_token^.kind = TokenKind._then then write_s("THEN") elsif current_token^.kind = TokenKind._else then write_s("ELSE") elsif current_token^.kind = TokenKind._elsif then write_s("ELSIF") elsif current_token^.kind = TokenKind._while then write_s("WHILE") elsif current_token^.kind = TokenKind._do then write_s("DO") elsif current_token^.kind = TokenKind._proc then write_s("PROC") elsif current_token^.kind = TokenKind._begin then write_s("BEGIN") elsif current_token^.kind = TokenKind._end then write_s("END") elsif current_token^.kind = TokenKind._extern then write_s("EXTERN") elsif current_token^.kind = TokenKind._const then write_s("CONST") elsif current_token^.kind = TokenKind._var then write_s("VAR") elsif current_token^.kind = TokenKind.array then write_s("ARRAY") elsif current_token^.kind = TokenKind._of then write_s("OF") elsif current_token^.kind = TokenKind._type then write_s("TYPE") elsif current_token^.kind = TokenKind._record then write_s("RECORD") elsif current_token^.kind = TokenKind._union then write_s("UNION") elsif current_token^.kind = TokenKind.pointer then write_s("POINTER") elsif current_token^.kind = TokenKind.to then write_s("TO") elsif current_token^.kind = TokenKind.boolean then write_s("BOOLEAN<"); write_b(current_token^.value.boolean_value); write_c('>') elsif current_token^.kind = TokenKind._nil then write_s("NIL") elsif current_token^.kind = TokenKind.and then write_s("AND") elsif current_token^.kind = TokenKind._or then write_s("OR") elsif current_token^.kind = TokenKind.not then write_s("NOT") elsif current_token^.kind = TokenKind._return then write_s("RETURN") elsif current_token^.kind = TokenKind._cast then write_s("CAST") elsif current_token^.kind = TokenKind.shift_left then write_s("<<") elsif current_token^.kind = TokenKind.shift_right then write_s(">>") elsif current_token^.kind = TokenKind.identifier then write_c('<'); write_s(current_token^.value.string); write_c('>') elsif current_token^.kind = TokenKind.left_paren then write_s("(") elsif current_token^.kind = TokenKind.right_paren then write_s(")") elsif current_token^.kind = TokenKind.left_square then write_s("[") elsif current_token^.kind = TokenKind.right_square then write_s("]") elsif current_token^.kind = TokenKind.greater_equal then write_s(">=") elsif current_token^.kind = TokenKind.less_equal then write_s("<=") elsif current_token^.kind = TokenKind.greater_than then write_s(">") elsif current_token^.kind = TokenKind.less_than then write_s("<") elsif current_token^.kind = TokenKind.equal then write_s("=") elsif current_token^.kind = TokenKind.not_equal then write_s("<>") elsif current_token^.kind = TokenKind.semicolon then write_c(';') elsif current_token^.kind = TokenKind.dot then write_c('.') elsif current_token^.kind = TokenKind.comma then write_c(',') elsif current_token^.kind = TokenKind.plus then write_c('+') elsif current_token^.kind = TokenKind.minus then write_c('-') elsif current_token^.kind = TokenKind.multiplication then write_c('*') elsif current_token^.kind = TokenKind.division then write_c('/') elsif current_token^.kind = TokenKind.remainder then write_c('%') elsif current_token^.kind = TokenKind.assignment then write_s(":=") elsif current_token^.kind = TokenKind.colon then write_c(':') elsif current_token^.kind = TokenKind.hat then write_c('^') elsif current_token^.kind = TokenKind.at then write_c('@') elsif current_token^.kind = TokenKind.comment then write_s("(* COMMENT *)") elsif current_token^.kind = TokenKind.integer then write_c('<'); write_i(current_token^.value.int_value); write_c('>') elsif current_token^.kind = TokenKind.word then write_c('<'); write_i(current_token^.value.int_value); write_s("u>") elsif current_token^.kind = TokenKind.character then write_c('<'); write_i(cast(current_token^.value.char_value: Int)); write_s("c>") elsif current_token^.kind = TokenKind.string then write_s("\"...\"") elsif current_token^.kind = TokenKind._defer then write_s("DEFER") elsif current_token^.kind = TokenKind.exclamation then write_c('!') elsif current_token^.kind = TokenKind.arrow then write_s("->") else write_s("UNKNOWN<"); write_i(cast(current_token^.kind: Int)); write_c('>') end; write_c(' '); i := i + 1u; end; write_c('\n') end proc categorize_identifier(token_content: String) -> Token; var current_token: Token; begin if "if" = token_content then current_token.kind := TokenKind._if elsif "then" = token_content then current_token.kind := TokenKind._then elsif "else" = token_content then current_token.kind := TokenKind._else elsif "elsif" = token_content then current_token.kind := TokenKind._elsif elsif "while" = token_content then current_token.kind := TokenKind._while elsif "do" = token_content then current_token.kind := TokenKind._do elsif "proc" = token_content then current_token.kind := TokenKind._proc elsif "begin" = token_content then current_token.kind := TokenKind._begin elsif "end" = token_content then current_token.kind := TokenKind._end elsif "extern" = token_content then current_token.kind := TokenKind._extern elsif "const" = token_content then current_token.kind := TokenKind._const elsif "var" = token_content then current_token.kind := TokenKind._var elsif "array" = token_content then current_token.kind := TokenKind.array elsif "of" = token_content then current_token.kind := TokenKind._of elsif "type" = token_content then current_token.kind := TokenKind._type elsif "record" = token_content then current_token.kind := TokenKind._record elsif "union" = token_content then current_token.kind := TokenKind._union elsif "pointer" = token_content then current_token.kind := TokenKind.pointer elsif "to" = token_content then current_token.kind := TokenKind.to elsif "true" = token_content then current_token.kind := TokenKind.boolean; current_token.value.boolean_value := true elsif "false" = token_content then current_token.kind := TokenKind.boolean; current_token.value.boolean_value := false elsif "nil" = token_content then current_token.kind := TokenKind._nil elsif "and" = token_content then current_token.kind := TokenKind.and elsif "or" = token_content then current_token.kind := TokenKind._or elsif "not" = token_content then current_token.kind := TokenKind.not elsif "return" = token_content then current_token.kind := TokenKind._return elsif "cast" = token_content then current_token.kind := TokenKind._cast elsif "defer" = token_content then current_token.kind := TokenKind._defer else current_token.kind := TokenKind.identifier; current_token.value.string := string_dup(token_content) end; return current_token end proc tokenize(source_code: SourceCode, tokens_size: ^Word) -> ^Token; var tokens, current_token: ^Token; first_char: Char; token_buffer: StringBuffer; begin tokens_size^ := 0u; tokens := nil; token_buffer := string_buffer_new(); skip_spaces(@source_code); while ~source_code_empty(@source_code) do tokens := cast(reallocarray(cast(tokens: ^Byte), tokens_size^ + 1u, #size(Token)): ^Token); current_token := tokens + tokens_size^; first_char := source_code_head(source_code); if is_alpha(first_char) or first_char = '_' then lex_identifier(@source_code, @token_buffer); current_token^ := categorize_identifier(string_buffer_clear(@token_buffer)) elsif is_digit(first_char) then lex_number(@source_code, @current_token^.value.int_value); if source_code_expect(@source_code, 'u') then current_token^.kind := TokenKind.word; source_code_advance(@source_code) else current_token^.kind := TokenKind.integer end elsif first_char = '(' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token^.kind := TokenKind.left_paren elsif source_code_head(source_code) = '*' then source_code_advance(@source_code); if lex_comment(@source_code, @token_buffer) then current_token^.value.string := string_dup(string_buffer_clear(@token_buffer)); current_token^.kind := TokenKind.comment else current_token^.kind := TokenKind.unknown end else current_token^.kind := TokenKind.left_paren end elsif first_char = ')' then current_token^.kind := TokenKind.right_paren; source_code_advance(@source_code) elsif first_char = '\'' then source_code_advance(@source_code); if lex_character(@source_code, @current_token^.value.char_value) & source_code_expect(@source_code, '\'') then current_token^.kind := TokenKind.character; source_code_advance(@source_code) else current_token^.kind := TokenKind.unknown end elsif first_char = '"' then source_code_advance(@source_code); if lex_string(@source_code, @token_buffer) then current_token^.kind := TokenKind.string; current_token^.value.string := string_dup(string_buffer_clear(@token_buffer)) else current_token^.kind := TokenKind.unknown end elsif first_char = '[' then current_token^.kind := TokenKind.left_square; source_code_advance(@source_code) elsif first_char = ']' then current_token^.kind := TokenKind.right_square; source_code_advance(@source_code) elsif first_char = '>' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token^.kind := TokenKind.greater_than elsif source_code_head(source_code) = '=' then current_token^.kind := TokenKind.greater_equal; source_code_advance(@source_code) elsif source_code_head(source_code) = '>' then current_token^.kind := TokenKind.shift_right; source_code_advance(@source_code) else current_token^.kind := TokenKind.greater_than end elsif first_char = '<' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token^.kind := TokenKind.less_than elsif source_code_head(source_code) = '=' then current_token^.kind := TokenKind.less_equal; source_code_advance(@source_code) elsif source_code_head(source_code) = '<' then current_token^.kind := TokenKind.shift_left; source_code_advance(@source_code) elsif source_code_head(source_code) = '>' then current_token^.kind := TokenKind.not_equal; source_code_advance(@source_code) else current_token^.kind := TokenKind.less_than end elsif first_char = '=' then current_token^.kind := TokenKind.equal; source_code_advance(@source_code) elsif first_char = ';' then current_token^.kind := TokenKind.semicolon; source_code_advance(@source_code) elsif first_char = '.' then current_token^.kind := TokenKind.dot; source_code_advance(@source_code) elsif first_char = ',' then current_token^.kind := TokenKind.comma; source_code_advance(@source_code) elsif first_char = '+' then current_token^.kind := TokenKind.plus; source_code_advance(@source_code) elsif first_char = '-' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token^.kind := TokenKind.minus elsif source_code_head(source_code) = '>' then current_token^.kind := TokenKind.arrow; source_code_advance(@source_code) else current_token^.kind := TokenKind.minus end elsif first_char = '*' then current_token^.kind := TokenKind.multiplication; source_code_advance(@source_code) elsif first_char = '/' then current_token^.kind := TokenKind.division; source_code_advance(@source_code) elsif first_char = '%' then current_token^.kind := TokenKind.remainder; source_code_advance(@source_code) elsif first_char = ':' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token^.kind := TokenKind.colon elsif source_code_head(source_code) = '=' then current_token^.kind := TokenKind.assignment; source_code_advance(@source_code) else current_token^.kind := TokenKind.colon end elsif first_char = '^' then current_token^.kind := TokenKind.hat; source_code_advance(@source_code) elsif first_char = '@' then current_token^.kind := TokenKind.at; source_code_advance(@source_code) elsif first_char = '!' then current_token^.kind := TokenKind.exclamation; source_code_advance(@source_code) else current_token^.kind := TokenKind.unknown; source_code_advance(@source_code) end; if current_token^.kind <> TokenKind.unknown then tokens_size^ := tokens_size^ + 1u; skip_spaces(@source_code) else write_s("Lexical analysis error on \""); write_c(first_char); write_s("\".\n") end end; return tokens end proc parse_command_line*(argc: Int, argv: ^^Char) -> ^CommandLine; var parameter: ^^Char; i: Int; result: ^CommandLine; begin i := 1; result := cast(malloc(#size(CommandLine)): ^CommandLine); result^.tokenize := false; result^.syntax_tree := false; result^.input := nil; while i < argc do parameter := argv + i; if strcmp(parameter^, "--tokenize\0".ptr) = 0 then result^.tokenize := true elsif strcmp(parameter^, "--syntax-tree\0".ptr) = 0 then result^.syntax_tree := true elsif parameter^^ <> '-' then result^.input := parameter^ else write_s("Fatal error: Unknown command line options:"); write_c(' '); write_z(parameter^); write_s(".\n"); return nil end; i := i + 1 end; if result^.input = nil then write_s("Fatal error: no input files.\n"); return nil end; return result end proc process(argc: Int, argv: ^^Char) -> Int; var tokens: ^Token; tokens_size: Word; source_code: SourceCode; command_line: ^CommandLine; return_code: Int; begin return_code := 0; command_line := parse_command_line(argc, argv); if command_line = nil then return_code := 2 end; if return_code = 0 then source_code.position := make_position(); source_code.input := cast(read_source(command_line^.input): ^Byte); source_code.empty := source_file_empty; source_code.head := source_file_head; source_code.advance := source_file_advance; if source_code.input = nil then perror(command_line^.input); return_code := 3 end end; if return_code = 0 then tokens := tokenize(source_code, @tokens_size); fclose(cast(source_code.input: ^SourceFile)^.handle); if command_line^.tokenize then print_tokens(tokens, tokens_size) end end; return return_code end begin exit(process(count, parameters)) end.