const SEEK_SET* := 0 SEEK_CUR* := 1 SEEK_END* := 2 type TokenKind* = ( unknown, identifier, _if, _then, _else, _elsif, _while, _do, _proc, _begin, _end, _extern, _const, _var, array, _of, _type, _record, _union, pointer, to, boolean, _nil, and, _or, not, _return, _cast, shift_left, shift_right, left_paren, right_paren, left_square, right_square, greater_equal, less_equal, greater_than, less_than, not_equal, equal, semicolon, dot, comma, plus, minus, multiplication, division, remainder, assignment, colon, hat, at, comment, integer, word, character, string, _defer, exclamation, arrow ) Position* = record line: Word; column: Word end Location* = record first: Position; last: Position end SourceFile* = record buffer: [1024]Char; handle: ^FILE; size: Word; index: Word end FILE* = record end StringBuffer* = record data: ^Byte; size: Word; capacity: Word end SourceCode = record position: Position; input: ^Byte; empty: proc(^Byte) -> Bool; advance: proc(^Byte); head: proc(^Byte) -> Char end Token* = record kind: TokenKind; value: union int_value: Int; string: String; boolean_value: Bool; char_value: Char end; location: Location end CommandLine* = record input: ^Char; tokenize: Bool; syntax_tree: Bool end (* External procedures. *) proc fopen(pathname: ^Char, mode: ^Char) -> ^FILE; extern proc fclose(stream: ^FILE) -> Int; extern proc fseek(stream: ^FILE, off: Int, whence: Int) -> Int; extern proc rewind(stream: ^FILE); extern proc ftell(stream: ^FILE) -> Int; extern proc fread(ptr: ^Byte, size: Word, nmemb: Word, stream: ^FILE) -> Word; extern proc write(fd: Int, buf: ^Byte, Word: Int) -> Int; extern proc malloc(size: Word) -> ^Byte; extern proc free(ptr: ^Byte); extern proc calloc(nmemb: Word, size: Word) -> ^Byte; extern proc realloc(ptr: ^Byte, size: Word) -> ^Byte; extern proc memset(ptr: ^Char, c: Int, n: Int) -> ^Char; extern proc strcmp(s1: ^Char, s2: ^Char) -> Int; extern proc strncmp(s1: ^Char, s2: ^Char, n: Word) -> Int; extern proc strncpy(dst: ^Char, src: ^Char, dsize: Word) -> ^Char; extern proc strcpy(dst: ^Char, src: ^Char) -> ^Char; extern proc strlen(ptr: ^Char) -> Word; extern proc perror(s: ^Char); extern proc exit(code: Int) -> !; extern (* Standard procedures. *) proc reallocarray(ptr: ^Byte, n: Word, size: Word) -> ^Byte; begin return realloc(ptr, n * size) end proc write_s(value: String); begin write(0, cast(value.ptr: ^Byte), cast(value.length: Int)) end proc write_z(value: ^Char); begin write(0, cast(value: ^Byte), cast(strlen(value): Int)) end proc write_b(value: Bool); begin if value then write_s("true") else write_s("false") end end proc write_c(value: Char); begin write(0, cast(@value: ^Byte), 1) end proc write_i(value: Int); var digit: Int n: Word buffer: [10]Char begin n := 10u; if value = 0 then write_c('0') end; while value <> 0 do digit := value % 10; value := value / 10; buffer[n] := cast(cast('0': Int) + digit: Char); n := n - 1u end; while n < 10u do n := n + 1u; write_c(buffer[n]) end end proc write_u(value: Word); begin write_i(cast(value: Int)) end proc is_digit(c: Char) -> Bool; begin return cast(c: Int) >= cast('0': Int) & cast(c: Int) <= cast('9': Int) end proc is_alpha(c: Char) -> Bool; begin return cast(c: Int) >= cast('A': Int) & cast(c: Int) <= cast('z': Int) end proc is_alnum(c: Char) -> Bool; begin return is_digit(c) or is_alpha(c) end proc is_space(c: Char) -> Bool; begin return c = ' ' or c = '\n' or c = '\t' end proc substring(string: String, start: Word, count: Word) -> String; begin return String(string.ptr + start, count) end proc open_substring(string: String, start: Word) -> String; begin return substring(string, start, string.length - start) end proc string_dup(origin: String) -> String; var copy: ^Char begin copy := cast(malloc(origin.length): ^Char); strncpy(copy, origin.ptr, origin.length); return String(copy, origin.length) end proc string_buffer_new() -> StringBuffer; var result: StringBuffer begin result.capacity := 64u; result.data := malloc(result.capacity); result.size := 0u; return result end proc string_buffer_push(buffer: ^StringBuffer, char: Char); begin if buffer^.size >= buffer^.capacity then buffer^.capacity := buffer^.capacity + 1024u; buffer^.data := realloc(buffer^.data, buffer^.capacity) end; (buffer^.data + buffer^.size)^ := cast(char: Byte); buffer^.size := buffer^.size + 1u end proc string_buffer_pop(buffer: ^StringBuffer, count: Word); begin buffer^.size := buffer^.size - count end proc string_buffer_clear(buffer: ^StringBuffer) -> String; var result: String begin result := String(cast(buffer^.data: ^Char), buffer^.size); buffer^.size := 0u; return result end (* End of standard procedures. *) proc make_position() -> Position; begin return Position(1u, 1u) end proc read_source(filename: ^Char) -> ^SourceFile; var result: ^SourceFile file_handle: ^FILE begin file_handle := fopen(filename, "rb\0".ptr); if file_handle <> nil then result := cast(malloc(#size(SourceFile)): ^SourceFile); result^.handle := file_handle; result^.size := 0u; result^.index := 1u end; return result end proc escape_char(escape: Char, result: ^Char) -> Bool; var successful: Bool begin if escape = 'n' then result^ := '\n'; successful := true; elsif escape = 'a' then result^ := '\a'; successful := true elsif escape = 'b' then result^ := '\b'; successful := true elsif escape = 't' then result^ := '\t'; successful := true elsif escape = 'f' then result^ := '\f'; successful := true elsif escape = 'r' then result^ := '\r'; successful := true elsif escape = 'v' then result^ := '\v'; successful := true elsif escape = '\\' then result^ := '\\'; successful := true elsif escape = '\'' then result^ := '\''; successful := true elsif escape = '"' then result^ := '"'; successful := true elsif escape = '?' then result^ := '\?'; successful := true elsif escape = '0' then result^ := '\0'; successful := true else successful := false end; return successful end proc source_file_empty(source_input: ^Byte) -> Bool; var source_file: ^SourceFile begin source_file := cast(source_input: ^SourceFile); if source_file^.index > source_file^.size then source_file^.size := fread(cast(@source_file^.buffer: ^Byte), 1u, 1024u, source_file^.handle); source_file^.index := 1u end; return source_file^.size = 0u end proc source_file_head(source_input: ^Byte) -> Char; var source_file: ^SourceFile begin source_file := cast(source_input: ^SourceFile); return source_file^.buffer[source_file^.index] end proc source_file_advance(source_input: ^Byte); var source_file: ^SourceFile begin source_file := cast(source_input: ^SourceFile); source_file^.index := source_file^.index + 1u end proc source_code_empty(source_code: ^SourceCode) -> Bool; begin return source_code^.empty(source_code^.input) end proc source_code_head(source_code: SourceCode) -> Char; begin return source_code.head(source_code.input) end proc source_code_advance(source_code: ^SourceCode); begin source_code^.advance(source_code^.input); source_code^.position.column := source_code^.position.column end proc source_code_break(source_code: ^SourceCode); begin source_code^.position.line := source_code^.position.line + 1u; source_code^.position.column := 0u end proc source_code_expect(source_code: ^SourceCode, expected: Char) -> Bool; begin return ~source_code_empty(source_code) & source_code_head(source_code^) = expected end proc skip_spaces(source_code: ^SourceCode); var current: Char begin while ~source_code_empty(source_code), loop do current := source_code_head(source_code^); if ~is_space(current) then break loop elsif current = '\n' then source_code_break(source_code) end; source_code_advance(source_code) end end proc is_ident(char: Char) -> Bool; begin return is_alnum(char) or char = '_' end proc lex_identifier(source_code: ^SourceCode, token_content: ^StringBuffer); var content_length: Word begin while ~source_code_empty(source_code) & is_ident(source_code_head(source_code^)) do string_buffer_push(token_content, source_code_head(source_code^)); source_code_advance(source_code) end end proc lex_comment(source_code: ^SourceCode, token_content: ^StringBuffer) -> Bool; var trailing: Word begin trailing := 0u; while ~source_code_empty(source_code) & trailing < 2u do if source_code_head(source_code^) = '*' then string_buffer_push(token_content, '*'); trailing := 1u elsif source_code_head(source_code^) = ')' & trailing = 1u then string_buffer_pop(token_content, 1u); trailing := 2u else string_buffer_push(token_content, source_code_head(source_code^)); trailing := 0u end; source_code_advance(source_code) end; return trailing = 2u end proc lex_character(source_code: ^SourceCode, token_content: ^Char) -> Bool; var successful: Bool begin successful := ~source_code_empty(source_code); if successful then if source_code_head(source_code^) = '\\' then source_code_advance(source_code); successful := ~source_code_empty(source_code) & escape_char(source_code_head(source_code^), token_content) else token_content^ := source_code_head(source_code^); successful := true end end; if successful then source_code_advance(source_code) end; return successful end proc lex_string(source_code: ^SourceCode, token_content: ^StringBuffer) -> Bool; var token_end, constructed_string: ^Char token_length: Word is_valid: Bool next_char: Char begin is_valid := true; while is_valid & ~source_code_empty(source_code) & source_code_head(source_code^) <> '"' do is_valid := lex_character(source_code, @next_char); if is_valid then string_buffer_push(token_content, next_char) end end; if is_valid & source_code_expect(source_code, '"') then source_code_advance(source_code) else is_valid := false end; return is_valid end proc lex_number(source_code: ^SourceCode, token_content: ^Int); begin token_content^ := 0; while ~source_code_empty(source_code) & is_digit(source_code_head(source_code^)) do token_content^ := token_content^ * 10 + (cast(source_code_head(source_code^): Int) - cast('0': Int)); source_code_advance(source_code) end end proc print_tokens(tokens: ^Token, tokens_size: Word); var current_token: ^Token i: Word begin i := 0u; while i < tokens_size do current_token := tokens + i; case current_token^.kind of TokenKind._if: write_s("IF") | TokenKind._then: write_s("THEN") | TokenKind._else: write_s("ELSE") | TokenKind._elsif: write_s("ELSIF") | TokenKind._while: write_s("WHILE") | TokenKind._do: write_s("DO") | TokenKind._proc: write_s("PROC") | TokenKind._begin: write_s("BEGIN") | TokenKind._end: write_s("END") | TokenKind._extern: write_s("EXTERN") | TokenKind._const: write_s("CONST") | TokenKind._var: write_s("VAR") | TokenKind.array: write_s("ARRAY") | TokenKind._of: write_s("OF") | TokenKind._type: write_s("TYPE") | TokenKind._record: write_s("RECORD") | TokenKind._union: write_s("UNION") | TokenKind.pointer: write_s("POINTER") | TokenKind.to: write_s("TO") | TokenKind.boolean: write_s("BOOLEAN<"); write_b(current_token^.value.boolean_value); write_c('>') | TokenKind._nil: write_s("NIL") | TokenKind.and: write_s("AND") | TokenKind._or: write_s("OR") | TokenKind.not: write_s("NOT") | TokenKind._return: write_s("RETURN") | TokenKind._cast: write_s("CAST") | TokenKind.shift_left: write_s("<<") | TokenKind.shift_right: write_s(">>") | TokenKind.identifier: write_c('<'); write_s(current_token^.value.string); write_c('>') | TokenKind.left_paren: write_s("(") | TokenKind.right_paren: write_s(")") | TokenKind.left_square: write_s("[") | TokenKind.right_square: write_s("]") | TokenKind.greater_equal: write_s(">=") | TokenKind.less_equal: write_s("<=") | TokenKind.greater_than: write_s(">") | TokenKind.less_than: write_s("<") | TokenKind.equal: write_s("=") | TokenKind.not_equal: write_s("<>") | TokenKind.semicolon: write_c(';') | TokenKind.dot: write_c('.') | TokenKind.comma: write_c(',') | TokenKind.plus: write_c('+') | TokenKind.minus: write_c('-') | TokenKind.multiplication: write_c('*') | TokenKind.division: write_c('/') | TokenKind.remainder: write_c('%') | TokenKind.assignment: write_s(":=") | TokenKind.colon: write_c(':') | TokenKind.hat: write_c('^') | TokenKind.at: write_c('@') | TokenKind.comment: write_s("(* COMMENT *)") | TokenKind.integer: write_c('<'); write_i(current_token^.value.int_value); write_c('>') | TokenKind.word: write_c('<'); write_i(current_token^.value.int_value); write_s("u>") | TokenKind.character: write_c('<'); write_i(cast(current_token^.value.char_value: Int)); write_s("c>") | TokenKind.string: write_s("\"...\"") | TokenKind._defer: write_s("DEFER") | TokenKind.exclamation: write_c('!') | TokenKind.arrow: write_s("->") else write_s("UNKNOWN<"); write_i(cast(current_token^.kind: Int)); write_c('>') end; write_c(' '); i := i + 1u; end; write_c('\n') end proc categorize_identifier(token_content: String) -> Token; var current_token: Token begin if "if" = token_content then current_token.kind := TokenKind._if elsif "then" = token_content then current_token.kind := TokenKind._then elsif "else" = token_content then current_token.kind := TokenKind._else elsif "elsif" = token_content then current_token.kind := TokenKind._elsif elsif "while" = token_content then current_token.kind := TokenKind._while elsif "do" = token_content then current_token.kind := TokenKind._do elsif "proc" = token_content then current_token.kind := TokenKind._proc elsif "begin" = token_content then current_token.kind := TokenKind._begin elsif "end" = token_content then current_token.kind := TokenKind._end elsif "extern" = token_content then current_token.kind := TokenKind._extern elsif "const" = token_content then current_token.kind := TokenKind._const elsif "var" = token_content then current_token.kind := TokenKind._var elsif "array" = token_content then current_token.kind := TokenKind.array elsif "of" = token_content then current_token.kind := TokenKind._of elsif "type" = token_content then current_token.kind := TokenKind._type elsif "record" = token_content then current_token.kind := TokenKind._record elsif "union" = token_content then current_token.kind := TokenKind._union elsif "pointer" = token_content then current_token.kind := TokenKind.pointer elsif "to" = token_content then current_token.kind := TokenKind.to elsif "true" = token_content then current_token.kind := TokenKind.boolean; current_token.value.boolean_value := true elsif "false" = token_content then current_token.kind := TokenKind.boolean; current_token.value.boolean_value := false elsif "nil" = token_content then current_token.kind := TokenKind._nil elsif "and" = token_content then current_token.kind := TokenKind.and elsif "or" = token_content then current_token.kind := TokenKind._or elsif "not" = token_content then current_token.kind := TokenKind.not elsif "return" = token_content then current_token.kind := TokenKind._return elsif "cast" = token_content then current_token.kind := TokenKind._cast elsif "defer" = token_content then current_token.kind := TokenKind._defer else current_token.kind := TokenKind.identifier; current_token.value.string := string_dup(token_content) end; return current_token end proc tokenize(source_code: SourceCode, tokens_size: ^Word) -> ^Token; var tokens, current_token: ^Token first_char: Char token_buffer: StringBuffer begin tokens_size^ := 0u; tokens := nil; token_buffer := string_buffer_new(); skip_spaces(@source_code); while ~source_code_empty(@source_code) do tokens := cast(reallocarray(cast(tokens: ^Byte), tokens_size^ + 1u, #size(Token)): ^Token); current_token := tokens + tokens_size^; first_char := source_code_head(source_code); if is_alpha(first_char) or first_char = '_' then lex_identifier(@source_code, @token_buffer); current_token^ := categorize_identifier(string_buffer_clear(@token_buffer)) elsif is_digit(first_char) then lex_number(@source_code, @current_token^.value.int_value); if source_code_expect(@source_code, 'u') then current_token^.kind := TokenKind.word; source_code_advance(@source_code) else current_token^.kind := TokenKind.integer end elsif first_char = '(' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token^.kind := TokenKind.left_paren elsif source_code_head(source_code) = '*' then source_code_advance(@source_code); if lex_comment(@source_code, @token_buffer) then current_token^.value.string := string_dup(string_buffer_clear(@token_buffer)); current_token^.kind := TokenKind.comment else current_token^.kind := TokenKind.unknown end else current_token^.kind := TokenKind.left_paren end elsif first_char = ')' then current_token^.kind := TokenKind.right_paren; source_code_advance(@source_code) elsif first_char = '\'' then source_code_advance(@source_code); if lex_character(@source_code, @current_token^.value.char_value) & source_code_expect(@source_code, '\'') then current_token^.kind := TokenKind.character; source_code_advance(@source_code) else current_token^.kind := TokenKind.unknown end elsif first_char = '"' then source_code_advance(@source_code); if lex_string(@source_code, @token_buffer) then current_token^.kind := TokenKind.string; current_token^.value.string := string_dup(string_buffer_clear(@token_buffer)) else current_token^.kind := TokenKind.unknown end elsif first_char = '[' then current_token^.kind := TokenKind.left_square; source_code_advance(@source_code) elsif first_char = ']' then current_token^.kind := TokenKind.right_square; source_code_advance(@source_code) elsif first_char = '>' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token^.kind := TokenKind.greater_than elsif source_code_head(source_code) = '=' then current_token^.kind := TokenKind.greater_equal; source_code_advance(@source_code) elsif source_code_head(source_code) = '>' then current_token^.kind := TokenKind.shift_right; source_code_advance(@source_code) else current_token^.kind := TokenKind.greater_than end elsif first_char = '<' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token^.kind := TokenKind.less_than elsif source_code_head(source_code) = '=' then current_token^.kind := TokenKind.less_equal; source_code_advance(@source_code) elsif source_code_head(source_code) = '<' then current_token^.kind := TokenKind.shift_left; source_code_advance(@source_code) elsif source_code_head(source_code) = '>' then current_token^.kind := TokenKind.not_equal; source_code_advance(@source_code) else current_token^.kind := TokenKind.less_than end elsif first_char = '=' then current_token^.kind := TokenKind.equal; source_code_advance(@source_code) elsif first_char = ';' then current_token^.kind := TokenKind.semicolon; source_code_advance(@source_code) elsif first_char = '.' then current_token^.kind := TokenKind.dot; source_code_advance(@source_code) elsif first_char = ',' then current_token^.kind := TokenKind.comma; source_code_advance(@source_code) elsif first_char = '+' then current_token^.kind := TokenKind.plus; source_code_advance(@source_code) elsif first_char = '-' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token^.kind := TokenKind.minus elsif source_code_head(source_code) = '>' then current_token^.kind := TokenKind.arrow; source_code_advance(@source_code) else current_token^.kind := TokenKind.minus end elsif first_char = '*' then current_token^.kind := TokenKind.multiplication; source_code_advance(@source_code) elsif first_char = '/' then current_token^.kind := TokenKind.division; source_code_advance(@source_code) elsif first_char = '%' then current_token^.kind := TokenKind.remainder; source_code_advance(@source_code) elsif first_char = ':' then source_code_advance(@source_code); if source_code_empty(@source_code) then current_token^.kind := TokenKind.colon elsif source_code_head(source_code) = '=' then current_token^.kind := TokenKind.assignment; source_code_advance(@source_code) else current_token^.kind := TokenKind.colon end elsif first_char = '^' then current_token^.kind := TokenKind.hat; source_code_advance(@source_code) elsif first_char = '@' then current_token^.kind := TokenKind.at; source_code_advance(@source_code) elsif first_char = '!' then current_token^.kind := TokenKind.exclamation; source_code_advance(@source_code) else current_token^.kind := TokenKind.unknown; source_code_advance(@source_code) end; if current_token^.kind <> TokenKind.unknown then tokens_size^ := tokens_size^ + 1u; skip_spaces(@source_code) else write_s("Lexical analysis error on \""); write_c(first_char); write_s("\".\n") end end; return tokens end proc parse_command_line*(argc: Int, argv: ^^Char) -> ^CommandLine; var parameter: ^^Char i: Int result: ^CommandLine begin i := 1; result := cast(malloc(#size(CommandLine)): ^CommandLine); result^.tokenize := false; result^.syntax_tree := false; result^.input := nil; while i < argc do parameter := argv + i; if strcmp(parameter^, "--tokenize\0".ptr) = 0 then result^.tokenize := true elsif strcmp(parameter^, "--syntax-tree\0".ptr) = 0 then result^.syntax_tree := true elsif parameter^^ <> '-' then result^.input := parameter^ else write_s("Fatal error: Unknown command line options:"); write_c(' '); write_z(parameter^); write_s(".\n"); return nil end; i := i + 1 end; if result^.input = nil then write_s("Fatal error: no input files.\n"); return nil end; return result end proc process(argc: Int, argv: ^^Char) -> Int; var tokens: ^Token tokens_size: Word source_code: SourceCode command_line: ^CommandLine return_code: Int begin return_code := 0; command_line := parse_command_line(argc, argv); if command_line = nil then return_code := 2 end; if return_code = 0 then source_code.position := make_position(); source_code.input := cast(read_source(command_line^.input): ^Byte); source_code.empty := source_file_empty; source_code.head := source_file_head; source_code.advance := source_file_advance; if source_code.input = nil then perror(command_line^.input); return_code := 3 end end; if return_code = 0 then tokens := tokenize(source_code, @tokens_size); fclose(cast(source_code.input: ^SourceFile)^.handle); if command_line^.tokenize then print_tokens(tokens, tokens_size) end end; return return_code end begin exit(process(count, parameters)) end.