const SEEK_SET* = 0 SEEK_CUR* = 1 SEEK_END* = 2 TOKEN_IDENTIFIER* = 1 TOKEN_IF* = 2 TOKEN_THEN* = 3 TOKEN_ELSE* = 4 TOKEN_ELSIF* = 5 TOKEN_WHILE* = 6 TOKEN_DO* = 7 TOKEN_PROC* = 8 TOKEN_BEGIN* = 9 TOKEN_END* = 10 TOKEN_EXTERN* = 11 TOKEN_CONST* = 12 TOKEN_VAR* = 13 TOKEN_ARRAY* = 14 TOKEN_OF* = 15 TOKEN_TYPE* = 16 TOKEN_RECORD* = 17 TOKEN_UNION* = 18 TOKEN_POINTER* = 19 TOKEN_TO* = 20 TOKEN_BOOLEAN* = 21 TOKEN_NIL* = 22 TOKEN_AND* = 23 TOKEN_OR* = 24 TOKEN_NOT* = 25 TOKEN_RETURN* = 26 TOKEN_CAST* = 27 TOKEN_SHIFT_LEFT* = 28 TOKEN_SHIFT_RIGHT* = 29 TOKEN_LEFT_PAREN* = 30 TOKEN_RIGHT_PAREN* = 31 TOKEN_LEFT_SQUARE* = 32 TOKEN_RIGHT_SQUARE* = 33 TOKEN_GREATER_EQUAL* = 34 TOKEN_LESS_EQUAL* = 35 TOKEN_GREATER_THAN* = 36 TOKEN_LESS_THAN* = 37 TOKEN_NOT_EQUAL* = 38 TOKEN_EQUAL* = 39 TOKEN_SEMICOLON* = 40 TOKEN_DOT* = 41 TOKEN_COMMA* = 42 TOKEN_PLUS* = 43 TOKEN_MINUS* = 44 TOKEN_MULTIPLICATION* = 45 TOKEN_DIVISION* = 46 TOKEN_REMAINDER* = 47 TOKEN_ASSIGNMENT* = 48 TOKEN_COLON* = 49 TOKEN_HAT* = 50 TOKEN_AT* = 51 TOKEN_COMMENT* = 52 TOKEN_INTEGER* = 53 TOKEN_WORD* = 54 TOKEN_CHARACTER* = 55 TOKEN_STRING* = 56 TOKEN_DEFER* = 57 TOKEN_EXCLAMATION* = 58 TOKEN_ARROW = 59 type Position* = record line: Word column: Word end Location* = record first: Position last: Position end SourceCode = record position: Position text: String end TokenValue* = union int_value: Int string_value: ^Char string: String boolean_value: Bool char_value: Char end Token* = record kind: Int value: TokenValue location: Location end FILE* = record end CommandLine* = record input: ^Char tokenize: Bool syntax_tree: Bool end (* External procedures. *) proc fopen(pathname: ^Char, mode: ^Char) -> ^FILE; extern proc fclose(stream: ^FILE) -> Int; extern proc fseek(stream: ^FILE, off: Int, whence: Int) -> Int; extern proc rewind(stream: ^FILE); extern proc ftell(stream: ^FILE) -> Int; extern proc fread(ptr: ^Byte, size: Word, nmemb: Word, stream: ^FILE) -> Word; extern proc write(fd: Int, buf: ^Byte, Word: Int) -> Int; extern proc malloc(size: Word) -> ^Byte; extern proc free(ptr: ^Byte); extern proc calloc(nmemb: Word, size: Word) -> ^Byte; extern proc realloc(ptr: ^Byte, size: Word) -> ^Byte; extern proc memset(ptr: ^Char, c: Int, n: Int) -> ^Char; extern proc strcmp(s1: ^Char, s2: ^Char) -> Int; extern proc strncmp(s1: ^Char, s2: ^Char, n: Word) -> Int; extern proc strncpy(dst: ^Char, src: ^Char, dsize: Word) -> ^Char; extern proc strcpy(dst: ^Char, src: ^Char) -> ^Char; extern proc strlen(ptr: ^Char) -> Word; extern proc strtol(nptr: ^Char, endptr: ^^Char, base: Int) -> Int; extern proc perror(s: ^Char); extern proc exit(code: Int) -> !; extern (* Standard procedures. *) proc reallocarray(ptr: ^Byte, n: Word, size: Word) -> ^Byte; begin return realloc(ptr, n * size) end proc write_s(value: String); begin write(0, cast(value.ptr: ^Byte), cast(value.length: Int)) end proc write_z(value: ^Char); begin write(0, cast(value: ^Byte), cast(strlen(value): Int)) end proc write_b(value: Bool); begin if value then write_s("true") else write_s("false") end end proc write_c(value: Char); begin write(0, cast(@value: ^Byte), 1) end proc write_i(value: Int); var digit: Int n: Word buffer: [10]Char begin n := 10u if value = 0 then write_c('0') end while value <> 0 do digit := value % 10 value := value / 10 buffer[n] := cast(cast('0': Int) + digit: Char) n := n - 1u end while n < 10u do n := n + 1u write_c(buffer[n]) end end proc write_u(value: Word); begin write_i(cast(value: Int)) end proc is_digit(c: Char) -> Bool; begin return cast(c: Int) >= cast('0': Int) and cast(c: Int) <= cast('9': Int) end proc is_alpha(c: Char) -> Bool; begin return cast(c: Int) >= cast('A': Int) and cast(c: Int) <= cast('z': Int) end proc is_alnum(c: Char) -> Bool; begin return is_digit(c) or is_alpha(c) end proc is_space(c: Char) -> Bool; begin return c = ' ' or c = '\n' or c = '\t' end proc substring(string: String, start: Word, count: Word) -> String; begin return String(string.ptr + start, count) end proc open_substring(string: String, start: Word) -> String; begin return substring(string, start, string.length - start) end proc string_dup(origin: String) -> String; var copy: ^Char begin copy := cast(malloc(origin.length): ^Char) strncpy(copy, origin.ptr, origin.length) return String(copy, origin.length) end (* End of standard procedures. *) proc make_position() -> Position; begin return Position(1u, 1u) end proc read_source(filename: ^Char, result: ^String) -> Bool; var input_file: ^FILE source_size: Int input: ^Byte begin input_file := fopen(filename, "rb\0".ptr) if input_file = nil then return false end defer fclose(input_file) end if fseek(input_file, 0, SEEK_END) <> 0 then return false end source_size := ftell(input_file) if source_size < 0 then return false end rewind(input_file) input := malloc(cast(source_size: Word)) if fread(input, cast(source_size: Word), 1u, input_file) <> 1u then return false end result^ := String(cast(input: ^Char), cast(source_size: Word)) return true end proc escape_char(escape: Char, result: ^Char) -> Bool; var successful: Bool begin if escape = 'n' then result^ := '\n' successful := true elsif escape = 'a' then result^ := '\a' successful := true elsif escape = 'b' then result^ := '\b' successful := true elsif escape = 't' then result^ := '\t' successful := true elsif escape = 'f' then result^ := '\f' successful := true elsif escape = 'r' then result^ := '\r' successful := true elsif escape = 'v' then result^ := '\v' successful := true elsif escape = '\\' then result^ := '\\' successful := true elsif escape = '\'' then result^ := '\'' successful := true elsif escape = '"' then result^ := '"' successful := true elsif escape = '?' then result^ := '\?' successful := true elsif escape = '0' then result^ := '\0' successful := true else successful := false end return successful end proc advance_source(source_code: SourceCode, length: Word) -> SourceCode; begin source_code.text := open_substring(source_code.text, length) source_code.position.column := source_code.position.column + length return source_code end proc skip_spaces(source_code: SourceCode) -> SourceCode; begin while source_code.text.length > 0u and is_space(source_code.text[1u]) do if source_code.text[1u] = '\n' then source_code.position.line := source_code.position.line + 1u source_code.position.column := 1u else source_code.position.column := source_code.position.column + 1u end source_code.text := open_substring(source_code.text, 1u) end return source_code end proc lex_identifier(source_code: ^SourceCode, token_content: ^String); var content_length: Word begin content_length := 0u token_content^ := source_code^.text while is_alnum(source_code^.text[1u]) or source_code^.text[1u] = '_' do content_length := content_length + 1u source_code^ := advance_source(source_code^, 1u) end token_content^ := substring(token_content^, 0u, content_length) end proc lex_comment(source_code: ^SourceCode, token_content: ^String) -> Bool; var content_length: Word trailing: Word begin content_length := 0u token_content^ := source_code^.text trailing := 0u while source_code^.text.length > 0u and trailing < 2u do if source_code^.text[1u] = '*' then content_length := content_length + trailing trailing := 1u elsif source_code^.text[1u] = ')' and trailing = 1u then trailing := 2u else content_length := content_length + trailing + 1u trailing := 0u end source_code^ := advance_source(source_code^, 1u) end return trailing = 2u end proc lex_character(input: ^Char, current_token: ^Token) -> ^Char; begin if input^ = '\\' then input := input + 1 if escape_char(input^, @current_token^.value.char_value) then input := input + 1 end elsif input^ <> '\0' then current_token^.value.char_value := input^ input := input + 1 end return input end proc lex_string(input: ^Char, current_token: ^Token) -> ^Char; var token_end, constructed_string: ^Char token_length: Word is_valid: Bool begin token_end := input while token_end^ <> '\0' and not ((token_end - 1)^ <> '\\' and token_end^ = '"') do token_end := token_end + 1 end if token_end^ <> '\"' then return input end token_length := cast(token_end - input: Word) current_token^.value.string_value := cast(calloc(token_length, 1u): ^Char) is_valid := true constructed_string := current_token^.value.string_value while input < token_end and is_valid do if input^ = '\\' then input := input + 1 if escape_char(input^, constructed_string) then input := input + 1 else is_valid := false end elsif input^ <> '\0' then constructed_string^ := input^ input := input + 1 end constructed_string := constructed_string + 1 end return token_end end proc print_tokens(tokens: ^Token, tokens_size: Word); var current_token: ^Token i: Word begin i := 0u while i < tokens_size do current_token := tokens + i if current_token^.kind = TOKEN_IF then write_s("IF") elsif current_token^.kind = TOKEN_THEN then write_s("THEN") elsif current_token^.kind = TOKEN_ELSE then write_s("ELSE") elsif current_token^.kind = TOKEN_ELSIF then write_s("ELSIF") elsif current_token^.kind = TOKEN_WHILE then write_s("WHILE") elsif current_token^.kind = TOKEN_DO then write_s("DO") elsif current_token^.kind = TOKEN_PROC then write_s("PROC") elsif current_token^.kind = TOKEN_BEGIN then write_s("BEGIN") elsif current_token^.kind = TOKEN_END then write_s("END") elsif current_token^.kind = TOKEN_EXTERN then write_s("EXTERN") elsif current_token^.kind = TOKEN_CONST then write_s("CONST") elsif current_token^.kind = TOKEN_VAR then write_s("VAR") elsif current_token^.kind = TOKEN_ARRAY then write_s("ARRAY") elsif current_token^.kind = TOKEN_OF then write_s("OF") elsif current_token^.kind = TOKEN_TYPE then write_s("TYPE") elsif current_token^.kind = TOKEN_RECORD then write_s("RECORD") elsif current_token^.kind = TOKEN_UNION then write_s("UNION") elsif current_token^.kind = TOKEN_POINTER then write_s("POINTER") elsif current_token^.kind = TOKEN_TO then write_s("TO") elsif current_token^.kind = TOKEN_BOOLEAN then write_s("BOOLEAN<") write_b(current_token^.value.boolean_value) write_c('>') elsif current_token^.kind = TOKEN_NIL then write_s("NIL") elsif current_token^.kind = TOKEN_AND then write_s("AND") elsif current_token^.kind = TOKEN_OR then write_s("OR") elsif current_token^.kind = TOKEN_NOT then write_s("NOT") elsif current_token^.kind = TOKEN_RETURN then write_s("RETURN") elsif current_token^.kind = TOKEN_CAST then write_s("CAST") elsif current_token^.kind = TOKEN_SHIFT_LEFT then write_s("<<") elsif current_token^.kind = TOKEN_SHIFT_RIGHT then write_s(">>") elsif current_token^.kind = TOKEN_IDENTIFIER then write_c('<') write_s(current_token^.value.string) write_c('>') elsif current_token^.kind = TOKEN_LEFT_PAREN then write_s("(") elsif current_token^.kind = TOKEN_RIGHT_PAREN then write_s(")") elsif current_token^.kind = TOKEN_LEFT_SQUARE then write_s("[") elsif current_token^.kind = TOKEN_RIGHT_SQUARE then write_s("]") elsif current_token^.kind = TOKEN_GREATER_EQUAL then write_s(">=") elsif current_token^.kind = TOKEN_LESS_EQUAL then write_s("<=") elsif current_token^.kind = TOKEN_GREATER_THAN then write_s(">") elsif current_token^.kind = TOKEN_LESS_THAN then write_s("<") elsif current_token^.kind = TOKEN_EQUAL then write_s("=") elsif current_token^.kind = TOKEN_NOT_EQUAL then write_s("<>") elsif current_token^.kind = TOKEN_SEMICOLON then write_c(';') elsif current_token^.kind = TOKEN_DOT then write_c('.') elsif current_token^.kind = TOKEN_COMMA then write_c(',') elsif current_token^.kind = TOKEN_PLUS then write_c('+') elsif current_token^.kind = TOKEN_MINUS then write_c('-') elsif current_token^.kind = TOKEN_MULTIPLICATION then write_c('*') elsif current_token^.kind = TOKEN_DIVISION then write_c('/') elsif current_token^.kind = TOKEN_REMAINDER then write_c('%') elsif current_token^.kind = TOKEN_ASSIGNMENT then write_s(":=") elsif current_token^.kind = TOKEN_COLON then write_c(':') elsif current_token^.kind = TOKEN_HAT then write_c('^') elsif current_token^.kind = TOKEN_AT then write_c('@') elsif current_token^.kind = TOKEN_COMMENT then write_s("(* COMMENT *)") elsif current_token^.kind = TOKEN_INTEGER then write_c('<') write_i(current_token^.value.int_value) write_c('>') elsif current_token^.kind = TOKEN_WORD then write_c('<') write_i(current_token^.value.int_value) write_s("u>") elsif current_token^.kind = TOKEN_CHARACTER then write_c('<') write_i(cast(current_token^.value.char_value: Int)) write_s("c>") elsif current_token^.kind = TOKEN_STRING then write_s("\"...\"") elsif current_token^.kind = TOKEN_DEFER then write_s("DEFER") elsif current_token^.kind = TOKEN_EXCLAMATION then write_c('!') elsif current_token^.kind = TOKEN_ARROW then write_s("->") else write_s("UNKNOWN<") write_i(current_token^.kind) write_c('>') end write_c(' ') i := i + 1u end write_c('\n') end proc categorize_identifier(token_content: String) -> Token; var current_token: Token begin if "if" = token_content then current_token.kind := TOKEN_IF elsif "then" = token_content then current_token.kind := TOKEN_THEN elsif "else" = token_content then current_token.kind := TOKEN_ELSE elsif "elsif" = token_content then current_token.kind := TOKEN_ELSIF elsif "while" = token_content then current_token.kind := TOKEN_WHILE elsif "do" = token_content then current_token.kind := TOKEN_DO elsif "proc" = token_content then current_token.kind := TOKEN_PROC elsif "begin" = token_content then current_token.kind := TOKEN_BEGIN elsif "end" = token_content then current_token.kind := TOKEN_END elsif "extern" = token_content then current_token.kind := TOKEN_EXTERN elsif "const" = token_content then current_token.kind := TOKEN_CONST elsif "var" = token_content then current_token.kind := TOKEN_VAR elsif "array" = token_content then current_token.kind := TOKEN_ARRAY elsif "of" = token_content then current_token.kind := TOKEN_OF elsif "type" = token_content then current_token.kind := TOKEN_TYPE elsif "record" = token_content then current_token.kind := TOKEN_RECORD elsif "union" = token_content then current_token.kind := TOKEN_UNION elsif "pointer" = token_content then current_token.kind := TOKEN_POINTER elsif "to" = token_content then current_token.kind := TOKEN_TO elsif "true" = token_content then current_token.kind := TOKEN_BOOLEAN current_token.value.boolean_value := true elsif "false" = token_content then current_token.kind := TOKEN_BOOLEAN current_token.value.boolean_value := false elsif "nil" = token_content then current_token.kind := TOKEN_NIL elsif "and" = token_content then current_token.kind := TOKEN_AND elsif "or" = token_content then current_token.kind := TOKEN_OR elsif "not" = token_content then current_token.kind := TOKEN_NOT elsif "return" = token_content then current_token.kind := TOKEN_RETURN elsif "cast" = token_content then current_token.kind := TOKEN_CAST elsif "defer" = token_content then current_token.kind := TOKEN_DEFER else current_token.kind := TOKEN_IDENTIFIER current_token.value.string := string_dup(token_content) end return current_token end proc tokenize(source_code: SourceCode, tokens_size: ^Word) -> ^Token; var token_end: ^Char tokens, current_token: ^Token token_length: Word first_char: Char token_content: String begin tokens_size^ := 0u tokens := nil source_code := skip_spaces(source_code) while source_code.text.length <> 0u do tokens := cast(reallocarray(cast(tokens: ^Byte), tokens_size^ + 1u, Token.size): ^Token) current_token := tokens + tokens_size^ first_char := source_code.text[1u] if is_alpha(first_char) or first_char = '_' then lex_identifier(@source_code, @token_content) current_token^ := categorize_identifier(token_content) elsif is_digit(first_char) then token_end := nil current_token^.value.int_value := strtol(source_code.text.ptr, @token_end, 10) token_length := cast(token_end - source_code.text.ptr: Word) if token_end^ = 'u' then current_token^.kind := TOKEN_WORD source_code := advance_source(source_code, token_length + 1u) else current_token^.kind := TOKEN_INTEGER source_code := advance_source(source_code, token_length) end elsif first_char = '(' then source_code := advance_source(source_code, 1u) if source_code.text.length = 0u then current_token^.kind := TOKEN_LEFT_PAREN elsif source_code.text[1u] = '*' then source_code := advance_source(source_code, 1u) if lex_comment(@source_code, @token_content) then current_token^.value.string := string_dup(token_content) current_token^.kind := TOKEN_COMMENT else current_token^.kind := 0 end else current_token^.kind := TOKEN_LEFT_PAREN end elsif first_char = ')' then current_token^.kind := TOKEN_RIGHT_PAREN source_code := advance_source(source_code, 1u) elsif first_char = '\'' then token_end := lex_character(source_code.text.ptr + 1, current_token) token_length := cast(token_end - source_code.text.ptr: Word) if token_end^ = '\'' then current_token^.kind := TOKEN_CHARACTER source_code := advance_source(source_code, token_length + 1u) else source_code := advance_source(source_code, 1u) end elsif first_char = '"' then token_end := lex_string(source_code.text.ptr + 1, current_token) if token_end^ = '"' then current_token^.kind := TOKEN_STRING token_length := cast(token_end - source_code.text.ptr: Word) source_code := advance_source(source_code, token_length + 1u) end elsif first_char = '[' then current_token^.kind := TOKEN_LEFT_SQUARE source_code := advance_source(source_code, 1u) elsif first_char = ']' then current_token^.kind := TOKEN_RIGHT_SQUARE source_code := advance_source(source_code, 1u) elsif first_char = '>' then source_code := advance_source(source_code, 1u) if source_code.text.length = 0u then current_token^.kind := TOKEN_GREATER_THAN elsif source_code.text[1u] = '=' then current_token^.kind := TOKEN_GREATER_EQUAL source_code := advance_source(source_code, 1u) elsif source_code.text[1u] = '>' then current_token^.kind := TOKEN_SHIFT_RIGHT source_code := advance_source(source_code, 1u) else current_token^.kind := TOKEN_GREATER_THAN end elsif first_char = '<' then source_code := advance_source(source_code, 1u) if source_code.text.length = 0u then current_token^.kind := TOKEN_LESS_THAN elsif source_code.text[1u] = '=' then current_token^.kind := TOKEN_LESS_EQUAL source_code := advance_source(source_code, 1u) elsif source_code.text[1u] = '<' then current_token^.kind := TOKEN_SHIFT_LEFT source_code := advance_source(source_code, 1u) elsif source_code.text[1u] = '>' then current_token^.kind := TOKEN_NOT_EQUAL source_code := advance_source(source_code, 1u) else current_token^.kind := TOKEN_LESS_THAN end elsif first_char = '=' then current_token^.kind := TOKEN_EQUAL source_code := advance_source(source_code, 1u) elsif first_char = ';' then current_token^.kind := TOKEN_SEMICOLON source_code := advance_source(source_code, 1u) elsif first_char = '.' then current_token^.kind := TOKEN_DOT source_code := advance_source(source_code, 1u) elsif first_char = ',' then current_token^.kind := TOKEN_COMMA source_code := advance_source(source_code, 1u) elsif first_char = '+' then current_token^.kind := TOKEN_PLUS source_code := advance_source(source_code, 1u) elsif first_char = '-' then source_code := advance_source(source_code, 1u) if source_code.text.length = 0u then current_token^.kind := TOKEN_MINUS elsif source_code.text[1u] = '>' then current_token^.kind := TOKEN_ARROW source_code := advance_source(source_code, 1u) else current_token^.kind := TOKEN_MINUS end elsif first_char = '*' then current_token^.kind := TOKEN_MULTIPLICATION source_code := advance_source(source_code, 1u) elsif first_char = '/' then current_token^.kind := TOKEN_DIVISION source_code := advance_source(source_code, 1u) elsif first_char = '%' then current_token^.kind := TOKEN_REMAINDER source_code := advance_source(source_code, 1u) elsif first_char = ':' then source_code := advance_source(source_code, 1u) if source_code.text.length = 0u then current_token^.kind := TOKEN_COLON elsif source_code.text[1u] = '=' then current_token^.kind := TOKEN_ASSIGNMENT source_code := advance_source(source_code, 1u) else current_token^.kind := TOKEN_COLON end elsif first_char = '^' then current_token^.kind := TOKEN_HAT source_code := advance_source(source_code, 1u) elsif first_char = '@' then current_token^.kind := TOKEN_AT source_code := advance_source(source_code, 1u) elsif first_char = '!' then current_token^.kind := TOKEN_EXCLAMATION source_code := advance_source(source_code, 1u) else current_token^.kind := 0 source_code := advance_source(source_code, 1u) end if current_token^.kind <> 0 then tokens_size^ := tokens_size^ + 1u source_code := skip_spaces(source_code) else write_s("Lexical analysis error on \"") write_c(first_char) write_s("\".\n") end end return tokens end proc parse_command_line*(argc: Int, argv: ^^Char) -> ^CommandLine; var parameter: ^^Char i: Int result: ^CommandLine begin i := 1 result := cast(malloc(CommandLine.size): ^CommandLine) result^.tokenize := false result^.syntax_tree := false result^.input := nil while i < argc do parameter := argv + i if strcmp(parameter^, "--tokenize\0".ptr) = 0 then result^.tokenize := true elsif strcmp(parameter^, "--syntax-tree\0".ptr) = 0 then result^.syntax_tree := true elsif parameter^^ <> '-' then result^.input := parameter^ else write_s("Fatal error: Unknown command line options:") write_c(' ') write_z(parameter^) write_s(".\n") return nil end i := i + 1 end if result^.input = nil then write_s("Fatal error: no input files.\n") return nil end return result end proc process(argc: Int, argv: ^^Char) -> Int; var tokens: ^Token tokens_size: Word source_code: SourceCode command_line: ^CommandLine begin command_line := parse_command_line(argc, argv) if command_line = nil then return 2 end source_code.position := make_position() if not read_source(command_line^.input, @source_code.text) then perror(command_line^.input) return 3 end tokens := tokenize(source_code, @tokens_size) if command_line^.tokenize then print_tokens(tokens, tokens_size) end return 0 end begin exit(process(cast(count: Int), cast(parameters: ^^Char))) end.