type Position = record line: Word; column: Word end, Location = record first: Position; last: Position end, TokenValue = union int_value: Int; string_value: pointer to Char; boolean_value: Bool; char_value: Char end, Token = record kind: Int; value: TokenValue; location: Location end, FILE = record dummy: Int end; const SEEK_SET = 0, SEEK_CUR = 1, SEEK_END = 2, TOKEN_IDENTIFIER = 1, TOKEN_IF = 2, TOKEN_THEN = 3, TOKEN_ELSE = 4, TOKEN_ELSIF = 5, TOKEN_WHILE = 6, TOKEN_DO = 7, TOKEN_PROC = 8, TOKEN_BEGIN = 9, TOKEN_END = 10, TOKEN_EXTERN = 11, TOKEN_CONST = 12, TOKEN_VAR = 13, TOKEN_ARRAY = 14, TOKEN_OF = 15, TOKEN_TYPE = 16, TOKEN_RECORD = 17, TOKEN_UNION = 18, TOKEN_POINTER = 19, TOKEN_TO = 20, TOKEN_BOOLEAN = 21, TOKEN_NIL = 22, TOKEN_AND = 23, TOKEN_OR = 24, TOKEN_NOT = 25, TOKEN_RETURN = 26, TOKEN_CAST = 27, TOKEN_AS = 28, TOKEN_SIZEOF = 29, TOKEN_LEFT_PAREN = 30, TOKEN_RIGHT_PAREN = 31, TOKEN_LEFT_SQUARE = 32, TOKEN_RIGHT_SQUARE = 33, TOKEN_GREATER_EQUAL = 34, TOKEN_LESS_EQUAL = 35, TOKEN_GREATER_THAN = 36, TOKEN_LESS_THAN = 37, TOKEN_NOT_EQUAL = 38, TOKEN_EQUAL = 39, TOKEN_SEMICOLON = 40, TOKEN_DOT = 41, TOKEN_COMMA = 42, TOKEN_PLUS = 43, TOKEN_MINUS = 44, TOKEN_MULTIPLICATION = 45, TOKEN_DIVISION = 46, TOKEN_REMAINDER = 47, TOKEN_ASSIGNMENT = 48, TOKEN_COLON = 49, TOKEN_HAT = 50, TOKEN_AT = 51, TOKEN_COMMENT = 52, TOKEN_INTEGER = 53, TOKEN_WORD = 54, TOKEN_CHARACTER = 55, TOKEN_STRING = 56; (* External procedures. *) proc fopen(pathname: String, mode: String): pointer to FILE; extern; proc fclose(stream: pointer to FILE): Int; extern; proc fseek(stream: pointer to FILE, off: Int, whence: Int): Int; extern; proc ftell(stream: pointer to FILE): Int; extern; proc fread(ptr: pointer to Byte, size: Word, nmemb: Word, stream: pointer to FILE): Word; extern; proc write(fd: Int, buf: pointer to Byte, Word: Int): Int; extern; proc malloc(size: Word): pointer to Byte; extern; proc free(ptr: pointer to Byte); extern; proc calloc(nmemb: Word, size: Word): pointer to Byte; extern; proc realloc(ptr: pointer to Byte, size: Word): pointer to Byte; extern; proc reallocarray(ptr: pointer to Byte, n: Word, size: Word): pointer to Byte; extern; proc memset(ptr: pointer to Char, c: Int, n: Int): pointer to Char; extern; proc strncmp(s1: pointer to Char, s2: pointer to Char, n: Word): Int; extern; proc strncpy(dst: pointer to Char, src: pointer to Char, dsize: Word): pointer to Char; extern; proc strlen(ptr: pointer to Char): Word; extern; proc strtol(nptr: pointer to Char, endptr: pointer to pointer to Char, base: Int): Int; extern; proc exit(code: Int); extern; (* Standard procedures. *) proc write_s(value: String); begin write(0, value, strlen(value)) end; proc write_b(value: Bool); begin if value then write_s("true") else write_s("false") end end; proc write_c(value: Char); begin write(0, @value, 1) end; proc write_i(value: Int); var digit: Int, n: Int, buffer: array 10 of Char; begin n := 9; buffer[9] := '0'; if value = 0 then write_c('0') end; while value <> 0 do digit := value % 10; value := value / 10; buffer[n] := cast(cast('0' as Int) + digit as Char); n := n - 1 end; while n < 10 do n := n + 1; write_c(buffer[n]) end end; proc write_u(value: Word); begin write_i(value) end; proc is_digit(c: Char): Bool; begin return cast(c as Int) >= cast('0' as Int) and cast(c as Int) <= cast('9' as Int) end; proc is_alpha(c: Char): Bool; begin return cast(c as Int) >= cast('A' as Int) and cast(c as Int) <= cast('z' as Int) end; proc is_alnum(c: Char): Bool; begin return is_digit(c) or is_alpha(c) end; proc is_space(c: Char): Bool; begin return c = ' ' or c = '\n' or c = '\t' end; (* End of standard procedures. *) proc test_record(); var r: Token; begin write_s("\nTest record:\n"); r.kind := 4; r.value.int_value := 8; write_i(r.value.int_value) end; proc test_primitive(); begin write_s("\nTest primitives:\n"); write_u(25u); write_c('\n'); write_i(8); write_c('\n'); write_b(true); write_c('\n') end; proc read_source(filename: String): pointer to Char; var input_file: pointer to FILE, source_size: Int, input: pointer to Byte; begin input_file := fopen(filename, "rb"); fseek(input_file, 0, SEEK_END); source_size := ftell(input_file); fseek(input_file, 0, SEEK_SET); input := calloc(source_size + 1, 1); fread(input, source_size, 1, input_file); fclose(input_file); return input end; proc escape_char(escape: Char, result: pointer to Char): Bool; begin if escape = 'n' then result^ := '\n'; return true elsif escape = 'a' then result^ := '\a'; return true elsif escape = 'b' then result^ := '\b'; return true elsif escape = 't' then result^ := '\t'; return true elsif escape = 'f' then result^ := '\f'; return true elsif escape = 'r' then result^ := '\r'; return true elsif escape = 'v' then result^ := '\v'; return true elsif escape = '\\' then result^ := '\\'; return true elsif escape = '\'' then result^ := '\''; return true elsif escape = '"' then result^ := '"'; return true elsif escape = '?' then result^ := '\?'; return true elsif escape = '0' then result^ := '\0'; return true else return false end end; proc skip_spaces(input: pointer to Char): pointer to Char; begin while is_space(input^) do input := input + 1 end; return input end; proc lex_identifier(input: pointer to Char): pointer to Char; begin while is_alnum(input^) or input^ = '_' do input := input + 1 end; return input end; proc lex_comment(input: pointer to Char): pointer to Char; var current: pointer to Char, next: pointer to Char; begin while input^ <> '\0' do next := input + 1; if input^ = '*' and next^ = ')' then return next + 1 end; input := input + 1 end; return nil end; proc lex_character(input: pointer to Char, current_token: pointer to Token): pointer to Char; begin if input^ = '\\' then input := input + 1; if escape_char(input^, @current_token^.value.char_value) then input := input + 1 end elsif input^ <> '\0' then current_token^.value.char_value := input^; input := input + 1 end; return input end; proc print_tokens(tokens: pointer to Token, tokens_size: Word); var current_token: pointer to Token, i: Word; begin i := 0u; while i < tokens_size do current_token := tokens + i; if current_token^.kind = TOKEN_IF then write_s("IF") elsif current_token^.kind = TOKEN_THEN then write_s("THEN") elsif current_token^.kind = TOKEN_ELSE then write_s("ELSE") elsif current_token^.kind = TOKEN_ELSIF then write_s("ELSIF") elsif current_token^.kind = TOKEN_WHILE then write_s("WHILE") elsif current_token^.kind = TOKEN_DO then write_s("DO") elsif current_token^.kind = TOKEN_PROC then write_s("PROC") elsif current_token^.kind = TOKEN_BEGIN then write_s("BEGIN") elsif current_token^.kind = TOKEN_END then write_s("END") elsif current_token^.kind = TOKEN_EXTERN then write_s("EXTERN") elsif current_token^.kind = TOKEN_CONST then write_s("CONST") elsif current_token^.kind = TOKEN_VAR then write_s("VAR") elsif current_token^.kind = TOKEN_ARRAY then write_s("ARRAY") elsif current_token^.kind = TOKEN_OF then write_s("OF") elsif current_token^.kind = TOKEN_TYPE then write_s("TYPE") elsif current_token^.kind = TOKEN_RECORD then write_s("RECORD") elsif current_token^.kind = TOKEN_UNION then write_s("UNION") elsif current_token^.kind = TOKEN_POINTER then write_s("POINTER") elsif current_token^.kind = TOKEN_TO then write_s("TO") elsif current_token^.kind = TOKEN_BOOLEAN then write_s("BOOLEAN<"); write_b(current_token^.value.boolean_value); write_c('>') elsif current_token^.kind = TOKEN_NIL then write_s("NIL") elsif current_token^.kind = TOKEN_AND then write_s("AND") elsif current_token^.kind = TOKEN_OR then write_s("OR") elsif current_token^.kind = TOKEN_NOT then write_s("NOT") elsif current_token^.kind = TOKEN_RETURN then write_s("RETURN") elsif current_token^.kind = TOKEN_CAST then write_s("CAST") elsif current_token^.kind = TOKEN_AS then write_s("AS") elsif current_token^.kind = TOKEN_SIZEOF then write_s("SIZEOF") elsif current_token^.kind = TOKEN_IDENTIFIER then write_c('<'); write_s(current_token^.value.string_value); write_c('>') elsif current_token^.kind = TOKEN_LEFT_PAREN then write_s("(") elsif current_token^.kind = TOKEN_RIGHT_PAREN then write_s(")") elsif current_token^.kind = TOKEN_LEFT_SQUARE then write_s("[") elsif current_token^.kind = TOKEN_RIGHT_SQUARE then write_s("]") elsif current_token^.kind = TOKEN_GREATER_EQUAL then write_s(">=") elsif current_token^.kind = TOKEN_LESS_EQUAL then write_s("<=") elsif current_token^.kind = TOKEN_GREATER_THAN then write_s(">") elsif current_token^.kind = TOKEN_LESS_THAN then write_s("<") elsif current_token^.kind = TOKEN_EQUAL then write_s("=") elsif current_token^.kind = TOKEN_NOT_EQUAL then write_s("<>") elsif current_token^.kind = TOKEN_SEMICOLON then write_c(';') elsif current_token^.kind = TOKEN_DOT then write_c('.') elsif current_token^.kind = TOKEN_COMMA then write_c(',') elsif current_token^.kind = TOKEN_PLUS then write_c('+') elsif current_token^.kind = TOKEN_MINUS then write_c('-') elsif current_token^.kind = TOKEN_MULTIPLICATION then write_c('*') elsif current_token^.kind = TOKEN_DIVISION then write_c('/') elsif current_token^.kind = TOKEN_REMAINDER then write_c('%') elsif current_token^.kind = TOKEN_ASSIGNMENT then write_s(":=") elsif current_token^.kind = TOKEN_COLON then write_c(':') elsif current_token^.kind = TOKEN_HAT then write_c('^') elsif current_token^.kind = TOKEN_AT then write_c('@') elsif current_token^.kind = TOKEN_COMMENT then write_s("COMMENT") elsif current_token^.kind = TOKEN_INTEGER then write_c('<'); write_i(current_token^.value.int_value); write_c('>') elsif current_token^.kind = TOKEN_WORD then write_c('<'); write_i(current_token^.value.int_value); write_s("u>") elsif current_token^.kind = TOKEN_CHARACTER then write_c('<'); write_i(current_token^.value.char_value); write_s("c>") elsif current_token^.kind = TOKEN_STRING then write_s("\"...\"") else write_s("UNKNOWN<"); write_i(current_token^.kind); write_c('>') end; write_c(' '); i := i + sizeof(Token) end end; proc categorize_identifier(input_pointer: pointer to Char, token_length: Int): Token; var current_token: Token; begin if strncmp("if", input_pointer, token_length) = 0 then current_token.kind := TOKEN_IF elsif strncmp("then", input_pointer, token_length) = 0 then current_token.kind := TOKEN_THEN elsif strncmp("else", input_pointer, token_length) = 0 then current_token.kind := TOKEN_ELSE elsif strncmp("elsif", input_pointer, token_length) = 0 then current_token.kind := TOKEN_ELSIF elsif strncmp("while", input_pointer, token_length) = 0 then current_token.kind := TOKEN_WHILE elsif strncmp("do", input_pointer, token_length) = 0 then current_token.kind := TOKEN_DO elsif strncmp("proc", input_pointer, token_length) = 0 then current_token.kind := TOKEN_PROC elsif strncmp("begin", input_pointer, token_length) = 0 then current_token.kind := TOKEN_BEGIN elsif strncmp("end", input_pointer, token_length) = 0 then current_token.kind := TOKEN_END elsif strncmp("extern", input_pointer, token_length) = 0 then current_token.kind := TOKEN_EXTERN elsif strncmp("const", input_pointer, token_length) = 0 then current_token.kind := TOKEN_CONST elsif strncmp("var", input_pointer, token_length) = 0 then current_token.kind := TOKEN_VAR elsif strncmp("array", input_pointer, token_length) = 0 then current_token.kind := TOKEN_ARRAY elsif strncmp("of", input_pointer, token_length) = 0 then current_token.kind := TOKEN_OF elsif strncmp("type", input_pointer, token_length) = 0 then current_token.kind := TOKEN_TYPE elsif strncmp("record", input_pointer, token_length) = 0 then current_token.kind := TOKEN_RECORD elsif strncmp("union", input_pointer, token_length) = 0 then current_token.kind := TOKEN_UNION elsif strncmp("pointer", input_pointer, token_length) = 0 then current_token.kind := TOKEN_POINTER elsif strncmp("to", input_pointer, token_length) = 0 then current_token.kind := TOKEN_TO elsif strncmp("true", input_pointer, token_length) = 0 then current_token.kind := TOKEN_BOOLEAN; current_token.value.boolean_value := true elsif strncmp("false", input_pointer, token_length) = 0 then current_token.kind := TOKEN_BOOLEAN; current_token.value.boolean_value := false elsif strncmp("nil", input_pointer, token_length) = 0 then current_token.kind := TOKEN_NIL elsif strncmp("and", input_pointer, token_length) = 0 then current_token.kind := TOKEN_AND elsif strncmp("or", input_pointer, token_length) = 0 then current_token.kind := TOKEN_OR elsif strncmp("not", input_pointer, token_length) = 0 then current_token.kind := TOKEN_NOT elsif strncmp("return", input_pointer, token_length) = 0 then current_token.kind := TOKEN_RETURN elsif strncmp("cast", input_pointer, token_length) = 0 then current_token.kind := TOKEN_CAST elsif strncmp("as", input_pointer, token_length) = 0 then current_token.kind := TOKEN_AS elsif strncmp("sizeof", input_pointer, token_length) = 0 then current_token.kind := TOKEN_SIZEOF else current_token.kind := TOKEN_IDENTIFIER; current_token.value.string_value := cast(calloc(token_length + 1, 1) as pointer to Char); strncpy(current_token.value.string_value, input_pointer, token_length) end; return current_token end; proc compile(); var input: pointer to Char, input_pointer: pointer to Char, token_end: pointer to Char, token_length: Int, tokens: pointer to Token, current_token: pointer to Token, tokens_size: Word; begin tokens_size := 0u; tokens := cast(nil as pointer to Token); input := read_source("example.elna"); input_pointer := skip_spaces(input); while input_pointer^ <> '\0' do tokens := cast(realloc(tokens, tokens_size + sizeof(Token)) as pointer to Token); current_token := tokens + tokens_size; if is_alpha(input_pointer^) or input_pointer^ = '_' then token_end := lex_identifier(input_pointer + 1); token_length := cast(token_end as Int) - cast(input_pointer as Int); current_token^ := categorize_identifier(input_pointer, token_length); input_pointer := token_end elsif is_digit(input_pointer^) then token_end := cast(nil as pointer to Char); current_token^.value.int_value := strtol(input_pointer, @token_end, 10); if token_end^ = 'u' then current_token^.kind := TOKEN_WORD; input_pointer := token_end + 1 else current_token^.kind := TOKEN_INTEGER; input_pointer := token_end end elsif input_pointer^ = '(' then input_pointer := input_pointer + 1; if input_pointer^ = '*' then token_end := lex_comment(input_pointer + 1); if token_end <> cast(nil as pointer to Char) then token_length := cast(token_end as Int) - cast(input_pointer as Int); current_token^.value.string_value := cast(calloc(token_length + 1, 1) as pointer to Char); strncpy(current_token^.value.string_value, input_pointer, token_length); current_token^.kind := TOKEN_COMMENT; input_pointer := token_end else current_token^.kind := 0 end else current_token^.kind := TOKEN_LEFT_PAREN end elsif input_pointer^ = ')' then current_token^.kind := TOKEN_RIGHT_PAREN; input_pointer := input_pointer + 1 elsif input_pointer^ = '\'' then token_end := lex_character(input_pointer + 1, current_token); if token_end^ = '\'' then current_token^.kind := TOKEN_CHARACTER; input_pointer := token_end + 1 else input_pointer := input_pointer + 1 end elsif input_pointer^ = '[' then current_token^.kind := TOKEN_LEFT_SQUARE; input_pointer := input_pointer + 1 elsif input_pointer^ = ']' then current_token^.kind := TOKEN_RIGHT_SQUARE; input_pointer := input_pointer + 1 elsif input_pointer^ = '>' then input_pointer := input_pointer + 1; if input_pointer^ = '=' then current_token^.kind := TOKEN_GREATER_EQUAL; input_pointer := input_pointer + 1 else current_token^.kind := TOKEN_GREATER_THAN end elsif input_pointer^ = '<' then input_pointer := input_pointer + 1; if input_pointer^ = '=' then current_token^.kind := TOKEN_LESS_EQUAL; input_pointer := input_pointer + 1 elsif input_pointer^ = '>' then current_token^.kind := TOKEN_NOT_EQUAL; input_pointer := input_pointer + 1 else current_token^.kind := TOKEN_LESS_THAN end elsif input_pointer^ = '=' then current_token^.kind := TOKEN_EQUAL; input_pointer := input_pointer + 1 elsif input_pointer^ = ';' then current_token^.kind := TOKEN_SEMICOLON; input_pointer := input_pointer + 1 elsif input_pointer^ = '.' then current_token^.kind := TOKEN_DOT; input_pointer := input_pointer + 1 elsif input_pointer^ = ',' then current_token^.kind := TOKEN_COMMA; input_pointer := input_pointer + 1 elsif input_pointer^ = '+' then current_token^.kind := TOKEN_PLUS; input_pointer := input_pointer + 1 elsif input_pointer^ = '-' then current_token^.kind := TOKEN_MINUS; input_pointer := input_pointer + 1 elsif input_pointer^ = '*' then current_token^.kind := TOKEN_MULTIPLICATION; input_pointer := input_pointer + 1 elsif input_pointer^ = '/' then current_token^.kind := TOKEN_DIVISION; input_pointer := input_pointer + 1 elsif input_pointer^ = '%' then current_token^.kind := TOKEN_REMAINDER; input_pointer := input_pointer + 1 elsif input_pointer^ = ':' then input_pointer := input_pointer + 1; if input_pointer^ = '=' then current_token^.kind := TOKEN_ASSIGNMENT; input_pointer := input_pointer + 1 else current_token^.kind := TOKEN_COLON end elsif input_pointer^ = '^' then current_token^.kind := TOKEN_HAT; input_pointer := input_pointer + 1 elsif input_pointer^ = '@' then current_token^.kind := TOKEN_AT; input_pointer := input_pointer + 1 else current_token^.kind := 0; input_pointer := input_pointer + 1 end; if current_token^.kind <> 0 then tokens_size := tokens_size + sizeof(Token) end end; print_tokens(tokens, tokens_size); free(input) end; begin compile(); test_record(); test_primitive(); exit(0) end.