summaryrefslogtreecommitdiff
path: root/source/lexer.elna
diff options
context:
space:
mode:
Diffstat (limited to 'source/lexer.elna')
-rw-r--r--source/lexer.elna952
1 files changed, 952 insertions, 0 deletions
diff --git a/source/lexer.elna b/source/lexer.elna
new file mode 100644
index 0000000..d5f529b
--- /dev/null
+++ b/source/lexer.elna
@@ -0,0 +1,952 @@
+(* This Source Code Form is subject to the terms of the Mozilla Public License,
+ v. 2.0. If a copy of the MPL was not distributed with this file, You can
+ obtain one at https://mozilla.org/MPL/2.0/. *)
+module;
+
+import cstdio, cstring, cctype, cstdlib, common;
+
+const
+ CHUNK_SIZE := 85536u;
+
+type
+ (*
+ * Classification table assigns each possible character to a group (class). All
+ * characters of the same group are handled equivalently.
+ *
+ * Classification:
+ *)
+ TransitionClass = (
+ invalid,
+ digit,
+ alpha,
+ space,
+ colon,
+ equals,
+ left_paren,
+ right_paren,
+ asterisk,
+ underscore,
+ single,
+ hex,
+ zero,
+ x,
+ eof,
+ dot,
+ minus,
+ single_quote,
+ double_quote,
+ greater,
+ less,
+ other
+ );
+ TransitionState = (
+ start,
+ colon,
+ identifier,
+ decimal,
+ greater,
+ minus,
+ left_paren,
+ less,
+ dot,
+ comment,
+ closing_comment,
+ character,
+ string,
+ leading_zero,
+ decimal_suffix,
+ finish
+ );
+ LexerToken = record
+ kind: LexerKind;
+ value: union
+ booleanKind: Bool;
+ identifierKind: Identifier;
+ integerKind: Int;
+ stringKind: String
+ end;
+ start_location: TextLocation;
+ end_location: TextLocation
+ end;
+ TransitionAction = proc(^Lexer, ^LexerToken);
+ Transition = record
+ action: TransitionAction;
+ next_state: TransitionState
+ end;
+ TransitionClasses = [22]Transition;
+
+ BufferPosition* = record
+ iterator: ^Char;
+ location: TextLocation
+ end;
+ Lexer* = record
+ input: ^FILE;
+ buffer: ^Char;
+ size: Word;
+ length: Word;
+ start: BufferPosition;
+ current: BufferPosition
+ end;
+ LexerKind* = (
+ unknown,
+ identifier,
+ _if,
+ _then,
+ _else,
+ _elsif,
+ _while,
+ _do,
+ _proc,
+ _begin,
+ _end,
+ _extern,
+ _const,
+ _var,
+ _case,
+ _of,
+ _type,
+ _record,
+ _union,
+ pipe,
+ to,
+ boolean,
+ null,
+ and,
+ _or,
+ _xor,
+ not,
+ _return,
+ _cast,
+ shift_left,
+ shift_right,
+ left_paren,
+ right_paren,
+ left_square,
+ right_square,
+ greater_equal,
+ less_equal,
+ greater_than,
+ less_than,
+ not_equal,
+ equal,
+ semicolon,
+ dot,
+ comma,
+ plus,
+ minus,
+ multiplication,
+ division,
+ remainder,
+ assignment,
+ colon,
+ hat,
+ at,
+ comment,
+ integer,
+ word,
+ character,
+ string,
+ _defer,
+ exclamation,
+ arrow,
+ trait,
+ _program,
+ _module,
+ _import
+ );
+
+var
+ classification: [128]TransitionClass;
+ transitions: [16]TransitionClasses;
+
+proc initialize_classification();
+var
+ i: Word;
+begin
+ classification[1] := TransitionClass.eof; (* NUL *)
+ classification[2] := TransitionClass.invalid; (* SOH *)
+ classification[3] := TransitionClass.invalid; (* STX *)
+ classification[4] := TransitionClass.invalid; (* ETX *)
+ classification[5] := TransitionClass.invalid; (* EOT *)
+ classification[6] := TransitionClass.invalid; (* EMQ *)
+ classification[7] := TransitionClass.invalid; (* ACK *)
+ classification[8] := TransitionClass.invalid; (* BEL *)
+ classification[9] := TransitionClass.invalid; (* BS *)
+ classification[10] := TransitionClass.space; (* HT *)
+ classification[11] := TransitionClass.space; (* LF *)
+ classification[12] := TransitionClass.invalid; (* VT *)
+ classification[13] := TransitionClass.invalid; (* FF *)
+ classification[14] := TransitionClass.space; (* CR *)
+ classification[15] := TransitionClass.invalid; (* SO *)
+ classification[16] := TransitionClass.invalid; (* SI *)
+ classification[17] := TransitionClass.invalid; (* DLE *)
+ classification[18] := TransitionClass.invalid; (* DC1 *)
+ classification[19] := TransitionClass.invalid; (* DC2 *)
+ classification[20] := TransitionClass.invalid; (* DC3 *)
+ classification[21] := TransitionClass.invalid; (* DC4 *)
+ classification[22] := TransitionClass.invalid; (* NAK *)
+ classification[23] := TransitionClass.invalid; (* SYN *)
+ classification[24] := TransitionClass.invalid; (* ETB *)
+ classification[25] := TransitionClass.invalid; (* CAN *)
+ classification[26] := TransitionClass.invalid; (* EM *)
+ classification[27] := TransitionClass.invalid; (* SUB *)
+ classification[28] := TransitionClass.invalid; (* ESC *)
+ classification[29] := TransitionClass.invalid; (* FS *)
+ classification[30] := TransitionClass.invalid; (* GS *)
+ classification[31] := TransitionClass.invalid; (* RS *)
+ classification[32] := TransitionClass.invalid; (* US *)
+ classification[33] := TransitionClass.space; (* Space *)
+ classification[34] := TransitionClass.single; (* ! *)
+ classification[35] := TransitionClass.double_quote; (* " *)
+ classification[36] := TransitionClass.other; (* # *)
+ classification[37] := TransitionClass.other; (* $ *)
+ classification[38] := TransitionClass.single; (* % *)
+ classification[39] := TransitionClass.single; (* & *)
+ classification[40] := TransitionClass.single_quote; (* ' *)
+ classification[41] := TransitionClass.left_paren; (* ( *)
+ classification[42] := TransitionClass.right_paren; (* ) *)
+ classification[43] := TransitionClass.asterisk; (* * *)
+ classification[44] := TransitionClass.single; (* + *)
+ classification[45] := TransitionClass.single; (* , *)
+ classification[46] := TransitionClass.minus; (* - *)
+ classification[47] := TransitionClass.dot; (* . *)
+ classification[48] := TransitionClass.single; (* / *)
+ classification[49] := TransitionClass.zero; (* 0 *)
+ classification[50] := TransitionClass.digit; (* 1 *)
+ classification[51] := TransitionClass.digit; (* 2 *)
+ classification[52] := TransitionClass.digit; (* 3 *)
+ classification[53] := TransitionClass.digit; (* 4 *)
+ classification[54] := TransitionClass.digit; (* 5 *)
+ classification[55] := TransitionClass.digit; (* 6 *)
+ classification[56] := TransitionClass.digit; (* 7 *)
+ classification[57] := TransitionClass.digit; (* 8 *)
+ classification[58] := TransitionClass.digit; (* 9 *)
+ classification[59] := TransitionClass.colon; (* : *)
+ classification[60] := TransitionClass.single; (* ; *)
+ classification[61] := TransitionClass.less; (* < *)
+ classification[62] := TransitionClass.equals; (* = *)
+ classification[63] := TransitionClass.greater; (* > *)
+ classification[64] := TransitionClass.other; (* ? *)
+ classification[65] := TransitionClass.single; (* @ *)
+ classification[66] := TransitionClass.alpha; (* A *)
+ classification[67] := TransitionClass.alpha; (* B *)
+ classification[68] := TransitionClass.alpha; (* C *)
+ classification[69] := TransitionClass.alpha; (* D *)
+ classification[70] := TransitionClass.alpha; (* E *)
+ classification[71] := TransitionClass.alpha; (* F *)
+ classification[72] := TransitionClass.alpha; (* G *)
+ classification[73] := TransitionClass.alpha; (* H *)
+ classification[74] := TransitionClass.alpha; (* I *)
+ classification[75] := TransitionClass.alpha; (* J *)
+ classification[76] := TransitionClass.alpha; (* K *)
+ classification[77] := TransitionClass.alpha; (* L *)
+ classification[78] := TransitionClass.alpha; (* M *)
+ classification[79] := TransitionClass.alpha; (* N *)
+ classification[80] := TransitionClass.alpha; (* O *)
+ classification[81] := TransitionClass.alpha; (* P *)
+ classification[82] := TransitionClass.alpha; (* Q *)
+ classification[83] := TransitionClass.alpha; (* R *)
+ classification[84] := TransitionClass.alpha; (* S *)
+ classification[85] := TransitionClass.alpha; (* T *)
+ classification[86] := TransitionClass.alpha; (* U *)
+ classification[87] := TransitionClass.alpha; (* V *)
+ classification[88] := TransitionClass.alpha; (* W *)
+ classification[89] := TransitionClass.alpha; (* X *)
+ classification[90] := TransitionClass.alpha; (* Y *)
+ classification[91] := TransitionClass.alpha; (* Z *)
+ classification[92] := TransitionClass.single; (* [ *)
+ classification[93] := TransitionClass.other; (* \ *)
+ classification[94] := TransitionClass.single; (* ] *)
+ classification[95] := TransitionClass.single; (* ^ *)
+ classification[96] := TransitionClass.underscore; (* _ *)
+ classification[97] := TransitionClass.other; (* ` *)
+ classification[98] := TransitionClass.hex; (* a *)
+ classification[99] := TransitionClass.hex; (* b *)
+ classification[100] := TransitionClass.hex; (* c *)
+ classification[101] := TransitionClass.hex; (* d *)
+ classification[102] := TransitionClass.hex; (* e *)
+ classification[103] := TransitionClass.hex; (* f *)
+ classification[104] := TransitionClass.alpha; (* g *)
+ classification[105] := TransitionClass.alpha; (* h *)
+ classification[106] := TransitionClass.alpha; (* i *)
+ classification[107] := TransitionClass.alpha; (* j *)
+ classification[108] := TransitionClass.alpha; (* k *)
+ classification[109] := TransitionClass.alpha; (* l *)
+ classification[110] := TransitionClass.alpha; (* m *)
+ classification[111] := TransitionClass.alpha; (* n *)
+ classification[112] := TransitionClass.alpha; (* o *)
+ classification[113] := TransitionClass.alpha; (* p *)
+ classification[114] := TransitionClass.alpha; (* q *)
+ classification[115] := TransitionClass.alpha; (* r *)
+ classification[116] := TransitionClass.alpha; (* s *)
+ classification[117] := TransitionClass.alpha; (* t *)
+ classification[118] := TransitionClass.alpha; (* u *)
+ classification[119] := TransitionClass.alpha; (* v *)
+ classification[120] := TransitionClass.alpha; (* w *)
+ classification[121] := TransitionClass.x; (* x *)
+ classification[122] := TransitionClass.alpha; (* y *)
+ classification[123] := TransitionClass.alpha; (* z *)
+ classification[124] := TransitionClass.other; (* { *)
+ classification[125] := TransitionClass.single; (* | *)
+ classification[126] := TransitionClass.other; (* } *)
+ classification[127] := TransitionClass.single; (* ~ *)
+ classification[128] := TransitionClass.invalid; (* DEL *)
+
+ i := 129u;
+ while i <= 256u do
+ classification[i] := TransitionClass.other;
+ i := i + 1u
+ end
+end;
+
+proc compare_keyword(keyword: String, token_start: BufferPosition, token_end: ^Char) -> Bool;
+var
+ result: Bool;
+ index: Word;
+ continue: Bool;
+begin
+ index := 0u;
+ result := true;
+ continue := (index < keyword.length) & (token_start.iterator <> token_end);
+
+ while continue & result do
+ result := keyword[index] = token_start.iterator^
+ or cast(tolower(cast(keyword[index]: Int)): Char) = token_start.iterator^;
+ token_start.iterator := token_start.iterator + 1;
+ index := index + 1u;
+ continue := (index < keyword.length) & (token_start.iterator <> token_end)
+ end;
+ result := result & index = keyword.length;
+
+ return result & (token_start.iterator = token_end)
+end;
+
+(* Reached the end of file. *)
+proc transition_action_eof(lexer: ^Lexer, token: ^LexerToken);
+begin
+ token^.kind := LexerKind.unknown
+end;
+
+proc increment(position: ^BufferPosition);
+begin
+ position^.iterator := position^.iterator + 1
+end;
+
+(* Add the character to the token currently read and advance to the next character. *)
+proc transition_action_accumulate(lexer: ^Lexer, token: ^LexerToken);
+begin
+ increment(@lexer^.current)
+end;
+
+(* The current character is not a part of the token. Finish the token already
+ * read. Don't advance to the next character. *)
+proc transition_action_finalize(lexer: ^Lexer, token: ^LexerToken);
+begin
+ if lexer^.start.iterator^ = ':' then
+ token^.kind := LexerKind.colon
+ end;
+ if lexer^.start.iterator^ = '>' then
+ token^.kind := LexerKind.greater_than
+ end;
+ if lexer^.start.iterator^ = '<' then
+ token^.kind := LexerKind.less_than
+ end;
+ if lexer^.start.iterator^ = '(' then
+ token^.kind := LexerKind.left_paren
+ end;
+ if lexer^.start.iterator^ = '-' then
+ token^.kind := LexerKind.minus
+ end;
+ if lexer^.start.iterator^ = '.' then
+ token^.kind := LexerKind.dot
+ end
+end;
+
+(* An action for tokens containing multiple characters. *)
+proc transition_action_composite(lexer: ^Lexer, token: ^LexerToken);
+begin
+ if lexer^.start.iterator^ = '<' then
+ if lexer^.current.iterator^ = '>' then
+ token^.kind := LexerKind.not_equal
+ end;
+ if lexer^.current.iterator^ = '=' then
+ token^.kind := LexerKind.less_equal
+ end
+ end;
+ if (lexer^.start.iterator^ = '>') & (lexer^.current.iterator^ = '=') then
+ token^.kind := LexerKind.greater_equal
+ end;
+ if (lexer^.start.iterator^ = ':') & (lexer^.current.iterator^ = '=') then
+ token^.kind := LexerKind.assignment
+ end;
+ if (lexer^.start.iterator^ = '-') & (lexer^.current.iterator^ = '>') then
+ token^.kind := LexerKind.arrow
+ end;
+ increment(@lexer^.current)
+end;
+
+(* Skip a space. *)
+proc transition_action_skip(lexer: ^Lexer, token: ^LexerToken);
+begin
+ increment(@lexer^.start);
+
+ if lexer^.start.iterator^ = '\n' then
+ lexer^.start.location.line := lexer^.start.location.line + 1u;
+ lexer^.start.location.column := 1u
+ end;
+ lexer^.current := lexer^.start
+end;
+
+(* Delimited string action. *)
+proc transition_action_delimited(lexer: ^Lexer, token: ^LexerToken);
+var
+ text_length: Word;
+begin
+ if lexer^.start.iterator^ = '(' then
+ token^.kind := LexerKind.comment
+ end;
+ if lexer^.start.iterator^ = '"' then
+ text_length := cast(lexer^.current.iterator - lexer^.start.iterator + 1: Word);
+
+ token^.value.stringKind := String(cast(malloc(text_length): ^Char), text_length);
+ memcpy(cast(token^.value.stringKind.ptr: Pointer), cast(lexer^.start.iterator: Pointer), text_length);
+
+ token^.kind := LexerKind.character
+ end;
+ if lexer^.start.iterator^ = '\'' then
+ text_length := cast(lexer^.current.iterator - lexer^.start.iterator + 1: Word);
+
+ token^.value.stringKind := String(cast(malloc(text_length): ^Char), text_length);
+ memcpy(cast(token^.value.stringKind.ptr: Pointer), cast(lexer^.start.iterator: Pointer), text_length);
+
+ token^.kind := LexerKind.string
+ end;
+ increment(@lexer^.current)
+end;
+
+(* Finalize keyword or identifier. *)
+proc transition_action_key_id(lexer: ^Lexer, token: ^LexerToken);
+begin
+ token^.kind := LexerKind.identifier;
+
+ token^.value.identifierKind[1] := cast(lexer^.current.iterator - lexer^.start.iterator: Char);
+ memcpy(cast(@token^.value.identifierKind[2]: Pointer), cast(lexer^.start.iterator: Pointer), cast(token^.value.identifierKind[1]: Word));
+
+ if compare_keyword("program", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._program
+ end;
+ if compare_keyword("import", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._import
+ end;
+ if compare_keyword("const", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._const
+ end;
+ if compare_keyword("var", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._var
+ end;
+ if compare_keyword("if", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._if
+ end;
+ if compare_keyword("then", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._then
+ end;
+ if compare_keyword("elsif", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._elsif
+ end;
+ if compare_keyword("else", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._else
+ end;
+ if compare_keyword("while", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._while
+ end;
+ if compare_keyword("do", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._do
+ end;
+ if compare_keyword("proc", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._proc
+ end;
+ if compare_keyword("begin", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._begin
+ end;
+ if compare_keyword("end", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._end
+ end;
+ if compare_keyword("type", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._type
+ end;
+ if compare_keyword("record", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._record
+ end;
+ if compare_keyword("union", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._union
+ end;
+ if compare_keyword("NIL", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind.null
+ end;
+ if compare_keyword("or", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._or
+ end;
+ if compare_keyword("return", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._return
+ end;
+ if compare_keyword("defer", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._defer
+ end;
+ if compare_keyword("TO", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind.to
+ end;
+ if compare_keyword("CASE", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._case
+ end;
+ if compare_keyword("OF", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._of
+ end;
+ if compare_keyword("module", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._module
+ end;
+ if compare_keyword("xor", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind._xor
+ end;
+ if compare_keyword("TRUE", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind.boolean;
+ token^.value.booleanKind := true
+ end;
+ if compare_keyword("FALSE", lexer^.start, lexer^.current.iterator) then
+ token^.kind := LexerKind.boolean;
+ token^.value.booleanKind := false
+ end
+end;
+
+(* Action for tokens containing only one character. The character cannot be
+ * followed by other characters forming a composite token. *)
+proc transition_action_single(lexer: ^Lexer, token: ^LexerToken);
+begin
+ if lexer^.current.iterator^ = '&' then
+ token^.kind := LexerKind.and
+ end;
+ if lexer^.current.iterator^ = ';' then
+ token^.kind := LexerKind.semicolon
+ end;
+ if lexer^.current.iterator^ = ',' then
+ token^.kind := LexerKind.comma
+ end;
+ if lexer^.current.iterator^ = '~' then
+ token^.kind := LexerKind.not
+ end;
+ if lexer^.current.iterator^ = ')' then
+ token^.kind := LexerKind.right_paren
+ end;
+ if lexer^.current.iterator^ = '[' then
+ token^.kind := LexerKind.left_square
+ end;
+ if lexer^.current.iterator^ = ']' then
+ token^.kind := LexerKind.right_square
+ end;
+ if lexer^.current.iterator^ = '^' then
+ token^.kind := LexerKind.hat
+ end;
+ if lexer^.current.iterator^ = '=' then
+ token^.kind := LexerKind.equal
+ end;
+ if lexer^.current.iterator^ = '+' then
+ token^.kind := LexerKind.plus
+ end;
+ if lexer^.current.iterator^ = '*' then
+ token^.kind := LexerKind.multiplication
+ end;
+ if lexer^.current.iterator^ = '/' then
+ token^.kind := LexerKind.division
+ end;
+ if lexer^.current.iterator^ = '%' then
+ token^.kind := LexerKind.remainder
+ end;
+ if lexer^.current.iterator^ = '@' then
+ token^.kind := LexerKind.at
+ end;
+ if lexer^.current.iterator^ = '|' then
+ token^.kind := LexerKind.pipe
+ end;
+ increment(@lexer^.current)
+end;
+
+(* Handle an integer literal. *)
+proc transition_action_integer(lexer: ^Lexer, token: ^LexerToken);
+var
+ buffer: String;
+ integer_length: Word;
+ found: Bool;
+begin
+ token^.kind := LexerKind.integer;
+
+ integer_length := cast(lexer^.current.iterator - lexer^.start.iterator: Word);
+ memset(cast(token^.value.identifierKind.ptr: Pointer), 0, #size(Identifier));
+ memcpy(cast(@token^.value.identifierKind[1]: Pointer), cast(lexer^.start.iterator: Pointer), integer_length);
+
+ token^.value.identifierKind[cast(token^.value.identifierKind[1]: Int) + 2] := '\0';
+ token^.value.integerKind := atoi(@token^.value.identifierKind[2])
+end;
+
+proc set_default_transition(current_state: TransitionState, default_action: TransitionAction, next_state: TransitionState) -> Int;
+var
+ default_transition: Transition;
+ state_index: Int;
+begin
+ default_transition.action := default_action;
+ default_transition.next_state := next_state;
+ state_index := cast(current_state: Int) + 1;
+
+ transitions[state_index][cast(TransitionClass.invalid: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.digit: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.alpha: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.space: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.colon: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.equals: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.left_paren: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.right_paren: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.asterisk: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.underscore: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.single: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.hex: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.zero: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.x: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.eof: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.dot: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.minus: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.single_quote: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.double_quote: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.greater: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.less: Int) + 1] := default_transition;
+ transitions[state_index][cast(TransitionClass.other: Int) + 1] := default_transition;
+
+ return state_index
+end;
+
+(*
+ * The transition table describes transitions from one state to another, given
+ * a symbol (character class).
+ *
+ * The table has m rows and n columns, where m is the amount of states and n is
+ * the amount of classes. So given the current state and a classified character
+ * the table can be used to look up the next state.
+ *
+ * Each cell is a word long.
+ * - The least significant byte of the word is a row number (beginning with 0).
+ * It specifies the target state. "ff" means that this is an end state and no
+ * transition is possible.
+ * - The next byte is the action that should be performed when transitioning.
+ * For the meaning of actions see labels in the lex_next function, which
+ * handles each action.
+ *)
+proc initialize_transitions();
+var
+ state_index: Int;
+begin
+ (* Start state. *)
+ state_index := cast(TransitionState.start: Int) + 1;
+
+ transitions[state_index][cast(TransitionClass.invalid: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.invalid: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.decimal;
+
+ transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.identifier;
+
+ transitions[state_index][cast(TransitionClass.space: Int) + 1].action := transition_action_skip;
+ transitions[state_index][cast(TransitionClass.space: Int) + 1].next_state := TransitionState.start;
+
+ transitions[state_index][cast(TransitionClass.colon: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.colon: Int) + 1].next_state := TransitionState.colon;
+
+ transitions[state_index][cast(TransitionClass.equals: Int) + 1].action := transition_action_single;
+ transitions[state_index][cast(TransitionClass.equals: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.left_paren: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.left_paren: Int) + 1].next_state := TransitionState.left_paren;
+
+ transitions[state_index][cast(TransitionClass.right_paren: Int) + 1].action := transition_action_single;
+ transitions[state_index][cast(TransitionClass.right_paren: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].action := transition_action_single;
+ transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.underscore: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.underscore: Int) + 1].next_state := TransitionState.identifier;
+
+ transitions[state_index][cast(TransitionClass.single: Int) + 1].action := transition_action_single;
+ transitions[state_index][cast(TransitionClass.single: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.identifier;
+
+ transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.leading_zero;
+
+ transitions[state_index][cast(TransitionClass.x: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.identifier;
+
+ transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := transition_action_eof;
+ transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.dot: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.dot: Int) + 1].next_state := TransitionState.dot;
+
+ transitions[state_index][cast(TransitionClass.minus: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.minus: Int) + 1].next_state := TransitionState.minus;
+
+ transitions[state_index][cast(TransitionClass.single_quote: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.single_quote: Int) + 1].next_state := TransitionState.character;
+
+ transitions[state_index][cast(TransitionClass.double_quote: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.double_quote: Int) + 1].next_state := TransitionState.string;
+
+ transitions[state_index][cast(TransitionClass.greater: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.greater: Int) + 1].next_state := TransitionState.greater;
+
+ transitions[state_index][cast(TransitionClass.less: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.less: Int) + 1].next_state := TransitionState.less;
+
+ transitions[state_index][cast(TransitionClass.other: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.other: Int) + 1].next_state := TransitionState.finish;
+
+ (* Colon state. *)
+ state_index := set_default_transition(TransitionState.colon, transition_action_finalize, TransitionState.finish);
+
+ transitions[state_index][cast(TransitionClass.equals: Int) + 1].action := transition_action_composite;
+ transitions[state_index][cast(TransitionClass.equals: Int) + 1].next_state := TransitionState.finish;
+
+ (* Identifier state. *)
+ state_index := set_default_transition(TransitionState.identifier, transition_action_key_id, TransitionState.finish);
+
+ transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.identifier;
+
+ transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.identifier;
+
+ transitions[state_index][cast(TransitionClass.underscore: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.underscore: Int) + 1].next_state := TransitionState.identifier;
+
+ transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.identifier;
+
+ transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.identifier;
+
+ transitions[state_index][cast(TransitionClass.x: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.identifier;
+
+ (* Decimal state. *)
+ state_index := set_default_transition(TransitionState.decimal, transition_action_integer, TransitionState.finish);
+
+ transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.decimal;
+
+ transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.decimal_suffix;
+
+ transitions[state_index][cast(TransitionClass.underscore: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.underscore: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.decimal_suffix;
+
+ transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.decimal;
+
+ transitions[state_index][cast(TransitionClass.x: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.decimal_suffix;
+
+ (* Greater state. *)
+ state_index := set_default_transition(TransitionState.greater, transition_action_finalize, TransitionState.finish);
+
+ transitions[state_index][cast(TransitionClass.equals: Int) + 1].action := transition_action_composite;
+ transitions[state_index][cast(TransitionClass.equals: Int) + 1].next_state := TransitionState.finish;
+
+ (* Minus state. *)
+ state_index := set_default_transition(TransitionState.minus, transition_action_finalize, TransitionState.finish);
+
+ transitions[state_index][cast(TransitionClass.greater: Int) + 1].action := transition_action_composite;
+ transitions[state_index][cast(TransitionClass.greater: Int) + 1].next_state := TransitionState.finish;
+
+ (* Left paren state. *)
+ state_index := set_default_transition(TransitionState.left_paren, transition_action_finalize, TransitionState.finish);
+
+ transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].next_state := TransitionState.comment;
+
+ (* Less state. *)
+ state_index := set_default_transition(TransitionState.less, transition_action_finalize, TransitionState.finish);
+
+ transitions[state_index][cast(TransitionClass.equals: Int) + 1].action := transition_action_composite;
+ transitions[state_index][cast(TransitionClass.equals: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.greater: Int) + 1].action := transition_action_composite;
+ transitions[state_index][cast(TransitionClass.greater: Int) + 1].next_state := TransitionState.finish;
+
+ (* Hexadecimal after 0x. *)
+ state_index := set_default_transition(TransitionState.dot, transition_action_finalize, TransitionState.finish);
+
+ transitions[state_index][cast(TransitionClass.dot: Int) + 1].action := transition_action_composite;
+ transitions[state_index][cast(TransitionClass.dot: Int) + 1].next_state := TransitionState.finish;
+
+ (* Comment. *)
+ state_index := set_default_transition(TransitionState.comment, transition_action_accumulate, TransitionState.comment);
+
+ transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].next_state := TransitionState.closing_comment;
+
+ transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish;
+
+ (* Closing comment. *)
+ state_index := set_default_transition(TransitionState.closing_comment, transition_action_accumulate, TransitionState.comment);
+
+ transitions[state_index][cast(TransitionClass.invalid: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.invalid: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.right_paren: Int) + 1].action := transition_action_delimited;
+ transitions[state_index][cast(TransitionClass.right_paren: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].action := transition_action_accumulate;
+ transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].next_state := TransitionState.closing_comment;
+
+ transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish;
+
+ (* Character. *)
+ state_index := set_default_transition(TransitionState.character, transition_action_accumulate, TransitionState.character);
+
+ transitions[state_index][cast(TransitionClass.invalid: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.invalid: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.single_quote: Int) + 1].action := transition_action_delimited;
+ transitions[state_index][cast(TransitionClass.single_quote: Int) + 1].next_state := TransitionState.finish;
+
+ (* String. *)
+ state_index := set_default_transition(TransitionState.string, transition_action_accumulate, TransitionState.string);
+
+ transitions[state_index][cast(TransitionClass.invalid: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.invalid: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.double_quote: Int) + 1].action := transition_action_delimited;
+ transitions[state_index][cast(TransitionClass.double_quote: Int) + 1].next_state := TransitionState.finish;
+
+ (* Leading zero. *)
+ state_index := set_default_transition(TransitionState.leading_zero, transition_action_integer, TransitionState.finish);
+
+ transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.underscore: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.underscore: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.x: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.finish;
+
+ (* Digit with a character suffix. *)
+ state_index := set_default_transition(TransitionState.decimal_suffix, transition_action_integer, TransitionState.finish);
+
+ transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.finish;
+
+ transitions[state_index][cast(TransitionClass.x: Int) + 1].action := nil;
+ transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.finish
+end;
+
+proc lexer_make*(lexer: ^Lexer, input: ^FILE);
+begin
+ lexer^.input := input;
+ lexer^.length := 0u;
+
+ lexer^.buffer := cast(malloc(CHUNK_SIZE): ^Char);
+ memset(cast(lexer^.buffer: Pointer), 0, CHUNK_SIZE);
+ lexer^.size := CHUNK_SIZE
+end;
+
+(* Returns the last read token. *)
+proc lexer_current*(lexer: ^Lexer) -> LexerToken;
+var
+ current_class: TransitionClass;
+ current_state: TransitionState;
+ current_transition: Transition;
+ result: LexerToken;
+ index1: Word;
+ index2: Word;
+begin
+ lexer^.current := lexer^.start;
+ current_state := TransitionState.start;
+
+ while current_state <> TransitionState.finish do
+ index1 := cast(lexer^.current.iterator^: Word) + 1u;
+ current_class := classification[index1];
+
+ index1 := cast(current_state: Word) + 1u;
+ index2 := cast(current_class: Word) + 1u;
+
+ current_transition := transitions[index1][index2];
+ if current_transition.action <> nil then
+ current_transition.action(lexer, @result)
+ end;
+ current_state := current_transition.next_state
+ end;
+ result.start_location := lexer^.start.location;
+ result.end_location := lexer^.current.location;
+
+ return result
+end;
+
+(* Read and return the next token. *)
+proc lexer_lex*(lexer: ^Lexer) -> LexerToken;
+var
+ result: LexerToken;
+begin
+ if lexer^.length = 0u then
+ lexer^.length := fread(cast(lexer^.buffer: Pointer), CHUNK_SIZE, 1u, lexer^.input);
+ lexer^.current.location.column := 1u;
+ lexer^.current.location.line := 1u;
+ lexer^.current.iterator := lexer^.buffer
+ end;
+ lexer^.start := lexer^.current;
+
+ result := lexer_current(lexer);
+ return result
+end;
+
+proc lexer_destroy*(lexer: ^Lexer);
+begin
+ free(cast(lexer^.buffer: Pointer))
+end;
+
+proc lexer_initialize();
+begin
+ initialize_classification();
+ initialize_transitions()
+end;
+
+end.