diff options
Diffstat (limited to 'source/lexer.elna')
| -rw-r--r-- | source/lexer.elna | 952 |
1 files changed, 952 insertions, 0 deletions
diff --git a/source/lexer.elna b/source/lexer.elna new file mode 100644 index 0000000..d5f529b --- /dev/null +++ b/source/lexer.elna @@ -0,0 +1,952 @@ +(* This Source Code Form is subject to the terms of the Mozilla Public License, + v. 2.0. If a copy of the MPL was not distributed with this file, You can + obtain one at https://mozilla.org/MPL/2.0/. *) +module; + +import cstdio, cstring, cctype, cstdlib, common; + +const + CHUNK_SIZE := 85536u; + +type + (* + * Classification table assigns each possible character to a group (class). All + * characters of the same group are handled equivalently. + * + * Classification: + *) + TransitionClass = ( + invalid, + digit, + alpha, + space, + colon, + equals, + left_paren, + right_paren, + asterisk, + underscore, + single, + hex, + zero, + x, + eof, + dot, + minus, + single_quote, + double_quote, + greater, + less, + other + ); + TransitionState = ( + start, + colon, + identifier, + decimal, + greater, + minus, + left_paren, + less, + dot, + comment, + closing_comment, + character, + string, + leading_zero, + decimal_suffix, + finish + ); + LexerToken = record + kind: LexerKind; + value: union + booleanKind: Bool; + identifierKind: Identifier; + integerKind: Int; + stringKind: String + end; + start_location: TextLocation; + end_location: TextLocation + end; + TransitionAction = proc(^Lexer, ^LexerToken); + Transition = record + action: TransitionAction; + next_state: TransitionState + end; + TransitionClasses = [22]Transition; + + BufferPosition* = record + iterator: ^Char; + location: TextLocation + end; + Lexer* = record + input: ^FILE; + buffer: ^Char; + size: Word; + length: Word; + start: BufferPosition; + current: BufferPosition + end; + LexerKind* = ( + unknown, + identifier, + _if, + _then, + _else, + _elsif, + _while, + _do, + _proc, + _begin, + _end, + _extern, + _const, + _var, + _case, + _of, + _type, + _record, + _union, + pipe, + to, + boolean, + null, + and, + _or, + _xor, + not, + _return, + _cast, + shift_left, + shift_right, + left_paren, + right_paren, + left_square, + right_square, + greater_equal, + less_equal, + greater_than, + less_than, + not_equal, + equal, + semicolon, + dot, + comma, + plus, + minus, + multiplication, + division, + remainder, + assignment, + colon, + hat, + at, + comment, + integer, + word, + character, + string, + _defer, + exclamation, + arrow, + trait, + _program, + _module, + _import + ); + +var + classification: [128]TransitionClass; + transitions: [16]TransitionClasses; + +proc initialize_classification(); +var + i: Word; +begin + classification[1] := TransitionClass.eof; (* NUL *) + classification[2] := TransitionClass.invalid; (* SOH *) + classification[3] := TransitionClass.invalid; (* STX *) + classification[4] := TransitionClass.invalid; (* ETX *) + classification[5] := TransitionClass.invalid; (* EOT *) + classification[6] := TransitionClass.invalid; (* EMQ *) + classification[7] := TransitionClass.invalid; (* ACK *) + classification[8] := TransitionClass.invalid; (* BEL *) + classification[9] := TransitionClass.invalid; (* BS *) + classification[10] := TransitionClass.space; (* HT *) + classification[11] := TransitionClass.space; (* LF *) + classification[12] := TransitionClass.invalid; (* VT *) + classification[13] := TransitionClass.invalid; (* FF *) + classification[14] := TransitionClass.space; (* CR *) + classification[15] := TransitionClass.invalid; (* SO *) + classification[16] := TransitionClass.invalid; (* SI *) + classification[17] := TransitionClass.invalid; (* DLE *) + classification[18] := TransitionClass.invalid; (* DC1 *) + classification[19] := TransitionClass.invalid; (* DC2 *) + classification[20] := TransitionClass.invalid; (* DC3 *) + classification[21] := TransitionClass.invalid; (* DC4 *) + classification[22] := TransitionClass.invalid; (* NAK *) + classification[23] := TransitionClass.invalid; (* SYN *) + classification[24] := TransitionClass.invalid; (* ETB *) + classification[25] := TransitionClass.invalid; (* CAN *) + classification[26] := TransitionClass.invalid; (* EM *) + classification[27] := TransitionClass.invalid; (* SUB *) + classification[28] := TransitionClass.invalid; (* ESC *) + classification[29] := TransitionClass.invalid; (* FS *) + classification[30] := TransitionClass.invalid; (* GS *) + classification[31] := TransitionClass.invalid; (* RS *) + classification[32] := TransitionClass.invalid; (* US *) + classification[33] := TransitionClass.space; (* Space *) + classification[34] := TransitionClass.single; (* ! *) + classification[35] := TransitionClass.double_quote; (* " *) + classification[36] := TransitionClass.other; (* # *) + classification[37] := TransitionClass.other; (* $ *) + classification[38] := TransitionClass.single; (* % *) + classification[39] := TransitionClass.single; (* & *) + classification[40] := TransitionClass.single_quote; (* ' *) + classification[41] := TransitionClass.left_paren; (* ( *) + classification[42] := TransitionClass.right_paren; (* ) *) + classification[43] := TransitionClass.asterisk; (* * *) + classification[44] := TransitionClass.single; (* + *) + classification[45] := TransitionClass.single; (* , *) + classification[46] := TransitionClass.minus; (* - *) + classification[47] := TransitionClass.dot; (* . *) + classification[48] := TransitionClass.single; (* / *) + classification[49] := TransitionClass.zero; (* 0 *) + classification[50] := TransitionClass.digit; (* 1 *) + classification[51] := TransitionClass.digit; (* 2 *) + classification[52] := TransitionClass.digit; (* 3 *) + classification[53] := TransitionClass.digit; (* 4 *) + classification[54] := TransitionClass.digit; (* 5 *) + classification[55] := TransitionClass.digit; (* 6 *) + classification[56] := TransitionClass.digit; (* 7 *) + classification[57] := TransitionClass.digit; (* 8 *) + classification[58] := TransitionClass.digit; (* 9 *) + classification[59] := TransitionClass.colon; (* : *) + classification[60] := TransitionClass.single; (* ; *) + classification[61] := TransitionClass.less; (* < *) + classification[62] := TransitionClass.equals; (* = *) + classification[63] := TransitionClass.greater; (* > *) + classification[64] := TransitionClass.other; (* ? *) + classification[65] := TransitionClass.single; (* @ *) + classification[66] := TransitionClass.alpha; (* A *) + classification[67] := TransitionClass.alpha; (* B *) + classification[68] := TransitionClass.alpha; (* C *) + classification[69] := TransitionClass.alpha; (* D *) + classification[70] := TransitionClass.alpha; (* E *) + classification[71] := TransitionClass.alpha; (* F *) + classification[72] := TransitionClass.alpha; (* G *) + classification[73] := TransitionClass.alpha; (* H *) + classification[74] := TransitionClass.alpha; (* I *) + classification[75] := TransitionClass.alpha; (* J *) + classification[76] := TransitionClass.alpha; (* K *) + classification[77] := TransitionClass.alpha; (* L *) + classification[78] := TransitionClass.alpha; (* M *) + classification[79] := TransitionClass.alpha; (* N *) + classification[80] := TransitionClass.alpha; (* O *) + classification[81] := TransitionClass.alpha; (* P *) + classification[82] := TransitionClass.alpha; (* Q *) + classification[83] := TransitionClass.alpha; (* R *) + classification[84] := TransitionClass.alpha; (* S *) + classification[85] := TransitionClass.alpha; (* T *) + classification[86] := TransitionClass.alpha; (* U *) + classification[87] := TransitionClass.alpha; (* V *) + classification[88] := TransitionClass.alpha; (* W *) + classification[89] := TransitionClass.alpha; (* X *) + classification[90] := TransitionClass.alpha; (* Y *) + classification[91] := TransitionClass.alpha; (* Z *) + classification[92] := TransitionClass.single; (* [ *) + classification[93] := TransitionClass.other; (* \ *) + classification[94] := TransitionClass.single; (* ] *) + classification[95] := TransitionClass.single; (* ^ *) + classification[96] := TransitionClass.underscore; (* _ *) + classification[97] := TransitionClass.other; (* ` *) + classification[98] := TransitionClass.hex; (* a *) + classification[99] := TransitionClass.hex; (* b *) + classification[100] := TransitionClass.hex; (* c *) + classification[101] := TransitionClass.hex; (* d *) + classification[102] := TransitionClass.hex; (* e *) + classification[103] := TransitionClass.hex; (* f *) + classification[104] := TransitionClass.alpha; (* g *) + classification[105] := TransitionClass.alpha; (* h *) + classification[106] := TransitionClass.alpha; (* i *) + classification[107] := TransitionClass.alpha; (* j *) + classification[108] := TransitionClass.alpha; (* k *) + classification[109] := TransitionClass.alpha; (* l *) + classification[110] := TransitionClass.alpha; (* m *) + classification[111] := TransitionClass.alpha; (* n *) + classification[112] := TransitionClass.alpha; (* o *) + classification[113] := TransitionClass.alpha; (* p *) + classification[114] := TransitionClass.alpha; (* q *) + classification[115] := TransitionClass.alpha; (* r *) + classification[116] := TransitionClass.alpha; (* s *) + classification[117] := TransitionClass.alpha; (* t *) + classification[118] := TransitionClass.alpha; (* u *) + classification[119] := TransitionClass.alpha; (* v *) + classification[120] := TransitionClass.alpha; (* w *) + classification[121] := TransitionClass.x; (* x *) + classification[122] := TransitionClass.alpha; (* y *) + classification[123] := TransitionClass.alpha; (* z *) + classification[124] := TransitionClass.other; (* { *) + classification[125] := TransitionClass.single; (* | *) + classification[126] := TransitionClass.other; (* } *) + classification[127] := TransitionClass.single; (* ~ *) + classification[128] := TransitionClass.invalid; (* DEL *) + + i := 129u; + while i <= 256u do + classification[i] := TransitionClass.other; + i := i + 1u + end +end; + +proc compare_keyword(keyword: String, token_start: BufferPosition, token_end: ^Char) -> Bool; +var + result: Bool; + index: Word; + continue: Bool; +begin + index := 0u; + result := true; + continue := (index < keyword.length) & (token_start.iterator <> token_end); + + while continue & result do + result := keyword[index] = token_start.iterator^ + or cast(tolower(cast(keyword[index]: Int)): Char) = token_start.iterator^; + token_start.iterator := token_start.iterator + 1; + index := index + 1u; + continue := (index < keyword.length) & (token_start.iterator <> token_end) + end; + result := result & index = keyword.length; + + return result & (token_start.iterator = token_end) +end; + +(* Reached the end of file. *) +proc transition_action_eof(lexer: ^Lexer, token: ^LexerToken); +begin + token^.kind := LexerKind.unknown +end; + +proc increment(position: ^BufferPosition); +begin + position^.iterator := position^.iterator + 1 +end; + +(* Add the character to the token currently read and advance to the next character. *) +proc transition_action_accumulate(lexer: ^Lexer, token: ^LexerToken); +begin + increment(@lexer^.current) +end; + +(* The current character is not a part of the token. Finish the token already + * read. Don't advance to the next character. *) +proc transition_action_finalize(lexer: ^Lexer, token: ^LexerToken); +begin + if lexer^.start.iterator^ = ':' then + token^.kind := LexerKind.colon + end; + if lexer^.start.iterator^ = '>' then + token^.kind := LexerKind.greater_than + end; + if lexer^.start.iterator^ = '<' then + token^.kind := LexerKind.less_than + end; + if lexer^.start.iterator^ = '(' then + token^.kind := LexerKind.left_paren + end; + if lexer^.start.iterator^ = '-' then + token^.kind := LexerKind.minus + end; + if lexer^.start.iterator^ = '.' then + token^.kind := LexerKind.dot + end +end; + +(* An action for tokens containing multiple characters. *) +proc transition_action_composite(lexer: ^Lexer, token: ^LexerToken); +begin + if lexer^.start.iterator^ = '<' then + if lexer^.current.iterator^ = '>' then + token^.kind := LexerKind.not_equal + end; + if lexer^.current.iterator^ = '=' then + token^.kind := LexerKind.less_equal + end + end; + if (lexer^.start.iterator^ = '>') & (lexer^.current.iterator^ = '=') then + token^.kind := LexerKind.greater_equal + end; + if (lexer^.start.iterator^ = ':') & (lexer^.current.iterator^ = '=') then + token^.kind := LexerKind.assignment + end; + if (lexer^.start.iterator^ = '-') & (lexer^.current.iterator^ = '>') then + token^.kind := LexerKind.arrow + end; + increment(@lexer^.current) +end; + +(* Skip a space. *) +proc transition_action_skip(lexer: ^Lexer, token: ^LexerToken); +begin + increment(@lexer^.start); + + if lexer^.start.iterator^ = '\n' then + lexer^.start.location.line := lexer^.start.location.line + 1u; + lexer^.start.location.column := 1u + end; + lexer^.current := lexer^.start +end; + +(* Delimited string action. *) +proc transition_action_delimited(lexer: ^Lexer, token: ^LexerToken); +var + text_length: Word; +begin + if lexer^.start.iterator^ = '(' then + token^.kind := LexerKind.comment + end; + if lexer^.start.iterator^ = '"' then + text_length := cast(lexer^.current.iterator - lexer^.start.iterator + 1: Word); + + token^.value.stringKind := String(cast(malloc(text_length): ^Char), text_length); + memcpy(cast(token^.value.stringKind.ptr: Pointer), cast(lexer^.start.iterator: Pointer), text_length); + + token^.kind := LexerKind.character + end; + if lexer^.start.iterator^ = '\'' then + text_length := cast(lexer^.current.iterator - lexer^.start.iterator + 1: Word); + + token^.value.stringKind := String(cast(malloc(text_length): ^Char), text_length); + memcpy(cast(token^.value.stringKind.ptr: Pointer), cast(lexer^.start.iterator: Pointer), text_length); + + token^.kind := LexerKind.string + end; + increment(@lexer^.current) +end; + +(* Finalize keyword or identifier. *) +proc transition_action_key_id(lexer: ^Lexer, token: ^LexerToken); +begin + token^.kind := LexerKind.identifier; + + token^.value.identifierKind[1] := cast(lexer^.current.iterator - lexer^.start.iterator: Char); + memcpy(cast(@token^.value.identifierKind[2]: Pointer), cast(lexer^.start.iterator: Pointer), cast(token^.value.identifierKind[1]: Word)); + + if compare_keyword("program", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._program + end; + if compare_keyword("import", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._import + end; + if compare_keyword("const", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._const + end; + if compare_keyword("var", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._var + end; + if compare_keyword("if", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._if + end; + if compare_keyword("then", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._then + end; + if compare_keyword("elsif", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._elsif + end; + if compare_keyword("else", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._else + end; + if compare_keyword("while", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._while + end; + if compare_keyword("do", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._do + end; + if compare_keyword("proc", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._proc + end; + if compare_keyword("begin", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._begin + end; + if compare_keyword("end", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._end + end; + if compare_keyword("type", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._type + end; + if compare_keyword("record", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._record + end; + if compare_keyword("union", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._union + end; + if compare_keyword("NIL", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind.null + end; + if compare_keyword("or", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._or + end; + if compare_keyword("return", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._return + end; + if compare_keyword("defer", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._defer + end; + if compare_keyword("TO", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind.to + end; + if compare_keyword("CASE", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._case + end; + if compare_keyword("OF", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._of + end; + if compare_keyword("module", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._module + end; + if compare_keyword("xor", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind._xor + end; + if compare_keyword("TRUE", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind.boolean; + token^.value.booleanKind := true + end; + if compare_keyword("FALSE", lexer^.start, lexer^.current.iterator) then + token^.kind := LexerKind.boolean; + token^.value.booleanKind := false + end +end; + +(* Action for tokens containing only one character. The character cannot be + * followed by other characters forming a composite token. *) +proc transition_action_single(lexer: ^Lexer, token: ^LexerToken); +begin + if lexer^.current.iterator^ = '&' then + token^.kind := LexerKind.and + end; + if lexer^.current.iterator^ = ';' then + token^.kind := LexerKind.semicolon + end; + if lexer^.current.iterator^ = ',' then + token^.kind := LexerKind.comma + end; + if lexer^.current.iterator^ = '~' then + token^.kind := LexerKind.not + end; + if lexer^.current.iterator^ = ')' then + token^.kind := LexerKind.right_paren + end; + if lexer^.current.iterator^ = '[' then + token^.kind := LexerKind.left_square + end; + if lexer^.current.iterator^ = ']' then + token^.kind := LexerKind.right_square + end; + if lexer^.current.iterator^ = '^' then + token^.kind := LexerKind.hat + end; + if lexer^.current.iterator^ = '=' then + token^.kind := LexerKind.equal + end; + if lexer^.current.iterator^ = '+' then + token^.kind := LexerKind.plus + end; + if lexer^.current.iterator^ = '*' then + token^.kind := LexerKind.multiplication + end; + if lexer^.current.iterator^ = '/' then + token^.kind := LexerKind.division + end; + if lexer^.current.iterator^ = '%' then + token^.kind := LexerKind.remainder + end; + if lexer^.current.iterator^ = '@' then + token^.kind := LexerKind.at + end; + if lexer^.current.iterator^ = '|' then + token^.kind := LexerKind.pipe + end; + increment(@lexer^.current) +end; + +(* Handle an integer literal. *) +proc transition_action_integer(lexer: ^Lexer, token: ^LexerToken); +var + buffer: String; + integer_length: Word; + found: Bool; +begin + token^.kind := LexerKind.integer; + + integer_length := cast(lexer^.current.iterator - lexer^.start.iterator: Word); + memset(cast(token^.value.identifierKind.ptr: Pointer), 0, #size(Identifier)); + memcpy(cast(@token^.value.identifierKind[1]: Pointer), cast(lexer^.start.iterator: Pointer), integer_length); + + token^.value.identifierKind[cast(token^.value.identifierKind[1]: Int) + 2] := '\0'; + token^.value.integerKind := atoi(@token^.value.identifierKind[2]) +end; + +proc set_default_transition(current_state: TransitionState, default_action: TransitionAction, next_state: TransitionState) -> Int; +var + default_transition: Transition; + state_index: Int; +begin + default_transition.action := default_action; + default_transition.next_state := next_state; + state_index := cast(current_state: Int) + 1; + + transitions[state_index][cast(TransitionClass.invalid: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.digit: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.alpha: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.space: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.colon: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.equals: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.left_paren: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.right_paren: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.asterisk: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.underscore: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.single: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.hex: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.zero: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.x: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.eof: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.dot: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.minus: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.single_quote: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.double_quote: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.greater: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.less: Int) + 1] := default_transition; + transitions[state_index][cast(TransitionClass.other: Int) + 1] := default_transition; + + return state_index +end; + +(* + * The transition table describes transitions from one state to another, given + * a symbol (character class). + * + * The table has m rows and n columns, where m is the amount of states and n is + * the amount of classes. So given the current state and a classified character + * the table can be used to look up the next state. + * + * Each cell is a word long. + * - The least significant byte of the word is a row number (beginning with 0). + * It specifies the target state. "ff" means that this is an end state and no + * transition is possible. + * - The next byte is the action that should be performed when transitioning. + * For the meaning of actions see labels in the lex_next function, which + * handles each action. + *) +proc initialize_transitions(); +var + state_index: Int; +begin + (* Start state. *) + state_index := cast(TransitionState.start: Int) + 1; + + transitions[state_index][cast(TransitionClass.invalid: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.invalid: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.decimal; + + transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.identifier; + + transitions[state_index][cast(TransitionClass.space: Int) + 1].action := transition_action_skip; + transitions[state_index][cast(TransitionClass.space: Int) + 1].next_state := TransitionState.start; + + transitions[state_index][cast(TransitionClass.colon: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.colon: Int) + 1].next_state := TransitionState.colon; + + transitions[state_index][cast(TransitionClass.equals: Int) + 1].action := transition_action_single; + transitions[state_index][cast(TransitionClass.equals: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.left_paren: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.left_paren: Int) + 1].next_state := TransitionState.left_paren; + + transitions[state_index][cast(TransitionClass.right_paren: Int) + 1].action := transition_action_single; + transitions[state_index][cast(TransitionClass.right_paren: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].action := transition_action_single; + transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.underscore: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.underscore: Int) + 1].next_state := TransitionState.identifier; + + transitions[state_index][cast(TransitionClass.single: Int) + 1].action := transition_action_single; + transitions[state_index][cast(TransitionClass.single: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.identifier; + + transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.leading_zero; + + transitions[state_index][cast(TransitionClass.x: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.identifier; + + transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := transition_action_eof; + transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.dot: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.dot: Int) + 1].next_state := TransitionState.dot; + + transitions[state_index][cast(TransitionClass.minus: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.minus: Int) + 1].next_state := TransitionState.minus; + + transitions[state_index][cast(TransitionClass.single_quote: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.single_quote: Int) + 1].next_state := TransitionState.character; + + transitions[state_index][cast(TransitionClass.double_quote: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.double_quote: Int) + 1].next_state := TransitionState.string; + + transitions[state_index][cast(TransitionClass.greater: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.greater: Int) + 1].next_state := TransitionState.greater; + + transitions[state_index][cast(TransitionClass.less: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.less: Int) + 1].next_state := TransitionState.less; + + transitions[state_index][cast(TransitionClass.other: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.other: Int) + 1].next_state := TransitionState.finish; + + (* Colon state. *) + state_index := set_default_transition(TransitionState.colon, transition_action_finalize, TransitionState.finish); + + transitions[state_index][cast(TransitionClass.equals: Int) + 1].action := transition_action_composite; + transitions[state_index][cast(TransitionClass.equals: Int) + 1].next_state := TransitionState.finish; + + (* Identifier state. *) + state_index := set_default_transition(TransitionState.identifier, transition_action_key_id, TransitionState.finish); + + transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.identifier; + + transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.identifier; + + transitions[state_index][cast(TransitionClass.underscore: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.underscore: Int) + 1].next_state := TransitionState.identifier; + + transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.identifier; + + transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.identifier; + + transitions[state_index][cast(TransitionClass.x: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.identifier; + + (* Decimal state. *) + state_index := set_default_transition(TransitionState.decimal, transition_action_integer, TransitionState.finish); + + transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.decimal; + + transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.decimal_suffix; + + transitions[state_index][cast(TransitionClass.underscore: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.underscore: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.decimal_suffix; + + transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.decimal; + + transitions[state_index][cast(TransitionClass.x: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.decimal_suffix; + + (* Greater state. *) + state_index := set_default_transition(TransitionState.greater, transition_action_finalize, TransitionState.finish); + + transitions[state_index][cast(TransitionClass.equals: Int) + 1].action := transition_action_composite; + transitions[state_index][cast(TransitionClass.equals: Int) + 1].next_state := TransitionState.finish; + + (* Minus state. *) + state_index := set_default_transition(TransitionState.minus, transition_action_finalize, TransitionState.finish); + + transitions[state_index][cast(TransitionClass.greater: Int) + 1].action := transition_action_composite; + transitions[state_index][cast(TransitionClass.greater: Int) + 1].next_state := TransitionState.finish; + + (* Left paren state. *) + state_index := set_default_transition(TransitionState.left_paren, transition_action_finalize, TransitionState.finish); + + transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].next_state := TransitionState.comment; + + (* Less state. *) + state_index := set_default_transition(TransitionState.less, transition_action_finalize, TransitionState.finish); + + transitions[state_index][cast(TransitionClass.equals: Int) + 1].action := transition_action_composite; + transitions[state_index][cast(TransitionClass.equals: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.greater: Int) + 1].action := transition_action_composite; + transitions[state_index][cast(TransitionClass.greater: Int) + 1].next_state := TransitionState.finish; + + (* Hexadecimal after 0x. *) + state_index := set_default_transition(TransitionState.dot, transition_action_finalize, TransitionState.finish); + + transitions[state_index][cast(TransitionClass.dot: Int) + 1].action := transition_action_composite; + transitions[state_index][cast(TransitionClass.dot: Int) + 1].next_state := TransitionState.finish; + + (* Comment. *) + state_index := set_default_transition(TransitionState.comment, transition_action_accumulate, TransitionState.comment); + + transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].next_state := TransitionState.closing_comment; + + transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish; + + (* Closing comment. *) + state_index := set_default_transition(TransitionState.closing_comment, transition_action_accumulate, TransitionState.comment); + + transitions[state_index][cast(TransitionClass.invalid: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.invalid: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.right_paren: Int) + 1].action := transition_action_delimited; + transitions[state_index][cast(TransitionClass.right_paren: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].action := transition_action_accumulate; + transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].next_state := TransitionState.closing_comment; + + transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish; + + (* Character. *) + state_index := set_default_transition(TransitionState.character, transition_action_accumulate, TransitionState.character); + + transitions[state_index][cast(TransitionClass.invalid: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.invalid: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.single_quote: Int) + 1].action := transition_action_delimited; + transitions[state_index][cast(TransitionClass.single_quote: Int) + 1].next_state := TransitionState.finish; + + (* String. *) + state_index := set_default_transition(TransitionState.string, transition_action_accumulate, TransitionState.string); + + transitions[state_index][cast(TransitionClass.invalid: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.invalid: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.double_quote: Int) + 1].action := transition_action_delimited; + transitions[state_index][cast(TransitionClass.double_quote: Int) + 1].next_state := TransitionState.finish; + + (* Leading zero. *) + state_index := set_default_transition(TransitionState.leading_zero, transition_action_integer, TransitionState.finish); + + transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.underscore: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.underscore: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.x: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.finish; + + (* Digit with a character suffix. *) + state_index := set_default_transition(TransitionState.decimal_suffix, transition_action_integer, TransitionState.finish); + + transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.finish; + + transitions[state_index][cast(TransitionClass.x: Int) + 1].action := nil; + transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.finish +end; + +proc lexer_make*(lexer: ^Lexer, input: ^FILE); +begin + lexer^.input := input; + lexer^.length := 0u; + + lexer^.buffer := cast(malloc(CHUNK_SIZE): ^Char); + memset(cast(lexer^.buffer: Pointer), 0, CHUNK_SIZE); + lexer^.size := CHUNK_SIZE +end; + +(* Returns the last read token. *) +proc lexer_current*(lexer: ^Lexer) -> LexerToken; +var + current_class: TransitionClass; + current_state: TransitionState; + current_transition: Transition; + result: LexerToken; + index1: Word; + index2: Word; +begin + lexer^.current := lexer^.start; + current_state := TransitionState.start; + + while current_state <> TransitionState.finish do + index1 := cast(lexer^.current.iterator^: Word) + 1u; + current_class := classification[index1]; + + index1 := cast(current_state: Word) + 1u; + index2 := cast(current_class: Word) + 1u; + + current_transition := transitions[index1][index2]; + if current_transition.action <> nil then + current_transition.action(lexer, @result) + end; + current_state := current_transition.next_state + end; + result.start_location := lexer^.start.location; + result.end_location := lexer^.current.location; + + return result +end; + +(* Read and return the next token. *) +proc lexer_lex*(lexer: ^Lexer) -> LexerToken; +var + result: LexerToken; +begin + if lexer^.length = 0u then + lexer^.length := fread(cast(lexer^.buffer: Pointer), CHUNK_SIZE, 1u, lexer^.input); + lexer^.current.location.column := 1u; + lexer^.current.location.line := 1u; + lexer^.current.iterator := lexer^.buffer + end; + lexer^.start := lexer^.current; + + result := lexer_current(lexer); + return result +end; + +proc lexer_destroy*(lexer: ^Lexer); +begin + free(cast(lexer^.buffer: Pointer)) +end; + +proc lexer_initialize(); +begin + initialize_classification(); + initialize_transitions() +end; + +end. |
