965 lines
39 KiB
Plaintext
965 lines
39 KiB
Plaintext
(* This Source Code Form is subject to the terms of the Mozilla Public License,
|
|
v. 2.0. If a copy of the MPL was not distributed with this file, You can
|
|
obtain one at https://mozilla.org/MPL/2.0/. *)
|
|
module;
|
|
|
|
import cstdio, cstring, common;
|
|
|
|
const
|
|
CHUNK_SIZE := 85536;
|
|
|
|
type
|
|
(*
|
|
* Classification table assigns each possible character to a group (class). All
|
|
* characters of the same group a handled equivalently.
|
|
*
|
|
* Classification:
|
|
*)
|
|
TransitionClass = (
|
|
invalid,
|
|
digit,
|
|
alpha,
|
|
space,
|
|
colon,
|
|
equals,
|
|
left_paren,
|
|
right_paren,
|
|
asterisk,
|
|
underscore,
|
|
single,
|
|
hex,
|
|
zero,
|
|
x,
|
|
eof,
|
|
dot,
|
|
minus,
|
|
single_quote,
|
|
double_quote,
|
|
greater,
|
|
less,
|
|
other
|
|
);
|
|
TransitionState = (
|
|
start,
|
|
colon,
|
|
identifier,
|
|
decimal,
|
|
greater,
|
|
minus,
|
|
left_paren,
|
|
less,
|
|
dot,
|
|
comment,
|
|
closing_comment,
|
|
character,
|
|
string,
|
|
leading_zero,
|
|
decimal_suffix,
|
|
finish
|
|
);
|
|
LexerToken = record
|
|
kind: LexerKind;
|
|
value: union
|
|
booleanKind: Bool;
|
|
identifierKind: Identifier;
|
|
integerKind: Int;
|
|
stringKind: String
|
|
end;
|
|
start_location: TextLocation;
|
|
end_location: TextLocation
|
|
end;
|
|
TransitionAction = proc(^Lexer, ^LexerToken);
|
|
Transition = record
|
|
action: TransitionAction;
|
|
next_state: TransitionState
|
|
end;
|
|
TransitionClasses = [22]Transition;
|
|
|
|
BufferPosition* = record
|
|
iterator: ^Char;
|
|
location: TextLocation
|
|
end;
|
|
Lexer* = record
|
|
input: ^FILE;
|
|
buffer: ^Char;
|
|
size: Word;
|
|
length: Word;
|
|
start: BufferPosition;
|
|
current: BufferPosition
|
|
end;
|
|
LexerKind* = (
|
|
eof,
|
|
identifier,
|
|
_if,
|
|
_then,
|
|
_else,
|
|
_elsif,
|
|
_while,
|
|
_do,
|
|
_proc,
|
|
_begin,
|
|
_end,
|
|
_xor,
|
|
_const,
|
|
_var,
|
|
_case,
|
|
_of,
|
|
_type,
|
|
_record,
|
|
_union,
|
|
pipe,
|
|
to,
|
|
boolean,
|
|
null,
|
|
and,
|
|
_or,
|
|
tilde,
|
|
_return,
|
|
_defer,
|
|
range,
|
|
left_paren,
|
|
right_paren,
|
|
lefts_quare,
|
|
right_square,
|
|
greater_equal,
|
|
less_equal,
|
|
greater_than,
|
|
less_than,
|
|
not_equal,
|
|
equal,
|
|
semicolon,
|
|
dot,
|
|
comma,
|
|
plus,
|
|
minus,
|
|
asterisk,
|
|
division,
|
|
remainder,
|
|
assignment,
|
|
colon,
|
|
hat,
|
|
at,
|
|
comment,
|
|
integer,
|
|
word,
|
|
character,
|
|
string,
|
|
from,
|
|
pointer,
|
|
array,
|
|
arrow,
|
|
_program,
|
|
_module,
|
|
_import
|
|
);
|
|
|
|
var
|
|
classification: [128]TransitionClass;
|
|
transitions: [16]TransitionClasses;
|
|
|
|
proc initialize_classification();
|
|
var
|
|
i: Word;
|
|
begin
|
|
classification[1] := TransitionClass.eof; (* NUL *)
|
|
classification[2] := TransitionClass.invalid; (* SOH *)
|
|
classification[3] := TransitionClass.invalid; (* STX *)
|
|
classification[4] := TransitionClass.invalid; (* ETX *)
|
|
classification[5] := TransitionClass.invalid; (* EOT *)
|
|
classification[6] := TransitionClass.invalid; (* EMQ *)
|
|
classification[7] := TransitionClass.invalid; (* ACK *)
|
|
classification[8] := TransitionClass.invalid; (* BEL *)
|
|
classification[9] := TransitionClass.invalid; (* BS *)
|
|
classification[10] := TransitionClass.space; (* HT *)
|
|
classification[11] := TransitionClass.space; (* LF *)
|
|
classification[12] := TransitionClass.invalid; (* VT *)
|
|
classification[13] := TransitionClass.invalid; (* FF *)
|
|
classification[14] := TransitionClass.space; (* CR *)
|
|
classification[15] := TransitionClass.invalid; (* SO *)
|
|
classification[16] := TransitionClass.invalid; (* SI *)
|
|
classification[17] := TransitionClass.invalid; (* DLE *)
|
|
classification[18] := TransitionClass.invalid; (* DC1 *)
|
|
classification[19] := TransitionClass.invalid; (* DC2 *)
|
|
classification[20] := TransitionClass.invalid; (* DC3 *)
|
|
classification[21] := TransitionClass.invalid; (* DC4 *)
|
|
classification[22] := TransitionClass.invalid; (* NAK *)
|
|
classification[23] := TransitionClass.invalid; (* SYN *)
|
|
classification[24] := TransitionClass.invalid; (* ETB *)
|
|
classification[25] := TransitionClass.invalid; (* CAN *)
|
|
classification[26] := TransitionClass.invalid; (* EM *)
|
|
classification[27] := TransitionClass.invalid; (* SUB *)
|
|
classification[28] := TransitionClass.invalid; (* ESC *)
|
|
classification[29] := TransitionClass.invalid; (* FS *)
|
|
classification[30] := TransitionClass.invalid; (* GS *)
|
|
classification[31] := TransitionClass.invalid; (* RS *)
|
|
classification[32] := TransitionClass.invalid; (* US *)
|
|
classification[33] := TransitionClass.space; (* Space *)
|
|
classification[34] := TransitionClass.single; (* ! *)
|
|
classification[35] := TransitionClass.double_quote; (* " *)
|
|
classification[36] := TransitionClass.other; (* # *)
|
|
classification[37] := TransitionClass.other; (* $ *)
|
|
classification[38] := TransitionClass.single; (* % *)
|
|
classification[39] := TransitionClass.single; (* & *)
|
|
classification[40] := TransitionClass.single_quote; (* ' *)
|
|
classification[41] := TransitionClass.left_paren; (* ( *)
|
|
classification[42] := TransitionClass.right_paren; (* ) *)
|
|
classification[43] := TransitionClass.asterisk; (* * *)
|
|
classification[44] := TransitionClass.single; (* + *)
|
|
classification[45] := TransitionClass.single; (* , *)
|
|
classification[46] := TransitionClass.minus; (* - *)
|
|
classification[47] := TransitionClass.dot; (* . *)
|
|
classification[48] := TransitionClass.single; (* / *)
|
|
classification[49] := TransitionClass.zero; (* 0 *)
|
|
classification[50] := TransitionClass.digit; (* 1 *)
|
|
classification[51] := TransitionClass.digit; (* 2 *)
|
|
classification[52] := TransitionClass.digit; (* 3 *)
|
|
classification[53] := TransitionClass.digit; (* 4 *)
|
|
classification[54] := TransitionClass.digit; (* 5 *)
|
|
classification[55] := TransitionClass.digit; (* 6 *)
|
|
classification[56] := TransitionClass.digit; (* 7 *)
|
|
classification[57] := TransitionClass.digit; (* 8 *)
|
|
classification[58] := TransitionClass.digit; (* 9 *)
|
|
classification[59] := TransitionClass.colon; (* : *)
|
|
classification[60] := TransitionClass.single; (* ; *)
|
|
classification[61] := TransitionClass.less; (* < *)
|
|
classification[62] := TransitionClass.equals; (* = *)
|
|
classification[63] := TransitionClass.greater; (* > *)
|
|
classification[64] := TransitionClass.other; (* ? *)
|
|
classification[65] := TransitionClass.single; (* @ *)
|
|
classification[66] := TransitionClass.alpha; (* A *)
|
|
classification[67] := TransitionClass.alpha; (* B *)
|
|
classification[68] := TransitionClass.alpha; (* C *)
|
|
classification[69] := TransitionClass.alpha; (* D *)
|
|
classification[70] := TransitionClass.alpha; (* E *)
|
|
classification[71] := TransitionClass.alpha; (* F *)
|
|
classification[72] := TransitionClass.alpha; (* G *)
|
|
classification[73] := TransitionClass.alpha; (* H *)
|
|
classification[74] := TransitionClass.alpha; (* I *)
|
|
classification[75] := TransitionClass.alpha; (* J *)
|
|
classification[76] := TransitionClass.alpha; (* K *)
|
|
classification[77] := TransitionClass.alpha; (* L *)
|
|
classification[78] := TransitionClass.alpha; (* M *)
|
|
classification[79] := TransitionClass.alpha; (* N *)
|
|
classification[80] := TransitionClass.alpha; (* O *)
|
|
classification[81] := TransitionClass.alpha; (* P *)
|
|
classification[82] := TransitionClass.alpha; (* Q *)
|
|
classification[83] := TransitionClass.alpha; (* R *)
|
|
classification[84] := TransitionClass.alpha; (* S *)
|
|
classification[85] := TransitionClass.alpha; (* T *)
|
|
classification[86] := TransitionClass.alpha; (* U *)
|
|
classification[87] := TransitionClass.alpha; (* V *)
|
|
classification[88] := TransitionClass.alpha; (* W *)
|
|
classification[89] := TransitionClass.alpha; (* X *)
|
|
classification[90] := TransitionClass.alpha; (* Y *)
|
|
classification[91] := TransitionClass.alpha; (* Z *)
|
|
classification[92] := TransitionClass.single; (* [ *)
|
|
classification[93] := TransitionClass.other; (* \ *)
|
|
classification[94] := TransitionClass.single; (* ] *)
|
|
classification[95] := TransitionClass.single; (* ^ *)
|
|
classification[96] := TransitionClass.underscore; (* _ *)
|
|
classification[97] := TransitionClass.other; (* ` *)
|
|
classification[98] := TransitionClass.hex; (* a *)
|
|
classification[99] := TransitionClass.hex; (* b *)
|
|
classification[100] := TransitionClass.hex; (* c *)
|
|
classification[101] := TransitionClass.hex; (* d *)
|
|
classification[102] := TransitionClass.hex; (* e *)
|
|
classification[103] := TransitionClass.hex; (* f *)
|
|
classification[104] := TransitionClass.alpha; (* g *)
|
|
classification[105] := TransitionClass.alpha; (* h *)
|
|
classification[106] := TransitionClass.alpha; (* i *)
|
|
classification[107] := TransitionClass.alpha; (* j *)
|
|
classification[108] := TransitionClass.alpha; (* k *)
|
|
classification[109] := TransitionClass.alpha; (* l *)
|
|
classification[110] := TransitionClass.alpha; (* m *)
|
|
classification[111] := TransitionClass.alpha; (* n *)
|
|
classification[112] := TransitionClass.alpha; (* o *)
|
|
classification[113] := TransitionClass.alpha; (* p *)
|
|
classification[114] := TransitionClass.alpha; (* q *)
|
|
classification[115] := TransitionClass.alpha; (* r *)
|
|
classification[116] := TransitionClass.alpha; (* s *)
|
|
classification[117] := TransitionClass.alpha; (* t *)
|
|
classification[118] := TransitionClass.alpha; (* u *)
|
|
classification[119] := TransitionClass.alpha; (* v *)
|
|
classification[120] := TransitionClass.alpha; (* w *)
|
|
classification[121] := TransitionClass.x; (* x *)
|
|
classification[122] := TransitionClass.alpha; (* y *)
|
|
classification[123] := TransitionClass.alpha; (* z *)
|
|
classification[124] := TransitionClass.other; (* { *)
|
|
classification[125] := TransitionClass.single; (* | *)
|
|
classification[126] := TransitionClass.other; (* } *)
|
|
classification[127] := TransitionClass.single; (* ~ *)
|
|
classification[128] := TransitionClass.invalid; (* DEL *)
|
|
|
|
i := 129u;
|
|
while i <= 256u do
|
|
classification[i] := TransitionClass.other;
|
|
i := i + 1u
|
|
end
|
|
end;
|
|
|
|
proc compare_keyword(keyword: String, token_start: BufferPosition, token_end: ^Char) -> Bool;
|
|
var
|
|
result: Bool;
|
|
index: Word;
|
|
keyword_length: Word;
|
|
continue: Bool;
|
|
begin
|
|
index := 0u;
|
|
result := true;
|
|
keyword_length := Length(keyword);
|
|
continue := (index < keyword_length) & (token_start.iterator <> token_end);
|
|
|
|
while continue & result do
|
|
result := (keyword[index] = token_start.iterator^) or (Lower(keyword[index]) = token_start.iterator^);
|
|
token_start.iterator := token_start.iterator + 1;
|
|
index := index + 1u;
|
|
continue := (index < keyword_length) & (token_start.iterator <> token_end)
|
|
end;
|
|
result := result & (index = Length(keyword));
|
|
|
|
return result & (token_start.iterator = token_end)
|
|
end;
|
|
|
|
(* Reached the end of file. *)
|
|
proc transition_action_eof(lexer: ^Lexer, token: ^LexerToken);
|
|
begin
|
|
token^.kind := LexerKind.eof
|
|
end;
|
|
|
|
proc increment(position: ^BufferPosition);
|
|
begin
|
|
position^.iterator := position^.iterator + 1
|
|
end;
|
|
|
|
(* Add the character to the token currently read and advance to the next character. *)
|
|
proc transition_action_accumulate(lexer: ^Lexer, token: ^LexerToken);
|
|
begin
|
|
increment(@lexer^.current)
|
|
end;
|
|
|
|
(* The current character is not a part of the token. Finish the token already
|
|
* read. Don't advance to the next character. *)
|
|
proc transition_action_finalize(lexer: ^Lexer, token: ^LexerToken);
|
|
begin
|
|
if lexer^.start.iterator^ = ':' then
|
|
token^.kind := LexerKind.colon
|
|
end;
|
|
if lexer^.start.iterator^ = '>' then
|
|
token^.kind := LexerKind.greater_than
|
|
end;
|
|
if lexer^.start.iterator^ = '<' then
|
|
token^.kind := LexerKind.less_than
|
|
end;
|
|
if lexer^.start.iterator^ = '(' then
|
|
token^.kind := LexerKind.left_paren
|
|
end;
|
|
if lexer^.start.iterator^ = '-' then
|
|
token^.kind := LexerKind.minus
|
|
end;
|
|
if lexer^.start.iterator^ = '.' then
|
|
token^.kind := LexerKind.dot
|
|
end
|
|
end;
|
|
|
|
(* An action for tokens containing multiple characters. *)
|
|
proc transition_action_composite(lexer: ^Lexer, token: ^LexerToken);
|
|
begin
|
|
if lexer^.start.iterator^ = '<' then
|
|
if lexer^.current.iterator^ = '>' then
|
|
token^.kind := LexerKind.not_equal
|
|
end;
|
|
if lexer^.current.iterator^ = '=' then
|
|
token^.kind := LexerKind.less_equal
|
|
end
|
|
end;
|
|
if (lexer^.start.iterator^ = '>') & (lexer^.current.iterator^ = '=') then
|
|
token^.kind := LexerKind.greater_equal
|
|
end;
|
|
if (lexer^.start.iterator^ = '.') & (lexer^.current.iterator^ = '.') then
|
|
token^.kind := LexerKind.range
|
|
end;
|
|
if (lexer^.start.iterator^ = ':') & (lexer^.current.iterator^ = '=') then
|
|
token^.kind := LexerKind.assignment
|
|
end;
|
|
if (lexer^.start.iterator^ = '-') & (lexer^.current.iterator^ = '>') then
|
|
token^.kind := LexerKind.arrow
|
|
end;
|
|
increment(@lexer^.current)
|
|
end;
|
|
|
|
(* Skip a space. *)
|
|
proc transition_action_skip(lexer: ^Lexer, token: ^LexerToken);
|
|
begin
|
|
increment(@lexer^.start);
|
|
|
|
if lexer^.start.iterator^ = '\n' then
|
|
lexer^.start.location.line := lexer^.start.location.line + 1u;
|
|
lexer^.start.location.column := 1u
|
|
end;
|
|
lexer^.current := lexer^.start
|
|
end;
|
|
|
|
(* Delimited string action. *)
|
|
proc transition_action_delimited(lexer: ^Lexer, token: ^LexerToken);
|
|
var
|
|
text_length: Word;
|
|
begin
|
|
if lexer^.start.iterator^ = '(' then
|
|
token^.kind := LexerKind.comment
|
|
end;
|
|
if lexer^.start.iterator^ = '"' then
|
|
text_length := cast(lexer^.current.iterator - lexer^.start.iterator + 1: Word);
|
|
|
|
token^.stringKind := String(malloc(text_length), text_length);
|
|
memcpy(@token^.stringKind.ptr, lexer^.start.iterator, text_length);
|
|
|
|
token^.kind := LexerKind.character
|
|
end;
|
|
if lexer^.start.iterator^ = "'" then
|
|
text_length := cast(lexer^.current.iterator - lexer^.start.iterator + 1: Word);
|
|
|
|
token^.stringKind := String(malloc(text_length), text_length);
|
|
memcpy(@token^.stringKind.ptr, lexer^.start.iterator, text_length);
|
|
|
|
token^.kind := LexerKind.string
|
|
end;
|
|
increment(@lexer^.current)
|
|
end;
|
|
|
|
(* Finalize keyword or identifier. *)
|
|
proc transition_action_key_id(lexer: ^Lexer, token: ^LexerToken);
|
|
begin
|
|
token^.kind := LexerKind.identifier;
|
|
|
|
token^.identifierKind[1] := cast(lexer^.current.iterator - lexer^.start.iterator: Char);
|
|
memcpy(@token^.identifierKind[2], lexer^.start.iterator, ORD(token^.identifierKind[1]));
|
|
|
|
if compare_keyword("program", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._program
|
|
end;
|
|
if compare_keyword("import", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._import
|
|
end;
|
|
if compare_keyword("const", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._const
|
|
end;
|
|
if compare_keyword("var", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._var
|
|
end;
|
|
if compare_keyword("if", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._if
|
|
end;
|
|
if compare_keyword("then", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._then
|
|
end;
|
|
if compare_keyword("elsif", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._elsif
|
|
end;
|
|
if compare_keyword("else", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._else
|
|
end;
|
|
if compare_keyword("while", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._while
|
|
end;
|
|
if compare_keyword("do", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._do
|
|
end;
|
|
if compare_keyword("proc", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._proc
|
|
end;
|
|
if compare_keyword("begin", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._begin
|
|
end;
|
|
if compare_keyword("end", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._end
|
|
end;
|
|
if compare_keyword("type", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._type
|
|
end;
|
|
if compare_keyword("record", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._record
|
|
end;
|
|
if compare_keyword("union", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._union
|
|
end;
|
|
if compare_keyword("NIL", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind.null
|
|
end;
|
|
if compare_keyword("or", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._or
|
|
end;
|
|
if compare_keyword("return", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._return
|
|
end;
|
|
if compare_keyword("defer", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._defer
|
|
end;
|
|
if compare_keyword("TO", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind.to
|
|
end;
|
|
if compare_keyword("CASE", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._case
|
|
end;
|
|
if compare_keyword("OF", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._of
|
|
end;
|
|
if compare_keyword("FROM", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind.from
|
|
end;
|
|
if compare_keyword("module", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._module
|
|
end;
|
|
if compare_keyword("xor", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind._xor
|
|
end;
|
|
if compare_keyword("POINTER", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind.pointer
|
|
end;
|
|
if compare_keyword("ARRAY", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind.array
|
|
end;
|
|
if compare_keyword("TRUE", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind.boolean;
|
|
token^.booleanKind := true
|
|
end;
|
|
if compare_keyword("FALSE", lexer^.start, lexer^.current.iterator) then
|
|
token^.kind := LexerKind.boolean;
|
|
token^.booleanKind := false
|
|
end
|
|
end;
|
|
|
|
(* Action for tokens containing only one character. The character cannot be
|
|
* followed by other characters forming a composite token. *)
|
|
proc transition_action_single(lexer: ^Lexer, token: ^LexerToken);
|
|
begin
|
|
if lexer^.current.iterator^ = '&' then
|
|
token^.kind := LexerKind.and
|
|
end;
|
|
if lexer^.current.iterator^ = ';' then
|
|
token^.kind := LexerKind.semicolon
|
|
end;
|
|
if lexer^.current.iterator^ = ',' then
|
|
token^.kind := LexerKind.comma
|
|
end;
|
|
if lexer^.current.iterator^ = '~' then
|
|
token^.kind := LexerKind.tilde
|
|
end;
|
|
if lexer^.current.iterator^ = ')' then
|
|
token^.kind := LexerKind.right_paren
|
|
end;
|
|
if lexer^.current.iterator^ = '[' then
|
|
token^.kind := LexerKind.left_square
|
|
end;
|
|
if lexer^.current.iterator^ = ']' then
|
|
token^.kind := LexerKind.right_square
|
|
end;
|
|
if lexer^.current.iterator^ = '^' then
|
|
token^.kind := LexerKind.hat
|
|
end;
|
|
if lexer^.current.iterator^ = '=' then
|
|
token^.kind := LexerKind.equal
|
|
end;
|
|
if lexer^.current.iterator^ = '+' then
|
|
token^.kind := LexerKind.plus
|
|
end;
|
|
if lexer^.current.iterator^ = '*' then
|
|
token^.kind := LexerKind.asterisk
|
|
end;
|
|
if lexer^.current.iterator^ = '/' then
|
|
token^.kind := LexerKind.division
|
|
end;
|
|
if lexer^.current.iterator^ = '%' then
|
|
token^.kind := LexerKind.remainder
|
|
end;
|
|
if lexer^.current.iterator^ = '@' then
|
|
token^.kind := LexerKind.at
|
|
end;
|
|
if lexer^.current.iterator^ = '|' then
|
|
token^.kind := LexerKind.pipe
|
|
end;
|
|
increment(@lexer^.current.iterator)
|
|
end;
|
|
|
|
(* Handle an integer literal. *)
|
|
proc transition_action_integer(lexer: ^Lexer, token: ^LexerToken);
|
|
var
|
|
buffer: String;
|
|
integer_length: Int;
|
|
found: Bool;
|
|
begin
|
|
token^.kind := LexerKind.integer;
|
|
|
|
integer_length := lexer^.current.iterator - lexer^.start.iterator;
|
|
memset(@token^.identifierKind, 0, #size(Identifier));
|
|
memcpy(@token^.identifierKind[1], lexer^.start.iterator, integer_length);
|
|
|
|
buffer := InitStringCharStar(@token^.identifierKind[1]);
|
|
token^.integerKind := StringToInteger(buffer, 10, found);
|
|
buffer := KillString(buffer)
|
|
end;
|
|
|
|
proc set_default_transition(current_state: TransitionState, default_action: TransitionAction, next_state: TransitionState) -> Int;
|
|
var
|
|
default_transition: Transition;
|
|
state_index: Int;
|
|
begin
|
|
default_transition.action := default_action;
|
|
default_transition.next_state := next_state;
|
|
state_index := cast(current_state: Int) + 1;
|
|
|
|
transitions[state_index][cast(TransitionClass.invalid: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.digit: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.alpha: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.space: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.colon: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.equals: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.left_paren: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.right_paren: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.asterisk: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.underscore: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.single: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.hex: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.zero: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.x: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.eof: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.dot: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.minus: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.single_quote: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.double_quote: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.greater: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.less: Int) + 1] := default_transition;
|
|
transitions[state_index][cast(TransitionClass.other: Int) + 1] := default_transition;
|
|
|
|
return state_index
|
|
end;
|
|
|
|
(*
|
|
* The transition table describes transitions from one state to another, given
|
|
* a symbol (character class).
|
|
*
|
|
* The table has m rows and n columns, where m is the amount of states and n is
|
|
* the amount of classes. So given the current state and a classified character
|
|
* the table can be used to look up the next state.
|
|
*
|
|
* Each cell is a word long.
|
|
* - The least significant byte of the word is a row number (beginning with 0).
|
|
* It specifies the target state. "ff" means that this is an end state and no
|
|
* transition is possible.
|
|
* - The next byte is the action that should be performed when transitioning.
|
|
* For the meaning of actions see labels in the lex_next function, which
|
|
* handles each action.
|
|
*)
|
|
proc initialize_transitions();
|
|
var
|
|
state_index: Int;
|
|
begin
|
|
(* Start state. *)
|
|
state_index := cast(TransitionState.start: Int) + 1;
|
|
|
|
transitions[state_index][cast(TransitionClass.invalid: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.invalid: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.decimal;
|
|
|
|
transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.identifier;
|
|
|
|
transitions[state_index][cast(TransitionClass.space: Int) + 1].action := transition_action_skip;
|
|
transitions[state_index][cast(TransitionClass.space: Int) + 1].next_state := TransitionState.start;
|
|
|
|
transitions[state_index][cast(TransitionClass.colon: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.colon: Int) + 1].next_state := TransitionState.colon;
|
|
|
|
transitions[state_index][cast(TransitionClass.equals: Int) + 1].action := transition_action_single;
|
|
transitions[state_index][cast(TransitionClass.equals: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.left_paren: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.left_paren: Int) + 1].next_state := TransitionState.left_paren;
|
|
|
|
transitions[state_index][cast(TransitionClass.right_paren: Int) + 1].action := transition_action_single;
|
|
transitions[state_index][cast(TransitionClass.right_paren: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].action := transition_action_single;
|
|
transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.underscore: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.underscore: Int) + 1].next_state := TransitionState.identifier;
|
|
|
|
transitions[state_index][cast(TransitionClass.single: Int) + 1].action := transition_action_single;
|
|
transitions[state_index][cast(TransitionClass.single: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.identifier;
|
|
|
|
transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.leading_zero;
|
|
|
|
transitions[state_index][cast(TransitionClass.x: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.identifier;
|
|
|
|
transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := transition_action_eof;
|
|
transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.dot: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.dot: Int) + 1].next_state := TransitionState.dot;
|
|
|
|
transitions[state_index][cast(TransitionClass.minus: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.minus: Int) + 1].next_state := TransitionState.minus;
|
|
|
|
transitions[state_index][cast(TransitionClass.single_quote: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.single_quote: Int) + 1].next_state := TransitionState.character;
|
|
|
|
transitions[state_index][cast(TransitionClass.double_quote: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.double_quote: Int) + 1].next_state := TransitionState.string;
|
|
|
|
transitions[state_index][cast(TransitionClass.greater: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.greater: Int) + 1].next_state := TransitionState.greater;
|
|
|
|
transitions[state_index][cast(TransitionClass.less: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.less: Int) + 1].next_state := TransitionState.less;
|
|
|
|
transitions[state_index][cast(TransitionClass.other: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.other: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
(* Colon state. *)
|
|
state_index := set_default_transition(TransitionState.colon, transition_action_finalize, TransitionState.finish);
|
|
|
|
transitions[state_index][cast(TransitionClass.equals: Int) + 1].action := transition_action_composite;
|
|
transitions[state_index][cast(TransitionClass.equals: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
(* Identifier state. *)
|
|
state_index := set_default_transition(TransitionState.identifier, transition_action_key_id, TransitionState.finish);
|
|
|
|
transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.identifier;
|
|
|
|
transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.identifier;
|
|
|
|
transitions[state_index][cast(TransitionClass.underscore: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.underscore: Int) + 1].next_state := TransitionState.identifier;
|
|
|
|
transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.identifier;
|
|
|
|
transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.identifier;
|
|
|
|
transitions[state_index][cast(TransitionClass.x: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.identifier;
|
|
|
|
(* Decimal state. *)
|
|
state_index := set_default_transition(TransitionState.decimal, transition_action_integer, TransitionState.finish);
|
|
|
|
transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.decimal;
|
|
|
|
transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.decimal_suffix;
|
|
|
|
transitions[state_index][cast(TransitionClass.underscore: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.underscore: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.decimal_suffix;
|
|
|
|
transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.decimal;
|
|
|
|
transitions[state_index][cast(TransitionClass.x: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.decimal_suffix;
|
|
|
|
(* Greater state. *)
|
|
state_index := set_default_transition(TransitionState.greater, transition_action_finalize, TransitionState.finish);
|
|
|
|
transitions[state_index][cast(TransitionClass.equals: Int) + 1].action := transition_action_composite;
|
|
transitions[state_index][cast(TransitionClass.equals: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
(* Minus state. *)
|
|
state_index := set_default_transition(TransitionState.minus, transition_action_finalize, TransitionState.finish);
|
|
|
|
transitions[state_index][cast(TransitionClass.greater: Int) + 1].action := transition_action_composite;
|
|
transitions[state_index][cast(TransitionClass.greater: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
(* Left paren state. *)
|
|
state_index := set_default_transition(TransitionState.left_paren, transition_action_finalize, TransitionState.finish);
|
|
|
|
transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].next_state := TransitionState.comment;
|
|
|
|
(* Less state. *)
|
|
state_index := set_default_transition(TransitionState.less, transition_action_finalize, TransitionState.finish);
|
|
|
|
transitions[state_index][cast(TransitionClass.equals: Int) + 1].action := transition_action_composite;
|
|
transitions[state_index][cast(TransitionClass.equals: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.greater: Int) + 1].action := transition_action_composite;
|
|
transitions[state_index][cast(TransitionClass.greater: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
(* Hexadecimal after 0x. *)
|
|
state_index := set_default_transition(TransitionState.dot, transition_action_finalize, TransitionState.finish);
|
|
|
|
transitions[state_index][cast(TransitionClass.dot: Int) + 1].action := transition_action_composite;
|
|
transitions[state_index][cast(TransitionClass.dot: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
(* Comment. *)
|
|
state_index := set_default_transition(TransitionState.comment, transition_action_accumulate, TransitionState.comment);
|
|
|
|
transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].next_state := TransitionState.closing_comment;
|
|
|
|
transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
(* Closing comment. *)
|
|
state_index := set_default_transition(TransitionState.closing_comment, transition_action_accumulate, TransitionState.comment);
|
|
|
|
transitions[state_index][cast(TransitionClass.invalid: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.invalid: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.right_paren: Int) + 1].action := transition_action_delimited;
|
|
transitions[state_index][cast(TransitionClass.right_paren: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].action := transition_action_accumulate;
|
|
transitions[state_index][cast(TransitionClass.asterisk: Int) + 1].next_state := TransitionState.closing_comment;
|
|
|
|
transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
(* Character. *)
|
|
state_index := set_default_transition(TransitionState.character, transition_action_accumulate, TransitionState.character);
|
|
|
|
transitions[state_index][cast(TransitionClass.invalid: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.invalid: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.single_quote: Int) + 1].action := transition_action_delimited;
|
|
transitions[state_index][cast(TransitionClass.single_quote: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
(* String. *)
|
|
state_index := set_default_transition(TransitionState.string, transition_action_accumulate, TransitionState.string);
|
|
|
|
transitions[state_index][cast(TransitionClass.invalid: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.invalid: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.eof: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.eof: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.double_quote: Int) + 1].action := transition_action_delimited;
|
|
transitions[state_index][cast(TransitionClass.double_quote: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
(* Leading zero. *)
|
|
state_index := set_default_transition(TransitionState.leading_zero, transition_action_integer, TransitionState.finish);
|
|
|
|
transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.underscore: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.underscore: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.x: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
(* Digit with a character suffix. *)
|
|
state_index := set_default_transition(TransitionState.decimal_suffix, transition_action_integer, TransitionState.finish);
|
|
|
|
transitions[state_index][cast(TransitionClass.alpha: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.alpha: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.digit: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.digit: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.hex: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.hex: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.zero: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.zero: Int) + 1].next_state := TransitionState.finish;
|
|
|
|
transitions[state_index][cast(TransitionClass.x: Int) + 1].action := nil;
|
|
transitions[state_index][cast(TransitionClass.x: Int) + 1].next_state := TransitionState.finish
|
|
end;
|
|
|
|
proc lexer_make*(lexer: ^Lexer, input: ^FILE);
|
|
begin
|
|
lexer^.input := input;
|
|
lexer^.length := 0;
|
|
|
|
lexer^.buffer := malloc(CHUNK_SIZE);
|
|
memset(lexer^.buffer, 0, CHUNK_SIZE);
|
|
lexer^.size := CHUNK_SIZE
|
|
end;
|
|
|
|
(* Returns the last read token. *)
|
|
proc lexer_current*(lexer: ^Lexer) -> LexerToken;
|
|
var
|
|
current_class: TransitionClass;
|
|
current_state: TransitionState;
|
|
current_transition: Transition;
|
|
result: LexerToken;
|
|
index1: Word;
|
|
index2: Word;
|
|
begin
|
|
lexer^.current := lexer^.start;
|
|
current_state := TransitionState.start;
|
|
|
|
while current_state <> TransitionState.finish do
|
|
index1 := cast(lexer^.current.iterator^: Word) + 1u;
|
|
current_class := classification[index1];
|
|
|
|
index1 := cast(current_state: Word) + 1u;
|
|
index2 := cast(current_class: Word) + 1u;
|
|
|
|
current_transition := transitions[index1][index2];
|
|
if current_transition.action <> nil then
|
|
current_transition.action(lexer, @result)
|
|
end;
|
|
current_state := current_transition.next_state
|
|
end;
|
|
result.start_location := lexer^.start.location;
|
|
result.end_location := lexer^.current.location;
|
|
|
|
return result
|
|
end;
|
|
|
|
(* Read and return the next token. *)
|
|
proc lexer_lex*(lexer: ^Lexer) -> LexerToken;
|
|
var
|
|
result: LexerToken;
|
|
begin
|
|
if lexer^.length = 0 then
|
|
lexer^.length := ReadNBytes(lexer^.input, CHUNK_SIZE, lexer^.buffer);
|
|
lexer^.current.location.column := 1;
|
|
lexer^.current.location.line := 1;
|
|
lexer^.current.iterator := lexer^.buffer
|
|
end;
|
|
lexer^.start := lexer^.current;
|
|
|
|
result := lexer_current(lexer);
|
|
return result
|
|
end;
|
|
|
|
proc lexer_destroy*(lexer: ^Lexer);
|
|
begin
|
|
free(lexer^.buffer)
|
|
end;
|
|
|
|
proc lexer_initialize();
|
|
begin
|
|
initialize_classification();
|
|
initialize_transitions()
|
|
end;
|
|
|
|
end.
|