Handle ASCII codes > 128 in the tokenizer
This commit is contained in:
@ -72,6 +72,8 @@ var
|
||||
transitions: [16]TransitionClasses;
|
||||
|
||||
proc initialize_classification();
|
||||
var
|
||||
i: CARDINAL;
|
||||
begin
|
||||
classification[1] := transitionClassEof; (* NUL *)
|
||||
classification[2] := transitionClassInvalid; (* SOH *)
|
||||
@ -200,265 +202,274 @@ begin
|
||||
classification[125] := transitionClassSingle; (* | *)
|
||||
classification[126] := transitionClassOther; (* } *)
|
||||
classification[127] := transitionClassSingle; (* ~ *)
|
||||
classification[128] := transitionClassInvalid (* DEL *)
|
||||
classification[128] := transitionClassInvalid; (* DEL *)
|
||||
|
||||
i := 129;
|
||||
while i <= 256 do
|
||||
classification[i] := transitionClassOther;
|
||||
i := i + 1
|
||||
end
|
||||
end;
|
||||
|
||||
proc compare_keyword(Keyword: ARRAY OF CHAR, TokenStart: PLexerBuffer, TokenEnd: PLexerBuffer): BOOLEAN;
|
||||
proc compare_keyword(Keyword: ARRAY OF CHAR, TokenStart: PLexerBuffer, TokenEnd: PLexerBuffer) -> BOOLEAN;
|
||||
var
|
||||
Result: BOOLEAN;
|
||||
Index: CARDINAL;
|
||||
result: BOOLEAN;
|
||||
index: CARDINAL;
|
||||
begin
|
||||
Index := 0;
|
||||
Result := TRUE;
|
||||
index := 0;
|
||||
result := TRUE;
|
||||
|
||||
while (Index < Length(Keyword)) & (TokenStart <> TokenEnd) & Result DO
|
||||
Result := (Keyword[Index] = TokenStart^) or (Lower(Keyword[Index]) = TokenStart^);
|
||||
while (index < Length(Keyword)) & (TokenStart <> TokenEnd) & result DO
|
||||
result := (Keyword[index] = TokenStart^) or (Lower(Keyword[index]) = TokenStart^);
|
||||
INC(TokenStart);
|
||||
INC(Index)
|
||||
INC(index)
|
||||
end;
|
||||
Result := (Index = Length(Keyword)) & (TokenStart = TokenEnd) & Result;
|
||||
return Result
|
||||
result := (index = Length(Keyword)) & (TokenStart = TokenEnd) & result;
|
||||
return result
|
||||
end;
|
||||
|
||||
(* Reached the end of file. *)
|
||||
proc transition_action_eof(lexer: PLexer, AToken: PLexerToken);
|
||||
proc transition_action_eof(lexer: PLexer, token: PLexerToken);
|
||||
begin
|
||||
AToken^.Kind := lexerKindEof
|
||||
token^.kind := lexerKindEof
|
||||
end;
|
||||
|
||||
(* Add the character to the token currently read and advance to the next character. *)
|
||||
proc transition_action_accumulate(lexer: PLexer, AToken: PLexerToken);
|
||||
proc transition_action_accumulate(lexer: PLexer, token: PLexerToken);
|
||||
begin
|
||||
INC(lexer^.Current)
|
||||
end;
|
||||
|
||||
(* The current character is not a part of the token. Finish the token already
|
||||
* read. Don't advance to the next character. *)
|
||||
proc transition_action_finalize(lexer: PLexer, AToken: PLexerToken);
|
||||
proc transition_action_finalize(lexer: PLexer, token: PLexerToken);
|
||||
begin
|
||||
if lexer^.Start^ = ':' then
|
||||
AToken^.Kind := lexerKindColon
|
||||
token^.kind := lexerKindColon
|
||||
end;
|
||||
if lexer^.Start^ = '>' then
|
||||
AToken^.Kind := lexerKindGreaterThan
|
||||
token^.kind := lexerKindGreaterThan
|
||||
end;
|
||||
if lexer^.Start^ = '<' then
|
||||
AToken^.Kind := lexerKindLessThan
|
||||
token^.kind := lexerKindLessThan
|
||||
end;
|
||||
if lexer^.Start^ = '(' then
|
||||
AToken^.Kind := lexerKindLeftParen
|
||||
token^.kind := lexerKindLeftParen
|
||||
end;
|
||||
if lexer^.Start^ = '-' then
|
||||
AToken^.Kind := lexerKindLeftParen
|
||||
token^.kind := lexerKindLeftParen
|
||||
end;
|
||||
if lexer^.Start^ = '.' then
|
||||
AToken^.Kind := lexerKindDot
|
||||
token^.kind := lexerKindDot
|
||||
end
|
||||
end;
|
||||
|
||||
(* An action for tokens containing multiple characters. *)
|
||||
proc transition_action_composite(lexer: PLexer, AToken: PLexerToken);
|
||||
proc transition_action_composite(lexer: PLexer, token: PLexerToken);
|
||||
begin
|
||||
if lexer^.Start^ = '<' then
|
||||
if lexer^.Current^ = '>' then
|
||||
AToken^.Kind := lexerKindNotEqual
|
||||
token^.kind := lexerKindNotEqual
|
||||
end;
|
||||
if lexer^.Current^ = '=' then
|
||||
AToken^.Kind := lexerKindLessEqual
|
||||
token^.kind := lexerKindLessEqual
|
||||
end
|
||||
end;
|
||||
if (lexer^.Start^ = '>') & (lexer^.Current^ = '=') then
|
||||
AToken^.Kind := lexerKindGreaterEqual
|
||||
token^.kind := lexerKindGreaterEqual
|
||||
end;
|
||||
if (lexer^.Start^ = '.') & (lexer^.Current^ = '.') then
|
||||
AToken^.Kind := lexerKindRange
|
||||
token^.kind := lexerKindRange
|
||||
end;
|
||||
if (lexer^.Start^ = ':') & (lexer^.Current^ = '=') then
|
||||
AToken^.Kind := lexerKindAssignment
|
||||
token^.kind := lexerKindAssignment
|
||||
end;
|
||||
if (lexer^.Start^ = '-') & (lexer^.Current^ = '>') then
|
||||
token^.kind := lexerKindArrow
|
||||
end;
|
||||
INC(lexer^.Current)
|
||||
end;
|
||||
|
||||
(* Skip a space. *)
|
||||
proc transition_action_skip(lexer: PLexer, AToken: PLexerToken);
|
||||
proc transition_action_skip(lexer: PLexer, token: PLexerToken);
|
||||
begin
|
||||
INC(lexer^.Current);
|
||||
INC(lexer^.Start)
|
||||
end;
|
||||
|
||||
(* Delimited string action. *)
|
||||
proc transition_action_delimited(lexer: PLexer, AToken: PLexerToken);
|
||||
proc transition_action_delimited(lexer: PLexer, token: PLexerToken);
|
||||
begin
|
||||
if lexer^.Start^ = '(' then
|
||||
AToken^.Kind := lexerKindComment
|
||||
token^.kind := lexerKindComment
|
||||
end;
|
||||
if lexer^.Start^ = '"' then
|
||||
AToken^.Kind := lexerKindCharacter
|
||||
token^.kind := lexerKindCharacter
|
||||
end;
|
||||
if lexer^.Start^ = "'" then
|
||||
AToken^.Kind := lexerKindString
|
||||
token^.kind := lexerKindString
|
||||
end;
|
||||
INC(lexer^.Current)
|
||||
end;
|
||||
|
||||
(* Finalize keyword or identifier. *)
|
||||
proc transition_action_key_id(lexer: PLexer, AToken: PLexerToken);
|
||||
proc transition_action_key_id(lexer: PLexer, token: PLexerToken);
|
||||
begin
|
||||
AToken^.Kind := lexerKindIdentifier;
|
||||
token^.kind := lexerKindIdentifier;
|
||||
|
||||
AToken^.identifierKind[1] := lexer^.Current - lexer^.Start;
|
||||
MemCopy(lexer^.Start, ORD(AToken^.identifierKind[1]), ADR(AToken^.identifierKind[2]));
|
||||
token^.identifierKind[1] := lexer^.Current - lexer^.Start;
|
||||
MemCopy(lexer^.Start, ORD(token^.identifierKind[1]), ADR(token^.identifierKind[2]));
|
||||
|
||||
if compare_keyword('PROGRAM', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindProgram
|
||||
token^.kind := lexerKindProgram
|
||||
end;
|
||||
if compare_keyword('IMPORT', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindImport
|
||||
token^.kind := lexerKindImport
|
||||
end;
|
||||
if compare_keyword('CONST', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindConst
|
||||
token^.kind := lexerKindConst
|
||||
end;
|
||||
if compare_keyword('VAR', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindVar
|
||||
token^.kind := lexerKindVar
|
||||
end;
|
||||
if compare_keyword('IF', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindIf
|
||||
token^.kind := lexerKindIf
|
||||
end;
|
||||
if compare_keyword('THEN', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindThen
|
||||
token^.kind := lexerKindThen
|
||||
end;
|
||||
if compare_keyword('ELSIF', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindElsif
|
||||
token^.kind := lexerKindElsif
|
||||
end;
|
||||
if compare_keyword('ELSE', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindElse
|
||||
token^.kind := lexerKindElse
|
||||
end;
|
||||
if compare_keyword('WHILE', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindWhile
|
||||
token^.kind := lexerKindWhile
|
||||
end;
|
||||
if compare_keyword('DO', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindDo
|
||||
token^.kind := lexerKindDo
|
||||
end;
|
||||
if compare_keyword('proc', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindProc
|
||||
token^.kind := lexerKindProc
|
||||
end;
|
||||
if compare_keyword('BEGIN', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindBegin
|
||||
token^.kind := lexerKindBegin
|
||||
end;
|
||||
if compare_keyword('END', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindEnd
|
||||
token^.kind := lexerKindEnd
|
||||
end;
|
||||
if compare_keyword('TYPE', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindType
|
||||
token^.kind := lexerKindType
|
||||
end;
|
||||
if compare_keyword('RECORD', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindRecord
|
||||
token^.kind := lexerKindRecord
|
||||
end;
|
||||
if compare_keyword('UNION', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindUnion
|
||||
token^.kind := lexerKindUnion
|
||||
end;
|
||||
if compare_keyword('NIL', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindNull
|
||||
token^.kind := lexerKindNull
|
||||
end;
|
||||
if compare_keyword('AND', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindAnd
|
||||
token^.kind := lexerKindAnd
|
||||
end;
|
||||
if compare_keyword('OR', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindOr
|
||||
token^.kind := lexerKindOr
|
||||
end;
|
||||
if compare_keyword('RETURN', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindReturn
|
||||
token^.kind := lexerKindReturn
|
||||
end;
|
||||
if compare_keyword('DEFINITION', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindDefinition
|
||||
token^.kind := lexerKindDefinition
|
||||
end;
|
||||
if compare_keyword('TO', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindTo
|
||||
token^.kind := lexerKindTo
|
||||
end;
|
||||
if compare_keyword('CASE', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindCase
|
||||
token^.kind := lexerKindCase
|
||||
end;
|
||||
if compare_keyword('OF', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindOf
|
||||
token^.kind := lexerKindOf
|
||||
end;
|
||||
if compare_keyword('FROM', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindFrom
|
||||
token^.kind := lexerKindFrom
|
||||
end;
|
||||
if compare_keyword('MODULE', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindModule
|
||||
token^.kind := lexerKindModule
|
||||
end;
|
||||
if compare_keyword('IMPLEMENTATION', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindImplementation
|
||||
token^.kind := lexerKindImplementation
|
||||
end;
|
||||
if compare_keyword('POINTER', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindPointer
|
||||
token^.kind := lexerKindPointer
|
||||
end;
|
||||
if compare_keyword('ARRAY', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindArray
|
||||
token^.kind := lexerKindArray
|
||||
end;
|
||||
if compare_keyword('TRUE', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindBoolean;
|
||||
AToken^.booleanKind := TRUE
|
||||
token^.kind := lexerKindBoolean;
|
||||
token^.booleanKind := TRUE
|
||||
end;
|
||||
if compare_keyword('FALSE', lexer^.Start, lexer^.Current) then
|
||||
AToken^.Kind := lexerKindBoolean;
|
||||
AToken^.booleanKind := FALSE
|
||||
token^.kind := lexerKindBoolean;
|
||||
token^.booleanKind := FALSE
|
||||
end
|
||||
end;
|
||||
|
||||
(* Action for tokens containing only one character. The character cannot be
|
||||
* followed by other characters forming a composite token. *)
|
||||
proc transition_action_single(lexer: PLexer, AToken: PLexerToken);
|
||||
proc transition_action_single(lexer: PLexer, token: PLexerToken);
|
||||
begin
|
||||
if lexer^.Current^ = '&' then
|
||||
AToken^.Kind := lexerKindAnd
|
||||
token^.kind := lexerKindAnd
|
||||
end;
|
||||
if lexer^.Current^ = ';' then
|
||||
AToken^.Kind := lexerKindSemicolon
|
||||
token^.kind := lexerKindSemicolon
|
||||
end;
|
||||
if lexer^.Current^ = ',' then
|
||||
AToken^.Kind := lexerKindComma
|
||||
token^.kind := lexerKindComma
|
||||
end;
|
||||
if lexer^.Current^ = ',' then
|
||||
AToken^.Kind := lexerKindComma
|
||||
token^.kind := lexerKindComma
|
||||
end;
|
||||
if lexer^.Current^ = ')' then
|
||||
AToken^.Kind := lexerKindRightParen
|
||||
token^.kind := lexerKindRightParen
|
||||
end;
|
||||
if lexer^.Current^ = '[' then
|
||||
AToken^.Kind := lexerKindLeftSquare
|
||||
token^.kind := lexerKindLeftSquare
|
||||
end;
|
||||
if lexer^.Current^ = ']' then
|
||||
AToken^.Kind := lexerKindRightSquare
|
||||
token^.kind := lexerKindRightSquare
|
||||
end;
|
||||
if lexer^.Current^ = '^' then
|
||||
AToken^.Kind := lexerKindHat
|
||||
token^.kind := lexerKindHat
|
||||
end;
|
||||
if lexer^.Current^ = '=' then
|
||||
AToken^.Kind := lexerKindEqual
|
||||
token^.kind := lexerKindEqual
|
||||
end;
|
||||
if lexer^.Current^ = '+' then
|
||||
AToken^.Kind := lexerKindPlus
|
||||
token^.kind := lexerKindPlus
|
||||
end;
|
||||
if lexer^.Current^ = '/' then
|
||||
AToken^.Kind := lexerKindDivision
|
||||
token^.kind := lexerKindDivision
|
||||
end;
|
||||
if lexer^.Current^ = '%' then
|
||||
AToken^.Kind := lexerKindRemainder
|
||||
token^.kind := lexerKindRemainder
|
||||
end;
|
||||
if lexer^.Current^ = '@' then
|
||||
AToken^.Kind := lexerKindAt
|
||||
token^.kind := lexerKindAt
|
||||
end;
|
||||
if lexer^.Current^ = '|' then
|
||||
AToken^.Kind := lexerKindPipe
|
||||
token^.kind := lexerKindPipe
|
||||
end;
|
||||
INC(lexer^.Current)
|
||||
end;
|
||||
|
||||
(* Handle an integer literal. *)
|
||||
proc transition_action_integer(lexer: PLexer, AToken: PLexerToken);
|
||||
proc transition_action_integer(lexer: PLexer, token: PLexerToken);
|
||||
begin
|
||||
AToken^.Kind := lexerKindInteger
|
||||
token^.kind := lexerKindInteger
|
||||
end;
|
||||
|
||||
proc set_default_transition(CurrentState: TransitionState, DefaultAction: TransitionAction, NextState: TransitionState);
|
||||
@ -756,12 +767,12 @@ begin
|
||||
lexer^.Size := CHUNK_SIZE
|
||||
end;
|
||||
|
||||
proc lexer_current(lexer: PLexer): LexerToken;
|
||||
proc lexer_current(lexer: PLexer) -> LexerToken;
|
||||
var
|
||||
CurrentClass: TransitionClass;
|
||||
CurrentState: TransitionState;
|
||||
CurrentTransition: Transition;
|
||||
Result: LexerToken;
|
||||
result: LexerToken;
|
||||
begin
|
||||
lexer^.Current := lexer^.Start;
|
||||
CurrentState := transitionStateStart;
|
||||
@ -771,16 +782,16 @@ begin
|
||||
|
||||
CurrentTransition := transitions[ORD(CurrentState) + 1][ORD(CurrentClass) + 1];
|
||||
if CurrentTransition.Action <> nil then
|
||||
CurrentTransition.Action(lexer, ADR(Result))
|
||||
CurrentTransition.Action(lexer, ADR(result))
|
||||
end;
|
||||
CurrentState := CurrentTransition.NextState
|
||||
end;
|
||||
return Result
|
||||
return result
|
||||
end;
|
||||
|
||||
proc lexer_lex(lexer: PLexer): LexerToken;
|
||||
proc lexer_lex(lexer: PLexer) -> LexerToken;
|
||||
var
|
||||
Result: LexerToken;
|
||||
result: LexerToken;
|
||||
begin
|
||||
if lexer^.Length = 0 then
|
||||
lexer^.Length := ReadNBytes(lexer^.Input, CHUNK_SIZE, lexer^.Buffer);
|
||||
@ -788,8 +799,8 @@ begin
|
||||
end;
|
||||
lexer^.Start := lexer^.Current;
|
||||
|
||||
Result := lexer_current(lexer);
|
||||
return Result
|
||||
result := lexer_current(lexer);
|
||||
return result
|
||||
end;
|
||||
|
||||
proc lexer_destroy(lexer: PLexer);
|
||||
|
Reference in New Issue
Block a user