Handle ASCII codes > 128 in the tokenizer

This commit is contained in:
2025-05-30 19:51:40 +02:00
parent 4eccc147ba
commit 6e415e474f
4 changed files with 287 additions and 266 deletions

View File

@ -72,6 +72,8 @@ var
transitions: [16]TransitionClasses;
proc initialize_classification();
var
i: CARDINAL;
begin
classification[1] := transitionClassEof; (* NUL *)
classification[2] := transitionClassInvalid; (* SOH *)
@ -200,265 +202,274 @@ begin
classification[125] := transitionClassSingle; (* | *)
classification[126] := transitionClassOther; (* } *)
classification[127] := transitionClassSingle; (* ~ *)
classification[128] := transitionClassInvalid (* DEL *)
classification[128] := transitionClassInvalid; (* DEL *)
i := 129;
while i <= 256 do
classification[i] := transitionClassOther;
i := i + 1
end
end;
proc compare_keyword(Keyword: ARRAY OF CHAR, TokenStart: PLexerBuffer, TokenEnd: PLexerBuffer): BOOLEAN;
proc compare_keyword(Keyword: ARRAY OF CHAR, TokenStart: PLexerBuffer, TokenEnd: PLexerBuffer) -> BOOLEAN;
var
Result: BOOLEAN;
Index: CARDINAL;
result: BOOLEAN;
index: CARDINAL;
begin
Index := 0;
Result := TRUE;
index := 0;
result := TRUE;
while (Index < Length(Keyword)) & (TokenStart <> TokenEnd) & Result DO
Result := (Keyword[Index] = TokenStart^) or (Lower(Keyword[Index]) = TokenStart^);
while (index < Length(Keyword)) & (TokenStart <> TokenEnd) & result DO
result := (Keyword[index] = TokenStart^) or (Lower(Keyword[index]) = TokenStart^);
INC(TokenStart);
INC(Index)
INC(index)
end;
Result := (Index = Length(Keyword)) & (TokenStart = TokenEnd) & Result;
return Result
result := (index = Length(Keyword)) & (TokenStart = TokenEnd) & result;
return result
end;
(* Reached the end of file. *)
proc transition_action_eof(lexer: PLexer, AToken: PLexerToken);
proc transition_action_eof(lexer: PLexer, token: PLexerToken);
begin
AToken^.Kind := lexerKindEof
token^.kind := lexerKindEof
end;
(* Add the character to the token currently read and advance to the next character. *)
proc transition_action_accumulate(lexer: PLexer, AToken: PLexerToken);
proc transition_action_accumulate(lexer: PLexer, token: PLexerToken);
begin
INC(lexer^.Current)
end;
(* The current character is not a part of the token. Finish the token already
* read. Don't advance to the next character. *)
proc transition_action_finalize(lexer: PLexer, AToken: PLexerToken);
proc transition_action_finalize(lexer: PLexer, token: PLexerToken);
begin
if lexer^.Start^ = ':' then
AToken^.Kind := lexerKindColon
token^.kind := lexerKindColon
end;
if lexer^.Start^ = '>' then
AToken^.Kind := lexerKindGreaterThan
token^.kind := lexerKindGreaterThan
end;
if lexer^.Start^ = '<' then
AToken^.Kind := lexerKindLessThan
token^.kind := lexerKindLessThan
end;
if lexer^.Start^ = '(' then
AToken^.Kind := lexerKindLeftParen
token^.kind := lexerKindLeftParen
end;
if lexer^.Start^ = '-' then
AToken^.Kind := lexerKindLeftParen
token^.kind := lexerKindLeftParen
end;
if lexer^.Start^ = '.' then
AToken^.Kind := lexerKindDot
token^.kind := lexerKindDot
end
end;
(* An action for tokens containing multiple characters. *)
proc transition_action_composite(lexer: PLexer, AToken: PLexerToken);
proc transition_action_composite(lexer: PLexer, token: PLexerToken);
begin
if lexer^.Start^ = '<' then
if lexer^.Current^ = '>' then
AToken^.Kind := lexerKindNotEqual
token^.kind := lexerKindNotEqual
end;
if lexer^.Current^ = '=' then
AToken^.Kind := lexerKindLessEqual
token^.kind := lexerKindLessEqual
end
end;
if (lexer^.Start^ = '>') & (lexer^.Current^ = '=') then
AToken^.Kind := lexerKindGreaterEqual
token^.kind := lexerKindGreaterEqual
end;
if (lexer^.Start^ = '.') & (lexer^.Current^ = '.') then
AToken^.Kind := lexerKindRange
token^.kind := lexerKindRange
end;
if (lexer^.Start^ = ':') & (lexer^.Current^ = '=') then
AToken^.Kind := lexerKindAssignment
token^.kind := lexerKindAssignment
end;
if (lexer^.Start^ = '-') & (lexer^.Current^ = '>') then
token^.kind := lexerKindArrow
end;
INC(lexer^.Current)
end;
(* Skip a space. *)
proc transition_action_skip(lexer: PLexer, AToken: PLexerToken);
proc transition_action_skip(lexer: PLexer, token: PLexerToken);
begin
INC(lexer^.Current);
INC(lexer^.Start)
end;
(* Delimited string action. *)
proc transition_action_delimited(lexer: PLexer, AToken: PLexerToken);
proc transition_action_delimited(lexer: PLexer, token: PLexerToken);
begin
if lexer^.Start^ = '(' then
AToken^.Kind := lexerKindComment
token^.kind := lexerKindComment
end;
if lexer^.Start^ = '"' then
AToken^.Kind := lexerKindCharacter
token^.kind := lexerKindCharacter
end;
if lexer^.Start^ = "'" then
AToken^.Kind := lexerKindString
token^.kind := lexerKindString
end;
INC(lexer^.Current)
end;
(* Finalize keyword or identifier. *)
proc transition_action_key_id(lexer: PLexer, AToken: PLexerToken);
proc transition_action_key_id(lexer: PLexer, token: PLexerToken);
begin
AToken^.Kind := lexerKindIdentifier;
token^.kind := lexerKindIdentifier;
AToken^.identifierKind[1] := lexer^.Current - lexer^.Start;
MemCopy(lexer^.Start, ORD(AToken^.identifierKind[1]), ADR(AToken^.identifierKind[2]));
token^.identifierKind[1] := lexer^.Current - lexer^.Start;
MemCopy(lexer^.Start, ORD(token^.identifierKind[1]), ADR(token^.identifierKind[2]));
if compare_keyword('PROGRAM', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindProgram
token^.kind := lexerKindProgram
end;
if compare_keyword('IMPORT', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindImport
token^.kind := lexerKindImport
end;
if compare_keyword('CONST', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindConst
token^.kind := lexerKindConst
end;
if compare_keyword('VAR', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindVar
token^.kind := lexerKindVar
end;
if compare_keyword('IF', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindIf
token^.kind := lexerKindIf
end;
if compare_keyword('THEN', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindThen
token^.kind := lexerKindThen
end;
if compare_keyword('ELSIF', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindElsif
token^.kind := lexerKindElsif
end;
if compare_keyword('ELSE', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindElse
token^.kind := lexerKindElse
end;
if compare_keyword('WHILE', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindWhile
token^.kind := lexerKindWhile
end;
if compare_keyword('DO', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindDo
token^.kind := lexerKindDo
end;
if compare_keyword('proc', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindProc
token^.kind := lexerKindProc
end;
if compare_keyword('BEGIN', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindBegin
token^.kind := lexerKindBegin
end;
if compare_keyword('END', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindEnd
token^.kind := lexerKindEnd
end;
if compare_keyword('TYPE', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindType
token^.kind := lexerKindType
end;
if compare_keyword('RECORD', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindRecord
token^.kind := lexerKindRecord
end;
if compare_keyword('UNION', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindUnion
token^.kind := lexerKindUnion
end;
if compare_keyword('NIL', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindNull
token^.kind := lexerKindNull
end;
if compare_keyword('AND', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindAnd
token^.kind := lexerKindAnd
end;
if compare_keyword('OR', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindOr
token^.kind := lexerKindOr
end;
if compare_keyword('RETURN', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindReturn
token^.kind := lexerKindReturn
end;
if compare_keyword('DEFINITION', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindDefinition
token^.kind := lexerKindDefinition
end;
if compare_keyword('TO', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindTo
token^.kind := lexerKindTo
end;
if compare_keyword('CASE', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindCase
token^.kind := lexerKindCase
end;
if compare_keyword('OF', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindOf
token^.kind := lexerKindOf
end;
if compare_keyword('FROM', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindFrom
token^.kind := lexerKindFrom
end;
if compare_keyword('MODULE', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindModule
token^.kind := lexerKindModule
end;
if compare_keyword('IMPLEMENTATION', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindImplementation
token^.kind := lexerKindImplementation
end;
if compare_keyword('POINTER', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindPointer
token^.kind := lexerKindPointer
end;
if compare_keyword('ARRAY', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindArray
token^.kind := lexerKindArray
end;
if compare_keyword('TRUE', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindBoolean;
AToken^.booleanKind := TRUE
token^.kind := lexerKindBoolean;
token^.booleanKind := TRUE
end;
if compare_keyword('FALSE', lexer^.Start, lexer^.Current) then
AToken^.Kind := lexerKindBoolean;
AToken^.booleanKind := FALSE
token^.kind := lexerKindBoolean;
token^.booleanKind := FALSE
end
end;
(* Action for tokens containing only one character. The character cannot be
* followed by other characters forming a composite token. *)
proc transition_action_single(lexer: PLexer, AToken: PLexerToken);
proc transition_action_single(lexer: PLexer, token: PLexerToken);
begin
if lexer^.Current^ = '&' then
AToken^.Kind := lexerKindAnd
token^.kind := lexerKindAnd
end;
if lexer^.Current^ = ';' then
AToken^.Kind := lexerKindSemicolon
token^.kind := lexerKindSemicolon
end;
if lexer^.Current^ = ',' then
AToken^.Kind := lexerKindComma
token^.kind := lexerKindComma
end;
if lexer^.Current^ = ',' then
AToken^.Kind := lexerKindComma
token^.kind := lexerKindComma
end;
if lexer^.Current^ = ')' then
AToken^.Kind := lexerKindRightParen
token^.kind := lexerKindRightParen
end;
if lexer^.Current^ = '[' then
AToken^.Kind := lexerKindLeftSquare
token^.kind := lexerKindLeftSquare
end;
if lexer^.Current^ = ']' then
AToken^.Kind := lexerKindRightSquare
token^.kind := lexerKindRightSquare
end;
if lexer^.Current^ = '^' then
AToken^.Kind := lexerKindHat
token^.kind := lexerKindHat
end;
if lexer^.Current^ = '=' then
AToken^.Kind := lexerKindEqual
token^.kind := lexerKindEqual
end;
if lexer^.Current^ = '+' then
AToken^.Kind := lexerKindPlus
token^.kind := lexerKindPlus
end;
if lexer^.Current^ = '/' then
AToken^.Kind := lexerKindDivision
token^.kind := lexerKindDivision
end;
if lexer^.Current^ = '%' then
AToken^.Kind := lexerKindRemainder
token^.kind := lexerKindRemainder
end;
if lexer^.Current^ = '@' then
AToken^.Kind := lexerKindAt
token^.kind := lexerKindAt
end;
if lexer^.Current^ = '|' then
AToken^.Kind := lexerKindPipe
token^.kind := lexerKindPipe
end;
INC(lexer^.Current)
end;
(* Handle an integer literal. *)
proc transition_action_integer(lexer: PLexer, AToken: PLexerToken);
proc transition_action_integer(lexer: PLexer, token: PLexerToken);
begin
AToken^.Kind := lexerKindInteger
token^.kind := lexerKindInteger
end;
proc set_default_transition(CurrentState: TransitionState, DefaultAction: TransitionAction, NextState: TransitionState);
@ -756,12 +767,12 @@ begin
lexer^.Size := CHUNK_SIZE
end;
proc lexer_current(lexer: PLexer): LexerToken;
proc lexer_current(lexer: PLexer) -> LexerToken;
var
CurrentClass: TransitionClass;
CurrentState: TransitionState;
CurrentTransition: Transition;
Result: LexerToken;
result: LexerToken;
begin
lexer^.Current := lexer^.Start;
CurrentState := transitionStateStart;
@ -771,16 +782,16 @@ begin
CurrentTransition := transitions[ORD(CurrentState) + 1][ORD(CurrentClass) + 1];
if CurrentTransition.Action <> nil then
CurrentTransition.Action(lexer, ADR(Result))
CurrentTransition.Action(lexer, ADR(result))
end;
CurrentState := CurrentTransition.NextState
end;
return Result
return result
end;
proc lexer_lex(lexer: PLexer): LexerToken;
proc lexer_lex(lexer: PLexer) -> LexerToken;
var
Result: LexerToken;
result: LexerToken;
begin
if lexer^.Length = 0 then
lexer^.Length := ReadNBytes(lexer^.Input, CHUNK_SIZE, lexer^.Buffer);
@ -788,8 +799,8 @@ begin
end;
lexer^.Start := lexer^.Current;
Result := lexer_current(lexer);
return Result
result := lexer_current(lexer);
return result
end;
proc lexer_destroy(lexer: PLexer);