Handle ASCII codes > 128 in the tokenizer
This commit is contained in:
190
source/Lexer.mod
190
source/Lexer.mod
@ -72,6 +72,8 @@ VAR
|
||||
transitions: ARRAY[1..16] OF TransitionClasses;
|
||||
|
||||
PROCEDURE initialize_classification();
|
||||
VAR
|
||||
i: CARDINAL;
|
||||
BEGIN
|
||||
classification[1] := transitionClassEof; (* NUL *)
|
||||
classification[2] := transitionClassInvalid; (* SOH *)
|
||||
@ -200,255 +202,261 @@ BEGIN
|
||||
classification[125] := transitionClassSingle; (* | *)
|
||||
classification[126] := transitionClassOther; (* } *)
|
||||
classification[127] := transitionClassSingle; (* ~ *)
|
||||
classification[128] := transitionClassInvalid (* DEL *)
|
||||
classification[128] := transitionClassInvalid; (* DEL *)
|
||||
|
||||
i := 129;
|
||||
WHILE i <= 256 DO
|
||||
classification[i] := transitionClassOther;
|
||||
i := i + 1
|
||||
END
|
||||
END initialize_classification;
|
||||
PROCEDURE compare_keyword(Keyword: ARRAY OF CHAR; TokenStart: PLexerBuffer; TokenEnd: PLexerBuffer): BOOLEAN;
|
||||
VAR
|
||||
Result: BOOLEAN;
|
||||
Index: CARDINAL;
|
||||
result: BOOLEAN;
|
||||
index: CARDINAL;
|
||||
BEGIN
|
||||
Index := 0;
|
||||
Result := TRUE;
|
||||
index := 0;
|
||||
result := TRUE;
|
||||
|
||||
WHILE (Index < Length(Keyword)) AND (TokenStart <> TokenEnd) AND Result DO
|
||||
Result := (Keyword[Index] = TokenStart^) OR (Lower(Keyword[Index]) = TokenStart^);
|
||||
WHILE (index < Length(Keyword)) AND (TokenStart <> TokenEnd) AND result DO
|
||||
result := (Keyword[index] = TokenStart^) OR (Lower(Keyword[index]) = TokenStart^);
|
||||
INC(TokenStart);
|
||||
INC(Index)
|
||||
INC(index)
|
||||
END;
|
||||
Result := (Index = Length(Keyword)) AND (TokenStart = TokenEnd) AND Result;
|
||||
RETURN Result
|
||||
result := (index = Length(Keyword)) AND (TokenStart = TokenEnd) AND result;
|
||||
RETURN result
|
||||
END compare_keyword;
|
||||
(* Reached the end of file. *)
|
||||
PROCEDURE transition_action_eof(lexer: PLexer; AToken: PLexerToken);
|
||||
PROCEDURE transition_action_eof(lexer: PLexer; token: PLexerToken);
|
||||
BEGIN
|
||||
AToken^.Kind := lexerKindEof
|
||||
token^.Kind := lexerKindEof
|
||||
END transition_action_eof;
|
||||
(* Add the character to the token currently read and advance to the next character. *)
|
||||
PROCEDURE transition_action_accumulate(lexer: PLexer; AToken: PLexerToken);
|
||||
PROCEDURE transition_action_accumulate(lexer: PLexer; token: PLexerToken);
|
||||
BEGIN
|
||||
INC(lexer^.Current)
|
||||
END transition_action_accumulate;
|
||||
(* The current character is not a part of the token. Finish the token already
|
||||
* read. Don't advance to the next character. *)
|
||||
PROCEDURE transition_action_finalize(lexer: PLexer; AToken: PLexerToken);
|
||||
PROCEDURE transition_action_finalize(lexer: PLexer; token: PLexerToken);
|
||||
BEGIN
|
||||
IF lexer^.Start^ = ':' THEN
|
||||
AToken^.Kind := lexerKindColon
|
||||
token^.Kind := lexerKindColon
|
||||
END;
|
||||
IF lexer^.Start^ = '>' THEN
|
||||
AToken^.Kind := lexerKindGreaterThan
|
||||
token^.Kind := lexerKindGreaterThan
|
||||
END;
|
||||
IF lexer^.Start^ = '<' THEN
|
||||
AToken^.Kind := lexerKindLessThan
|
||||
token^.Kind := lexerKindLessThan
|
||||
END;
|
||||
IF lexer^.Start^ = '(' THEN
|
||||
AToken^.Kind := lexerKindLeftParen
|
||||
token^.Kind := lexerKindLeftParen
|
||||
END;
|
||||
IF lexer^.Start^ = '-' THEN
|
||||
AToken^.Kind := lexerKindLeftParen
|
||||
token^.Kind := lexerKindLeftParen
|
||||
END;
|
||||
IF lexer^.Start^ = '.' THEN
|
||||
AToken^.Kind := lexerKindDot
|
||||
token^.Kind := lexerKindDot
|
||||
END
|
||||
END transition_action_finalize;
|
||||
(* An action for tokens containing multiple characters. *)
|
||||
PROCEDURE transition_action_composite(lexer: PLexer; AToken: PLexerToken);
|
||||
PROCEDURE transition_action_composite(lexer: PLexer; token: PLexerToken);
|
||||
BEGIN
|
||||
IF lexer^.Start^ = '<' THEN
|
||||
IF lexer^.Current^ = '>' THEN
|
||||
AToken^.Kind := lexerKindNotEqual
|
||||
token^.Kind := lexerKindNotEqual
|
||||
END;
|
||||
IF lexer^.Current^ = '=' THEN
|
||||
AToken^.Kind := lexerKindLessEqual
|
||||
token^.Kind := lexerKindLessEqual
|
||||
END
|
||||
END;
|
||||
IF (lexer^.Start^ = '>') AND (lexer^.Current^ = '=') THEN
|
||||
AToken^.Kind := lexerKindGreaterEqual
|
||||
token^.Kind := lexerKindGreaterEqual
|
||||
END;
|
||||
IF (lexer^.Start^ = '.') AND (lexer^.Current^ = '.') THEN
|
||||
AToken^.Kind := lexerKindRange
|
||||
token^.Kind := lexerKindRange
|
||||
END;
|
||||
IF (lexer^.Start^ = ':') AND (lexer^.Current^ = '=') THEN
|
||||
AToken^.Kind := lexerKindAssignment
|
||||
token^.Kind := lexerKindAssignment
|
||||
END;
|
||||
INC(lexer^.Current)
|
||||
END transition_action_composite;
|
||||
(* Skip a space. *)
|
||||
PROCEDURE transition_action_skip(lexer: PLexer; AToken: PLexerToken);
|
||||
PROCEDURE transition_action_skip(lexer: PLexer; token: PLexerToken);
|
||||
BEGIN
|
||||
INC(lexer^.Current);
|
||||
INC(lexer^.Start)
|
||||
END transition_action_skip;
|
||||
(* Delimited string action. *)
|
||||
PROCEDURE transition_action_delimited(lexer: PLexer; AToken: PLexerToken);
|
||||
PROCEDURE transition_action_delimited(lexer: PLexer; token: PLexerToken);
|
||||
BEGIN
|
||||
IF lexer^.Start^ = '(' THEN
|
||||
AToken^.Kind := lexerKindComment
|
||||
token^.Kind := lexerKindComment
|
||||
END;
|
||||
IF lexer^.Start^ = '"' THEN
|
||||
AToken^.Kind := lexerKindCharacter
|
||||
token^.Kind := lexerKindCharacter
|
||||
END;
|
||||
IF lexer^.Start^ = "'" THEN
|
||||
AToken^.Kind := lexerKindString
|
||||
token^.Kind := lexerKindString
|
||||
END;
|
||||
INC(lexer^.Current)
|
||||
END transition_action_delimited;
|
||||
(* Finalize keyword OR identifier. *)
|
||||
PROCEDURE transition_action_key_id(lexer: PLexer; AToken: PLexerToken);
|
||||
PROCEDURE transition_action_key_id(lexer: PLexer; token: PLexerToken);
|
||||
BEGIN
|
||||
AToken^.Kind := lexerKindIdentifier;
|
||||
token^.Kind := lexerKindIdentifier;
|
||||
|
||||
AToken^.identifierKind[1] := lexer^.Current - lexer^.Start;
|
||||
MemCopy(lexer^.Start, ORD(AToken^.identifierKind[1]), ADR(AToken^.identifierKind[2]));
|
||||
token^.identifierKind[1] := lexer^.Current - lexer^.Start;
|
||||
MemCopy(lexer^.Start, ORD(token^.identifierKind[1]), ADR(token^.identifierKind[2]));
|
||||
|
||||
IF compare_keyword('PROGRAM', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindProgram
|
||||
token^.Kind := lexerKindProgram
|
||||
END;
|
||||
IF compare_keyword('IMPORT', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindImport
|
||||
token^.Kind := lexerKindImport
|
||||
END;
|
||||
IF compare_keyword('CONST', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindConst
|
||||
token^.Kind := lexerKindConst
|
||||
END;
|
||||
IF compare_keyword('VAR', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindVar
|
||||
token^.Kind := lexerKindVar
|
||||
END;
|
||||
IF compare_keyword('IF', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindIf
|
||||
token^.Kind := lexerKindIf
|
||||
END;
|
||||
IF compare_keyword('THEN', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindThen
|
||||
token^.Kind := lexerKindThen
|
||||
END;
|
||||
IF compare_keyword('ELSIF', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindElsif
|
||||
token^.Kind := lexerKindElsif
|
||||
END;
|
||||
IF compare_keyword('ELSE', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindElse
|
||||
token^.Kind := lexerKindElse
|
||||
END;
|
||||
IF compare_keyword('WHILE', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindWhile
|
||||
token^.Kind := lexerKindWhile
|
||||
END;
|
||||
IF compare_keyword('DO', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindDo
|
||||
token^.Kind := lexerKindDo
|
||||
END;
|
||||
IF compare_keyword('proc', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindProc
|
||||
token^.Kind := lexerKindProc
|
||||
END;
|
||||
IF compare_keyword('BEGIN', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindBegin
|
||||
token^.Kind := lexerKindBegin
|
||||
END;
|
||||
IF compare_keyword('END', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindEnd
|
||||
token^.Kind := lexerKindEnd
|
||||
END;
|
||||
IF compare_keyword('TYPE', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindType
|
||||
token^.Kind := lexerKindType
|
||||
END;
|
||||
IF compare_keyword('RECORD', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindRecord
|
||||
token^.Kind := lexerKindRecord
|
||||
END;
|
||||
IF compare_keyword('UNION', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindUnion
|
||||
token^.Kind := lexerKindUnion
|
||||
END;
|
||||
IF compare_keyword('NIL', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindNull
|
||||
token^.Kind := lexerKindNull
|
||||
END;
|
||||
IF compare_keyword('AND', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindAnd
|
||||
token^.Kind := lexerKindAnd
|
||||
END;
|
||||
IF compare_keyword('OR', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindOr
|
||||
token^.Kind := lexerKindOr
|
||||
END;
|
||||
IF compare_keyword('RETURN', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindReturn
|
||||
token^.Kind := lexerKindReturn
|
||||
END;
|
||||
IF compare_keyword('DEFINITION', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindDefinition
|
||||
token^.Kind := lexerKindDefinition
|
||||
END;
|
||||
IF compare_keyword('TO', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindTo
|
||||
token^.Kind := lexerKindTo
|
||||
END;
|
||||
IF compare_keyword('CASE', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindCase
|
||||
token^.Kind := lexerKindCase
|
||||
END;
|
||||
IF compare_keyword('OF', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindOf
|
||||
token^.Kind := lexerKindOf
|
||||
END;
|
||||
IF compare_keyword('FROM', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindFrom
|
||||
token^.Kind := lexerKindFrom
|
||||
END;
|
||||
IF compare_keyword('MODULE', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindModule
|
||||
token^.Kind := lexerKindModule
|
||||
END;
|
||||
IF compare_keyword('IMPLEMENTATION', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindImplementation
|
||||
token^.Kind := lexerKindImplementation
|
||||
END;
|
||||
IF compare_keyword('POINTER', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindPointer
|
||||
token^.Kind := lexerKindPointer
|
||||
END;
|
||||
IF compare_keyword('ARRAY', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindArray
|
||||
token^.Kind := lexerKindArray
|
||||
END;
|
||||
IF compare_keyword('TRUE', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindBoolean;
|
||||
AToken^.booleanKind := TRUE
|
||||
token^.Kind := lexerKindBoolean;
|
||||
token^.booleanKind := TRUE
|
||||
END;
|
||||
IF compare_keyword('FALSE', lexer^.Start, lexer^.Current) THEN
|
||||
AToken^.Kind := lexerKindBoolean;
|
||||
AToken^.booleanKind := FALSE
|
||||
token^.Kind := lexerKindBoolean;
|
||||
token^.booleanKind := FALSE
|
||||
END
|
||||
END transition_action_key_id;
|
||||
(* Action for tokens containing only one character. The character cannot be
|
||||
* followed by other characters forming a composite token. *)
|
||||
PROCEDURE transition_action_single(lexer: PLexer; AToken: PLexerToken);
|
||||
PROCEDURE transition_action_single(lexer: PLexer; token: PLexerToken);
|
||||
BEGIN
|
||||
IF lexer^.Current^ = '&' THEN
|
||||
AToken^.Kind := lexerKindAnd
|
||||
token^.Kind := lexerKindAnd
|
||||
END;
|
||||
IF lexer^.Current^ = ';' THEN
|
||||
AToken^.Kind := lexerKindSemicolon
|
||||
token^.Kind := lexerKindSemicolon
|
||||
END;
|
||||
IF lexer^.Current^ = ',' THEN
|
||||
AToken^.Kind := lexerKindComma
|
||||
token^.Kind := lexerKindComma
|
||||
END;
|
||||
IF lexer^.Current^ = ',' THEN
|
||||
AToken^.Kind := lexerKindComma
|
||||
token^.Kind := lexerKindComma
|
||||
END;
|
||||
IF lexer^.Current^ = ')' THEN
|
||||
AToken^.Kind := lexerKindRightParen
|
||||
token^.Kind := lexerKindRightParen
|
||||
END;
|
||||
IF lexer^.Current^ = '[' THEN
|
||||
AToken^.Kind := lexerKindLeftSquare
|
||||
token^.Kind := lexerKindLeftSquare
|
||||
END;
|
||||
IF lexer^.Current^ = ']' THEN
|
||||
AToken^.Kind := lexerKindRightSquare
|
||||
token^.Kind := lexerKindRightSquare
|
||||
END;
|
||||
IF lexer^.Current^ = '^' THEN
|
||||
AToken^.Kind := lexerKindHat
|
||||
token^.Kind := lexerKindHat
|
||||
END;
|
||||
IF lexer^.Current^ = '=' THEN
|
||||
AToken^.Kind := lexerKindEqual
|
||||
token^.Kind := lexerKindEqual
|
||||
END;
|
||||
IF lexer^.Current^ = '+' THEN
|
||||
AToken^.Kind := lexerKindPlus
|
||||
token^.Kind := lexerKindPlus
|
||||
END;
|
||||
IF lexer^.Current^ = '/' THEN
|
||||
AToken^.Kind := lexerKindDivision
|
||||
token^.Kind := lexerKindDivision
|
||||
END;
|
||||
IF lexer^.Current^ = '%' THEN
|
||||
AToken^.Kind := lexerKindRemainder
|
||||
token^.Kind := lexerKindRemainder
|
||||
END;
|
||||
IF lexer^.Current^ = '@' THEN
|
||||
AToken^.Kind := lexerKindAt
|
||||
token^.Kind := lexerKindAt
|
||||
END;
|
||||
IF lexer^.Current^ = '|' THEN
|
||||
AToken^.Kind := lexerKindPipe
|
||||
token^.Kind := lexerKindPipe
|
||||
END;
|
||||
INC(lexer^.Current)
|
||||
END transition_action_single;
|
||||
(* Handle an integer literal. *)
|
||||
PROCEDURE transition_action_integer(lexer: PLexer; AToken: PLexerToken);
|
||||
PROCEDURE transition_action_integer(lexer: PLexer; token: PLexerToken);
|
||||
BEGIN
|
||||
AToken^.Kind := lexerKindInteger
|
||||
token^.Kind := lexerKindInteger
|
||||
END transition_action_integer;
|
||||
PROCEDURE set_default_transition(CurrentState: TransitionState; DefaultAction: TransitionAction; NextState: TransitionState);
|
||||
VAR
|
||||
@ -747,7 +755,7 @@ VAR
|
||||
CurrentClass: TransitionClass;
|
||||
CurrentState: TransitionState;
|
||||
CurrentTransition: Transition;
|
||||
Result: LexerToken;
|
||||
result: LexerToken;
|
||||
BEGIN
|
||||
lexer^.Current := lexer^.Start;
|
||||
CurrentState := transitionStateStart;
|
||||
@ -757,15 +765,15 @@ BEGIN
|
||||
|
||||
CurrentTransition := transitions[ORD(CurrentState) + 1][ORD(CurrentClass) + 1];
|
||||
IF CurrentTransition.Action <> NIL THEN
|
||||
CurrentTransition.Action(lexer, ADR(Result))
|
||||
CurrentTransition.Action(lexer, ADR(result))
|
||||
END;
|
||||
CurrentState := CurrentTransition.NextState
|
||||
END;
|
||||
RETURN Result
|
||||
RETURN result
|
||||
END lexer_current;
|
||||
PROCEDURE lexer_lex(lexer: PLexer): LexerToken;
|
||||
VAR
|
||||
Result: LexerToken;
|
||||
result: LexerToken;
|
||||
BEGIN
|
||||
IF lexer^.Length = 0 THEN
|
||||
lexer^.Length := ReadNBytes(lexer^.Input, CHUNK_SIZE, lexer^.Buffer);
|
||||
@ -773,8 +781,8 @@ BEGIN
|
||||
END;
|
||||
lexer^.Start := lexer^.Current;
|
||||
|
||||
Result := lexer_current(lexer);
|
||||
RETURN Result
|
||||
result := lexer_current(lexer);
|
||||
RETURN result
|
||||
END lexer_lex;
|
||||
PROCEDURE lexer_destroy(lexer: PLexer);
|
||||
BEGIN
|
||||
|
Reference in New Issue
Block a user