Lex multiline comments

This commit is contained in:
Eugen Wissner 2025-01-30 01:03:16 +01:00
parent 5178027d9f
commit 1b90829299
Signed by: belka
GPG Key ID: A27FDC1E8EE902C0
7 changed files with 339 additions and 72 deletions

View File

@ -1,7 +1,8 @@
type type
TokenValue = union TokenValue = union
intValue: Int; int_value: Int;
stringValue: pointer to Char string_value: pointer to Char;
boolean_value: Bool
end, end,
Token = record Token = record
kind: Int; kind: Int;
@ -15,11 +16,22 @@ const
SEEK_SET = 0, SEEK_CUR = 1, SEEK_END = 2, SEEK_SET = 0, SEEK_CUR = 1, SEEK_END = 2,
TOKEN_IDENTIFIER = 1, TOKEN_IF = 2, TOKEN_THEN = 3, TOKEN_ELSE = 4, TOKEN_ELSIF = 5, TOKEN_IDENTIFIER = 1, TOKEN_IF = 2, TOKEN_THEN = 3, TOKEN_ELSE = 4, TOKEN_ELSIF = 5,
TOKEN_WHILE = 6, TOKEN_DO = 7; TOKEN_WHILE = 6, TOKEN_DO = 7, TOKEN_PROC = 8, TOKEN_BEGIN = 9, TOKEN_END = 10,
TOKEN_EXTERN = 11, TOKEN_CONST = 12, TOKEN_VAR = 13, TOKEN_ARRAY = 14, TOKEN_OF = 15,
TOKEN_TYPE = 16, TOKEN_RECORD = 17, TOKEN_UNION = 18, TOKEN_POINTER = 19, TOKEN_TO = 20,
TOKEN_BOOLEAN = 21, TOKEN_NIL = 22, TOKEN_AND = 23, TOKEN_OR = 24, TOKEN_NOT = 25,
TOKEN_RETURN = 26, TOKEN_CAST = 27, TOKEN_AS = 28, TOKEN_SIZEOF = 29,
TOKEN_LEFT_PAREN = 30, TOKEN_RIGHT_PAREN = 31, TOKEN_LEFT_SQUARE = 32,
TOKEN_RIGHT_SQUARE = 33, TOKEN_GREATER_EQUAL = 34, TOKEN_LESS_EQUAL = 35,
TOKEN_GREATER_THAN = 36, TOKEN_LESS_THAN = 37, TOKEN_NOT_EQUAL = 38, TOKEN_EQUAL = 39,
TOKEN_SEMICOLON = 40, TOKEN_DOT = 41, TOKEN_COMMA = 42,
TOKEN_PLUS = 43, TOKEN_MINUS = 44, TOKEN_MULTIPLICATION = 45, TOKEN_DIVISION = 46,
TOKEN_REMAINDER = 47, TOKEN_ASSIGNMENT = 48, TOKEN_COLON = 49, TOKEN_HAT = 50,
TOKEN_AT = 51;
-- (*
-- External procedures. External procedures.
-- *)
proc fopen(pathname: String, mode: String): pointer to FILE; extern; proc fopen(pathname: String, mode: String): pointer to FILE; extern;
proc fclose(stream: pointer to FILE): Int; extern; proc fclose(stream: pointer to FILE): Int; extern;
proc fseek(stream: pointer to FILE, off: Int, whence: Int): Int; extern; proc fseek(stream: pointer to FILE, off: Int, whence: Int): Int; extern;
@ -41,9 +53,9 @@ proc strlen(ptr: pointer to Char): Word; extern;
proc exit(code: Int); extern; proc exit(code: Int); extern;
-- (*
-- Standard procedures. Standard procedures.
-- *)
proc write_s(value: String); proc write_s(value: String);
begin begin
write(0, value, strlen(value)) write(0, value, strlen(value))
@ -71,7 +83,7 @@ begin
n := 9; n := 9;
buffer[9] := '0'; buffer[9] := '0';
while value /= 0 do while value <> 0 do
digit := value % 10; digit := value % 10;
value := value / 10; value := value / 10;
@ -109,9 +121,9 @@ begin
return c = ' ' or c = '\n' return c = ' ' or c = '\n'
end; end;
-- (*
-- End of standard procedures. End of standard procedures.
-- *)
proc test_record(); proc test_record();
var r: Token; var r: Token;
@ -119,9 +131,9 @@ begin
write_s("\nTest record:\n"); write_s("\nTest record:\n");
r.kind := 4; r.kind := 4;
r.value.intValue := 8; r.value.int_value := 8;
write_i(r.value.intValue) write_i(r.value.int_value)
end; end;
proc test_primitive(); proc test_primitive();
@ -173,58 +185,11 @@ begin
return input return input
end; end;
proc compile(); proc print_tokens(tokens: pointer to Token, tokens_size: Int);
var var
input: pointer to Char,
input_pointer: pointer to Char,
token_end: pointer to Char,
token_length: Int,
tokens: pointer to Token,
current_token: pointer to Token, current_token: pointer to Token,
tokens_size: Int,
i: Int; i: Int;
begin begin
tokens_size := 0;
tokens := cast(0 as pointer to Token);
input := read_source("example.elna");
input_pointer := skip_spaces(input);
while input_pointer^ /= '\0' do
if is_alpha(input_pointer^) or input_pointer^ = '_' then
token_end := lex_identifier(input_pointer + 1);
token_length := cast(token_end as Int) - cast(input_pointer as Int);
tokens := cast(realloc(tokens, tokens_size + sizeof(Token)) as pointer to Token);
current_token := tokens + tokens_size;
if strncmp("if", input_pointer, token_length) = 0 then
current_token^.kind := TOKEN_IF
elsif strncmp("then", input_pointer, token_length) = 0 then
current_token^.kind := TOKEN_THEN
elsif strncmp("else", input_pointer, token_length) = 0 then
current_token^.kind := TOKEN_ELSE
elsif strncmp("elsif", input_pointer, token_length) = 0 then
current_token^.kind := TOKEN_ELSIF
elsif strncmp("while", input_pointer, token_length) = 0 then
current_token^.kind := TOKEN_WHILE
elsif strncmp("do", input_pointer, token_length) = 0 then
current_token^.kind := TOKEN_DO
else
current_token^.kind := TOKEN_IDENTIFIER;
current_token^.value.stringValue := cast(calloc(token_length + 1, 1) as pointer to Char);
strncpy(current_token^.value.stringValue, input_pointer, token_length)
end;
tokens_size := tokens_size + sizeof(Token);
input_pointer := token_end
else
input_pointer := input_pointer + 1
end
end;
i := 0; i := 0;
while i < tokens_size do while i < tokens_size do
current_token := tokens + i; current_token := tokens + i;
@ -241,10 +206,100 @@ begin
write_s("WHILE") write_s("WHILE")
elsif current_token^.kind = TOKEN_DO then elsif current_token^.kind = TOKEN_DO then
write_s("DO") write_s("DO")
elsif current_token^.kind = TOKEN_IDENTIFIER then elsif current_token^.kind = TOKEN_PROC then
write_s("IDENTIFIER<"); write_s("PROC")
write_s(current_token^.value.stringValue); elsif current_token^.kind = TOKEN_BEGIN then
write_s("BEGIN")
elsif current_token^.kind = TOKEN_END then
write_s("END")
elsif current_token^.kind = TOKEN_EXTERN then
write_s("EXTERN")
elsif current_token^.kind = TOKEN_CONST then
write_s("CONST")
elsif current_token^.kind = TOKEN_VAR then
write_s("VAR")
elsif current_token^.kind = TOKEN_ARRAY then
write_s("ARRAY")
elsif current_token^.kind = TOKEN_OF then
write_s("OF")
elsif current_token^.kind = TOKEN_TYPE then
write_s("TYPE")
elsif current_token^.kind = TOKEN_RECORD then
write_s("RECORD")
elsif current_token^.kind = TOKEN_UNION then
write_s("UNION")
elsif current_token^.kind = TOKEN_POINTER then
write_s("POINTER")
elsif current_token^.kind = TOKEN_TO then
write_s("TO")
elsif current_token^.kind = TOKEN_BOOLEAN then
write_s("BOOLEAN<");
write_b(current_token^.value.boolean_value);
write_c('>') write_c('>')
elsif current_token^.kind = TOKEN_NIL then
write_s("NIL")
elsif current_token^.kind = TOKEN_AND then
write_s("AND")
elsif current_token^.kind = TOKEN_OR then
write_s("OR")
elsif current_token^.kind = TOKEN_NOT then
write_s("NOT")
elsif current_token^.kind = TOKEN_RETURN then
write_s("RETURN")
elsif current_token^.kind = TOKEN_CAST then
write_s("CAST")
elsif current_token^.kind = TOKEN_AS then
write_s("AS")
elsif current_token^.kind = TOKEN_SIZEOF then
write_s("SIZEOF")
elsif current_token^.kind = TOKEN_IDENTIFIER then
write_c('<');
write_s(current_token^.value.string_value);
write_c('>')
elsif current_token^.kind = TOKEN_LEFT_PAREN then
write_s("(")
elsif current_token^.kind = TOKEN_RIGHT_PAREN then
write_s(")")
elsif current_token^.kind = TOKEN_LEFT_SQUARE then
write_s("[")
elsif current_token^.kind = TOKEN_RIGHT_SQUARE then
write_s("]")
elsif current_token^.kind = TOKEN_GREATER_EQUAL then
write_s(">=")
elsif current_token^.kind = TOKEN_LESS_EQUAL then
write_s("<=")
elsif current_token^.kind = TOKEN_GREATER_THAN then
write_s(">")
elsif current_token^.kind = TOKEN_LESS_THAN then
write_s("<")
elsif current_token^.kind = TOKEN_EQUAL then
write_s("=")
elsif current_token^.kind = TOKEN_NOT_EQUAL then
write_s("<>")
elsif current_token^.kind = TOKEN_SEMICOLON then
write_s(";")
elsif current_token^.kind = TOKEN_DOT then
write_s(".")
elsif current_token^.kind = TOKEN_COMMA then
write_s(",")
elsif current_token^.kind = TOKEN_PLUS then
write_s("+")
elsif current_token^.kind = TOKEN_MINUS then
write_s("-")
elsif current_token^.kind = TOKEN_MULTIPLICATION then
write_s("*")
elsif current_token^.kind = TOKEN_DIVISION then
write_s("/")
elsif current_token^.kind = TOKEN_REMAINDER then
write_s("%")
elsif current_token^.kind = TOKEN_ASSIGNMENT then
write_s(":=")
elsif current_token^.kind = TOKEN_COLON then
write_s(":")
elsif current_token^.kind = TOKEN_HAT then
write_s("^")
elsif current_token^.kind = TOKEN_AT then
write_s("@")
else else
write_s("UNKNOWN<"); write_s("UNKNOWN<");
write_i(current_token^.kind); write_i(current_token^.kind);
@ -253,8 +308,193 @@ begin
write_c(' '); write_c(' ');
i := i + sizeof(Token) i := i + sizeof(Token)
end
end;
proc categorize_identifier(input_pointer: pointer to Char, token_length: Int): Token;
var
current_token: Token;
begin
if strncmp("if", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_IF
elsif strncmp("then", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_THEN
elsif strncmp("else", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_ELSE
elsif strncmp("elsif", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_ELSIF
elsif strncmp("while", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_WHILE
elsif strncmp("do", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_DO
elsif strncmp("proc", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_PROC
elsif strncmp("begin", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_BEGIN
elsif strncmp("end", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_END
elsif strncmp("extern", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_EXTERN
elsif strncmp("const", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_CONST
elsif strncmp("var", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_VAR
elsif strncmp("array", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_ARRAY
elsif strncmp("of", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_OF
elsif strncmp("type", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_TYPE
elsif strncmp("record", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_RECORD
elsif strncmp("union", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_UNION
elsif strncmp("pointer", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_POINTER
elsif strncmp("to", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_TO
elsif strncmp("true", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_BOOLEAN;
current_token.value.boolean_value := true
elsif strncmp("false", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_BOOLEAN;
current_token.value.boolean_value := false
elsif strncmp("nil", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_NIL
elsif strncmp("and", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_AND
elsif strncmp("or", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_OR
elsif strncmp("not", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_NOT
elsif strncmp("return", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_RETURN
elsif strncmp("cast", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_CAST
elsif strncmp("as", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_AS
elsif strncmp("sizeof", input_pointer, token_length) = 0 then
current_token.kind := TOKEN_SIZEOF
else
current_token.kind := TOKEN_IDENTIFIER;
current_token.value.string_value := cast(calloc(token_length + 1, 1) as pointer to Char);
strncpy(current_token.value.string_value, input_pointer, token_length)
end; end;
return current_token
end;
proc compile();
var
input: pointer to Char,
input_pointer: pointer to Char,
token_end: pointer to Char,
token_length: Int,
tokens: pointer to Token,
current_token: pointer to Token,
tokens_size: Int;
begin
tokens_size := 0;
tokens := cast(nil as pointer to Token);
input := read_source("example.elna");
input_pointer := skip_spaces(input);
while input_pointer^ <> '\0' do
tokens := cast(realloc(tokens, tokens_size + sizeof(Token)) as pointer to Token);
current_token := tokens + tokens_size;
if is_alpha(input_pointer^) or input_pointer^ = '_' then
token_end := lex_identifier(input_pointer + 1);
token_length := cast(token_end as Int) - cast(input_pointer as Int);
current_token^ := categorize_identifier(input_pointer, token_length);
input_pointer := token_end
elsif input_pointer^ = '(' then
current_token^.kind := TOKEN_LEFT_PAREN;
input_pointer := input_pointer + 1
elsif input_pointer^ = ')' then
current_token^.kind := TOKEN_RIGHT_PAREN;
input_pointer := input_pointer + 1
elsif input_pointer^ = '[' then
current_token^.kind := TOKEN_LEFT_SQUARE;
input_pointer := input_pointer + 1
elsif input_pointer^ = ']' then
current_token^.kind := TOKEN_RIGHT_SQUARE;
input_pointer := input_pointer + 1
elsif input_pointer^ = '>' then
input_pointer := input_pointer + 1;
if input_pointer^ = '=' then
current_token^.kind := TOKEN_GREATER_EQUAL;
input_pointer := input_pointer + 1
else
current_token^.kind := TOKEN_GREATER_THAN
end
elsif input_pointer^ = '<' then
input_pointer := input_pointer + 1;
if input_pointer^ = '=' then
current_token^.kind := TOKEN_LESS_EQUAL;
input_pointer := input_pointer + 1
elsif input_pointer^ = '>' then
current_token^.kind := TOKEN_NOT_EQUAL;
input_pointer := input_pointer + 1
else
current_token^.kind := TOKEN_LESS_THAN
end
elsif input_pointer^ = '=' then
current_token^.kind := TOKEN_EQUAL;
input_pointer := input_pointer + 1
elsif input_pointer^ = ';' then
current_token^.kind := TOKEN_SEMICOLON;
input_pointer := input_pointer + 1
elsif input_pointer^ = '.' then
current_token^.kind := TOKEN_DOT;
input_pointer := input_pointer + 1
elsif input_pointer^ = ',' then
current_token^.kind := TOKEN_COMMA;
input_pointer := input_pointer + 1
elsif input_pointer^ = '+' then
current_token^.kind := TOKEN_PLUS;
input_pointer := input_pointer + 1
elsif input_pointer^ = '-' then
current_token^.kind := TOKEN_MINUS;
input_pointer := input_pointer + 1
elsif input_pointer^ = '*' then
current_token^.kind := TOKEN_MULTIPLICATION;
input_pointer := input_pointer + 1
elsif input_pointer^ = '/' then
current_token^.kind := TOKEN_DIVISION;
input_pointer := input_pointer + 1
elsif input_pointer^ = '%' then
current_token^.kind := TOKEN_REMAINDER;
input_pointer := input_pointer + 1
elsif input_pointer^ = ':' then
input_pointer := input_pointer + 1;
if input_pointer^ = '=' then
current_token^.kind := TOKEN_ASSIGNMENT;
input_pointer := input_pointer + 1
else
current_token^.kind := TOKEN_COLON
end
elsif input_pointer^ = '^' then
current_token^.kind := TOKEN_HAT;
input_pointer := input_pointer + 1
elsif input_pointer^ = '@' then
current_token^.kind := TOKEN_AT;
input_pointer := input_pointer + 1
else
current_token^.kind := 0;
input_pointer := input_pointer + 1
end;
if current_token^.kind <> 0 then
tokens_size := tokens_size + sizeof(Token)
end
end;
print_tokens(tokens, tokens_size);
free(input) free(input)
end; end;

View File

@ -235,6 +235,11 @@ namespace gcc
this->current_expression = build_int_cstu(elna_char_type_node, character->number()); this->current_expression = build_int_cstu(elna_char_type_node, character->number());
} }
void generic_visitor::visit(source::number_literal<nullptr_t> *)
{
this->current_expression = null_pointer_node;
}
void generic_visitor::visit(source::string_literal *string) void generic_visitor::visit(source::string_literal *string)
{ {
this->current_expression = build_string_literal(string->string().size() + 1, string->string().c_str()); this->current_expression = build_string_literal(string->string().size() + 1, string->string().c_str());

View File

@ -49,6 +49,7 @@ namespace gcc
void visit(source::number_literal<double> *literal) override; void visit(source::number_literal<double> *literal) override;
void visit(source::number_literal<bool> *boolean) override; void visit(source::number_literal<bool> *boolean) override;
void visit(source::number_literal<unsigned char> *character) override; void visit(source::number_literal<unsigned char> *character) override;
void visit(source::number_literal<std::nullptr_t> *) override;
void visit(source::string_literal *string) override; void visit(source::string_literal *string) override;
void visit(source::binary_expression *expression) override; void visit(source::binary_expression *expression) override;
void visit(source::unary_expression *expression) override; void visit(source::unary_expression *expression) override;

View File

@ -63,7 +63,6 @@ namespace source
class dereference_expression; class dereference_expression;
template<typename T> template<typename T>
class number_literal; class number_literal;
class char_literal;
class string_literal; class string_literal;
/** /**
@ -101,6 +100,7 @@ namespace source
virtual void visit(number_literal<double> *) = 0; virtual void visit(number_literal<double> *) = 0;
virtual void visit(number_literal<bool> *) = 0; virtual void visit(number_literal<bool> *) = 0;
virtual void visit(number_literal<unsigned char> *) = 0; virtual void visit(number_literal<unsigned char> *) = 0;
virtual void visit(number_literal<std::nullptr_t> *) = 0;
virtual void visit(string_literal *) = 0; virtual void visit(string_literal *) = 0;
}; };
@ -139,6 +139,7 @@ namespace source
virtual void visit(number_literal<double> *) override; virtual void visit(number_literal<double> *) override;
virtual void visit(number_literal<bool> *) override; virtual void visit(number_literal<bool> *) override;
virtual void visit(number_literal<unsigned char> *) override; virtual void visit(number_literal<unsigned char> *) override;
virtual void visit(number_literal<std::nullptr_t> *) override;
virtual void visit(string_literal *) override; virtual void visit(string_literal *) override;
}; };

View File

@ -192,6 +192,10 @@ namespace source
{ {
} }
void empty_visitor::visit(number_literal<std::nullptr_t> *)
{
}
void empty_visitor::visit(string_literal *) void empty_visitor::visit(string_literal *)
{ {
} }

View File

@ -18,15 +18,24 @@
%option c++ noyywrap never-interactive %option c++ noyywrap never-interactive
%option yyclass="elna::source::lexer" %option yyclass="elna::source::lexer"
%x IN_COMMENT
%% %%
%{ %{
this->location.step(); this->location.step();
%} %}
\-\-.* { <IN_COMMENT>{
/* Skip the comment */ \*\) BEGIN(INITIAL);
[^*\n]+ ; /* Eat comment in chunks. */
\* ; /* Eat the lone star. */
\n+ {
this->location.lines(yyleng);
this->location.step();
} }
[\ \t\r] ; /* Skip the whitespaces */ }
\(\* BEGIN(IN_COMMENT);
[\ \t\r] ; /* Skip the whitespaces. */
\n+ { \n+ {
this->location.lines(yyleng); this->location.lines(yyleng);
this->location.step(); this->location.step();
@ -94,6 +103,9 @@ true {
false { false {
return yy::parser::make_BOOLEAN(false, this->location); return yy::parser::make_BOOLEAN(false, this->location);
} }
nil {
return yy::parser::make_NIL(this->location);
}
and { and {
return yy::parser::make_AND(this->location); return yy::parser::make_AND(this->location);
} }
@ -222,7 +234,7 @@ sizeof {
\< { \< {
return yy::parser::make_LESS_THAN(this->location); return yy::parser::make_LESS_THAN(this->location);
} }
\/= { \<\> {
return yy::parser::make_NOT_EQUAL(this->location); return yy::parser::make_NOT_EQUAL(this->location);
} }
= { = {

View File

@ -70,7 +70,7 @@
%token AND OR NOT CAST AS SIZEOF %token AND OR NOT CAST AS SIZEOF
%token GREATER_EQUAL LESS_EQUAL LESS_THAN GREATER_THAN NOT_EQUAL EQUALS %token GREATER_EQUAL LESS_EQUAL LESS_THAN GREATER_THAN NOT_EQUAL EQUALS
%token PLUS MINUS MULTIPLICATION DIVISION REMAINDER %token PLUS MINUS MULTIPLICATION DIVISION REMAINDER
%token ASSIGNMENT COLON HAT AT %token ASSIGNMENT COLON HAT AT NIL
%type <elna::source::literal *> literal; %type <elna::source::literal *> literal;
%type <elna::source::constant_definition *> constant_definition; %type <elna::source::constant_definition *> constant_definition;
@ -245,6 +245,10 @@ literal:
{ {
$$ = new elna::source::number_literal<unsigned char>(elna::source::make_position(@1), $1.at(0)); $$ = new elna::source::number_literal<unsigned char>(elna::source::make_position(@1), $1.at(0));
} }
| NIL
{
$$ = new elna::source::number_literal<std::nullptr_t>(elna::source::make_position(@1), nullptr);
}
| STRING | STRING
{ {
$$ = new elna::source::string_literal(elna::source::make_position(@1), $1); $$ = new elna::source::string_literal(elna::source::make_position(@1), $1);