From 1b908292998c782038a9bb04bddb507368f27391 Mon Sep 17 00:00:00 2001 From: Eugen Wissner Date: Thu, 30 Jan 2025 01:03:16 +0100 Subject: [PATCH] Lex multiline comments --- example.elna | 372 ++++++++++++++++++++++++++------ gcc/elna-generic.cc | 5 + include/elna/gcc/elna-generic.h | 1 + include/elna/source/ast.h | 3 +- source/ast.cc | 4 + source/lexer.ll | 20 +- source/parser.yy | 6 +- 7 files changed, 339 insertions(+), 72 deletions(-) diff --git a/example.elna b/example.elna index 2f856aa..5302c62 100644 --- a/example.elna +++ b/example.elna @@ -1,7 +1,8 @@ type TokenValue = union - intValue: Int; - stringValue: pointer to Char + int_value: Int; + string_value: pointer to Char; + boolean_value: Bool end, Token = record kind: Int; @@ -15,11 +16,22 @@ const SEEK_SET = 0, SEEK_CUR = 1, SEEK_END = 2, TOKEN_IDENTIFIER = 1, TOKEN_IF = 2, TOKEN_THEN = 3, TOKEN_ELSE = 4, TOKEN_ELSIF = 5, - TOKEN_WHILE = 6, TOKEN_DO = 7; + TOKEN_WHILE = 6, TOKEN_DO = 7, TOKEN_PROC = 8, TOKEN_BEGIN = 9, TOKEN_END = 10, + TOKEN_EXTERN = 11, TOKEN_CONST = 12, TOKEN_VAR = 13, TOKEN_ARRAY = 14, TOKEN_OF = 15, + TOKEN_TYPE = 16, TOKEN_RECORD = 17, TOKEN_UNION = 18, TOKEN_POINTER = 19, TOKEN_TO = 20, + TOKEN_BOOLEAN = 21, TOKEN_NIL = 22, TOKEN_AND = 23, TOKEN_OR = 24, TOKEN_NOT = 25, + TOKEN_RETURN = 26, TOKEN_CAST = 27, TOKEN_AS = 28, TOKEN_SIZEOF = 29, + TOKEN_LEFT_PAREN = 30, TOKEN_RIGHT_PAREN = 31, TOKEN_LEFT_SQUARE = 32, + TOKEN_RIGHT_SQUARE = 33, TOKEN_GREATER_EQUAL = 34, TOKEN_LESS_EQUAL = 35, + TOKEN_GREATER_THAN = 36, TOKEN_LESS_THAN = 37, TOKEN_NOT_EQUAL = 38, TOKEN_EQUAL = 39, + TOKEN_SEMICOLON = 40, TOKEN_DOT = 41, TOKEN_COMMA = 42, + TOKEN_PLUS = 43, TOKEN_MINUS = 44, TOKEN_MULTIPLICATION = 45, TOKEN_DIVISION = 46, + TOKEN_REMAINDER = 47, TOKEN_ASSIGNMENT = 48, TOKEN_COLON = 49, TOKEN_HAT = 50, + TOKEN_AT = 51; --- --- External procedures. --- +(* + External procedures. +*) proc fopen(pathname: String, mode: String): pointer to FILE; extern; proc fclose(stream: pointer to FILE): Int; extern; proc fseek(stream: pointer to FILE, off: Int, whence: Int): Int; extern; @@ -41,9 +53,9 @@ proc strlen(ptr: pointer to Char): Word; extern; proc exit(code: Int); extern; --- --- Standard procedures. --- +(* + Standard procedures. +*) proc write_s(value: String); begin write(0, value, strlen(value)) @@ -71,7 +83,7 @@ begin n := 9; buffer[9] := '0'; - while value /= 0 do + while value <> 0 do digit := value % 10; value := value / 10; @@ -109,9 +121,9 @@ begin return c = ' ' or c = '\n' end; --- --- End of standard procedures. --- +(* + End of standard procedures. +*) proc test_record(); var r: Token; @@ -119,9 +131,9 @@ begin write_s("\nTest record:\n"); r.kind := 4; - r.value.intValue := 8; + r.value.int_value := 8; - write_i(r.value.intValue) + write_i(r.value.int_value) end; proc test_primitive(); @@ -173,58 +185,11 @@ begin return input end; -proc compile(); +proc print_tokens(tokens: pointer to Token, tokens_size: Int); var - input: pointer to Char, - input_pointer: pointer to Char, - token_end: pointer to Char, - token_length: Int, - tokens: pointer to Token, current_token: pointer to Token, - tokens_size: Int, i: Int; begin - tokens_size := 0; - tokens := cast(0 as pointer to Token); - - input := read_source("example.elna"); - - input_pointer := skip_spaces(input); - - while input_pointer^ /= '\0' do - if is_alpha(input_pointer^) or input_pointer^ = '_' then - token_end := lex_identifier(input_pointer + 1); - token_length := cast(token_end as Int) - cast(input_pointer as Int); - - tokens := cast(realloc(tokens, tokens_size + sizeof(Token)) as pointer to Token); - current_token := tokens + tokens_size; - - if strncmp("if", input_pointer, token_length) = 0 then - current_token^.kind := TOKEN_IF - elsif strncmp("then", input_pointer, token_length) = 0 then - current_token^.kind := TOKEN_THEN - elsif strncmp("else", input_pointer, token_length) = 0 then - current_token^.kind := TOKEN_ELSE - elsif strncmp("elsif", input_pointer, token_length) = 0 then - current_token^.kind := TOKEN_ELSIF - elsif strncmp("while", input_pointer, token_length) = 0 then - current_token^.kind := TOKEN_WHILE - elsif strncmp("do", input_pointer, token_length) = 0 then - current_token^.kind := TOKEN_DO - else - current_token^.kind := TOKEN_IDENTIFIER; - current_token^.value.stringValue := cast(calloc(token_length + 1, 1) as pointer to Char); - strncpy(current_token^.value.stringValue, input_pointer, token_length) - end; - - tokens_size := tokens_size + sizeof(Token); - - input_pointer := token_end - else - input_pointer := input_pointer + 1 - end - end; - i := 0; while i < tokens_size do current_token := tokens + i; @@ -241,10 +206,100 @@ begin write_s("WHILE") elsif current_token^.kind = TOKEN_DO then write_s("DO") - elsif current_token^.kind = TOKEN_IDENTIFIER then - write_s("IDENTIFIER<"); - write_s(current_token^.value.stringValue); + elsif current_token^.kind = TOKEN_PROC then + write_s("PROC") + elsif current_token^.kind = TOKEN_BEGIN then + write_s("BEGIN") + elsif current_token^.kind = TOKEN_END then + write_s("END") + elsif current_token^.kind = TOKEN_EXTERN then + write_s("EXTERN") + elsif current_token^.kind = TOKEN_CONST then + write_s("CONST") + elsif current_token^.kind = TOKEN_VAR then + write_s("VAR") + elsif current_token^.kind = TOKEN_ARRAY then + write_s("ARRAY") + elsif current_token^.kind = TOKEN_OF then + write_s("OF") + elsif current_token^.kind = TOKEN_TYPE then + write_s("TYPE") + elsif current_token^.kind = TOKEN_RECORD then + write_s("RECORD") + elsif current_token^.kind = TOKEN_UNION then + write_s("UNION") + elsif current_token^.kind = TOKEN_POINTER then + write_s("POINTER") + elsif current_token^.kind = TOKEN_TO then + write_s("TO") + elsif current_token^.kind = TOKEN_BOOLEAN then + write_s("BOOLEAN<"); + write_b(current_token^.value.boolean_value); write_c('>') + elsif current_token^.kind = TOKEN_NIL then + write_s("NIL") + elsif current_token^.kind = TOKEN_AND then + write_s("AND") + elsif current_token^.kind = TOKEN_OR then + write_s("OR") + elsif current_token^.kind = TOKEN_NOT then + write_s("NOT") + elsif current_token^.kind = TOKEN_RETURN then + write_s("RETURN") + elsif current_token^.kind = TOKEN_CAST then + write_s("CAST") + elsif current_token^.kind = TOKEN_AS then + write_s("AS") + elsif current_token^.kind = TOKEN_SIZEOF then + write_s("SIZEOF") + elsif current_token^.kind = TOKEN_IDENTIFIER then + write_c('<'); + write_s(current_token^.value.string_value); + write_c('>') + elsif current_token^.kind = TOKEN_LEFT_PAREN then + write_s("(") + elsif current_token^.kind = TOKEN_RIGHT_PAREN then + write_s(")") + elsif current_token^.kind = TOKEN_LEFT_SQUARE then + write_s("[") + elsif current_token^.kind = TOKEN_RIGHT_SQUARE then + write_s("]") + elsif current_token^.kind = TOKEN_GREATER_EQUAL then + write_s(">=") + elsif current_token^.kind = TOKEN_LESS_EQUAL then + write_s("<=") + elsif current_token^.kind = TOKEN_GREATER_THAN then + write_s(">") + elsif current_token^.kind = TOKEN_LESS_THAN then + write_s("<") + elsif current_token^.kind = TOKEN_EQUAL then + write_s("=") + elsif current_token^.kind = TOKEN_NOT_EQUAL then + write_s("<>") + elsif current_token^.kind = TOKEN_SEMICOLON then + write_s(";") + elsif current_token^.kind = TOKEN_DOT then + write_s(".") + elsif current_token^.kind = TOKEN_COMMA then + write_s(",") + elsif current_token^.kind = TOKEN_PLUS then + write_s("+") + elsif current_token^.kind = TOKEN_MINUS then + write_s("-") + elsif current_token^.kind = TOKEN_MULTIPLICATION then + write_s("*") + elsif current_token^.kind = TOKEN_DIVISION then + write_s("/") + elsif current_token^.kind = TOKEN_REMAINDER then + write_s("%") + elsif current_token^.kind = TOKEN_ASSIGNMENT then + write_s(":=") + elsif current_token^.kind = TOKEN_COLON then + write_s(":") + elsif current_token^.kind = TOKEN_HAT then + write_s("^") + elsif current_token^.kind = TOKEN_AT then + write_s("@") else write_s("UNKNOWN<"); write_i(current_token^.kind); @@ -253,8 +308,193 @@ begin write_c(' '); i := i + sizeof(Token) + end +end; + +proc categorize_identifier(input_pointer: pointer to Char, token_length: Int): Token; +var + current_token: Token; +begin + if strncmp("if", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_IF + elsif strncmp("then", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_THEN + elsif strncmp("else", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_ELSE + elsif strncmp("elsif", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_ELSIF + elsif strncmp("while", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_WHILE + elsif strncmp("do", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_DO + elsif strncmp("proc", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_PROC + elsif strncmp("begin", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_BEGIN + elsif strncmp("end", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_END + elsif strncmp("extern", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_EXTERN + elsif strncmp("const", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_CONST + elsif strncmp("var", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_VAR + elsif strncmp("array", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_ARRAY + elsif strncmp("of", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_OF + elsif strncmp("type", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_TYPE + elsif strncmp("record", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_RECORD + elsif strncmp("union", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_UNION + elsif strncmp("pointer", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_POINTER + elsif strncmp("to", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_TO + elsif strncmp("true", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_BOOLEAN; + current_token.value.boolean_value := true + elsif strncmp("false", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_BOOLEAN; + current_token.value.boolean_value := false + elsif strncmp("nil", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_NIL + elsif strncmp("and", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_AND + elsif strncmp("or", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_OR + elsif strncmp("not", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_NOT + elsif strncmp("return", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_RETURN + elsif strncmp("cast", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_CAST + elsif strncmp("as", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_AS + elsif strncmp("sizeof", input_pointer, token_length) = 0 then + current_token.kind := TOKEN_SIZEOF + else + current_token.kind := TOKEN_IDENTIFIER; + current_token.value.string_value := cast(calloc(token_length + 1, 1) as pointer to Char); + strncpy(current_token.value.string_value, input_pointer, token_length) end; + return current_token +end; + +proc compile(); +var + input: pointer to Char, + input_pointer: pointer to Char, + token_end: pointer to Char, + token_length: Int, + tokens: pointer to Token, + current_token: pointer to Token, + tokens_size: Int; +begin + tokens_size := 0; + tokens := cast(nil as pointer to Token); + + input := read_source("example.elna"); + + input_pointer := skip_spaces(input); + + while input_pointer^ <> '\0' do + tokens := cast(realloc(tokens, tokens_size + sizeof(Token)) as pointer to Token); + current_token := tokens + tokens_size; + + if is_alpha(input_pointer^) or input_pointer^ = '_' then + token_end := lex_identifier(input_pointer + 1); + token_length := cast(token_end as Int) - cast(input_pointer as Int); + + current_token^ := categorize_identifier(input_pointer, token_length); + + input_pointer := token_end + elsif input_pointer^ = '(' then + current_token^.kind := TOKEN_LEFT_PAREN; + input_pointer := input_pointer + 1 + elsif input_pointer^ = ')' then + current_token^.kind := TOKEN_RIGHT_PAREN; + input_pointer := input_pointer + 1 + elsif input_pointer^ = '[' then + current_token^.kind := TOKEN_LEFT_SQUARE; + input_pointer := input_pointer + 1 + elsif input_pointer^ = ']' then + current_token^.kind := TOKEN_RIGHT_SQUARE; + input_pointer := input_pointer + 1 + elsif input_pointer^ = '>' then + input_pointer := input_pointer + 1; + if input_pointer^ = '=' then + current_token^.kind := TOKEN_GREATER_EQUAL; + input_pointer := input_pointer + 1 + else + current_token^.kind := TOKEN_GREATER_THAN + end + elsif input_pointer^ = '<' then + input_pointer := input_pointer + 1; + if input_pointer^ = '=' then + current_token^.kind := TOKEN_LESS_EQUAL; + input_pointer := input_pointer + 1 + elsif input_pointer^ = '>' then + current_token^.kind := TOKEN_NOT_EQUAL; + input_pointer := input_pointer + 1 + else + current_token^.kind := TOKEN_LESS_THAN + end + elsif input_pointer^ = '=' then + current_token^.kind := TOKEN_EQUAL; + input_pointer := input_pointer + 1 + elsif input_pointer^ = ';' then + current_token^.kind := TOKEN_SEMICOLON; + input_pointer := input_pointer + 1 + elsif input_pointer^ = '.' then + current_token^.kind := TOKEN_DOT; + input_pointer := input_pointer + 1 + elsif input_pointer^ = ',' then + current_token^.kind := TOKEN_COMMA; + input_pointer := input_pointer + 1 + elsif input_pointer^ = '+' then + current_token^.kind := TOKEN_PLUS; + input_pointer := input_pointer + 1 + elsif input_pointer^ = '-' then + current_token^.kind := TOKEN_MINUS; + input_pointer := input_pointer + 1 + elsif input_pointer^ = '*' then + current_token^.kind := TOKEN_MULTIPLICATION; + input_pointer := input_pointer + 1 + elsif input_pointer^ = '/' then + current_token^.kind := TOKEN_DIVISION; + input_pointer := input_pointer + 1 + elsif input_pointer^ = '%' then + current_token^.kind := TOKEN_REMAINDER; + input_pointer := input_pointer + 1 + elsif input_pointer^ = ':' then + input_pointer := input_pointer + 1; + if input_pointer^ = '=' then + current_token^.kind := TOKEN_ASSIGNMENT; + input_pointer := input_pointer + 1 + else + current_token^.kind := TOKEN_COLON + end + elsif input_pointer^ = '^' then + current_token^.kind := TOKEN_HAT; + input_pointer := input_pointer + 1 + elsif input_pointer^ = '@' then + current_token^.kind := TOKEN_AT; + input_pointer := input_pointer + 1 + else + current_token^.kind := 0; + input_pointer := input_pointer + 1 + end; + + if current_token^.kind <> 0 then + tokens_size := tokens_size + sizeof(Token) + end + end; + print_tokens(tokens, tokens_size); + free(input) end; diff --git a/gcc/elna-generic.cc b/gcc/elna-generic.cc index ca07c1c..7e10b1b 100644 --- a/gcc/elna-generic.cc +++ b/gcc/elna-generic.cc @@ -235,6 +235,11 @@ namespace gcc this->current_expression = build_int_cstu(elna_char_type_node, character->number()); } + void generic_visitor::visit(source::number_literal *) + { + this->current_expression = null_pointer_node; + } + void generic_visitor::visit(source::string_literal *string) { this->current_expression = build_string_literal(string->string().size() + 1, string->string().c_str()); diff --git a/include/elna/gcc/elna-generic.h b/include/elna/gcc/elna-generic.h index b994f3c..0ce0308 100644 --- a/include/elna/gcc/elna-generic.h +++ b/include/elna/gcc/elna-generic.h @@ -49,6 +49,7 @@ namespace gcc void visit(source::number_literal *literal) override; void visit(source::number_literal *boolean) override; void visit(source::number_literal *character) override; + void visit(source::number_literal *) override; void visit(source::string_literal *string) override; void visit(source::binary_expression *expression) override; void visit(source::unary_expression *expression) override; diff --git a/include/elna/source/ast.h b/include/elna/source/ast.h index c31e768..bfb1ee7 100644 --- a/include/elna/source/ast.h +++ b/include/elna/source/ast.h @@ -63,7 +63,6 @@ namespace source class dereference_expression; template class number_literal; - class char_literal; class string_literal; /** @@ -101,6 +100,7 @@ namespace source virtual void visit(number_literal *) = 0; virtual void visit(number_literal *) = 0; virtual void visit(number_literal *) = 0; + virtual void visit(number_literal *) = 0; virtual void visit(string_literal *) = 0; }; @@ -139,6 +139,7 @@ namespace source virtual void visit(number_literal *) override; virtual void visit(number_literal *) override; virtual void visit(number_literal *) override; + virtual void visit(number_literal *) override; virtual void visit(string_literal *) override; }; diff --git a/source/ast.cc b/source/ast.cc index ff6e6b7..5395ee3 100644 --- a/source/ast.cc +++ b/source/ast.cc @@ -192,6 +192,10 @@ namespace source { } + void empty_visitor::visit(number_literal *) + { + } + void empty_visitor::visit(string_literal *) { } diff --git a/source/lexer.ll b/source/lexer.ll index 8cf9fcf..02cf0a5 100644 --- a/source/lexer.ll +++ b/source/lexer.ll @@ -18,15 +18,24 @@ %option c++ noyywrap never-interactive %option yyclass="elna::source::lexer" +%x IN_COMMENT + %% %{ this->location.step(); %} -\-\-.* { - /* Skip the comment */ +{ +\*\) BEGIN(INITIAL); +[^*\n]+ ; /* Eat comment in chunks. */ +\* ; /* Eat the lone star. */ +\n+ { + this->location.lines(yyleng); + this->location.step(); } -[\ \t\r] ; /* Skip the whitespaces */ +} +\(\* BEGIN(IN_COMMENT); +[\ \t\r] ; /* Skip the whitespaces. */ \n+ { this->location.lines(yyleng); this->location.step(); @@ -94,6 +103,9 @@ true { false { return yy::parser::make_BOOLEAN(false, this->location); } +nil { + return yy::parser::make_NIL(this->location); + } and { return yy::parser::make_AND(this->location); } @@ -222,7 +234,7 @@ sizeof { \< { return yy::parser::make_LESS_THAN(this->location); } -\/= { +\<\> { return yy::parser::make_NOT_EQUAL(this->location); } = { diff --git a/source/parser.yy b/source/parser.yy index e0ffa81..322fecd 100644 --- a/source/parser.yy +++ b/source/parser.yy @@ -70,7 +70,7 @@ %token AND OR NOT CAST AS SIZEOF %token GREATER_EQUAL LESS_EQUAL LESS_THAN GREATER_THAN NOT_EQUAL EQUALS %token PLUS MINUS MULTIPLICATION DIVISION REMAINDER -%token ASSIGNMENT COLON HAT AT +%token ASSIGNMENT COLON HAT AT NIL %type literal; %type constant_definition; @@ -245,6 +245,10 @@ literal: { $$ = new elna::source::number_literal(elna::source::make_position(@1), $1.at(0)); } + | NIL + { + $$ = new elna::source::number_literal(elna::source::make_position(@1), nullptr); + } | STRING { $$ = new elna::source::string_literal(elna::source::make_position(@1), $1);