From aa3df7f4d30cb7b8ad2446b241553f68fcc18400 Mon Sep 17 00:00:00 2001 From: Eugen Wissner Date: Sat, 6 Apr 2024 16:10:07 +0200 Subject: [PATCH] Tokenize the input with flex --- CMakeLists.txt | 4 + TODO | 6 - cli/cl.cpp | 73 ++------ include/elna/cli/cl.hpp | 9 - include/elna/source/lexer.hpp | 40 +---- include/elna/source/parser.hpp | 43 +++-- source/lexer.cpp | 260 +++------------------------ source/parser.cpp | 9 +- source/scanner.l | 194 ++++++++++++++++++++ tests/failures/missing_semicolon.txt | 1 + tests/missing_semicolon.eln | 4 + 11 files changed, 274 insertions(+), 369 deletions(-) create mode 100644 source/scanner.l create mode 100644 tests/failures/missing_semicolon.txt create mode 100644 tests/missing_semicolon.eln diff --git a/CMakeLists.txt b/CMakeLists.txt index 1fadc02..fb582fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,11 +6,14 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_CXX_STANDARD 17) find_package(Boost COMPONENTS program_options REQUIRED) +find_package(FLEX) include_directories(${Boost_INCLUDE_DIR}) add_executable(tester tests/tester.cpp include/elna/tester.hpp) target_include_directories(tester PRIVATE include) +FLEX_TARGET(scanner source/scanner.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.cpp) + add_executable(elnsh shell/main.cpp shell/interactive.cpp include/elna/shell/interactive.hpp shell/history.cpp include/elna/shell/history.hpp @@ -26,6 +29,7 @@ add_executable(elna cli/main.cpp backend/riscv.cpp include/elna/backend/riscv.hpp backend/target.cpp include/elna/backend/target.hpp cli/cl.cpp include/elna/cli/cl.hpp + ${FLEX_scanner_OUTPUTS} ) target_include_directories(elna PRIVATE include) target_link_libraries(elna LINK_PUBLIC ${Boost_LIBRARIES}) diff --git a/TODO b/TODO index de7ba8f..95bfc5c 100644 --- a/TODO +++ b/TODO @@ -1,20 +1,14 @@ # Compiler -- Whitespaces are checked twice, in the source class and by lexing. - Catch exceptions thrown by the argument parser and print them normally. -- Parser should be able to collect errors. - Provide position information on parse tree nodes. -- Move constants to the symbol table, so we can check at parse time for duplicates. -- Don't pass raw pointers to the visitor methods. - While loop. - Type checking. -- Procedures. - Calculate additional stack space needed for subexpressions in the allocator visitor and not in the backend. - Support immediates greater than 12 bits. - It seems instructions are correctly encoded only if the compiler is running on a little endian architecture. -- Merge declaration and definition nodes. - Pointer. - Static array. diff --git a/cli/cl.cpp b/cli/cl.cpp index 2a9f31a..6e6ad28 100644 --- a/cli/cl.cpp +++ b/cli/cl.cpp @@ -1,41 +1,10 @@ #include "elna/cli/cl.hpp" #include "elna/backend/target.hpp" #include "elna/source/semantic.hpp" -#include -#include #include namespace elna::cli { - std::string read_source(const char *source) - { - constexpr std::size_t buffer_size = 4096; - - std::ifstream input_stream{ source, std::ios::binary | std::ios::in }; - std::string output; - if (input_stream.fail()) - { - throw std::ios_base::failure("File does not exist"); - } - while (true) - { - const std::size_t old_size = output.size(); - output.resize(old_size + buffer_size); - input_stream.read(&output[old_size], buffer_size); - - if (input_stream.eof()) - { - output.resize(old_size + input_stream.gcount()); - break; - } - else if (input_stream.fail()) - { - throw std::ios_base::failure("Unable to complete reading the source file"); - } - } - return output; - } - void print_error(const std::unique_ptr& compile_error) { std::cerr << compile_error->path().string() << ":" @@ -45,36 +14,32 @@ namespace elna::cli int compile(const std::filesystem::path& in_file, const std::filesystem::path& out_file) { - std::string source_text; try { - source_text = read_source(in_file.c_str()); + source::result lex_result = source::tokenize(in_file); + + if (lex_result.has_errors()) + { + print_errors(lex_result.errors().cbegin(), lex_result.errors().cend()); + return 1; + } + source::parser parser{ std::move(lex_result.success()) }; + auto ast = parser.parse(); + if (ast == nullptr) + { + print_errors(parser.errors().cbegin(), parser.errors().cend()); + return 2; + } + auto global_scope = std::make_shared(); + source::name_analysis_visitor(global_scope).visit(ast.get()); + source::type_analysis_visitor().visit(ast.get()); + source::allocator_visitor(global_scope).visit(ast.get()); + riscv::riscv32_elf(ast.get(), global_scope, out_file); } catch (std::ios_base::failure&) { return 3; } - size_t tokensCount{ 0 }; - auto lex_result = source::lex(source_text, in_file); - - if (lex_result.has_errors()) - { - print_errors(lex_result.errors().cbegin(), lex_result.errors().cend()); - return 1; - } - source::parser parser{ std::move(lex_result.success()) }; - auto ast = parser.parse(); - if (ast == nullptr) - { - print_errors(parser.errors().cbegin(), parser.errors().cend()); - return 2; - } - auto global_scope = std::make_shared(); - source::name_analysis_visitor(global_scope).visit(ast.get()); - source::type_analysis_visitor().visit(ast.get()); - source::allocator_visitor(global_scope).visit(ast.get()); - riscv::riscv32_elf(ast.get(), global_scope, out_file); - return 0; } } diff --git a/include/elna/cli/cl.hpp b/include/elna/cli/cl.hpp index e98672c..89492e2 100644 --- a/include/elna/cli/cl.hpp +++ b/include/elna/cli/cl.hpp @@ -6,15 +6,6 @@ namespace elna::cli { - /** - * Reads an input file and returns its contents. - * - * \param source Input file. - * - * \return File contents. - */ - std::string read_source(const char *source); - /** * Formats and prints the given error. * diff --git a/include/elna/source/lexer.hpp b/include/elna/source/lexer.hpp index 44f8294..ac8197c 100644 --- a/include/elna/source/lexer.hpp +++ b/include/elna/source/lexer.hpp @@ -9,40 +9,6 @@ namespace elna::source { - /** - * Range over the source text that keeps track of the current position. - */ - class text_iterator - { - std::string::const_iterator m_buffer; - elna::source::position m_position; - - text_iterator(std::string::const_iterator buffer, - const elna::source::position start_position = elna::source::position()); - - public: - using iterator_category = std::forward_iterator_tag; - using difference_type = ptrdiff_t; - using value_type = char; - using pointer = const value_type *; - using reference = const value_type&; - - const elna::source::position& position() const noexcept; - - reference operator*() const noexcept; - pointer operator->() const noexcept; - text_iterator& operator++(); - text_iterator& operator++(int); - bool operator==(const text_iterator& that) const noexcept; - bool operator!=(const text_iterator& that) const noexcept; - text_iterator operator+(std::size_t step); - - friend std::pair text_iterators(const std::string& buffer); - }; - - std::pair - text_iterators(const std::string &buffer); - /** * Union type representing a single token. */ @@ -53,6 +19,7 @@ namespace elna::source */ enum class type : std::uint16_t { + dot, number, boolean, term_operator, @@ -63,7 +30,6 @@ namespace elna::source semicolon, left_paren, right_paren, - dot, comma, factor_operator, eof, @@ -97,6 +63,7 @@ namespace elna::source token(type of, elna::source::position position); token(type of, std::int32_t value, const elna::source::position position); token(type of, const std::string& value, const elna::source::position position); + token(type of, value&& value, const elna::source::position position); token(const token& that); token(token&& that); ~token(); @@ -236,9 +203,8 @@ namespace elna::source /** * Splits the source text into tokens. * - * \param buffer Source text. * \param path Source file location. * \return Tokens or error. */ - elna::source::result lex(const std::string& buffer, const std::filesystem::path& path); + elna::source::result tokenize(const std::filesystem::path& path); } diff --git a/include/elna/source/parser.hpp b/include/elna/source/parser.hpp index dd65431..1adc8eb 100644 --- a/include/elna/source/parser.hpp +++ b/include/elna/source/parser.hpp @@ -89,35 +89,44 @@ namespace elna::source }; /** - * Variable declaration. - */ - class declaration : public node - { - std::string m_identifier; - std::string m_type; - - public: - declaration(const std::string& identifier, const std::string& type); - virtual void accept(parser_visitor *visitor) override; - - std::string& identifier() noexcept; - std::string& type() noexcept; - }; - - /** - * Constant definition. + * Symbol definition. */ class definition : public node { std::string m_identifier; protected: + /** + * Constructs a definition identified by some name. + * + * \param identifier Definition name. + */ definition(const std::string& identifier); public: + /** + * \return Definition name. + */ std::string& identifier() noexcept; }; + /** + * Variable declaration. + */ + class declaration : public definition + { + std::string m_type; + + public: + declaration(const std::string& identifier, const std::string& type); + virtual void accept(parser_visitor *visitor) override; + + std::string& type() noexcept; + }; + + /** + * Constant definition. + */ class constant_definition : public definition { std::unique_ptr m_body; diff --git a/source/lexer.cpp b/source/lexer.cpp index 82370b0..cc414de 100644 --- a/source/lexer.cpp +++ b/source/lexer.cpp @@ -1,79 +1,12 @@ #include "elna/source/lexer.hpp" #include #include -#include namespace elna::source { using source_position = elna::source::position; using source_error = elna::source::error; - std::pair text_iterators(const std::string &buffer) - { - return std::make_pair<>(text_iterator(std::cbegin(buffer)), - text_iterator(std::cend(buffer), position{0, 0})); - } - - text_iterator::text_iterator(std::string::const_iterator buffer, - const source_position start_position) - : m_buffer(buffer), m_position(start_position) - { - } - - const source_position& text_iterator::position() const noexcept - { - return this->m_position; - } - - text_iterator::reference text_iterator::operator*() const noexcept - { - return *m_buffer; - } - - text_iterator::pointer text_iterator::operator->() const noexcept - { - return m_buffer.base(); - } - - text_iterator& text_iterator::operator++() - { - if (*this->m_buffer == '\n') - { - this->m_position.column = 1; - ++this->m_position.line; - } - else - { - ++this->m_position.column; - } - std::advance(this->m_buffer, 1); - - return *this; - } - - text_iterator& text_iterator::operator++(int) - { - auto tmp = *this; - ++(*this); - return *this; - } - - text_iterator text_iterator::operator+(std::size_t step) - { - auto result = *this; - return ++result; - } - - bool text_iterator::operator==(const text_iterator& that) const noexcept - { - return this->m_buffer == that.m_buffer; - } - - bool text_iterator::operator!=(const text_iterator& that) const noexcept - { - return !(*this == that); - } - token::value::value() : nil(nullptr) { @@ -103,6 +36,23 @@ namespace elna::source { } + token::token(type of, value&& value, const elna::source::position position) + : m_type(of), m_position(position) + { + if (has_identifier()) + { + new((void *) &m_value.identifier) std::string(std::move(value.identifier)); + } + else if (is_numeric()) + { + m_value.number = value.number; + } + else + { + m_value.nil = nullptr; + } + } + token::token(const type of, source_position position) : m_type(of), m_position(position) { @@ -278,11 +228,12 @@ namespace elna::source std::string unexpected_character::what() const { - std::stringstream ss{ "Unexpected character '" }; + std::string ss{ "Unexpected character '" }; - ss << character << "'"; + ss.insert(ss.cend(), character.cbegin(), character.cend()); + ss.push_back('\''); - return ss.str(); + return ss; } unexpected_token::unexpected_token(const token& token, const std::filesystem::path& path) @@ -371,173 +322,4 @@ namespace elna::source { return m_errors; } - - result lex(const std::string& buffer, const std::filesystem::path& path) - { - std::vector tokens; - auto [iterator, text_end] = text_iterators(buffer); - - while (iterator != text_end) - { - if (*iterator == ' ' || *iterator == '\n') - { - } - else if (std::isdigit(*iterator)) - { - tokens.emplace_back( - token::type::number, - static_cast(*iterator - '0'), - iterator.position() - ); - } - else if (*iterator == '=') - { - tokens.emplace_back(token::type::equals, iterator.position()); - } - else if (*iterator == '(') - { - tokens.emplace_back(token::type::left_paren, iterator.position()); - } - else if (*iterator == ')') - { - tokens.emplace_back(token::type::right_paren, iterator.position()); - } - else if (*iterator == ';') - { - tokens.emplace_back(token::type::semicolon, iterator.position()); - } - else if (*iterator == ',') - { - tokens.emplace_back(token::type::comma, iterator.position()); - } - else if (*iterator == '.') - { - tokens.emplace_back(token::type::dot, iterator.position()); - } - else if (std::isalpha(*iterator)) - { - std::string word; - auto i = iterator; - while (i != text_end && std::isalpha(*i)) - { - word.push_back(*i); - ++i; - } - if (word == "const") - { - tokens.emplace_back(token::type::let, iterator.position()); - } - else if (word == "var") - { - tokens.emplace_back(token::type::var, iterator.position()); - } - else if (word == "begin") - { - tokens.emplace_back(token::type::begin, iterator.position()); - } - else if (word == "end") - { - tokens.emplace_back(token::type::end, iterator.position()); - } - else if (word == "if") - { - tokens.emplace_back(token::type::when, iterator.position()); - } - else if (word == "then") - { - tokens.emplace_back(token::type::then, iterator.position()); - } - else if (word == "while") - { - tokens.emplace_back(token::type::loop, iterator.position()); - } - else if (word == "do") - { - tokens.emplace_back(token::type::_do, iterator.position()); - } - else if (word == "True") - { - tokens.emplace_back(token::type::boolean, 1, iterator.position()); - } - else if (word == "False") - { - tokens.emplace_back(token::type::boolean, 0, iterator.position()); - } - else if (word == "proc") - { - tokens.emplace_back(token::type::procedure, 0, iterator.position()); - } - else - { - tokens.emplace_back(token::type::identifier, word.c_str(), iterator.position()); - } - iterator = i; - continue; - } - else if (*iterator == '+' || *iterator == '-') - { - std::string _operator{ *iterator }; - - tokens.emplace_back(token::type::term_operator, _operator.c_str(), iterator.position()); - } - else if (*iterator == '/' && iterator + 1 != text_end && *(iterator + 1) == '=') - { - tokens.emplace_back(token::type::comparison_operator, "n", iterator.position()); - ++iterator; - } - else if (*iterator == '*' || *iterator == '/') - { - std::string _operator{ *iterator }; - - tokens.emplace_back(token::type::factor_operator, _operator.c_str(), iterator.position()); - } - else if (*iterator == '<') - { - std::string _operator; - auto operator_position = iterator.position(); - - if (iterator + 1 == text_end || *(iterator + 1) != '=') - { - _operator.push_back(*iterator); - } - else - { - ++iterator; - _operator.push_back('l'); - } - tokens.emplace_back(token::type::comparison_operator, _operator.c_str(), operator_position); - } - else if (*iterator == '>') - { - std::string _operator; - auto operator_position = iterator.position(); - - if (iterator + 1 == text_end || *(iterator + 1) != '=') - { - _operator.push_back(*iterator); - } - else - { - ++iterator; - _operator.push_back('g'); - } - tokens.emplace_back(token::type::comparison_operator, _operator.c_str(), operator_position); - } - else if (*iterator == ':' && iterator + 1 != text_end && *(iterator + 1) == '=') - { - tokens.emplace_back(token::type::assignment, iterator.position()); - ++iterator; - } - else if (*iterator == ':') - { - tokens.emplace_back(token::type::colon, iterator.position()); - } - else - { - return result(unexpected_character{ std::string{ *iterator }, path, iterator.position() }); - } - ++iterator; - } - return result(std::in_place, std::move(tokens), iterator.position(), path); - } } diff --git a/source/parser.cpp b/source/parser.cpp index a59c800..3171cb3 100644 --- a/source/parser.cpp +++ b/source/parser.cpp @@ -95,7 +95,7 @@ namespace elna::source } declaration::declaration(const std::string& identifier, const std::string& type) - : m_identifier(identifier), m_type(type) + : definition(identifier), m_type(type) { } @@ -104,11 +104,6 @@ namespace elna::source visitor->visit(this); } - std::string& declaration::identifier() noexcept - { - return m_identifier; - } - std::string& declaration::type() noexcept { return m_type; @@ -710,7 +705,7 @@ namespace elna::source else { iterator.add_error(*iterator); - break; + return nullptr; } } diff --git a/source/scanner.l b/source/scanner.l new file mode 100644 index 0000000..140f8a4 --- /dev/null +++ b/source/scanner.l @@ -0,0 +1,194 @@ +%{ +#define YY_NO_UNISTD_H +#define YY_USER_ACTION token_position = elna::source::position{ line_no, column_no }; column_no += yyleng; + +#include +#include "elna/source/lexer.hpp" + +elna::source::token::value yylval{}; +elna::source::position token_position{}; +static std::size_t column_no = 1; +static std::size_t line_no = 1; +%} + +%option noyywrap +%option never-interactive +%% +\-\-.* { + /* Skip the comment */ + } +[\ \t\r] { + /* Skip the whitespaces */ + } +\n { + ++line_no; + column_no = 1; + } +if { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::when); + } +then { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::then); + } +while { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::loop); + } +do { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::_do); + } +proc { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::procedure); + } +begin { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::begin); + } +end { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::end); + } +const { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::let); + } +var { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::var); + } +True { + yylval.number = 1; + + return static_cast(elna::source::token::type::boolean); + } +False { + yylval.number = 0; + + return static_cast(elna::source::token::type::boolean); + } +[A-Za-z_][A-Za-z0-9_]* { + new((void *) &yylval.identifier) std::string(yytext); + + return static_cast(elna::source::token::type::identifier); + } +[0-9]+ { + yylval.number = strtol(yytext, NULL, 10); + + return static_cast(elna::source::token::type::number); + } +\( { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::left_paren); + } +\) { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::right_paren); + } +\>= { + new((void *) &yylval.identifier) std::string(1, 'g'); + + return static_cast(elna::source::token::type::comparison_operator); + } +\<= { + new((void *) &yylval.identifier) std::string(1, 'l'); + + return static_cast(elna::source::token::type::comparison_operator); + } +(>|<) { + new((void *) &yylval.identifier) std::string(yytext); + + return static_cast(elna::source::token::type::comparison_operator); + } +\/= { + new((void *) &yylval.identifier) std::string(1, 'n'); + + return static_cast(elna::source::token::type::comparison_operator); + } += { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::equals); + } +; { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::semicolon); + } +\. { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::dot); + } +, { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::comma); + } +(\+|\-) { + new((void *) &yylval.identifier) std::string(yytext); + + return static_cast(elna::source::token::type::term_operator); + } +(\*|\/) { + new((void *) &yylval.identifier) std::string(yytext); + + return static_cast(elna::source::token::type::factor_operator); + } +:= { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::assignment); + } +: { + yylval.nil = nullptr; + + return static_cast(elna::source::token::type::colon); + } +. { + return -1; + } +%% +namespace elna::source +{ + +result tokenize(const std::filesystem::path& path) +{ + int yytoken; + std::vector tokens; + + yyin = fopen(path.c_str(), "rb"); + if (yyin == nullptr) + { + throw std::ios_base::failure("File does not exist"); + } + do + { + yytoken = yylex(); + + if (yytoken < 0) + { + return result(unexpected_character{ std::string{ yytext[0] }, path, token_position }); + } + tokens.emplace_back(static_cast(yytoken), std::move(yylval), token_position); + } + while (yytoken != 0); + + return result(std::in_place, std::move(tokens), position{ line_no, column_no }, path); +} + +} diff --git a/tests/failures/missing_semicolon.txt b/tests/failures/missing_semicolon.txt new file mode 100644 index 0000000..00953c6 --- /dev/null +++ b/tests/failures/missing_semicolon.txt @@ -0,0 +1 @@ +tests/missing_semicolon.eln:3:3: Unexpected token «identifier» diff --git a/tests/missing_semicolon.eln b/tests/missing_semicolon.eln new file mode 100644 index 0000000..02cd556 --- /dev/null +++ b/tests/missing_semicolon.eln @@ -0,0 +1,4 @@ +begin + writei(1) + writei(2) +end.