Tokenize the input with flex

This commit is contained in:
Eugen Wissner 2024-04-06 16:10:07 +02:00
parent 4251c361c7
commit aa3df7f4d3
Signed by: belka
GPG Key ID: A27FDC1E8EE902C0
11 changed files with 274 additions and 369 deletions

View File

@ -6,11 +6,14 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
find_package(Boost COMPONENTS program_options REQUIRED) find_package(Boost COMPONENTS program_options REQUIRED)
find_package(FLEX)
include_directories(${Boost_INCLUDE_DIR}) include_directories(${Boost_INCLUDE_DIR})
add_executable(tester tests/tester.cpp include/elna/tester.hpp) add_executable(tester tests/tester.cpp include/elna/tester.hpp)
target_include_directories(tester PRIVATE include) target_include_directories(tester PRIVATE include)
FLEX_TARGET(scanner source/scanner.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.cpp)
add_executable(elnsh shell/main.cpp add_executable(elnsh shell/main.cpp
shell/interactive.cpp include/elna/shell/interactive.hpp shell/interactive.cpp include/elna/shell/interactive.hpp
shell/history.cpp include/elna/shell/history.hpp shell/history.cpp include/elna/shell/history.hpp
@ -26,6 +29,7 @@ add_executable(elna cli/main.cpp
backend/riscv.cpp include/elna/backend/riscv.hpp backend/riscv.cpp include/elna/backend/riscv.hpp
backend/target.cpp include/elna/backend/target.hpp backend/target.cpp include/elna/backend/target.hpp
cli/cl.cpp include/elna/cli/cl.hpp cli/cl.cpp include/elna/cli/cl.hpp
${FLEX_scanner_OUTPUTS}
) )
target_include_directories(elna PRIVATE include) target_include_directories(elna PRIVATE include)
target_link_libraries(elna LINK_PUBLIC ${Boost_LIBRARIES}) target_link_libraries(elna LINK_PUBLIC ${Boost_LIBRARIES})

6
TODO
View File

@ -1,20 +1,14 @@
# Compiler # Compiler
- Whitespaces are checked twice, in the source class and by lexing.
- Catch exceptions thrown by the argument parser and print them normally. - Catch exceptions thrown by the argument parser and print them normally.
- Parser should be able to collect errors.
- Provide position information on parse tree nodes. - Provide position information on parse tree nodes.
- Move constants to the symbol table, so we can check at parse time for duplicates.
- Don't pass raw pointers to the visitor methods.
- While loop. - While loop.
- Type checking. - Type checking.
- Procedures.
- Calculate additional stack space needed for subexpressions in the allocator - Calculate additional stack space needed for subexpressions in the allocator
visitor and not in the backend. visitor and not in the backend.
- Support immediates greater than 12 bits. - Support immediates greater than 12 bits.
- It seems instructions are correctly encoded only if the compiler is running - It seems instructions are correctly encoded only if the compiler is running
on a little endian architecture. on a little endian architecture.
- Merge declaration and definition nodes.
- Pointer. - Pointer.
- Static array. - Static array.

View File

@ -1,41 +1,10 @@
#include "elna/cli/cl.hpp" #include "elna/cli/cl.hpp"
#include "elna/backend/target.hpp" #include "elna/backend/target.hpp"
#include "elna/source/semantic.hpp" #include "elna/source/semantic.hpp"
#include <cstddef>
#include <fstream>
#include <iostream> #include <iostream>
namespace elna::cli namespace elna::cli
{ {
std::string read_source(const char *source)
{
constexpr std::size_t buffer_size = 4096;
std::ifstream input_stream{ source, std::ios::binary | std::ios::in };
std::string output;
if (input_stream.fail())
{
throw std::ios_base::failure("File does not exist");
}
while (true)
{
const std::size_t old_size = output.size();
output.resize(old_size + buffer_size);
input_stream.read(&output[old_size], buffer_size);
if (input_stream.eof())
{
output.resize(old_size + input_stream.gcount());
break;
}
else if (input_stream.fail())
{
throw std::ios_base::failure("Unable to complete reading the source file");
}
}
return output;
}
void print_error(const std::unique_ptr<source::error>& compile_error) void print_error(const std::unique_ptr<source::error>& compile_error)
{ {
std::cerr << compile_error->path().string() << ":" std::cerr << compile_error->path().string() << ":"
@ -45,17 +14,9 @@ namespace elna::cli
int compile(const std::filesystem::path& in_file, const std::filesystem::path& out_file) int compile(const std::filesystem::path& in_file, const std::filesystem::path& out_file)
{ {
std::string source_text;
try try
{ {
source_text = read_source(in_file.c_str()); source::result<source::lexer> lex_result = source::tokenize(in_file);
}
catch (std::ios_base::failure&)
{
return 3;
}
size_t tokensCount{ 0 };
auto lex_result = source::lex(source_text, in_file);
if (lex_result.has_errors()) if (lex_result.has_errors())
{ {
@ -74,7 +35,11 @@ namespace elna::cli
source::type_analysis_visitor().visit(ast.get()); source::type_analysis_visitor().visit(ast.get());
source::allocator_visitor(global_scope).visit(ast.get()); source::allocator_visitor(global_scope).visit(ast.get());
riscv::riscv32_elf(ast.get(), global_scope, out_file); riscv::riscv32_elf(ast.get(), global_scope, out_file);
}
catch (std::ios_base::failure&)
{
return 3;
}
return 0; return 0;
} }
} }

View File

@ -6,15 +6,6 @@
namespace elna::cli namespace elna::cli
{ {
/**
* Reads an input file and returns its contents.
*
* \param source Input file.
*
* \return File contents.
*/
std::string read_source(const char *source);
/** /**
* Formats and prints the given error. * Formats and prints the given error.
* *

View File

@ -9,40 +9,6 @@
namespace elna::source namespace elna::source
{ {
/**
* Range over the source text that keeps track of the current position.
*/
class text_iterator
{
std::string::const_iterator m_buffer;
elna::source::position m_position;
text_iterator(std::string::const_iterator buffer,
const elna::source::position start_position = elna::source::position());
public:
using iterator_category = std::forward_iterator_tag;
using difference_type = ptrdiff_t;
using value_type = char;
using pointer = const value_type *;
using reference = const value_type&;
const elna::source::position& position() const noexcept;
reference operator*() const noexcept;
pointer operator->() const noexcept;
text_iterator& operator++();
text_iterator& operator++(int);
bool operator==(const text_iterator& that) const noexcept;
bool operator!=(const text_iterator& that) const noexcept;
text_iterator operator+(std::size_t step);
friend std::pair<text_iterator, text_iterator> text_iterators(const std::string& buffer);
};
std::pair<text_iterator, text_iterator>
text_iterators(const std::string &buffer);
/** /**
* Union type representing a single token. * Union type representing a single token.
*/ */
@ -53,6 +19,7 @@ namespace elna::source
*/ */
enum class type : std::uint16_t enum class type : std::uint16_t
{ {
dot,
number, number,
boolean, boolean,
term_operator, term_operator,
@ -63,7 +30,6 @@ namespace elna::source
semicolon, semicolon,
left_paren, left_paren,
right_paren, right_paren,
dot,
comma, comma,
factor_operator, factor_operator,
eof, eof,
@ -97,6 +63,7 @@ namespace elna::source
token(type of, elna::source::position position); token(type of, elna::source::position position);
token(type of, std::int32_t value, const elna::source::position position); token(type of, std::int32_t value, const elna::source::position position);
token(type of, const std::string& value, const elna::source::position position); token(type of, const std::string& value, const elna::source::position position);
token(type of, value&& value, const elna::source::position position);
token(const token& that); token(const token& that);
token(token&& that); token(token&& that);
~token(); ~token();
@ -236,9 +203,8 @@ namespace elna::source
/** /**
* Splits the source text into tokens. * Splits the source text into tokens.
* *
* \param buffer Source text.
* \param path Source file location. * \param path Source file location.
* \return Tokens or error. * \return Tokens or error.
*/ */
elna::source::result<lexer> lex(const std::string& buffer, const std::filesystem::path& path); elna::source::result<lexer> tokenize(const std::filesystem::path& path);
} }

View File

@ -89,35 +89,44 @@ namespace elna::source
}; };
/** /**
* Variable declaration. * Symbol definition.
*/
class declaration : public node
{
std::string m_identifier;
std::string m_type;
public:
declaration(const std::string& identifier, const std::string& type);
virtual void accept(parser_visitor *visitor) override;
std::string& identifier() noexcept;
std::string& type() noexcept;
};
/**
* Constant definition.
*/ */
class definition : public node class definition : public node
{ {
std::string m_identifier; std::string m_identifier;
protected: protected:
/**
* Constructs a definition identified by some name.
*
* \param identifier Definition name.
*/
definition(const std::string& identifier); definition(const std::string& identifier);
public: public:
/**
* \return Definition name.
*/
std::string& identifier() noexcept; std::string& identifier() noexcept;
}; };
/**
* Variable declaration.
*/
class declaration : public definition
{
std::string m_type;
public:
declaration(const std::string& identifier, const std::string& type);
virtual void accept(parser_visitor *visitor) override;
std::string& type() noexcept;
};
/**
* Constant definition.
*/
class constant_definition : public definition class constant_definition : public definition
{ {
std::unique_ptr<integer_literal> m_body; std::unique_ptr<integer_literal> m_body;

View File

@ -1,79 +1,12 @@
#include "elna/source/lexer.hpp" #include "elna/source/lexer.hpp"
#include <cassert> #include <cassert>
#include <variant> #include <variant>
#include <sstream>
namespace elna::source namespace elna::source
{ {
using source_position = elna::source::position; using source_position = elna::source::position;
using source_error = elna::source::error; using source_error = elna::source::error;
std::pair<text_iterator, text_iterator> text_iterators(const std::string &buffer)
{
return std::make_pair<>(text_iterator(std::cbegin(buffer)),
text_iterator(std::cend(buffer), position{0, 0}));
}
text_iterator::text_iterator(std::string::const_iterator buffer,
const source_position start_position)
: m_buffer(buffer), m_position(start_position)
{
}
const source_position& text_iterator::position() const noexcept
{
return this->m_position;
}
text_iterator::reference text_iterator::operator*() const noexcept
{
return *m_buffer;
}
text_iterator::pointer text_iterator::operator->() const noexcept
{
return m_buffer.base();
}
text_iterator& text_iterator::operator++()
{
if (*this->m_buffer == '\n')
{
this->m_position.column = 1;
++this->m_position.line;
}
else
{
++this->m_position.column;
}
std::advance(this->m_buffer, 1);
return *this;
}
text_iterator& text_iterator::operator++(int)
{
auto tmp = *this;
++(*this);
return *this;
}
text_iterator text_iterator::operator+(std::size_t step)
{
auto result = *this;
return ++result;
}
bool text_iterator::operator==(const text_iterator& that) const noexcept
{
return this->m_buffer == that.m_buffer;
}
bool text_iterator::operator!=(const text_iterator& that) const noexcept
{
return !(*this == that);
}
token::value::value() token::value::value()
: nil(nullptr) : nil(nullptr)
{ {
@ -103,6 +36,23 @@ namespace elna::source
{ {
} }
token::token(type of, value&& value, const elna::source::position position)
: m_type(of), m_position(position)
{
if (has_identifier())
{
new((void *) &m_value.identifier) std::string(std::move(value.identifier));
}
else if (is_numeric())
{
m_value.number = value.number;
}
else
{
m_value.nil = nullptr;
}
}
token::token(const type of, source_position position) token::token(const type of, source_position position)
: m_type(of), m_position(position) : m_type(of), m_position(position)
{ {
@ -278,11 +228,12 @@ namespace elna::source
std::string unexpected_character::what() const std::string unexpected_character::what() const
{ {
std::stringstream ss{ "Unexpected character '" }; std::string ss{ "Unexpected character '" };
ss << character << "'"; ss.insert(ss.cend(), character.cbegin(), character.cend());
ss.push_back('\'');
return ss.str(); return ss;
} }
unexpected_token::unexpected_token(const token& token, const std::filesystem::path& path) unexpected_token::unexpected_token(const token& token, const std::filesystem::path& path)
@ -371,173 +322,4 @@ namespace elna::source
{ {
return m_errors; return m_errors;
} }
result<lexer> lex(const std::string& buffer, const std::filesystem::path& path)
{
std::vector<token> tokens;
auto [iterator, text_end] = text_iterators(buffer);
while (iterator != text_end)
{
if (*iterator == ' ' || *iterator == '\n')
{
}
else if (std::isdigit(*iterator))
{
tokens.emplace_back(
token::type::number,
static_cast<std::int32_t>(*iterator - '0'),
iterator.position()
);
}
else if (*iterator == '=')
{
tokens.emplace_back(token::type::equals, iterator.position());
}
else if (*iterator == '(')
{
tokens.emplace_back(token::type::left_paren, iterator.position());
}
else if (*iterator == ')')
{
tokens.emplace_back(token::type::right_paren, iterator.position());
}
else if (*iterator == ';')
{
tokens.emplace_back(token::type::semicolon, iterator.position());
}
else if (*iterator == ',')
{
tokens.emplace_back(token::type::comma, iterator.position());
}
else if (*iterator == '.')
{
tokens.emplace_back(token::type::dot, iterator.position());
}
else if (std::isalpha(*iterator))
{
std::string word;
auto i = iterator;
while (i != text_end && std::isalpha(*i))
{
word.push_back(*i);
++i;
}
if (word == "const")
{
tokens.emplace_back(token::type::let, iterator.position());
}
else if (word == "var")
{
tokens.emplace_back(token::type::var, iterator.position());
}
else if (word == "begin")
{
tokens.emplace_back(token::type::begin, iterator.position());
}
else if (word == "end")
{
tokens.emplace_back(token::type::end, iterator.position());
}
else if (word == "if")
{
tokens.emplace_back(token::type::when, iterator.position());
}
else if (word == "then")
{
tokens.emplace_back(token::type::then, iterator.position());
}
else if (word == "while")
{
tokens.emplace_back(token::type::loop, iterator.position());
}
else if (word == "do")
{
tokens.emplace_back(token::type::_do, iterator.position());
}
else if (word == "True")
{
tokens.emplace_back(token::type::boolean, 1, iterator.position());
}
else if (word == "False")
{
tokens.emplace_back(token::type::boolean, 0, iterator.position());
}
else if (word == "proc")
{
tokens.emplace_back(token::type::procedure, 0, iterator.position());
}
else
{
tokens.emplace_back(token::type::identifier, word.c_str(), iterator.position());
}
iterator = i;
continue;
}
else if (*iterator == '+' || *iterator == '-')
{
std::string _operator{ *iterator };
tokens.emplace_back(token::type::term_operator, _operator.c_str(), iterator.position());
}
else if (*iterator == '/' && iterator + 1 != text_end && *(iterator + 1) == '=')
{
tokens.emplace_back(token::type::comparison_operator, "n", iterator.position());
++iterator;
}
else if (*iterator == '*' || *iterator == '/')
{
std::string _operator{ *iterator };
tokens.emplace_back(token::type::factor_operator, _operator.c_str(), iterator.position());
}
else if (*iterator == '<')
{
std::string _operator;
auto operator_position = iterator.position();
if (iterator + 1 == text_end || *(iterator + 1) != '=')
{
_operator.push_back(*iterator);
}
else
{
++iterator;
_operator.push_back('l');
}
tokens.emplace_back(token::type::comparison_operator, _operator.c_str(), operator_position);
}
else if (*iterator == '>')
{
std::string _operator;
auto operator_position = iterator.position();
if (iterator + 1 == text_end || *(iterator + 1) != '=')
{
_operator.push_back(*iterator);
}
else
{
++iterator;
_operator.push_back('g');
}
tokens.emplace_back(token::type::comparison_operator, _operator.c_str(), operator_position);
}
else if (*iterator == ':' && iterator + 1 != text_end && *(iterator + 1) == '=')
{
tokens.emplace_back(token::type::assignment, iterator.position());
++iterator;
}
else if (*iterator == ':')
{
tokens.emplace_back(token::type::colon, iterator.position());
}
else
{
return result<lexer>(unexpected_character{ std::string{ *iterator }, path, iterator.position() });
}
++iterator;
}
return result<lexer>(std::in_place, std::move(tokens), iterator.position(), path);
}
} }

View File

@ -95,7 +95,7 @@ namespace elna::source
} }
declaration::declaration(const std::string& identifier, const std::string& type) declaration::declaration(const std::string& identifier, const std::string& type)
: m_identifier(identifier), m_type(type) : definition(identifier), m_type(type)
{ {
} }
@ -104,11 +104,6 @@ namespace elna::source
visitor->visit(this); visitor->visit(this);
} }
std::string& declaration::identifier() noexcept
{
return m_identifier;
}
std::string& declaration::type() noexcept std::string& declaration::type() noexcept
{ {
return m_type; return m_type;
@ -710,7 +705,7 @@ namespace elna::source
else else
{ {
iterator.add_error(*iterator); iterator.add_error(*iterator);
break; return nullptr;
} }
} }

194
source/scanner.l Normal file
View File

@ -0,0 +1,194 @@
%{
#define YY_NO_UNISTD_H
#define YY_USER_ACTION token_position = elna::source::position{ line_no, column_no }; column_no += yyleng;
#include <fstream>
#include "elna/source/lexer.hpp"
elna::source::token::value yylval{};
elna::source::position token_position{};
static std::size_t column_no = 1;
static std::size_t line_no = 1;
%}
%option noyywrap
%option never-interactive
%%
\-\-.* {
/* Skip the comment */
}
[\ \t\r] {
/* Skip the whitespaces */
}
\n {
++line_no;
column_no = 1;
}
if {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::when);
}
then {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::then);
}
while {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::loop);
}
do {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::_do);
}
proc {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::procedure);
}
begin {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::begin);
}
end {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::end);
}
const {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::let);
}
var {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::var);
}
True {
yylval.number = 1;
return static_cast<int>(elna::source::token::type::boolean);
}
False {
yylval.number = 0;
return static_cast<int>(elna::source::token::type::boolean);
}
[A-Za-z_][A-Za-z0-9_]* {
new((void *) &yylval.identifier) std::string(yytext);
return static_cast<int>(elna::source::token::type::identifier);
}
[0-9]+ {
yylval.number = strtol(yytext, NULL, 10);
return static_cast<int>(elna::source::token::type::number);
}
\( {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::left_paren);
}
\) {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::right_paren);
}
\>= {
new((void *) &yylval.identifier) std::string(1, 'g');
return static_cast<int>(elna::source::token::type::comparison_operator);
}
\<= {
new((void *) &yylval.identifier) std::string(1, 'l');
return static_cast<int>(elna::source::token::type::comparison_operator);
}
(>|<) {
new((void *) &yylval.identifier) std::string(yytext);
return static_cast<int>(elna::source::token::type::comparison_operator);
}
\/= {
new((void *) &yylval.identifier) std::string(1, 'n');
return static_cast<int>(elna::source::token::type::comparison_operator);
}
= {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::equals);
}
; {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::semicolon);
}
\. {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::dot);
}
, {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::comma);
}
(\+|\-) {
new((void *) &yylval.identifier) std::string(yytext);
return static_cast<int>(elna::source::token::type::term_operator);
}
(\*|\/) {
new((void *) &yylval.identifier) std::string(yytext);
return static_cast<int>(elna::source::token::type::factor_operator);
}
:= {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::assignment);
}
: {
yylval.nil = nullptr;
return static_cast<int>(elna::source::token::type::colon);
}
. {
return -1;
}
%%
namespace elna::source
{
result<lexer> tokenize(const std::filesystem::path& path)
{
int yytoken;
std::vector<token> tokens;
yyin = fopen(path.c_str(), "rb");
if (yyin == nullptr)
{
throw std::ios_base::failure("File does not exist");
}
do
{
yytoken = yylex();
if (yytoken < 0)
{
return result<lexer>(unexpected_character{ std::string{ yytext[0] }, path, token_position });
}
tokens.emplace_back(static_cast<token::type>(yytoken), std::move(yylval), token_position);
}
while (yytoken != 0);
return result<lexer>(std::in_place, std::move(tokens), position{ line_no, column_no }, path);
}
}

View File

@ -0,0 +1 @@
tests/missing_semicolon.eln:3:3: Unexpected token «identifier»

View File

@ -0,0 +1,4 @@
begin
writei(1)
writei(2)
end.