elna/source/lexer.cpp

447 lines
12 KiB
C++
Raw Normal View History

#include "elna/source/lexer.hpp"
#include <variant>
2024-03-09 08:36:07 +01:00
#include <sstream>
2024-03-03 13:11:39 +01:00
2024-03-07 09:15:11 +01:00
namespace elna::source
2024-03-03 13:11:39 +01:00
{
using source_position = elna::source::position;
using source_error = elna::source::error;
2024-02-22 21:29:25 +01:00
std::pair<text_iterator, text_iterator> text_iterators(const std::string &buffer)
2024-02-22 21:29:25 +01:00
{
return std::make_pair<>(text_iterator(std::cbegin(buffer)),
text_iterator(std::cend(buffer), position{0, 0}));
2024-02-22 21:29:25 +01:00
}
text_iterator::text_iterator(std::string::const_iterator buffer,
2024-03-03 13:11:39 +01:00
const source_position start_position)
2024-02-22 21:29:25 +01:00
: m_buffer(buffer), m_position(start_position)
{
}
const source_position& text_iterator::position() const noexcept
2024-02-22 21:29:25 +01:00
{
return this->m_position;
}
text_iterator::reference text_iterator::operator*() const noexcept
2024-02-22 21:29:25 +01:00
{
return *m_buffer;
}
text_iterator::pointer text_iterator::operator->() const noexcept
2024-02-22 21:29:25 +01:00
{
return m_buffer.base();
}
text_iterator& text_iterator::operator++()
2024-02-22 21:29:25 +01:00
{
if (*this->m_buffer == '\n')
{
this->m_position.column = 1;
++this->m_position.line;
}
else
{
++this->m_position.column;
}
std::advance(this->m_buffer, 1);
return *this;
}
text_iterator& text_iterator::operator++(int)
2024-02-22 21:29:25 +01:00
{
auto tmp = *this;
++(*this);
return *this;
}
2024-03-11 10:43:26 +01:00
text_iterator text_iterator::operator+(std::size_t step)
{
auto result = *this;
return ++result;
}
bool text_iterator::operator==(const text_iterator& that) const noexcept
2024-02-22 21:29:25 +01:00
{
return this->m_buffer == that.m_buffer;
}
bool text_iterator::operator!=(const text_iterator& that) const noexcept
2024-02-22 21:29:25 +01:00
{
return !(*this == that);
}
token::value::value()
: nil(nullptr)
2024-02-22 21:29:25 +01:00
{
}
2024-02-28 16:18:39 +01:00
token::value::value(std::int32_t value)
: number(value)
{
}
2024-02-28 16:18:39 +01:00
token::value::value(const std::string& value)
: identifier(value)
{
2024-02-28 16:18:39 +01:00
}
token::value::~value()
2024-02-28 16:18:39 +01:00
{
}
token::token(const type of, const std::string& value, const source_position position)
: m_type(of), m_value(value), m_position(position)
{
}
token::token(const type of, std::int32_t number, const source_position position)
: m_type(of), m_value(number), m_position(position)
{
}
token::token(const type of, source_position position)
2024-02-28 16:18:39 +01:00
: m_type(of), m_position(position)
{
}
token::token(const token& that)
2024-02-28 16:18:39 +01:00
{
*this = that;
}
token::token(token&& that)
2024-02-28 16:18:39 +01:00
{
*this = std::move(that);
}
token::~token()
2024-02-28 16:18:39 +01:00
{
2024-03-19 09:35:50 +01:00
if (has_identifier())
2024-02-28 16:18:39 +01:00
{
m_value.identifier.~basic_string();
2024-02-28 16:18:39 +01:00
}
}
token& token::operator=(const token& that)
2024-02-28 16:18:39 +01:00
{
2024-03-19 09:35:50 +01:00
if (has_identifier())
{
m_value.identifier.~basic_string();
}
2024-02-28 16:18:39 +01:00
m_type = that.of();
m_position = that.position();
if (that.has_identifier())
2024-02-28 16:18:39 +01:00
{
2024-03-19 09:35:50 +01:00
new((void *) &m_value.identifier) std::string(that.identifier());
2024-02-28 16:18:39 +01:00
}
2024-03-17 01:00:44 +01:00
else if (that.is_numeric())
2024-02-28 16:18:39 +01:00
{
m_value.number = that.number();
}
else
{
m_value.nil = nullptr;
}
2024-02-28 16:18:39 +01:00
return *this;
}
token& token::operator=(token&& that)
2024-02-28 16:18:39 +01:00
{
2024-03-19 09:35:50 +01:00
if (has_identifier())
{
m_value.identifier.~basic_string();
}
2024-02-28 16:18:39 +01:00
m_type = that.of();
m_position = that.position();
if (that.has_identifier())
2024-02-28 16:18:39 +01:00
{
2024-03-19 09:35:50 +01:00
new((void *) &m_value.identifier) std::string(std::move(that.identifier()));
2024-02-28 16:18:39 +01:00
}
2024-03-17 01:00:44 +01:00
else if (that.is_numeric())
2024-02-28 16:18:39 +01:00
{
m_value.number = that.number();
}
else
{
m_value.nil = nullptr;
}
2024-02-28 16:18:39 +01:00
return *this;
2024-02-22 21:29:25 +01:00
}
token::type token::of() const noexcept
2024-02-22 21:29:25 +01:00
{
return m_type;
}
const std::string& token::identifier() const
2024-02-28 16:18:39 +01:00
{
if (!has_identifier())
{
throw std::bad_variant_access();
}
2024-02-28 16:18:39 +01:00
return m_value.identifier;
}
std::int32_t token::number() const
2024-02-22 21:29:25 +01:00
{
2024-03-17 01:00:44 +01:00
if (!is_numeric())
{
throw std::bad_variant_access();
}
2024-02-28 16:18:39 +01:00
return m_value.number;
2024-02-22 21:29:25 +01:00
}
const source_position& token::position() const noexcept
2024-02-22 21:29:25 +01:00
{
return m_position;
}
bool token::has_identifier() const noexcept
{
return of() == type::identifier
|| of() == type::term_operator
|| of() == type::factor_operator;
}
2024-03-17 01:00:44 +01:00
bool token::is_numeric() const noexcept
{
return of() == type::number
|| of() == type::boolean;
}
2024-03-09 08:36:07 +01:00
unexpected_character::unexpected_character(const std::string& character, const source::position position)
: error(position), character(character)
{
}
std::string unexpected_character::what() const
{
std::stringstream ss{ "Unexpected character '" };
ss << character << "'";
return ss.str();
}
2024-03-10 08:50:55 +01:00
unexpected_token::unexpected_token(const token& token)
: error(token.position()), m_token(token)
{
}
std::string unexpected_token::what() const
{
return "Unexpected token";
}
2024-03-14 08:52:45 +01:00
lexer::lexer(std::vector<token>&& tokens, const position last_position)
: tokens(std::move(tokens)), iterator(this->tokens.cbegin()), eof(token(token::type::eof, last_position))
{
}
lexer& lexer::operator++()
{
++iterator;
return *this;
}
const token& lexer::operator*() const
{
return *iterator;
}
const token *lexer::operator->() const
{
return iterator.base();
}
const token& lexer::current() const noexcept
{
if (iterator == tokens.cend())
{
return this->eof;
}
return *iterator;
}
bool lexer::current(const token::type token_type) const noexcept
{
return current().of() == token_type;
}
void lexer::add_error(const token& expected)
{
m_errors.push_back(std::make_unique<unexpected_token>(expected));
}
std::optional<std::reference_wrapper<const token>> lexer::advance(const token::type token_type)
{
if (iterator != tokens.cend() && iterator->of() == token_type)
{
return std::make_optional<>(std::cref(*iterator++));
}
add_error(current());
return std::optional<std::reference_wrapper<const token>>();
}
const token& lexer::look_ahead() const
{
auto tmp = iterator;
++tmp;
if (iterator == tokens.cend() || tmp == tokens.cend())
{
return eof;
}
return *tmp;
}
bool lexer::look_ahead(const token::type token_type) const
{
return look_ahead().of() == token_type;
}
bool lexer::skip(const token::type token_type)
{
return advance(token_type).has_value();
}
const std::list<std::unique_ptr<error>>& lexer::errors() const noexcept
{
return m_errors;
}
result<lexer> lex(const std::string& buffer)
2024-02-22 21:29:25 +01:00
{
std::vector<token> tokens;
auto [iterator, text_end] = text_iterators(buffer);
2024-02-22 21:29:25 +01:00
while (iterator != text_end)
2024-02-22 21:29:25 +01:00
{
if (*iterator == ' ' || *iterator == '\n')
{
}
2024-02-28 16:18:39 +01:00
else if (std::isdigit(*iterator))
{
tokens.emplace_back(
token::type::number,
2024-02-28 16:18:39 +01:00
static_cast<std::int32_t>(*iterator - '0'),
iterator.position()
);
}
else if (*iterator == '=')
{
tokens.emplace_back(token::type::equals, iterator.position());
2024-02-28 16:18:39 +01:00
}
else if (*iterator == '(')
2024-02-22 21:29:25 +01:00
{
tokens.emplace_back(token::type::left_paren, iterator.position());
2024-02-28 16:18:39 +01:00
}
else if (*iterator == ')')
{
tokens.emplace_back(token::type::right_paren, iterator.position());
2024-02-28 16:18:39 +01:00
}
else if (*iterator == ';')
{
tokens.emplace_back(token::type::semicolon, iterator.position());
2024-02-28 16:18:39 +01:00
}
else if (*iterator == ',')
{
tokens.emplace_back(token::type::comma, iterator.position());
2024-02-28 16:18:39 +01:00
}
else if (*iterator == '.')
{
tokens.emplace_back(token::type::dot, iterator.position());
2024-02-28 16:18:39 +01:00
}
else if (std::isalpha(*iterator))
{
std::string word;
auto i = iterator;
while (i != text_end && std::isalpha(*i))
2024-02-22 21:29:25 +01:00
{
2024-02-28 16:18:39 +01:00
word.push_back(*i);
++i;
2024-02-22 21:29:25 +01:00
}
2024-02-28 16:18:39 +01:00
if (word == "const")
{
tokens.emplace_back(token::type::let, iterator.position());
2024-02-28 16:18:39 +01:00
}
else if (word == "var")
{
tokens.emplace_back(token::type::var, iterator.position());
2024-02-28 16:18:39 +01:00
}
2024-03-10 08:50:55 +01:00
else if (word == "begin")
{
tokens.emplace_back(token::type::begin, iterator.position());
}
else if (word == "end")
{
tokens.emplace_back(token::type::end, iterator.position());
}
2024-03-17 01:00:44 +01:00
else if (word == "if")
{
tokens.emplace_back(token::type::when, iterator.position());
}
else if (word == "then")
{
tokens.emplace_back(token::type::then, iterator.position());
}
else if (word == "while")
{
tokens.emplace_back(token::type::_while, iterator.position());
}
else if (word == "do")
{
tokens.emplace_back(token::type::_do, iterator.position());
}
else if (word == "True")
{
tokens.emplace_back(token::type::boolean, 1, iterator.position());
}
else if (word == "False")
{
tokens.emplace_back(token::type::boolean, 0, iterator.position());
}
2024-03-20 17:56:38 +01:00
else if (word == "proc")
{
tokens.emplace_back(token::type::procedure, 0, iterator.position());
}
2024-02-28 16:18:39 +01:00
else
{
tokens.emplace_back(token::type::identifier, word.c_str(), iterator.position());
2024-02-28 16:18:39 +01:00
}
iterator = i;
2024-02-22 21:29:25 +01:00
continue;
}
2024-02-28 16:18:39 +01:00
else if (*iterator == '+' || *iterator == '-')
{
std::string _operator{ *iterator };
tokens.emplace_back(token::type::term_operator, _operator.c_str(), iterator.position());
2024-03-03 13:11:39 +01:00
}
else if (*iterator == '*' || *iterator == '/')
{
std::string _operator{ *iterator };
tokens.emplace_back(token::type::factor_operator, _operator.c_str(), iterator.position());
2024-02-28 16:18:39 +01:00
}
2024-03-11 10:43:26 +01:00
else if (*iterator == ':' && iterator + 1 != text_end && *(iterator + 1) == '=')
{
tokens.emplace_back(token::type::assignment, iterator.position());
++iterator;
}
2024-03-17 01:00:44 +01:00
else if (*iterator == ':')
{
tokens.emplace_back(token::type::colon, iterator.position());
}
2024-02-28 16:18:39 +01:00
else
{
2024-03-14 08:52:45 +01:00
return result<lexer>(unexpected_character{ std::string{ *iterator }, iterator.position() });
2024-02-28 16:18:39 +01:00
}
2024-02-22 21:29:25 +01:00
++iterator;
}
2024-03-14 08:52:45 +01:00
return result<lexer>(std::in_place, std::move(tokens), iterator.position());
2024-02-22 21:29:25 +01:00
}
}