#include <string>
#include <cstring>
#include <iostream>
#include <stdexcept>
namespace librayy {
struct Cursor {
long index;
long line;
long column;
Cursor(long i, long l, long c) : index(i), line(l), column(c) {}
};
}
namespace librayy {
/**
* This provides an interface to what turns a text buffer into a stream of
* tokens. It is used to significantly simplify parsing.
*/
class Lexer {
public:
/**
* A token that can be obtained from Lexer::nextToken.
*/
struct Token {
enum class Kind {
End, //!< The end of the text buffer is reached.
Unknown, //!< Unknown token.
Identifier, //!< Any identifier.
Comment, //!< A comment.
StringLiteral, //!< A string literal.
ByteLiteral, //!< A byte literal.
NumberLiteral, //!< A number literal.
BlockStart, //!< {
BlockEnd, //!< }
ParenOpen, //!< (
ParenClose, //!< )
Equal, //!< ==
Inequal, //!< !=
LooselyEqual, //!< ~=
LooselyInequal, //!< ~!=
SmallerThan, //!< <
GreaterThan, //!< >
SmallerThanOrEq, //!< <=
GreaterThanOrEq, //!< >=
Arrow, //!< ->
} kind;
/**
* The value of the token. Empty for anything but StringLiteral,
* ByteLiteral, NumberLiteral, Identifier and Comment.
*/
std::string value;
/**
* The location of the beginning of the token in the source code.
*/
Cursor location;
/**
* Constructs the token with the given token kind, value and location.
* @param k The kind of the token.
* @param v The value of the token.
* @param l The location of the token.
*/
Token(Kind k = Kind::Unknown,
const std::string& v = "",
Cursor l = Cursor(0, 0, 0))
: kind(k), value(v), location(l) {}
};
/**
* Constructs the lexer with the given code.
*/
Lexer(const std::string& code)
: _text(code), _location(-1, 1, 0), _last_char(' ') {}
/**
* Returns the next token that can be found. If the end of the text buffer
* is reached, returns Lexer::Token::Kind::End.
*/
const Token nextToken();
private:
/**
* Increments the cursor, setting _last_char too.
*/
void _incrementCursor();
/**
* The text buffer passed to the constructor.
*/
std::string _text;
/**
* The current location in the buffer.
*/
Cursor _location;
/**
* The last character found in the buffer.
*/
int _last_char;
};
}
namespace librayy {
void Lexer::_incrementCursor() {
try {
_text.at(_location.index + 1);
} catch(std::out_of_range) {
_last_char = -1; // End of buffer.
return;
}
++_location.index;
_last_char = _text[_location.index];
if (_last_char == '\n') {
_location.column = 1;
++_location.line;
} else {
++_location.column;
}
}
const Lexer::Token Lexer::nextToken() {
if (_last_char == -1) {
return Token(Token::Kind::End, "", _location);
}
// Ignore whitespace.
while (std::isspace(_last_char)) {
_incrementCursor();
}
// Match identifiers.
if (_last_char == '_' || std::isalpha(_last_char)) {
Cursor location = _location;
std::string value;
value.push_back(_last_char);
_incrementCursor();
while (std::isalnum(_last_char)) {
value.push_back(_last_char);
_incrementCursor();
}
return Lexer::Token(Token::Kind::Identifier, value, location);
}
// Match number literals.
if (std::strchr("0123456789", _last_char)) {
Cursor location = _location;
std::string num;
num.push_back(_last_char);
_incrementCursor();
while (std::strchr("0123456789.xABCDEFabcdef", _last_char)) {
num.push_back(_last_char);
_incrementCursor();
}
return Token(Token::Kind::NumberLiteral, num, location);
}
// Match operators.
if (std::strchr("=!~<>-", _last_char)) {
Cursor location = _location;
std::string op;
op.push_back(_last_char);
_incrementCursor();
while (std::strchr("=!~<>-", _last_char)) {
op.push_back(_last_char);
_incrementCursor();
}
Token::Kind kind = Token::Kind::Unknown;
if (op == "==") { kind = Token::Kind::Equal; }
else if (op == "!=") { kind = Token::Kind::Inequal; }
else if (op == "~=") { kind = Token::Kind::LooselyEqual; }
else if (op == "~!=") { kind = Token::Kind::LooselyInequal; }
else if (op == "<") { kind = Token::Kind::SmallerThan; }
else if (op == ">") { kind = Token::Kind::GreaterThan; }
else if (op == "<=") { kind = Token::Kind::SmallerThanOrEq; }
else if (op == ">=") { kind = Token::Kind::GreaterThanOrEq; }
else if (op == "->") { kind = Token::Kind::Arrow; }
return Lexer::Token(kind, op, location);
}
// Match other characters.
if (_last_char == '{') {
_incrementCursor();
return Lexer::Token(Token::Kind::BlockStart, "{", _location);
} else if (_last_char == '}') {
_incrementCursor();
return Lexer::Token(Token::Kind::BlockEnd, "}", _location);
} else if (_last_char == '(') {
_incrementCursor();
return Lexer::Token(Token::Kind::ParenOpen, "(", _location);
} else if (_last_char == ')') {
_incrementCursor();
return Lexer::Token(Token::Kind::ParenClose, ")", _location);
}
// No token found.
Cursor location = _location;
_incrementCursor();
return Lexer::Token(Token::Kind::Unknown, "", location);
}
}
std::ostream& operator<<(std::ostream& stream, librayy::Cursor cur) {
stream << cur.line << ":" << cur.column;
return stream;
}
int main(int argc, const char* argv[]) {
librayy::Lexer lexer("class Foo {\n"
" foo() -> void {\n"
" print(5 == 2)\n"
" }\n"
"}");
librayy::Lexer::Token tok;
while ((tok = lexer.nextToken()).kind != librayy::Lexer::Token::Kind::End) {
std::cout << tok.location << ":\t" << static_cast<int>(tok.kind) << "\t"
<< tok.value << std::endl;
}
return 0;
}