fork download
  1. #include <string>
  2. #include <cstring>
  3. #include <iostream>
  4. #include <stdexcept>
  5.  
  6. namespace librayy {
  7. struct Cursor {
  8. long index;
  9. long line;
  10. long column;
  11.  
  12. Cursor(long i, long l, long c) : index(i), line(l), column(c) {}
  13. };
  14. }
  15.  
  16. namespace librayy {
  17. /**
  18.   * This provides an interface to what turns a text buffer into a stream of
  19.   * tokens. It is used to significantly simplify parsing.
  20.   */
  21. class Lexer {
  22. public:
  23. /**
  24.   * A token that can be obtained from Lexer::nextToken.
  25.   */
  26. struct Token {
  27. enum class Kind {
  28. End, //!< The end of the text buffer is reached.
  29. Unknown, //!< Unknown token.
  30. Identifier, //!< Any identifier.
  31. Comment, //!< A comment.
  32.  
  33. StringLiteral, //!< A string literal.
  34. ByteLiteral, //!< A byte literal.
  35. NumberLiteral, //!< A number literal.
  36.  
  37. BlockStart, //!< {
  38. BlockEnd, //!< }
  39. ParenOpen, //!< (
  40. ParenClose, //!< )
  41.  
  42. Equal, //!< ==
  43. Inequal, //!< !=
  44. LooselyEqual, //!< ~=
  45. LooselyInequal, //!< ~!=
  46. SmallerThan, //!< <
  47. GreaterThan, //!< >
  48. SmallerThanOrEq, //!< <=
  49. GreaterThanOrEq, //!< >=
  50. Arrow, //!< ->
  51. } kind;
  52.  
  53. /**
  54.   * The value of the token. Empty for anything but StringLiteral,
  55.   * ByteLiteral, NumberLiteral, Identifier and Comment.
  56.   */
  57. std::string value;
  58.  
  59. /**
  60.   * The location of the beginning of the token in the source code.
  61.   */
  62. Cursor location;
  63.  
  64. /**
  65.   * Constructs the token with the given token kind, value and location.
  66.   * @param k The kind of the token.
  67.   * @param v The value of the token.
  68.   * @param l The location of the token.
  69.   */
  70. Token(Kind k = Kind::Unknown,
  71. const std::string& v = "",
  72. Cursor l = Cursor(0, 0, 0))
  73. : kind(k), value(v), location(l) {}
  74. };
  75.  
  76. /**
  77.   * Constructs the lexer with the given code.
  78.   */
  79. Lexer(const std::string& code)
  80. : _text(code), _location(-1, 1, 0), _last_char(' ') {}
  81.  
  82. /**
  83.   * Returns the next token that can be found. If the end of the text buffer
  84.   * is reached, returns Lexer::Token::Kind::End.
  85.   */
  86. const Token nextToken();
  87. private:
  88. /**
  89.   * Increments the cursor, setting _last_char too.
  90.   */
  91. void _incrementCursor();
  92.  
  93. /**
  94.   * The text buffer passed to the constructor.
  95.   */
  96. std::string _text;
  97.  
  98. /**
  99.   * The current location in the buffer.
  100.   */
  101. Cursor _location;
  102.  
  103. /**
  104.   * The last character found in the buffer.
  105.   */
  106. int _last_char;
  107. };
  108. }
  109.  
  110. namespace librayy {
  111. void Lexer::_incrementCursor() {
  112. try {
  113. _text.at(_location.index + 1);
  114. } catch(std::out_of_range) {
  115. _last_char = -1; // End of buffer.
  116. return;
  117. }
  118. ++_location.index;
  119. _last_char = _text[_location.index];
  120. if (_last_char == '\n') {
  121. _location.column = 1;
  122. ++_location.line;
  123. } else {
  124. ++_location.column;
  125. }
  126. }
  127.  
  128. const Lexer::Token Lexer::nextToken() {
  129. if (_last_char == -1) {
  130. return Token(Token::Kind::End, "", _location);
  131. }
  132.  
  133. // Ignore whitespace.
  134. while (std::isspace(_last_char)) {
  135. _incrementCursor();
  136. }
  137.  
  138. // Match identifiers.
  139. if (_last_char == '_' || std::isalpha(_last_char)) {
  140. Cursor location = _location;
  141. std::string value;
  142. value.push_back(_last_char);
  143.  
  144. _incrementCursor();
  145. while (std::isalnum(_last_char)) {
  146. value.push_back(_last_char);
  147. _incrementCursor();
  148. }
  149.  
  150. return Lexer::Token(Token::Kind::Identifier, value, location);
  151. }
  152.  
  153. // Match number literals.
  154. if (std::strchr("0123456789", _last_char)) {
  155. Cursor location = _location;
  156. std::string num;
  157. num.push_back(_last_char);
  158.  
  159. _incrementCursor();
  160. while (std::strchr("0123456789.xABCDEFabcdef", _last_char)) {
  161. num.push_back(_last_char);
  162. _incrementCursor();
  163. }
  164.  
  165. return Token(Token::Kind::NumberLiteral, num, location);
  166. }
  167.  
  168. // Match operators.
  169. if (std::strchr("=!~<>-", _last_char)) {
  170. Cursor location = _location;
  171. std::string op;
  172. op.push_back(_last_char);
  173.  
  174. _incrementCursor();
  175. while (std::strchr("=!~<>-", _last_char)) {
  176. op.push_back(_last_char);
  177. _incrementCursor();
  178. }
  179.  
  180. Token::Kind kind = Token::Kind::Unknown;
  181. if (op == "==") { kind = Token::Kind::Equal; }
  182. else if (op == "!=") { kind = Token::Kind::Inequal; }
  183. else if (op == "~=") { kind = Token::Kind::LooselyEqual; }
  184. else if (op == "~!=") { kind = Token::Kind::LooselyInequal; }
  185. else if (op == "<") { kind = Token::Kind::SmallerThan; }
  186. else if (op == ">") { kind = Token::Kind::GreaterThan; }
  187. else if (op == "<=") { kind = Token::Kind::SmallerThanOrEq; }
  188. else if (op == ">=") { kind = Token::Kind::GreaterThanOrEq; }
  189. else if (op == "->") { kind = Token::Kind::Arrow; }
  190.  
  191. return Lexer::Token(kind, op, location);
  192. }
  193.  
  194. // Match other characters.
  195. if (_last_char == '{') {
  196. _incrementCursor();
  197. return Lexer::Token(Token::Kind::BlockStart, "{", _location);
  198. } else if (_last_char == '}') {
  199. _incrementCursor();
  200. return Lexer::Token(Token::Kind::BlockEnd, "}", _location);
  201. } else if (_last_char == '(') {
  202. _incrementCursor();
  203. return Lexer::Token(Token::Kind::ParenOpen, "(", _location);
  204. } else if (_last_char == ')') {
  205. _incrementCursor();
  206. return Lexer::Token(Token::Kind::ParenClose, ")", _location);
  207. }
  208.  
  209. // No token found.
  210. Cursor location = _location;
  211. _incrementCursor();
  212. return Lexer::Token(Token::Kind::Unknown, "", location);
  213. }
  214. }
  215.  
  216. std::ostream& operator<<(std::ostream& stream, librayy::Cursor cur) {
  217. stream << cur.line << ":" << cur.column;
  218.  
  219. return stream;
  220. }
  221.  
  222. int main(int argc, const char* argv[]) {
  223. librayy::Lexer lexer("class Foo {\n"
  224. " foo() -> void {\n"
  225. " print(5 == 2)\n"
  226. " }\n"
  227. "}");
  228.  
  229. librayy::Lexer::Token tok;
  230. while ((tok = lexer.nextToken()).kind != librayy::Lexer::Token::Kind::End) {
  231. std::cout << tok.location << ":\t" << static_cast<int>(tok.kind) << "\t"
  232. << tok.value << std::endl;
  233. }
  234.  
  235. return 0;
  236. }
  237.  
Success #stdin #stdout 0s 2968KB
stdin
Standard input is empty
stdout
1:1:	2	class
1:7:	2	Foo
2:1:	7	{
2:4:	2	foo
2:8:	9	(
2:9:	10	)
2:10:	19	->
2:13:	2	void
3:1:	7	{
3:6:	2	print
3:12:	9	(
3:12:	6	5
3:14:	11	==
3:17:	6	2
4:1:	10	)
5:1:	8	}
5:2:	8	}