fork download
  1. import re
  2.  
  3. # mapping of terminal token type to regex of characters that match that type
  4. TOKEN_TYPES = {
  5. name: re.compile(pattern) for name, pattern in
  6. {
  7. 'INT': r'^\d+$',
  8. 'OPEN': r'^\($',
  9. 'CLOSE': r'^\)$',
  10. 'PLUS': r'^\+$',
  11. 'TIMES': r'^\*$',
  12. }.items()
  13. }
  14.  
  15. # grammar that describes the accepted sequences
  16. # current token : [possible next token types]
  17. GRAMMAR = {
  18. 'START': {'INT', 'OPEN'},
  19. 'INT': {'INT', 'CLOSE', 'PLUS', 'TIMES', 'END'},
  20. 'OPEN': {'INT'},
  21. 'CLOSE': {'PLUS', 'TIMES', 'END'},
  22. 'PLUS': {'INT', 'OPEN'},
  23. 'TIMES': {'INT', 'OPEN'},
  24. 'END': {},
  25. }
  26.  
  27.  
  28. def is_token(token_type: str, token: str, token_types=TOKEN_TYPES) -> bool:
  29. """check that a string token matches a given token type regex"""
  30.  
  31. assert token_type in token_types, 'Unrecognized token type %s' % token_type
  32. return bool(token_types[token_type].match(token))
  33.  
  34. def test_is_token():
  35. assert is_token('INT', '5'), "Token type INT didn't match 5"
  36. assert not is_token('INT', '+'), "Token type INT matched match +"
  37. assert is_token('OPEN', '('), "Token type OPEN didn't match ("
  38. assert not is_token('OPEN', ')'), "Token type OPEN matched )"
  39. assert is_token('PLUS', '+'), "Token type PLUS didn't match +"
  40. assert not is_token('PLUS', '*'), "Token type PLUS matched *"
  41. assert is_token('TIMES', '*'), "Token type TIMES didn't match *"
  42. assert not is_token('TIMES', '+'), "Token type TIMES matched +"
  43.  
  44.  
  45. class InputConsumer(object):
  46. """A stepper to store current position in a string buffer to aid parsing"""
  47.  
  48. def __init__(self, input_stream: str='', start_pos=0) -> None:
  49. self.input_stream = input_stream
  50. self.current_idx = start_pos
  51.  
  52. def at(self, idx: int=None):
  53. idx = idx if idx is not None else self.current_idx
  54. if idx > len(self.input_stream) - 1:
  55. return ''
  56. return self.input_stream[idx]
  57.  
  58. def step(self) -> None:
  59. self.current_idx += 1
  60.  
  61. def back(self) -> None:
  62. self.current_idx -= 1
  63.  
  64. def __str__(self):
  65. before = self.input_stream[:self.current_idx]
  66. curr = self.input_stream[self.current_idx]
  67. after = self.input_stream[self.current_idx + 1:]
  68. return '{0}[{1}]{2}'.format(before, curr, after)
  69.  
  70. def __repr__(self):
  71. return str(self)
  72.  
  73.  
  74. class LexerNode(object):
  75. """A linked list of lexed nodes storing node type, token str, next, and prev"""
  76.  
  77. def __init__(self, token: str, token_type: str=None,
  78. prev_node: LexerNode=None, next_node: LexerNode=None) -> None:
  79. self.token = token
  80. self.token_type = token_type
  81. self.prev_node = prev_node
  82. self.next_node = next_node
  83.  
  84.  
  85. def __str__(self) -> str:
  86. return '<{0}{1}> {2}'.format(
  87. self.token_type,
  88. ': {0}'.format(self.token) if self.token else '',
  89. self.next_node or '',
  90. )
  91.  
  92. def __repr__(self) -> str:
  93. return str(self)
  94.  
  95. def __eq__(self: LexerNode, other: object) -> bool:
  96. if not isinstance(other, LexerNode):
  97. return NotImplemented
  98. return (
  99. self.token == other.token and
  100. self.token_type == other.token_type and
  101. self.next_node == other.next_node
  102. )
  103.  
  104.  
  105. def lex(input_stream: str) -> LexerNode:
  106. """Lex an input string into a LexerNode chain of tokens"""
  107.  
  108. stream = InputConsumer(input_stream)
  109. lexed_chain = LexerNode('', 'START')
  110. current_node = lexed_chain
  111.  
  112. while stream.at():
  113. current_token = stream.at()
  114. token_type = None
  115.  
  116. for name in TOKEN_TYPES.keys():
  117. if is_token(name, current_token):
  118. token_type = name
  119. break
  120.  
  121. stream.step()
  122. new_node = LexerNode(current_token, token_type, current_node)
  123. current_node.next_node = new_node
  124. current_node = new_node
  125.  
  126. end_node = LexerNode('', 'END', current_node)
  127. current_node.next_node = end_node
  128. current_node = end_node
  129.  
  130. return lexed_chain
  131.  
  132.  
  133. def validate_lex(lexed_chain: LexerNode, grammar=GRAMMAR) -> bool:
  134. """Confirm that a lexed chain follows a given grammar"""
  135.  
  136. current_node = lexed_chain
  137. while current_node.next_node:
  138. token_type = current_node.token_type
  139. next_token_type = current_node.next_node.token_type
  140.  
  141. if next_token_type not in grammar[token_type]:
  142. return False
  143.  
  144. current_node = current_node.next_node
  145.  
  146. return True
  147.  
  148. def test_lexing():
  149. true_cases = ['5', '55', '(5)',
  150. '(5+5)', '5+5', '(5)+5', '5+(5)',
  151. '(5)+(5)', '(5)+5+(5)',
  152. '5)+5+(5'] # this case is an invalid parse, but a valid lex
  153.  
  154. for case in true_cases:
  155. chain = lex(case)
  156. assert validate_lex(chain), 'Got invalid lex for {0} => {1}'.format(case, chain)
  157.  
  158. false_cases = ['(', ')', '()', ')(', '((', '))',
  159. '+', '5+', '+5',
  160. '(+)', '(+', '+)', ')+', '+(', ')+(',
  161. '5+5+', '+5+5+', '+5+5']
  162.  
  163. for case in false_cases:
  164. chain = lex(case)
  165. assert not validate_lex(chain), 'Got valid lex for {0} => {1}'.format(case, chain)
  166.  
  167.  
  168. if __name__ == '__main__':
  169. test_is_token()
  170. test_lexing()
Runtime error #stdin #stdout #stderr 0.03s 9984KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Traceback (most recent call last):
  File "./prog.py", line 74, in <module>
  File "./prog.py", line 78, in LexerNode
NameError: name 'LexerNode' is not defined