import re
# mapping of terminal token type to regex of characters that match that type
TOKEN_TYPES = {
name: re .compile ( pattern) for name, pattern in
{
'INT' : r'^\d +$' ,
'OPEN' : r'^\( $' ,
'CLOSE' : r'^\) $' ,
'PLUS' : r'^\+ $' ,
'TIMES' : r'^\* $' ,
} .items ( )
}
# grammar that describes the accepted sequences
# current token : [possible next token types]
GRAMMAR = {
'START' : { 'INT' , 'OPEN' } ,
'INT' : { 'INT' , 'CLOSE' , 'PLUS' , 'TIMES' , 'END' } ,
'OPEN' : { 'INT' } ,
'CLOSE' : { 'PLUS' , 'TIMES' , 'END' } ,
'PLUS' : { 'INT' , 'OPEN' } ,
'TIMES' : { 'INT' , 'OPEN' } ,
'END' : { } ,
}
def is_token( token_type: str , token : str , token_types= TOKEN_TYPES) -> bool :
"""check that a string token matches a given token type regex"""
assert token_type in token_types, 'Unrecognized token type %s' % token_type
return bool ( token_types[ token_type] .match ( token ) )
def test_is_token( ) :
assert is_token( 'INT' , '5' ) , "Token type INT didn't match 5"
assert not is_token( 'INT' , '+' ) , "Token type INT matched match +"
assert is_token( 'OPEN' , '(' ) , "Token type OPEN didn't match ("
assert not is_token( 'OPEN' , ')' ) , "Token type OPEN matched )"
assert is_token( 'PLUS' , '+' ) , "Token type PLUS didn't match +"
assert not is_token( 'PLUS' , '*' ) , "Token type PLUS matched *"
assert is_token( 'TIMES' , '*' ) , "Token type TIMES didn't match *"
assert not is_token( 'TIMES' , '+' ) , "Token type TIMES matched +"
class InputConsumer( object ) :
"""A stepper to store current position in a string buffer to aid parsing"""
def __init__ ( self , input_stream: str = '' , start_pos= 0 ) -> None :
self .input_stream = input_stream
self .current_idx = start_pos
def at( self , idx: int = None ) :
idx = idx if idx is not None else self .current_idx
if idx > len ( self .input_stream ) - 1 :
return ''
return self .input_stream [ idx]
def step( self ) -> None :
self .current_idx += 1
def back( self ) -> None :
self .current_idx -= 1
def __str__ ( self ) :
before = self .input_stream [ :self .current_idx ]
curr = self .input_stream [ self .current_idx ]
after = self .input_stream [ self .current_idx + 1 :]
return '{0}[{1}]{2}' .format ( before, curr, after)
def __repr__ ( self ) :
return str ( self )
class LexerNode( object ) :
"""A linked list of lexed nodes storing node type, token str, next, and prev"""
def __init__ ( self , token : str , token_type: str = None ,
prev_node: LexerNode= None , next_node: LexerNode= None ) -> None :
self .token = token
self .token_type = token_type
self .prev_node = prev_node
self .next_node = next_node
def __str__ ( self ) -> str :
return '<{0}{1}> {2}' .format (
self .token_type ,
': {0}' .format ( self .token ) if self .token else '' ,
self .next_node or '' ,
)
def __repr__ ( self ) -> str :
return str ( self )
def __eq__ ( self : LexerNode, other: object ) -> bool :
if not isinstance ( other, LexerNode) :
return NotImplemented
return (
self .token == other.token and
self .token_type == other.token_type and
self .next_node == other.next_node
)
def lex( input_stream: str ) -> LexerNode:
"""Lex an input string into a LexerNode chain of tokens"""
stream = InputConsumer( input_stream)
lexed_chain = LexerNode( '' , 'START' )
current_node = lexed_chain
while stream.at ( ) :
current_token = stream.at ( )
token_type = None
for name in TOKEN_TYPES.keys ( ) :
if is_token( name, current_token) :
token_type = name
break
stream.step ( )
new_node = LexerNode( current_token, token_type, current_node)
current_node.next_node = new_node
current_node = new_node
end_node = LexerNode( '' , 'END' , current_node)
current_node.next_node = end_node
current_node = end_node
return lexed_chain
def validate_lex( lexed_chain: LexerNode, grammar= GRAMMAR) -> bool :
"""Confirm that a lexed chain follows a given grammar"""
current_node = lexed_chain
while current_node.next_node :
token_type = current_node.token_type
next_token_type = current_node.next_node .token_type
if next_token_type not in grammar[ token_type] :
return False
current_node = current_node.next_node
return True
def test_lexing( ) :
true_cases = [ '5' , '55' , '(5)' ,
'(5+5)' , '5+5' , '(5)+5' , '5+(5)' ,
'(5)+(5)' , '(5)+5+(5)' ,
'5)+5+(5' ] # this case is an invalid parse, but a valid lex
for case in true_cases:
chain = lex( case)
assert validate_lex( chain) , 'Got invalid lex for {0} => {1}' .format ( case, chain)
false_cases = [ '(' , ')' , '()' , ')(' , '((' , '))' ,
'+' , '5+' , '+5' ,
'(+)' , '(+' , '+)' , ')+' , '+(' , ')+(' ,
'5+5+' , '+5+5+' , '+5+5' ]
for case in false_cases:
chain = lex( case)
assert not validate_lex( chain) , 'Got valid lex for {0} => {1}' .format ( case, chain)
if __name__ == '__main__' :
test_is_token( )
test_lexing( )
aW1wb3J0IHJlCgojIG1hcHBpbmcgb2YgdGVybWluYWwgdG9rZW4gdHlwZSB0byByZWdleCBvZiBjaGFyYWN0ZXJzIHRoYXQgbWF0Y2ggdGhhdCB0eXBlClRPS0VOX1RZUEVTID0gewogICAgbmFtZTogcmUuY29tcGlsZShwYXR0ZXJuKSBmb3IgbmFtZSwgcGF0dGVybiBpbgogICAgewogICAgICAgICdJTlQnOiByJ15cZCskJywKICAgICAgICAnT1BFTic6IHInXlwoJCcsCiAgICAgICAgJ0NMT1NFJzogcideXCkkJywKICAgICAgICAnUExVUyc6IHInXlwrJCcsCiAgICAgICAgJ1RJTUVTJzogcideXCokJywKICAgIH0uaXRlbXMoKQp9CgojIGdyYW1tYXIgdGhhdCBkZXNjcmliZXMgdGhlIGFjY2VwdGVkIHNlcXVlbmNlcwojIGN1cnJlbnQgdG9rZW4gOiBbcG9zc2libGUgbmV4dCB0b2tlbiB0eXBlc10KR1JBTU1BUiA9IHsKICAgICdTVEFSVCc6IHsnSU5UJywgJ09QRU4nfSwKICAgICdJTlQnOiB7J0lOVCcsICdDTE9TRScsICdQTFVTJywgJ1RJTUVTJywgJ0VORCd9LAogICAgJ09QRU4nOiB7J0lOVCd9LAogICAgJ0NMT1NFJzogeydQTFVTJywgJ1RJTUVTJywgJ0VORCd9LAogICAgJ1BMVVMnOiB7J0lOVCcsICdPUEVOJ30sCiAgICAnVElNRVMnOiB7J0lOVCcsICdPUEVOJ30sCiAgICAnRU5EJzoge30sCn0KCgpkZWYgaXNfdG9rZW4odG9rZW5fdHlwZTogc3RyLCB0b2tlbjogc3RyLCB0b2tlbl90eXBlcz1UT0tFTl9UWVBFUykgLT4gYm9vbDoKICAgICIiImNoZWNrIHRoYXQgYSBzdHJpbmcgdG9rZW4gbWF0Y2hlcyBhIGdpdmVuIHRva2VuIHR5cGUgcmVnZXgiIiIKCiAgICBhc3NlcnQgdG9rZW5fdHlwZSBpbiB0b2tlbl90eXBlcywgJ1VucmVjb2duaXplZCB0b2tlbiB0eXBlICVzJyAlIHRva2VuX3R5cGUKICAgIHJldHVybiBib29sKHRva2VuX3R5cGVzW3Rva2VuX3R5cGVdLm1hdGNoKHRva2VuKSkKCmRlZiB0ZXN0X2lzX3Rva2VuKCk6CiAgICBhc3NlcnQgaXNfdG9rZW4oJ0lOVCcsICc1JyksICJUb2tlbiB0eXBlIElOVCBkaWRuJ3QgbWF0Y2ggNSIKICAgIGFzc2VydCBub3QgaXNfdG9rZW4oJ0lOVCcsICcrJyksICJUb2tlbiB0eXBlIElOVCBtYXRjaGVkIG1hdGNoICsiCiAgICBhc3NlcnQgaXNfdG9rZW4oJ09QRU4nLCAnKCcpLCAiVG9rZW4gdHlwZSBPUEVOIGRpZG4ndCBtYXRjaCAoIgogICAgYXNzZXJ0IG5vdCBpc190b2tlbignT1BFTicsICcpJyksICJUb2tlbiB0eXBlIE9QRU4gbWF0Y2hlZCApIgogICAgYXNzZXJ0IGlzX3Rva2VuKCdQTFVTJywgJysnKSwgIlRva2VuIHR5cGUgUExVUyBkaWRuJ3QgbWF0Y2ggKyIKICAgIGFzc2VydCBub3QgaXNfdG9rZW4oJ1BMVVMnLCAnKicpLCAiVG9rZW4gdHlwZSBQTFVTIG1hdGNoZWQgKiIKICAgIGFzc2VydCBpc190b2tlbignVElNRVMnLCAnKicpLCAiVG9rZW4gdHlwZSBUSU1FUyBkaWRuJ3QgbWF0Y2ggKiIKICAgIGFzc2VydCBub3QgaXNfdG9rZW4oJ1RJTUVTJywgJysnKSwgIlRva2VuIHR5cGUgVElNRVMgbWF0Y2hlZCArIgoKCmNsYXNzIElucHV0Q29uc3VtZXIob2JqZWN0KToKICAgICIiIkEgc3RlcHBlciB0byBzdG9yZSBjdXJyZW50IHBvc2l0aW9uIGluIGEgc3RyaW5nIGJ1ZmZlciB0byBhaWQgcGFyc2luZyIiIgoKICAgIGRlZiBfX2luaXRfXyhzZWxmLCBpbnB1dF9zdHJlYW06IHN0cj0nJywgc3RhcnRfcG9zPTApIC0+IE5vbmU6CiAgICAgICAgc2VsZi5pbnB1dF9zdHJlYW0gPSBpbnB1dF9zdHJlYW0KICAgICAgICBzZWxmLmN1cnJlbnRfaWR4ID0gc3RhcnRfcG9zCgogICAgZGVmIGF0KHNlbGYsIGlkeDogaW50PU5vbmUpOgogICAgICAgIGlkeCA9IGlkeCBpZiBpZHggaXMgbm90IE5vbmUgZWxzZSBzZWxmLmN1cnJlbnRfaWR4CiAgICAgICAgaWYgaWR4ID4gbGVuKHNlbGYuaW5wdXRfc3RyZWFtKSAtIDE6CiAgICAgICAgICAgIHJldHVybiAnJwogICAgICAgIHJldHVybiBzZWxmLmlucHV0X3N0cmVhbVtpZHhdCgogICAgZGVmIHN0ZXAoc2VsZikgLT4gTm9uZToKICAgICAgICBzZWxmLmN1cnJlbnRfaWR4ICs9IDEKCiAgICBkZWYgYmFjayhzZWxmKSAtPiBOb25lOgogICAgICAgIHNlbGYuY3VycmVudF9pZHggLT0gMQoKICAgIGRlZiBfX3N0cl9fKHNlbGYpOgogICAgICAgIGJlZm9yZSA9IHNlbGYuaW5wdXRfc3RyZWFtWzpzZWxmLmN1cnJlbnRfaWR4XQogICAgICAgIGN1cnIgPSBzZWxmLmlucHV0X3N0cmVhbVtzZWxmLmN1cnJlbnRfaWR4XQogICAgICAgIGFmdGVyID0gc2VsZi5pbnB1dF9zdHJlYW1bc2VsZi5jdXJyZW50X2lkeCArIDE6XQogICAgICAgIHJldHVybiAnezB9W3sxfV17Mn0nLmZvcm1hdChiZWZvcmUsIGN1cnIsIGFmdGVyKQoKICAgIGRlZiBfX3JlcHJfXyhzZWxmKToKICAgICAgICByZXR1cm4gc3RyKHNlbGYpCgoKY2xhc3MgTGV4ZXJOb2RlKG9iamVjdCk6CiAgICAiIiJBIGxpbmtlZCBsaXN0IG9mIGxleGVkIG5vZGVzIHN0b3Jpbmcgbm9kZSB0eXBlLCB0b2tlbiBzdHIsIG5leHQsIGFuZCBwcmV2IiIiCgogICAgZGVmIF9faW5pdF9fKHNlbGYsIHRva2VuOiBzdHIsIHRva2VuX3R5cGU6IHN0cj1Ob25lLAogICAgICAgICAgICAgICAgIHByZXZfbm9kZTogTGV4ZXJOb2RlPU5vbmUsIG5leHRfbm9kZTogTGV4ZXJOb2RlPU5vbmUpIC0+IE5vbmU6CiAgICAgICAgc2VsZi50b2tlbiA9IHRva2VuCiAgICAgICAgc2VsZi50b2tlbl90eXBlID0gdG9rZW5fdHlwZQogICAgICAgIHNlbGYucHJldl9ub2RlID0gcHJldl9ub2RlCiAgICAgICAgc2VsZi5uZXh0X25vZGUgPSBuZXh0X25vZGUKCgogICAgZGVmIF9fc3RyX18oc2VsZikgLT4gc3RyOgogICAgICAgIHJldHVybiAnPHswfXsxfT4gezJ9Jy5mb3JtYXQoCiAgICAgICAgICAgIHNlbGYudG9rZW5fdHlwZSwKICAgICAgICAgICAgJzogezB9Jy5mb3JtYXQoc2VsZi50b2tlbikgaWYgc2VsZi50b2tlbiBlbHNlICcnLAogICAgICAgICAgICBzZWxmLm5leHRfbm9kZSBvciAnJywKICAgICAgICApCgogICAgZGVmIF9fcmVwcl9fKHNlbGYpIC0+IHN0cjoKICAgICAgICByZXR1cm4gc3RyKHNlbGYpCgogICAgZGVmIF9fZXFfXyhzZWxmOiBMZXhlck5vZGUsIG90aGVyOiBvYmplY3QpIC0+IGJvb2w6CiAgICAgICAgaWYgbm90IGlzaW5zdGFuY2Uob3RoZXIsIExleGVyTm9kZSk6CiAgICAgICAgICAgIHJldHVybiBOb3RJbXBsZW1lbnRlZAogICAgICAgIHJldHVybiAoCiAgICAgICAgICAgIHNlbGYudG9rZW4gPT0gb3RoZXIudG9rZW4gYW5kCiAgICAgICAgICAgIHNlbGYudG9rZW5fdHlwZSA9PSBvdGhlci50b2tlbl90eXBlIGFuZAogICAgICAgICAgICBzZWxmLm5leHRfbm9kZSA9PSBvdGhlci5uZXh0X25vZGUKICAgICAgICApCgoKZGVmIGxleChpbnB1dF9zdHJlYW06IHN0cikgLT4gTGV4ZXJOb2RlOgogICAgIiIiTGV4IGFuIGlucHV0IHN0cmluZyBpbnRvIGEgTGV4ZXJOb2RlIGNoYWluIG9mIHRva2VucyIiIgoKICAgIHN0cmVhbSA9IElucHV0Q29uc3VtZXIoaW5wdXRfc3RyZWFtKQogICAgbGV4ZWRfY2hhaW4gPSBMZXhlck5vZGUoJycsICdTVEFSVCcpCiAgICBjdXJyZW50X25vZGUgPSBsZXhlZF9jaGFpbgoKICAgIHdoaWxlIHN0cmVhbS5hdCgpOgogICAgICAgIGN1cnJlbnRfdG9rZW4gPSBzdHJlYW0uYXQoKQogICAgICAgIHRva2VuX3R5cGUgPSBOb25lCgogICAgICAgIGZvciBuYW1lIGluIFRPS0VOX1RZUEVTLmtleXMoKToKICAgICAgICAgICAgaWYgaXNfdG9rZW4obmFtZSwgY3VycmVudF90b2tlbik6CiAgICAgICAgICAgICAgICB0b2tlbl90eXBlID0gbmFtZQogICAgICAgICAgICAgICAgYnJlYWsKCiAgICAgICAgc3RyZWFtLnN0ZXAoKQogICAgICAgIG5ld19ub2RlID0gTGV4ZXJOb2RlKGN1cnJlbnRfdG9rZW4sIHRva2VuX3R5cGUsIGN1cnJlbnRfbm9kZSkKICAgICAgICBjdXJyZW50X25vZGUubmV4dF9ub2RlID0gbmV3X25vZGUKICAgICAgICBjdXJyZW50X25vZGUgPSBuZXdfbm9kZQoKICAgIGVuZF9ub2RlID0gTGV4ZXJOb2RlKCcnLCAnRU5EJywgY3VycmVudF9ub2RlKQogICAgY3VycmVudF9ub2RlLm5leHRfbm9kZSA9IGVuZF9ub2RlCiAgICBjdXJyZW50X25vZGUgPSBlbmRfbm9kZQoKICAgIHJldHVybiBsZXhlZF9jaGFpbgoKCmRlZiB2YWxpZGF0ZV9sZXgobGV4ZWRfY2hhaW46IExleGVyTm9kZSwgZ3JhbW1hcj1HUkFNTUFSKSAtPiBib29sOgogICAgIiIiQ29uZmlybSB0aGF0IGEgbGV4ZWQgY2hhaW4gZm9sbG93cyBhIGdpdmVuIGdyYW1tYXIiIiIKCiAgICBjdXJyZW50X25vZGUgPSBsZXhlZF9jaGFpbgogICAgd2hpbGUgY3VycmVudF9ub2RlLm5leHRfbm9kZToKICAgICAgICB0b2tlbl90eXBlID0gY3VycmVudF9ub2RlLnRva2VuX3R5cGUKICAgICAgICBuZXh0X3Rva2VuX3R5cGUgPSBjdXJyZW50X25vZGUubmV4dF9ub2RlLnRva2VuX3R5cGUKCiAgICAgICAgaWYgbmV4dF90b2tlbl90eXBlIG5vdCBpbiBncmFtbWFyW3Rva2VuX3R5cGVdOgogICAgICAgICAgICByZXR1cm4gRmFsc2UKCiAgICAgICAgY3VycmVudF9ub2RlID0gY3VycmVudF9ub2RlLm5leHRfbm9kZQoKICAgIHJldHVybiBUcnVlCgpkZWYgdGVzdF9sZXhpbmcoKToKICAgIHRydWVfY2FzZXMgPSBbJzUnLCAnNTUnLCAnKDUpJywKICAgICAgICAgICAgICAgICAgJyg1KzUpJywgJzUrNScsICcoNSkrNScsICc1Kyg1KScsCiAgICAgICAgICAgICAgICAgICcoNSkrKDUpJywgJyg1KSs1Kyg1KScsCiAgICAgICAgICAgICAgICAgICc1KSs1Kyg1J10gICMgdGhpcyBjYXNlIGlzIGFuIGludmFsaWQgcGFyc2UsIGJ1dCBhIHZhbGlkIGxleAoKICAgIGZvciBjYXNlIGluIHRydWVfY2FzZXM6CiAgICAgICAgY2hhaW4gPSBsZXgoY2FzZSkKICAgICAgICBhc3NlcnQgdmFsaWRhdGVfbGV4KGNoYWluKSwgJ0dvdCBpbnZhbGlkIGxleCBmb3IgezB9ID0+IHsxfScuZm9ybWF0KGNhc2UsIGNoYWluKQoKICAgIGZhbHNlX2Nhc2VzID0gWycoJywgJyknLCAnKCknLCAnKSgnLCAnKCgnLCAnKSknLAogICAgICAgICAgICAgICAgICAgJysnLCAnNSsnLCAnKzUnLAogICAgICAgICAgICAgICAgICAgJygrKScsICcoKycsICcrKScsICcpKycsICcrKCcsICcpKygnLAogICAgICAgICAgICAgICAgICAgJzUrNSsnLCAnKzUrNSsnLCAnKzUrNSddCgogICAgZm9yIGNhc2UgaW4gZmFsc2VfY2FzZXM6CiAgICAgICAgY2hhaW4gPSBsZXgoY2FzZSkKICAgICAgICBhc3NlcnQgbm90IHZhbGlkYXRlX2xleChjaGFpbiksICdHb3QgdmFsaWQgbGV4IGZvciB7MH0gPT4gezF9Jy5mb3JtYXQoY2FzZSwgY2hhaW4pCgoKaWYgX19uYW1lX18gPT0gJ19fbWFpbl9fJzoKICAgIHRlc3RfaXNfdG9rZW4oKQogICAgdGVzdF9sZXhpbmcoKQ==