from collections import Counter
import heapq
class Node:
def __init__ ( self , value) :
self .children = [ ]
self .value = value
def __lt__ ( self , other) :
return self .value < other.value
def branch_map( self ) :
"""
returns a dictionary that describes the path from the root node to each leaf node.
for example, if the root's second child's first child's value is "X", then there will be a pair `10: "X"`
"""
ret = { "" : self .value }
for idx, child in enumerate ( self .children ) :
for k, v in child.branch_map ( ) .iteritems ( ) :
ret[ str ( idx) +k] = v
return ret
def frequency( data) :
c = Counter( data)
return { char: count/float ( len ( data) ) for char, count in c.iteritems ( ) }
def gen_huffman_tree( data) :
nodes = [ Node( ( freq, value) ) for value, freq in frequency( data) .iteritems ( ) ]
heapq .heapify ( nodes)
while len ( nodes) > 1 :
a = heapq .heappop ( nodes)
b = heapq .heappop ( nodes)
combined = Node( ( a.value [ 0 ] +b.value [ 0 ] , "*" ) )
combined.children .extend ( [ a, b] )
heapq .heappush ( nodes, combined)
return nodes[ 0 ]
def gen_huffman_coding( data) :
tree = gen_huffman_tree( data)
codes = { }
for code , ( freq, char) in tree.branch_map ( ) .iteritems ( ) :
codes[ char] = code
return codes
data = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
table = gen_huffman_coding( data)
for char in table:
print char, table[ char]
ZnJvbSBjb2xsZWN0aW9ucyBpbXBvcnQgQ291bnRlcgppbXBvcnQgaGVhcHEKCmNsYXNzIE5vZGU6CiAgICBkZWYgX19pbml0X18oc2VsZiwgdmFsdWUpOgogICAgICAgIHNlbGYuY2hpbGRyZW4gPSBbXQogICAgICAgIHNlbGYudmFsdWUgPSB2YWx1ZQogICAgZGVmIF9fbHRfXyhzZWxmLCBvdGhlcik6CiAgICAgICAgcmV0dXJuIHNlbGYudmFsdWUgPCBvdGhlci52YWx1ZQogICAgZGVmIGJyYW5jaF9tYXAoc2VsZik6CiAgICAgICAgIiIiCiAgICAgICAgICAgIHJldHVybnMgYSBkaWN0aW9uYXJ5IHRoYXQgZGVzY3JpYmVzIHRoZSBwYXRoIGZyb20gdGhlIHJvb3Qgbm9kZSB0byBlYWNoIGxlYWYgbm9kZS4KICAgICAgICAgICAgZm9yIGV4YW1wbGUsIGlmIHRoZSByb290J3Mgc2Vjb25kIGNoaWxkJ3MgZmlyc3QgY2hpbGQncyB2YWx1ZSBpcyAiWCIsIHRoZW4gdGhlcmUgd2lsbCBiZSBhIHBhaXIgYDEwOiAiWCJgCiAgICAgICAgIiIiCiAgICAgICAgcmV0ID0geyIiOiBzZWxmLnZhbHVlfQogICAgICAgIGZvciBpZHgsIGNoaWxkIGluIGVudW1lcmF0ZShzZWxmLmNoaWxkcmVuKToKICAgICAgICAgICAgZm9yIGssdiBpbiBjaGlsZC5icmFuY2hfbWFwKCkuaXRlcml0ZW1zKCk6CiAgICAgICAgICAgICAgICByZXRbc3RyKGlkeCkra10gPSB2CiAgICAgICAgcmV0dXJuIHJldAoKZGVmIGZyZXF1ZW5jeShkYXRhKToKICAgIGMgPSBDb3VudGVyKGRhdGEpCiAgICByZXR1cm4ge2NoYXI6IGNvdW50L2Zsb2F0KGxlbihkYXRhKSkgZm9yIGNoYXIsIGNvdW50IGluIGMuaXRlcml0ZW1zKCl9CgpkZWYgZ2VuX2h1ZmZtYW5fdHJlZShkYXRhKToKICAgIG5vZGVzID0gW05vZGUoKGZyZXEsIHZhbHVlKSkgZm9yIHZhbHVlLCBmcmVxIGluIGZyZXF1ZW5jeShkYXRhKS5pdGVyaXRlbXMoKV0KICAgIGhlYXBxLmhlYXBpZnkobm9kZXMpCiAgICB3aGlsZSBsZW4obm9kZXMpID4gMToKICAgICAgICBhID0gaGVhcHEuaGVhcHBvcChub2RlcykKICAgICAgICBiID0gaGVhcHEuaGVhcHBvcChub2RlcykKICAgICAgICBjb21iaW5lZCA9IE5vZGUoKGEudmFsdWVbMF0rYi52YWx1ZVswXSwgIioiKSkKICAgICAgICBjb21iaW5lZC5jaGlsZHJlbi5leHRlbmQoW2EsYl0pCiAgICAgICAgaGVhcHEuaGVhcHB1c2gobm9kZXMsIGNvbWJpbmVkKQogICAgcmV0dXJuIG5vZGVzWzBdCgpkZWYgZ2VuX2h1ZmZtYW5fY29kaW5nKGRhdGEpOgogICAgdHJlZSA9IGdlbl9odWZmbWFuX3RyZWUoZGF0YSkKICAgIGNvZGVzID0ge30KICAgIGZvciBjb2RlLCAoZnJlcSwgY2hhcikgaW4gdHJlZS5icmFuY2hfbWFwKCkuaXRlcml0ZW1zKCk6CiAgICAgICAgY29kZXNbY2hhcl0gPSBjb2RlCiAgICByZXR1cm4gY29kZXMKCmRhdGEgPSAiTG9yZW0gaXBzdW0gZG9sb3Igc2l0IGFtZXQsIGNvbnNlY3RldHVyIGFkaXBpc2NpbmcgZWxpdCwgc2VkIGRvIGVpdXNtb2QgdGVtcG9yIGluY2lkaWR1bnQgdXQgbGFib3JlIGV0IGRvbG9yZSBtYWduYSBhbGlxdWEuIFV0IGVuaW0gYWQgbWluaW0gdmVuaWFtLCBxdWlzIG5vc3RydWQgZXhlcmNpdGF0aW9uIHVsbGFtY28gbGFib3JpcyBuaXNpIHV0IGFsaXF1aXAgZXggZWEgY29tbW9kbyBjb25zZXF1YXQuIER1aXMgYXV0ZSBpcnVyZSBkb2xvciBpbiByZXByZWhlbmRlcml0IGluIHZvbHVwdGF0ZSB2ZWxpdCBlc3NlIGNpbGx1bSBkb2xvcmUgZXUgZnVnaWF0IG51bGxhIHBhcmlhdHVyLiBFeGNlcHRldXIgc2ludCBvY2NhZWNhdCBjdXBpZGF0YXQgbm9uIHByb2lkZW50LCBzdW50IGluIGN1bHBhIHF1aSBvZmZpY2lhIGRlc2VydW50IG1vbGxpdCBhbmltIGlkIGVzdCBsYWJvcnVtLiIKCnRhYmxlID0gZ2VuX2h1ZmZtYW5fY29kaW5nKGRhdGEpCmZvciBjaGFyIGluIHRhYmxlOgogICAgcHJpbnQgY2hhciwgdGFibGVbY2hhcl0=