fork download
  1. #include <iostream>
  2. #include <unordered_map>
  3. #include <map>
  4. #include <vector>
  5. #include <climits>
  6. using namespace std;
  7. struct Node{
  8. int x;
  9. Node* l;
  10. Node* r;
  11. Node(int X=0, Node* L=NULL, Node* R=NULL) : x(X), l(L), r(R) {}
  12. };
  13. Node* huffman_tree(const string &s){
  14. if(!s.size()) return NULL;
  15. unordered_map<char,int> h;
  16. for(char c: s)
  17. h[c]++;
  18. multimap<int, Node*> v;
  19. for(auto p: h)
  20. v.insert({p.second, new Node(p.first)});
  21. pair<int,Node*> p = *v.begin();
  22. v.erase(v.begin());
  23. while(!v.empty()){
  24. auto it = v.begin();
  25. v.insert({p.first+it->first, new Node(p.first+it->first,p.second,it->second)});
  26. v.erase(it);
  27. it = v.begin();
  28. p = *it;
  29. v.erase(it);
  30. }
  31. return p.second;
  32. }
  33. void huffman_helper(unordered_map<char,vector<bool>> &t, Node* node, vector<bool> v){
  34. if(!(node->l||node->r)){
  35. t[node->x] = v;
  36. return;
  37. }
  38. v.push_back(false);
  39. huffman_helper(t,node->l,v);
  40. v.back() = true;
  41. huffman_helper(t,node->r,v);
  42. }
  43. unordered_map<char,vector<bool>> huffman_table(Node* h){
  44. unordered_map<char,vector<bool>> t;
  45. if(!h) return t;
  46. huffman_helper(t,h,vector<bool>());
  47. return t;
  48. }
  49. void print(char c){
  50. if(c=='\s') cout<<"\\s";
  51. else if(c=='\t') cout<<"\\t";
  52. else if(c=='\n') cout<<"\\n";
  53. else cout<<c;
  54. }
  55. void print_binary_tree(Node* node, int depth=0, bool leaf_check=false){
  56. // this will print a leaf node differently if the leaf_check is set to true
  57. // print format for leaf can be defined in else block below.
  58. if(!node) return;
  59. print_binary_tree(node->r, depth+1, leaf_check);
  60. string str = "";
  61. for(int i=0; i<depth; i++)
  62. cout<<"|\t";
  63. if(!leaf_check||node->l||node->r) cout<<node->x;
  64. else print(node->x); // print in this format if it's a leaf
  65. cout<<endl;
  66. print_binary_tree(node->l, depth+1, leaf_check);
  67. return;
  68. }
  69. void print(const vector<bool> &v){
  70. for(const auto &x: v)
  71. cout<<x;
  72. cout<<endl;
  73. }
  74. vector<bool> encode(const string &s, unordered_map<char,vector<bool>> &t){
  75. vector<bool> v;
  76. for(char c: s)
  77. for(bool b: t[c])
  78. v.push_back(b);
  79. return v;
  80. }
  81. string decode(const vector<bool> &v, Node* h){
  82. string s = "";
  83. Node* node = h;
  84. for(bool b: v){
  85. node = b?node->r:node->l;
  86. if(!(node->l||node->r)){
  87. s.push_back(char(node->x));
  88. node = h;
  89. }
  90. }
  91. return s;
  92. }
  93. int main() {
  94. // CAUTION: this method is an exception for the case
  95. // when all text contains just one unique character.
  96. // E.g. ccccccccccccccccccccccccccccccc, without any whitespace etc.
  97. string s;
  98. string p="";
  99. while(getline(cin,s))
  100. p += s + "\n";
  101. Node* h = huffman_tree(p);
  102. print_binary_tree(h,0,true);
  103. cout<<endl<<endl;
  104. unordered_map<char,vector<bool>> t = huffman_table(h);
  105. int sum = 0;
  106. for(auto p: t){
  107. print(p.first);
  108. cout<<'\t';
  109. print(p.second);
  110. sum += p.second.size();
  111. }
  112. cout<<endl;
  113. cout<<"Size of text: text_length*char_size = ";
  114. cout<<p.size()<<" * "<<sizeof(char)*CHAR_BIT<<" = ";
  115. cout<<p.size()*sizeof(char)*CHAR_BIT<<endl;
  116. cout<<"Size of table: uniq_chars*(char_size + avg_encoding_size) = ";
  117. cout<<t.size()<<" * ("<<sizeof(char)*CHAR_BIT<<" + "<<sum<<"/"<<t.size()<<") = ";
  118. cout<<t.size()*sizeof(char)*CHAR_BIT+sum<<endl;
  119. vector<bool> v = encode(p,t);
  120. cout<<"Size of encoded string: "<<v.size()<<endl;
  121. print(v);
  122. cout<<endl;
  123. string text = decode(v,h);
  124. cout<<"Check if the original text and decoded text matches? ";
  125. cout<<(p==text?"Yes.":"No.")<<endl;
  126. cout<<endl<<text<<endl;
  127. return 0;
  128. }
Success #stdin #stdout 0s 4252KB
stdin
In computer science and information theory, a Huffman code is a particular type of optimal prefix code that is commonly used for lossless data compression. The process of finding or using such a code proceeds by means of Huffman coding, an algorithm developed by David A. Huffman while he was a Sc.D. student at MIT, and published in the 1952 paper "A Method for the Construction of Minimum-Redundancy Codes".

The output from Huffman's algorithm can be viewed as a variable-length code table for encoding a source symbol (such as a character in a file). The algorithm derives this table from the estimated probability or frequency of occurrence (weight) for each possible value of the source symbol. As in other entropy encoding methods, more common symbols are generally represented using fewer bits than less common symbols. Huffman's method can be efficiently implemented, finding a code in time linear to the number of input weights if these weights are sorted. However, although optimal among methods encoding symbols separately, Huffman coding is not always optimal among all compression methods - it is replaced with arithmetic coding or asymmetric numeral systems if better compression ratio is required.
stdout
|	|	|	|	|	r
|	|	|	|	109
|	|	|	|	|	|	|	|	H
|	|	|	|	|	|	|	14
|	|	|	|	|	|	|	|	,
|	|	|	|	|	|	27
|	|	|	|	|	|	|	|	v
|	|	|	|	|	|	|	13
|	|	|	|	|	|	|	|	|	\n
|	|	|	|	|	|	|	|	6
|	|	|	|	|	|	|	|	|	A
|	|	|	|	|	52
|	|	|	|	|	|	p
|	|	|	202
|	|	|	|	|	m
|	|	|	|	93
|	|	|	|	|	c
|	|	393
|	|	|	 
|	707
|	|	|	|	o
|	|	|	163
|	|	|	|	|	|	g
|	|	|	|	|	41
|	|	|	|	|	|	|	|	|	M
|	|	|	|	|	|	|	|	6
|	|	|	|	|	|	|	|	|	-
|	|	|	|	|	|	|	10
|	|	|	|	|	|	|	|	|	|	5
|	|	|	|	|	|	|	|	|	2
|	|	|	|	|	|	|	|	|	|	x
|	|	|	|	|	|	|	|	4
|	|	|	|	|	|	|	|	|	|	2
|	|	|	|	|	|	|	|	|	2
|	|	|	|	|	|	|	|	|	|	S
|	|	|	|	|	|	20
|	|	|	|	|	|	|	w
|	|	|	|	80
|	|	|	|	|	|	y
|	|	|	|	|	39
|	|	|	|	|	|	b
|	|	314
|	|	|	|	|	l
|	|	|	|	78
|	|	|	|	|	d
|	|	|	151
|	|	|	|	|	h
|	|	|	|	73
|	|	|	|	|	f
1214
|	|	|	|	i
|	|	|	137
|	|	|	|	a
|	|	270
|	|	|	|	\s
|	|	|	133
|	|	|	|	t
|	507
|	|	|	|	|	|	|	.
|	|	|	|	|	|	18
|	|	|	|	|	|	|	|	|	|	1
|	|	|	|	|	|	|	|	|	2
|	|	|	|	|	|	|	|	|	|	R
|	|	|	|	|	|	|	|	4
|	|	|	|	|	|	|	|	|	I
|	|	|	|	|	|	|	8
|	|	|	|	|	|	|	|	|	D
|	|	|	|	|	|	|	|	4
|	|	|	|	|	|	|	|	|	C
|	|	|	|	|	33
|	|	|	|	|	|	|	|	|	"
|	|	|	|	|	|	|	|	4
|	|	|	|	|	|	|	|	|	'
|	|	|	|	|	|	|	8
|	|	|	|	|	|	|	|	|	(
|	|	|	|	|	|	|	|	4
|	|	|	|	|	|	|	|	|	)
|	|	|	|	|	|	15
|	|	|	|	|	|	|	|	T
|	|	|	|	|	|	|	7
|	|	|	|	|	|	|	|	|	q
|	|	|	|	|	|	|	|	3
|	|	|	|	|	|	|	|	|	9
|	|	|	|	63
|	|	|	|	|	u
|	|	|	126
|	|	|	|	n
|	|	237
|	|	|	e


,	11110110
v	11110101
A	111101000
p	111100
m	11101
c	11100
 	110
o	1011
g	101011
M	101010111
-	101010110
S	1010101000
H	11110111
w	1010100
y	101001
b	101000
l	10011
r	11111
C	001111000
q	001110001
"	001110111
'	001110110
)	001110100
n	0010
(	001110101
T	00111001
u	00110
e	000
x	1010101010
I	001111010
R	0011110110
i	0111
1	0011110111
.	0011111
D	001111001
\s	0101
t	0100
2	1010101001
a	0110
f	10000
\n	111101001
9	001110000
h	10001
5	1010101011
d	10010

Size of text: text_length*char_size = 1214 * 8 = 9712
Size of table: uniq_chars*(char_size + avg_encoding_size) = 45 * (8 + 317/45) = 677
Size of encoded string: 5396


Check if the original text and decoded text matches? Yes.

In computer science and information theory, a Huffman code is a particular type of optimal prefix code that is commonly used for lossless data compression. The process of finding or using such a code proceeds by means of Huffman coding, an algorithm developed by David A. Huffman while he was a Sc.D. student at MIT, and published in the 1952 paper "A Method for the Construction of Minimum-Redundancy Codes".

The output from Huffman's algorithm can be viewed as a variable-length code table for encoding a source symbol (such as a character in a file). The algorithm derives this table from the estimated probability or frequency of occurrence (weight) for each possible value of the source symbol. As in other entropy encoding methods, more common symbols are generally represented using fewer bits than less common symbols. Huffman's method can be efficiently implemented, finding a code in time linear to the number of input weights if these weights are sorted. However, although optimal among methods encoding symbols separately, Huffman coding is not always optimal among all compression methods - it is replaced with arithmetic coding or asymmetric numeral systems if better compression ratio is required.