10. Playing with Huffman coding#
10.1. Introduction#
References:
from dahuffman import HuffmanCodec
codec = HuffmanCodec.from_data("A fost odată ca-n povești, A fost ca niciodată, Din rude mari împărătești, O prea frumoasă fată.")
codec.print_code_table()
Bits Code Value Symbol
3 000 0 't'
4 0010 2 'f'
4 0011 3 'r'
5 01000 8 'ș'
5 01001 9 ','
5 01010 10 'c'
5 01011 11 'd'
4 0110 6 'i'
5 01110 14 'm'
5 01111 15 'n'
4 1000 8 'o'
5 10010 18 'p'
5 10011 19 's'
4 1010 10 'ă'
6 101100 44 'î'
7 1011010 90 _EOF
7 1011011 91 '-'
7 1011100 92 '.'
7 1011101 93 'D'
6 101111 47 'A'
3 110 6 ' '
7 1110000 112 'O'
7 1110001 113 'v'
6 111001 57 'u'
5 11101 29 'e'
4 1111 15 'a'
10.2. From a large English text#
# Read a file at once
with open("data/texten.txt", "r") as f: # English text
lines = f.readlines()
contentEn = '\n'.join(lines)
codecEn = HuffmanCodec.from_data(contentEn)
codecEn.print_code_table()
Bits Code Value Symbol
4 0000 0 '\n'
6 000100 4 'w'
6 000101 5 'O'
8 00011000 24 'H'
8 00011001 25 'W'
9 000110100 52 '-'
12 000110101000 424 ')'
12 000110101001 425 'J'
11 00011010101 213 'j'
10 0001101011 107 'V'
8 00011011 27 'L'
6 000111 7 'c'
4 0010 2 't'
4 0011 3 'o'
6 010000 16 'f'
6 010001 17 '.'
7 0100100 36 'E'
7 0100101 37 'R'
6 010011 19 'I'
9 010100000 160 ';'
9 010100001 161 'F'
8 01010001 81 'C'
8 01010010 82 'G'
8 01010011 83 'N'
6 010101 21 'y'
7 0101100 44 'v'
8 01011010 90 'D'
11 01011011000 728 'q'
12 010110110010 1458 'z'
14 01011011001100 5836 _EOF
14 01011011001101 5837 '2'
13 0101101100111 2919 '0'
12 010110110100 1460 '"'
12 010110110101 1461 '1'
14 01011011011000 5848 '3'
14 01011011011001 5849 '5'
14 01011011011010 5850 '6'
14 01011011011011 5851 'X'
12 010110110111 1463 '9'
9 010110111 183 '!'
7 0101110 46 'b'
7 0101111 47 'T'
6 011000 24 'm'
8 01100100 100 "'"
8 01100101 101 'S'
7 0110011 51 'p'
6 011010 26 ','
7 0110110 54 'A'
11 01101110000 880 'x'
13 0110111000100 3524 '<'
13 0110111000101 3525 '>'
12 011011100011 1763 ':'
10 0110111001 441 'P'
9 011011101 221 '?'
8 01101111 111 'k'
4 0111 7 'e'
2 10 2 ' '
5 11000 24 'r'
5 11001 25 'i'
5 11010 26 'n'
5 11011 27 'h'
5 11100 28 's'
5 11101 29 'a'
6 111100 60 'l'
8 11110100 244 'B'
12 111101010000 3920 'K'
13 1111010100010 7842 'Z'
13 1111010100011 7843 '('
11 11110101001 1961 'U'
10 1111010101 981 'Y'
9 111101011 491 'M'
7 1111011 123 'g'
6 111110 62 'd'
6 111111 63 'u'
enc = codecEn.encode("How do you do!")
print(enc) # byte array
# Convert to binary string for display
enc_binarystr = ''.join([format(i, '08b') for i in enc])
print(enc_binarystr)
b'\x181/\x8eT\xff\xbe5\xba'
000110000011000100101111100011100101010011111111101111100011010110111010
9
72
10.3. From a large Romanian text#
# Read a file at once
with open("data/textro.txt", "r") as f: # Romanian text
lines = f.readlines()
contentRo = '\n'.join(lines)
codecRo = HuffmanCodec.from_data(contentRo)
codecRo.print_code_table()
Bits Code Value Symbol
4 0000 0 'u'
5 00010 2 ','
8 00011000 24 '!'
10 0001100100 100 'F'
10 0001100101 101 ':'
9 000110011 51 'U'
8 00011010 26 'E'
8 00011011 27 'D'
10 0001110000 112 'G'
11 00011100010 226 'k'
11 00011100011 227 '*'
9 000111001 57 'M'
8 00011101 29 'P'
7 0001111 15 'v'
3 001 1 'a'
4 0100 4 'n'
4 0101 5 'r'
5 01100 12 'd'
5 01101 13 'c'
5 01110 14 'o'
5 01111 15 's'
3 100 4 ' '
8 10100000 160 '.'
9 101000010 322 ';'
12 101000011000 2584 '1'
12 101000011001 2585 '3'
12 101000011010 2586 'Z'
13 1010000110110 5174 'x'
14 10100001101110 10350 _EOF
14 10100001101111 10351 '#'
10 1010000111 647 'j'
7 1010001 81 'C'
7 1010010 82 'b'
7 1010011 83 'f'
6 101010 42 'p'
7 1010110 86 'g'
9 101011100 348 'N'
9 101011101 349 'R'
8 10101111 175 'A'
5 10110 22 'l'
6 101110 46 '-'
9 101111000 376 'L'
10 1011110010 754 '?'
12 101111001100 3020 '"'
14 10111100110100 12084 '0'
14 10111100110101 12085 '5'
13 1011110011011 6043 '2'
14 10111100111000 12088 '8'
14 10111100111001 12089 'H'
13 1011110011101 6045 'J'
14 10111100111100 12092 'K'
14 10111100111101 12093 'Y'
14 10111100111110 12094 '['
14 10111100111111 12095 ']'
8 10111101 189 'I'
8 10111110 190 'z'
8 10111111 191 'S'
4 1100 12 'i'
4 1101 13 'e'
4 1110 14 '\n'
5 11110 30 't'
6 111110 62 'm'
11 11111100000 2016 'w'
11 11111100001 2017 'y'
10 1111110001 1009 'O'
9 111111001 505 'T'
10 1111110100 1012 'V'
10 1111110101 1013 'B'
9 111111011 507 'h'
7 1111111 127 '\t'
enc = codecRo.encode("Salutare!")
print(enc) # byte array
# Convert to binary string for display
enc_binarystr = ''.join([format(i, '08b') for i in enc])
print(enc_binarystr)
b'\xbf6\x0f\x15\xd1\x8a'
101111110011011000001111000101011101000110001010
10.4. Encoding strings#
Let’e encode:
the English text with the English code
the Romanian text with the Romanian code
the English text with the Romanian code
the Romanian text with the English code
enen = codecEn.encode(contentEn)
print(f"English text with English codec: {len(enen)} bytes")
roro = codecRo.encode(contentRo)
print(f"Romanian text with Romanian codec: {len(roro)} bytes")
# enro = codecEn.encode(contentRo)
# print(f"Romanian text with English codec: {len(enro)} bytes")
# roen = codecRo.encode(contentEn)
# print(f"English text with Romanian codec: {len(roen)} bytes")
English text with English codec: 9586 bytes
Romanian text with Romanian codec: 7222 bytes