10. Playing with Huffman coding#

10.1. Introduction#

References:

from dahuffman import HuffmanCodec

codec = HuffmanCodec.from_data("A fost odată ca-n povești, A fost ca niciodată, Din rude mari împărătești, O prea frumoasă fată.")
codec.print_code_table()
Bits Code    Value Symbol
   3 000         0 't'
   4 0010        2 'f'
   4 0011        3 'r'
   5 01000       8 'ș'
   5 01001       9 ','
   5 01010      10 'c'
   5 01011      11 'd'
   4 0110        6 'i'
   5 01110      14 'm'
   5 01111      15 'n'
   4 1000        8 'o'
   5 10010      18 'p'
   5 10011      19 's'
   4 1010       10 'ă'
   6 101100     44 'î'
   7 1011010    90 _EOF
   7 1011011    91 '-'
   7 1011100    92 '.'
   7 1011101    93 'D'
   6 101111     47 'A'
   3 110         6 ' '
   7 1110000   112 'O'
   7 1110001   113 'v'
   6 111001     57 'u'
   5 11101      29 'e'
   4 1111       15 'a'

10.2. From a large English text#

# Read a file at once
with open("data/texten.txt", "r") as f:     # English text
    lines = f.readlines()
contentEn = '\n'.join(lines)
codecEn = HuffmanCodec.from_data(contentEn)
codecEn.print_code_table()
Bits Code           Value Symbol
   4 0000               0 '\n'
   6 000100             4 'w'
   6 000101             5 'O'
   8 00011000          24 'H'
   8 00011001          25 'W'
   9 000110100         52 '-'
  12 000110101000     424 ')'
  12 000110101001     425 'J'
  11 00011010101      213 'j'
  10 0001101011       107 'V'
   8 00011011          27 'L'
   6 000111             7 'c'
   4 0010               2 't'
   4 0011               3 'o'
   6 010000            16 'f'
   6 010001            17 '.'
   7 0100100           36 'E'
   7 0100101           37 'R'
   6 010011            19 'I'
   9 010100000        160 ';'
   9 010100001        161 'F'
   8 01010001          81 'C'
   8 01010010          82 'G'
   8 01010011          83 'N'
   6 010101            21 'y'
   7 0101100           44 'v'
   8 01011010          90 'D'
  11 01011011000      728 'q'
  12 010110110010    1458 'z'
  14 01011011001100  5836 _EOF
  14 01011011001101  5837 '2'
  13 0101101100111   2919 '0'
  12 010110110100    1460 '"'
  12 010110110101    1461 '1'
  14 01011011011000  5848 '3'
  14 01011011011001  5849 '5'
  14 01011011011010  5850 '6'
  14 01011011011011  5851 'X'
  12 010110110111    1463 '9'
   9 010110111        183 '!'
   7 0101110           46 'b'
   7 0101111           47 'T'
   6 011000            24 'm'
   8 01100100         100 "'"
   8 01100101         101 'S'
   7 0110011           51 'p'
   6 011010            26 ','
   7 0110110           54 'A'
  11 01101110000      880 'x'
  13 0110111000100   3524 '<'
  13 0110111000101   3525 '>'
  12 011011100011    1763 ':'
  10 0110111001       441 'P'
   9 011011101        221 '?'
   8 01101111         111 'k'
   4 0111               7 'e'
   2 10                 2 ' '
   5 11000             24 'r'
   5 11001             25 'i'
   5 11010             26 'n'
   5 11011             27 'h'
   5 11100             28 's'
   5 11101             29 'a'
   6 111100            60 'l'
   8 11110100         244 'B'
  12 111101010000    3920 'K'
  13 1111010100010   7842 'Z'
  13 1111010100011   7843 '('
  11 11110101001     1961 'U'
  10 1111010101       981 'Y'
   9 111101011        491 'M'
   7 1111011          123 'g'
   6 111110            62 'd'
   6 111111            63 'u'
enc = codecEn.encode("How do you do!")
print(enc)  # byte array

# Convert to binary string for display
enc_binarystr = ''.join([format(i, '08b') for i in enc])
print(enc_binarystr)
b'\x181/\x8eT\xff\xbe5\xba'
000110000011000100101111100011100101010011111111101111100011010110111010
9
72

10.3. From a large Romanian text#

# Read a file at once
with open("data/textro.txt", "r") as f:     # Romanian text
    lines = f.readlines()
contentRo = '\n'.join(lines)
codecRo = HuffmanCodec.from_data(contentRo)
codecRo.print_code_table()
Bits Code           Value Symbol
   4 0000               0 'u'
   5 00010              2 ','
   8 00011000          24 '!'
  10 0001100100       100 'F'
  10 0001100101       101 ':'
   9 000110011         51 'U'
   8 00011010          26 'E'
   8 00011011          27 'D'
  10 0001110000       112 'G'
  11 00011100010      226 'k'
  11 00011100011      227 '*'
   9 000111001         57 'M'
   8 00011101          29 'P'
   7 0001111           15 'v'
   3 001                1 'a'
   4 0100               4 'n'
   4 0101               5 'r'
   5 01100             12 'd'
   5 01101             13 'c'
   5 01110             14 'o'
   5 01111             15 's'
   3 100                4 ' '
   8 10100000         160 '.'
   9 101000010        322 ';'
  12 101000011000    2584 '1'
  12 101000011001    2585 '3'
  12 101000011010    2586 'Z'
  13 1010000110110   5174 'x'
  14 10100001101110 10350 _EOF
  14 10100001101111 10351 '#'
  10 1010000111       647 'j'
   7 1010001           81 'C'
   7 1010010           82 'b'
   7 1010011           83 'f'
   6 101010            42 'p'
   7 1010110           86 'g'
   9 101011100        348 'N'
   9 101011101        349 'R'
   8 10101111         175 'A'
   5 10110             22 'l'
   6 101110            46 '-'
   9 101111000        376 'L'
  10 1011110010       754 '?'
  12 101111001100    3020 '"'
  14 10111100110100 12084 '0'
  14 10111100110101 12085 '5'
  13 1011110011011   6043 '2'
  14 10111100111000 12088 '8'
  14 10111100111001 12089 'H'
  13 1011110011101   6045 'J'
  14 10111100111100 12092 'K'
  14 10111100111101 12093 'Y'
  14 10111100111110 12094 '['
  14 10111100111111 12095 ']'
   8 10111101         189 'I'
   8 10111110         190 'z'
   8 10111111         191 'S'
   4 1100              12 'i'
   4 1101              13 'e'
   4 1110              14 '\n'
   5 11110             30 't'
   6 111110            62 'm'
  11 11111100000     2016 'w'
  11 11111100001     2017 'y'
  10 1111110001      1009 'O'
   9 111111001        505 'T'
  10 1111110100      1012 'V'
  10 1111110101      1013 'B'
   9 111111011        507 'h'
   7 1111111          127 '\t'
enc = codecRo.encode("Salutare!")
print(enc)  # byte array

# Convert to binary string for display
enc_binarystr = ''.join([format(i, '08b') for i in enc])
print(enc_binarystr)
b'\xbf6\x0f\x15\xd1\x8a'
101111110011011000001111000101011101000110001010

10.4. Encoding strings#

Let’e encode:

  • the English text with the English code

  • the Romanian text with the Romanian code

  • the English text with the Romanian code

  • the Romanian text with the English code

enen = codecEn.encode(contentEn)
print(f"English text with English codec: {len(enen)} bytes")

roro = codecRo.encode(contentRo)
print(f"Romanian text with Romanian codec: {len(roro)} bytes")

# enro = codecEn.encode(contentRo)
# print(f"Romanian text with English codec: {len(enro)} bytes")

# roen = codecRo.encode(contentEn)
# print(f"English text with Romanian codec: {len(roen)} bytes")
English text with English codec: 9586 bytes
Romanian text with Romanian codec: 7222 bytes