{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "

cs1001.py , Tel Aviv University, Fall 2017-2018

\n", "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Recitation 12\n", "\n", "We discussed two text-compression algorithms: Huffman code and Lempel-Ziv. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Code for printing several outputs in one cell (not part of the recitation):" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "from IPython.core.interactiveshell import InteractiveShell\n", "InteractiveShell.ast_node_interactivity = \"all\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Huffman compression" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Generates a variable-length, prefix-free code, based on the character frequencies in corpus string.\n", "Notice that there are several degrees of freedom that could lead to different tree structures." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def ascii2bit_stream(text):\n", " \"\"\" Translates ASCII text to binary reprersentation using\n", " 7 bits per character. Assume only ASCII chars \"\"\"\n", " return \"\".join([bin(ord(c))[2:].zfill(7) for c in text])\n", "\n", "\n", "\n", "########################################################\n", "#### HUFFMAN CODE\n", "########################################################\n", "\n", "def char_count(text):\n", " \"\"\" Counts the number of each character in text.\n", " Returns a dictionary, with keys being the observed characters,\n", " values being the counts \"\"\"\n", " d = {}\n", " for ch in text:\n", " if ch in d:\n", " d[ch] += 1\n", " else:\n", " d[ch] = 1\n", " return d\n", "\n", "\n", "def build_huffman_tree(char_count_dict):\n", " \"\"\" Recieves dictionary with char:count entries\n", " Generates a LIST structure representing\n", " the binary Huffman encoding tree \"\"\"\n", " queue = [(c,cnt) for (c,cnt) in char_count_dict.items()]\n", "\n", " while len(queue) > 1:\n", " #print(queue)\n", " # combine two smallest elements\n", " A, cntA = extract_min(queue) # smallest in queue\n", " B, cntB = extract_min(queue) # next smallest\n", " chars = [A,B]\n", " weight = cntA + cntB # combined weight\n", " queue.append((chars, weight)) # insert combined node\n", "\n", " # only root node left\n", " #print(\"final queue:\", queue)\n", " root, weight_trash = extract_min(queue) # weight_trash unused\n", " return root #a LIST representing the tree structure\n", "\n", "\n", "##def extract_min(queue):\n", "## \"\"\" queue is a list of 2-tuples (x,y).\n", "## remove and return the tuple with minimal y \"\"\" \n", "## min_pair = queue[0]\n", "## for pair in queue:\n", "## if pair[1] < min_pair[1]:\n", "## min_pair = pair\n", "##\n", "## queue.remove(min_pair)\n", "## return min_pair\n", "\n", "\n", "def extract_min(queue): #the shorter, \"Pythonic\" way\n", " \"\"\" queue is a list of 2-tuples (x,y).\n", " remove and return the tuple with minimal y \"\"\" \n", " min_pair = min(queue, key = lambda pair: pair[1])\n", " queue.remove(min_pair)\n", " return min_pair\n", "\n", "\n", "\n", "def generate_code(huff_tree, prefix=\"\"):\n", " \"\"\" Receives a Huffman tree with embedded encoding,\n", " and a prefix of encodings.\n", " returns a dictionary where characters are\n", " keys and associated binary strings are values.\"\"\"\n", " if isinstance(huff_tree, str): # a leaf\n", " return {huff_tree: prefix}\n", " else:\n", " lchild, rchild = huff_tree[0], huff_tree[1]\n", " codebook = {}\n", "\n", " codebook.update(generate_code(lchild, prefix+'0'))\n", " codebook.update(generate_code(rchild, prefix+'1'))\n", " # oh, the beauty of recursion...\n", " return codebook\n", "\n", " \n", "def compress(text, encoding_dict):\n", " \"\"\" compress text using encoding dictionary \"\"\"\n", " assert isinstance(text, str)\n", " return \"\".join(encoding_dict[ch] for ch in text)\n", "\n", "\n", "def reverse_dict(d):\n", " \"\"\" build the \"reverse\" of encoding dictionary \"\"\"\n", " return {y:x for (x,y) in d.items()}\n", "\n", "\n", "def decompress(bits, decoding_dict):\n", " prefix = \"\"\n", " result = []\n", " for bit in bits:\n", " prefix += bit\n", " if prefix in decoding_dict:\n", " result.append(decoding_dict[prefix])\n", " prefix = \"\" #restart\n", " assert prefix == \"\" # must finish last codeword\n", " return \"\".join(result) # converts list of chars to a string" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### The Huffman Encoding Process" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "s = \"live and let live\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Frequency Dictionary" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{' ': 3, 'a': 1, 'd': 1, 'e': 3, 'i': 2, 'l': 3, 'n': 1, 't': 1, 'v': 2}" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "count_dict = char_count(s)\n", "count_dict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Build a Huffman tree" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[[' ', ['i', 'v']], [[['a', 'n'], ['d', 't']], ['l', 'e']]]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "huff_tree = build_huffman_tree(count_dict)\n", "huff_tree" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Generate binary rep. for each char in the corpus based on the tree" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{' ': '00',\n", " 'a': '1000',\n", " 'd': '1010',\n", " 'e': '111',\n", " 'i': '010',\n", " 'l': '110',\n", " 'n': '1001',\n", " 't': '1011',\n", " 'v': '011'}" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d = generate_code(huff_tree)\n", "d" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compressing some text using our code" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "text1 = \"tell\"" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'1011111110110'" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "13" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "compress(text1, d)\n", "len(compress(text1, d))#4+9" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "28" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unicode_conversion = ascii2bit_stream(\"tell\") #when we convert every character to 7 bits\n", "len(unicode_conversion)#4*7" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Decoding" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'00': ' ',\n", " '010': 'i',\n", " '011': 'v',\n", " '1000': 'a',\n", " '1001': 'n',\n", " '1010': 'd',\n", " '1011': 't',\n", " '110': 'l',\n", " '111': 'e'}" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "e = reverse_dict(d)\n", "e" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'tell'" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "decompress('1011111110110', e)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Handling missing characters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "asci = str.join(\"\",[chr(c) for c in range(128)])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Compression ratio" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "def compression_ratio(text, corpus):\n", " d = generate_code(build_huffman_tree(char_count(corpus)))\n", " len_compress = len(compress(text, d))\n", " len_ascii = len(ascii2bit_stream(text)) #len(text)*7\n", " return len_compress/len_ascii" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.14285714285714285" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "0.14285714285714285" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "compression_ratio(\"ab\", \"ab\")\n", "1/7" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.2857142857142857" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "0.2857142857142857" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "compression_ratio(\"ab\", \"abcd\")\n", "2/7\n", "\n" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.1428571428571428" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "1.1428571428571428" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "compression_ratio(\"hello\", \"a\"*100 + asci)\n", "8/7" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "1.0" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "1.1428571428571428" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "\n", "compression_ratio(\"hello\", \"a\"*10 + asci)\n", "compression_ratio(\"hello\", \"a\"*40 + asci)\n", "compression_ratio(\"hello\", \"a\"*80 + asci)\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "compression_ratio(\"hello\", \"a\"*4 + asci)\n", "d = generate_code(build_huffman_tree(char_count(\"a\"*4 + asci)))\n" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'11110'" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "[('\\x00', '11111010'),\n", " ('\\x01', '11111011'),\n", " ('\\x02', '11111100'),\n", " ('\\x03', '11111101'),\n", " ('\\x04', '11111110'),\n", " ('\\x05', '11111111')]" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d[\"a\"]\n", "\n", "[(key,d[key]) for key in d.keys() if len(d[key]) > 7]" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'0'" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "'11101001'" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d = generate_code(build_huffman_tree(char_count(\"a\"*100 + asci)))\n", "d['a']\n", "d['h']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lempel-Ziv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Repetitions are encouded by [offset,length], where offset specifies that the current string is appeared offset charcters before, and length specifies the length of the repetition.\n", "\n", "- It might be that: offset < length\n", "- Usually, offset < 4096 and length < 32" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import math\n", "\n", "def str_to_ascii(text):\n", " \"\"\" Gets rid of on ascii characters in text\"\"\"\n", " return ''.join(ch for ch in text if ord(ch)<128)\n", "\n", "\n", "\n", "def maxmatch(T, p, w=2**12-1, max_length=2**5-1):\n", " \"\"\" finds a maximum match of length k<=2**5-1 in a\n", " w long window, T[p:p+k] with T[p-m:p-m+k].\n", " Returns m (offset) and k (match length) \"\"\"\n", " assert isinstance(T,str)\n", " n = len(T)\n", " maxmatch = 0\n", " offset = 0\n", " for m in range(1, min(p+1, w)):\n", " k = 0\n", " while k < min(n-p, max_length) and T[p-m+k] == T[p+k]:\n", " k += 1\n", " # at this point, T[p-m:p-m+k]==T[p:p+k]\n", " if maxmatch < k: \n", " maxmatch = k\n", " offset = m\n", " return offset, maxmatch\n", "# returned offset is smallest one (closest to p) among\n", "# all max matches (m starts at 1)\n", "\n", "\n", "\n", "def LZW_compress(text, w=2**12-1, max_length=2**5-1):\n", " \"\"\"LZW compression of an ascii text. Produces\n", " a list comprising of either ascii characters\n", " or pairs [m,k] where m is an offset and\n", " k is a match (both are non negative integers) \"\"\"\n", " result = []\n", " n = len(text)\n", " p = 0\n", " while p3 is a match (both are non negative integers) \"\"\"\n", " result = []\n", " n = len(text)\n", " p = 0\n", " while p 2\n", " p += 1\n", " m = int(compressed[p:p+offset_width],2)\n", " p += offset_width\n", " k = int(compressed[p:p+match_width],2)\n", " p += match_width\n", " result.append([m,k])\n", " return result\n", "\n", "\n", "#########################\n", "### Various executions\n", "#########################\n", "\n", "def process1(text):\n", " \"\"\" packages the whole process using LZ_compress \"\"\"\n", " atext = str_to_ascii(text)\n", " inter = LZW_compress(atext)\n", " binn = inter_to_bin(inter)\n", " inter2 = bin_to_inter(binn)\n", " text2 = LZW_decompress(inter)\n", " return inter, binn, inter2, text2\n", "\n", "\n", "def process2(text):\n", " \"\"\" packages the whole process using LZ_compress2 \"\"\"\n", " atext = str_to_ascii(text)\n", " inter = LZW_compress2(atext)\n", " binn = inter_to_bin(inter)\n", " inter2 = bin_to_inter(binn)\n", " text2 = LZW_decompress(inter2)\n", " return inter, binn, inter2, text2\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### The LZ Encoding Process" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "inter1, bits, inter2, text2 = process2(\"abcdabc\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What is the encoded representation?" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'01100001011000100110001101100100100000000010000011'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inter1\n", "bits" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "50" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(bits)#8*4+18" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How do we really encode the chars?" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['1100001', '1100010', '1100011', '1100100']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[bits[8*i:8*(i+1)] for i in range(4)]\n", "[bin(ord(c))[2:] for c in \"abcd\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Going back" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'abcdabc'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Another example" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "inter1, bits, inter2, text2 = process2(\"xyzxyzwxyzw\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['x', 'y', 'z', [3, 3], 'w', [4, 4]]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inter1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Compression ratio" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def lz_ratio(text):\n", " inter1, bits, inter2, text2 = process2(text)\n", " return len(bits)/(len(text)*7)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.086" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "0.08317142857142858" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lz_ratio('a' * 1000)\n", "lz_ratio('a' * 10000)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.08294930875576037" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "18/(31*7)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.1428571428571428" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "0.8285714285714286" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "0.8285714285714286" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lz_ratio('hello')\n", "lz_ratio('hello' * 2)\n", "(8*5+18) / (7*10)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 1 }