{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h1><center>cs1001.py , Tel Aviv University, Fall 2017-2018</center></h1>\n",
    "<img src=\"http://www.pngall.com/wp-content/uploads/2016/05/Python-Logo-PNG-Image-180x180.png\" width=50/>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Recitation 12\n",
    "\n",
    "We discussed two text-compression algorithms: Huffman code and Lempel-Ziv. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Code for printing several outputs in one cell (not part of the recitation):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = \"all\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Huffman compression"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Generates a variable-length, prefix-free code, based on the character frequencies in corpus string.\n",
    "Notice that there are several degrees of freedom that could lead to different tree structures."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"huffman.PNG\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ascii2bit_stream(text):\n",
    "    \"\"\" Translates ASCII text to binary reprersentation using\n",
    "        7 bits per character. Assume only ASCII chars \"\"\"\n",
    "    return \"\".join([bin(ord(c))[2:].zfill(7) for c in text])\n",
    "\n",
    "\n",
    "\n",
    "########################################################\n",
    "#### HUFFMAN CODE\n",
    "########################################################\n",
    "\n",
    "def char_count(text):\n",
    "    \"\"\" Counts the number of each character in text.\n",
    "        Returns a dictionary, with keys being the observed characters,\n",
    "        values being the counts \"\"\"\n",
    "    d = {}\n",
    "    for ch in text:\n",
    "        if ch in d:\n",
    "            d[ch] += 1\n",
    "        else:\n",
    "            d[ch] = 1\n",
    "    return d\n",
    "\n",
    "\n",
    "def build_huffman_tree(char_count_dict):\n",
    "    \"\"\" Recieves dictionary with char:count entries\n",
    "        Generates a LIST structure representing\n",
    "        the binary Huffman encoding tree \"\"\"\n",
    "    queue = [(c,cnt) for (c,cnt) in char_count_dict.items()]\n",
    "\n",
    "    while len(queue) > 1:\n",
    "        #print(queue)\n",
    "        # combine two smallest elements\n",
    "        A, cntA = extract_min(queue)    # smallest in queue\n",
    "        B, cntB = extract_min(queue)    # next smallest\n",
    "        chars = [A,B]\n",
    "        weight = cntA + cntB            # combined weight\n",
    "        queue.append((chars, weight))   # insert combined node\n",
    "\n",
    "    # only root node left\n",
    "    #print(\"final queue:\", queue)\n",
    "    root, weight_trash = extract_min(queue) # weight_trash unused\n",
    "    return root                             #a LIST representing the tree structure\n",
    "\n",
    "\n",
    "##def extract_min(queue):\n",
    "##    \"\"\" queue is a list of 2-tuples (x,y).\n",
    "##        remove and return the tuple with minimal y \"\"\" \n",
    "##    min_pair = queue[0]\n",
    "##    for pair in queue:\n",
    "##        if pair[1] < min_pair[1]:\n",
    "##            min_pair = pair\n",
    "##\n",
    "##    queue.remove(min_pair)\n",
    "##    return min_pair\n",
    "\n",
    "\n",
    "def extract_min(queue): #the shorter, \"Pythonic\" way\n",
    "    \"\"\" queue is a list of 2-tuples (x,y).\n",
    "        remove and return the tuple with minimal y \"\"\" \n",
    "    min_pair = min(queue, key = lambda pair: pair[1])\n",
    "    queue.remove(min_pair)\n",
    "    return min_pair\n",
    "\n",
    "\n",
    "\n",
    "def generate_code(huff_tree, prefix=\"\"):\n",
    "    \"\"\" Receives a Huffman tree with embedded encoding,\n",
    "        and a prefix of encodings.\n",
    "        returns a dictionary where characters are\n",
    "        keys and associated binary strings are values.\"\"\"\n",
    "    if isinstance(huff_tree, str): # a leaf\n",
    "        return {huff_tree: prefix}\n",
    "    else:\n",
    "        lchild, rchild = huff_tree[0], huff_tree[1]\n",
    "        codebook = {}\n",
    "\n",
    "        codebook.update(generate_code(lchild, prefix+'0'))\n",
    "        codebook.update(generate_code(rchild, prefix+'1'))\n",
    "        #   oh, the beauty of recursion...\n",
    "        return codebook\n",
    "\n",
    "    \n",
    "def compress(text, encoding_dict):\n",
    "    \"\"\" compress text using encoding dictionary \"\"\"\n",
    "    assert isinstance(text, str)\n",
    "    return \"\".join(encoding_dict[ch] for ch in text)\n",
    "\n",
    "\n",
    "def reverse_dict(d):\n",
    "    \"\"\" build the \"reverse\" of encoding dictionary \"\"\"\n",
    "    return {y:x for (x,y) in d.items()}\n",
    "\n",
    "\n",
    "def decompress(bits, decoding_dict):\n",
    "    prefix = \"\"\n",
    "    result = []\n",
    "    for bit in bits:\n",
    "        prefix += bit\n",
    "        if prefix in decoding_dict:\n",
    "            result.append(decoding_dict[prefix])\n",
    "            prefix = \"\"  #restart\n",
    "    assert prefix == \"\" # must finish last codeword\n",
    "    return \"\".join(result)  # converts list of chars to a string"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### The Huffman Encoding Process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = \"live and let live\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Frequency Dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{' ': 3, 'a': 1, 'd': 1, 'e': 3, 'i': 2, 'l': 3, 'n': 1, 't': 1, 'v': 2}"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "count_dict = char_count(s)\n",
    "count_dict"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Build a Huffman tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[' ', ['i', 'v']], [[['a', 'n'], ['d', 't']], ['l', 'e']]]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "huff_tree = build_huffman_tree(count_dict)\n",
    "huff_tree"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Generate binary rep. for each char in the corpus based on the tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{' ': '00',\n",
       " 'a': '1000',\n",
       " 'd': '1010',\n",
       " 'e': '111',\n",
       " 'i': '010',\n",
       " 'l': '110',\n",
       " 'n': '1001',\n",
       " 't': '1011',\n",
       " 'v': '011'}"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d = generate_code(huff_tree)\n",
    "d"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compressing some text using our code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "text1 = \"tell\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1011111110110'"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "13"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "compress(text1, d)\n",
    "len(compress(text1, d))#4+9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "28"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unicode_conversion = ascii2bit_stream(\"tell\") #when we convert every character to 7 bits\n",
    "len(unicode_conversion)#4*7"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Decoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'00': ' ',\n",
       " '010': 'i',\n",
       " '011': 'v',\n",
       " '1000': 'a',\n",
       " '1001': 'n',\n",
       " '1010': 'd',\n",
       " '1011': 't',\n",
       " '110': 'l',\n",
       " '111': 'e'}"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "e = reverse_dict(d)\n",
    "e"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'tell'"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "decompress('1011111110110', e)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Handling missing characters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "asci = str.join(\"\",[chr(c) for c in range(128)])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Compression ratio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compression_ratio(text, corpus):\n",
    "    d = generate_code(build_huffman_tree(char_count(corpus)))\n",
    "    len_compress = len(compress(text, d))\n",
    "    len_ascii = len(ascii2bit_stream(text)) #len(text)*7\n",
    "    return len_compress/len_ascii"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.14285714285714285"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0.14285714285714285"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "compression_ratio(\"ab\", \"ab\")\n",
    "1/7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.2857142857142857"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0.2857142857142857"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "compression_ratio(\"ab\", \"abcd\")\n",
    "2/7\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.1428571428571428"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "1.1428571428571428"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "compression_ratio(\"hello\", \"a\"*100 + asci)\n",
    "8/7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "1.1428571428571428"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "\n",
    "compression_ratio(\"hello\", \"a\"*10 + asci)\n",
    "compression_ratio(\"hello\", \"a\"*40 + asci)\n",
    "compression_ratio(\"hello\", \"a\"*80 + asci)\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "compression_ratio(\"hello\", \"a\"*4 + asci)\n",
    "d = generate_code(build_huffman_tree(char_count(\"a\"*4 + asci)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'11110'"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "[('\\x00', '11111010'),\n",
       " ('\\x01', '11111011'),\n",
       " ('\\x02', '11111100'),\n",
       " ('\\x03', '11111101'),\n",
       " ('\\x04', '11111110'),\n",
       " ('\\x05', '11111111')]"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d[\"a\"]\n",
    "\n",
    "[(key,d[key]) for key in d.keys() if len(d[key]) > 7]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'0'"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "'11101001'"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d = generate_code(build_huffman_tree(char_count(\"a\"*100 + asci)))\n",
    "d['a']\n",
    "d['h']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lempel-Ziv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Repetitions are encouded by [offset,length], where offset specifies that the current string is appeared offset charcters before, and length specifies the length of the repetition.\n",
    "\n",
    "- It might be that: offset < length\n",
    "- Usually, offset < 4096 and length < 32"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"lz.PNG\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "\n",
    "def str_to_ascii(text):\n",
    "  \"\"\" Gets rid of on ascii characters in text\"\"\"\n",
    "  return ''.join(ch for ch in text if ord(ch)<128)\n",
    "\n",
    "\n",
    "\n",
    "def maxmatch(T, p, w=2**12-1, max_length=2**5-1):\n",
    "   \"\"\" finds a maximum match of length k<=2**5-1 in a\n",
    "   w long window, T[p:p+k] with T[p-m:p-m+k].\n",
    "   Returns m (offset) and k (match length) \"\"\"\n",
    "   assert isinstance(T,str)\n",
    "   n = len(T)\n",
    "   maxmatch = 0\n",
    "   offset = 0\n",
    "   for m in range(1, min(p+1, w)):\n",
    "      k = 0\n",
    "      while k < min(n-p, max_length) and T[p-m+k] == T[p+k]:\n",
    "        k += 1\n",
    "        # at this point, T[p-m:p-m+k]==T[p:p+k]\n",
    "      if maxmatch < k:  \n",
    "         maxmatch = k\n",
    "         offset = m\n",
    "   return offset, maxmatch\n",
    "# returned offset is smallest one (closest to p) among\n",
    "# all max matches (m starts at 1)\n",
    "\n",
    "\n",
    "\n",
    "def LZW_compress(text, w=2**12-1, max_length=2**5-1):\n",
    "    \"\"\"LZW compression of an ascii text. Produces\n",
    "       a list comprising of either ascii characters\n",
    "       or pairs [m,k] where m is an offset and\n",
    "       k is a match (both are non negative integers) \"\"\"\n",
    "    result = []\n",
    "    n = len(text)\n",
    "    p = 0\n",
    "    while p<n:\n",
    "        m,k = maxmatch(text, p, w, max_length)\n",
    "        if k<2:\n",
    "            result.append(text[p]) #  a single char\n",
    "            p += 1\n",
    "        else:\n",
    "            result.append([m,k])   # two or more chars in match\n",
    "            p += k\n",
    "    return result  # produces a list composed of chars and pairs\n",
    "            \n",
    "                          \n",
    "def LZW_decompress(compressed, w=2**12-1, max_length=2**5-1):\n",
    "   \"\"\"LZW decompression from intermediate format to ascii text\"\"\"\n",
    "   result = []\n",
    "   n = len(compressed)\n",
    "   p = 0\n",
    "   while p<n:\n",
    "      if type(compressed[p]) == str:  # char, as opposed to a pair\n",
    "         result.append(compressed[p])\n",
    "         p+=1\n",
    "      else:\n",
    "         m,k = compressed[p]\n",
    "         p += 1\n",
    "         for i in range(0,k):\n",
    "            # append k times to result;  \n",
    "            result.append(result[-m])\n",
    "            # fixed offset m \"to the left\", as result itself grows\n",
    "   return \"\".join(result)\n",
    "\n",
    "\n",
    "\n",
    "def LZW_compress2(text, w=2**12-1, max_length=2**5-1):\n",
    "    \"\"\"LZW compression of an ascii text. Produces\n",
    "       a list comprising of either ascii characters\n",
    "       or pairs [m,k] where m is an offset and\n",
    "       k>3 is a match (both are non negative integers) \"\"\"\n",
    "    result = []\n",
    "    n = len(text)\n",
    "    p = 0\n",
    "    while p<n:\n",
    "        m,k = maxmatch(text, p, w, max_length)\n",
    "        if k<3: # modified from k<2\n",
    "            result.append(text[p]) # a single char\n",
    "            p += 1 #even if k was 2 (why?)\n",
    "        else:\n",
    "            result.append([m,k])   # two or more chars in match\n",
    "            p += k\n",
    "    return result  # produces a list composed of chars and pairs\n",
    "\n",
    "          \n",
    "\n",
    "def inter_to_bin(lst, w=2**12-1, max_length=2**5-1):\n",
    "   \"\"\" converts intermediate format compressed list\n",
    "       to a string of bits\"\"\"\n",
    "   offset_width = math.ceil(math.log(w,2))\n",
    "   match_width = math.ceil(math.log(max_length, 2))\n",
    "   #print(offset_width,match_width)   # for debugging\n",
    "   result = []\n",
    "   for elem in lst:\n",
    "      if type(elem) == str:\n",
    "         result.append(\"0\")\n",
    "         result.append('{:07b}'.format(ord(elem)))\n",
    "      elif type(elem) == list:\n",
    "         result.append(\"1\")\n",
    "         m,k = elem\n",
    "         result.append(str(bin(m)[2:]).zfill(offset_width))\n",
    "         result.append(str(bin(k)[2:]).zfill(match_width))\n",
    "         \n",
    "   return \"\".join(ch for ch in result)\n",
    "   \n",
    "   \n",
    "                         \n",
    "def bin_to_inter(compressed, w=2**12-1, max_length=2**5-1):\n",
    "   \"\"\" converts a compressed string of bits\n",
    "       to intermediate compressed format \"\"\"\n",
    "   offset_width = math.ceil(math.log(w,2))\n",
    "   match_width = math.ceil(math.log(max_length,2))\n",
    "   #print(offset_width,match_width)   # for debugging\n",
    "   result = []\n",
    "   n = len(compressed)\n",
    "   p = 0\n",
    "   while p<n:\n",
    "      if compressed[p] == \"0\":  # single ascii char\n",
    "         p += 1\n",
    "         char = chr(int(compressed[p:p+7], 2))\n",
    "         result.append(char)\n",
    "         p += 7\n",
    "      elif compressed[p] == \"1\":  # repeat of length > 2\n",
    "         p += 1\n",
    "         m = int(compressed[p:p+offset_width],2)\n",
    "         p += offset_width\n",
    "         k = int(compressed[p:p+match_width],2)\n",
    "         p += match_width\n",
    "         result.append([m,k])\n",
    "   return result\n",
    "\n",
    "\n",
    "#########################\n",
    "### Various executions\n",
    "#########################\n",
    "\n",
    "def process1(text):\n",
    "  \"\"\" packages the whole process using LZ_compress \"\"\"\n",
    "  atext = str_to_ascii(text)\n",
    "  inter = LZW_compress(atext)\n",
    "  binn = inter_to_bin(inter)\n",
    "  inter2 = bin_to_inter(binn)\n",
    "  text2 = LZW_decompress(inter)\n",
    "  return inter, binn, inter2, text2\n",
    "\n",
    "\n",
    "def process2(text):\n",
    "  \"\"\" packages the whole process using LZ_compress2 \"\"\"\n",
    "  atext = str_to_ascii(text)\n",
    "  inter = LZW_compress2(atext)\n",
    "  binn = inter_to_bin(inter)\n",
    "  inter2 = bin_to_inter(binn)\n",
    "  text2 = LZW_decompress(inter2)\n",
    "  return inter, binn, inter2, text2\n",
    "   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### The LZ Encoding Process"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "inter1, bits, inter2, text2 = process2(\"abcdabc\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "What is the encoded representation?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'01100001011000100110001101100100100000000010000011'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inter1\n",
    "bits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "50"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(bits)#8*4+18"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How do we really encode the chars?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['1100001', '1100010', '1100011', '1100100']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[bits[8*i:8*(i+1)] for i in range(4)]\n",
    "[bin(ord(c))[2:] for c in \"abcd\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Going back"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'abcdabc'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Another example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "inter1, bits, inter2, text2 = process2(\"xyzxyzwxyzw\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['x', 'y', 'z', [3, 3], 'w', [4, 4]]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inter1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Compression ratio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def lz_ratio(text):\n",
    "  inter1, bits, inter2, text2 = process2(text)\n",
    "  return len(bits)/(len(text)*7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.086"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0.08317142857142858"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lz_ratio('a' * 1000)\n",
    "lz_ratio('a' * 10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.08294930875576037"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "18/(31*7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.1428571428571428"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0.8285714285714286"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "0.8285714285714286"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lz_ratio('hello')\n",
    "lz_ratio('hello' * 2)\n",
    "(8*5+18) / (7*10)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}