{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Structures\n", "\n", "* tuple\n", "* list\n", "* dict\n", "* set" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## tuple" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A tuple is a one dimensional, fixed-length, immutable sequence.\n", "\n", "Create a tuple:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(1, 2, 3)" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tup = (1, 2, 3)\n", "tup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert to a tuple:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "tuple" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list_1 = [1, 2, 3]\n", "type(tuple(list_1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a nested tuple:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "([1, 2, 3], (4, 5))" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nested_tup = ([1, 2, 3], (4, 5))\n", "nested_tup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Access a tuple's elements by index O(1):" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1, 2, 3]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nested_tup[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Although tuples are immutable, their contents can contain mutable objects. \n", "\n", "Modify a tuple's contents:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1, 2, 3, 4]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nested_tup[0].append(4)\n", "nested_tup[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate tuples by creating a new tuple and copying objects:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(1, 3, 2, 4, 5, 6)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(1, 3, 2) + (4, 5, 6)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Multiply tuples to copy references to objects (objects themselves are not copied):" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('foo', 'bar', 'foo', 'bar')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "('foo', 'bar') * 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Unpack tuples:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "([1, 2, 3, 4], (4, 5))" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a, b = nested_tup\n", "a, b" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Unpack nested tuples:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(1, 2, 3, 4, 4, 5)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(a, b, c, d), (e, f) = nested_tup\n", "a, b, c, d, e, f" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A common use of variable unpacking is when iterating over sequences of tuples or lists:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1, 2, 3)\n", "(4, 5, 6)\n", "(7, 8, 9)\n" ] } ], "source": [ "seq = [( 1, 2, 3), (4, 5, 6), (7, 8, 9)] \n", "for a, b, c in seq: \n", " print(a, b, c)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A list is a one dimensional, variable-length, mutable sequence.\n", "\n", "Create a list:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1, 2, 3]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list_1 = [1, 2, 3]\n", "list_1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert to a list:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "list" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(list(tup))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a nested list:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 2, 3), [4, 5]]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nested_list = [(1, 2, 3), [4, 5]]\n", "nested_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Access a list's elements by index O(1):" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[4, 5]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nested_list[1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Append an element to a list O(1):" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 2, 3), [4, 5], 6]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nested_list.append(6)\n", "nested_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Insert an element to a list at a specific index (note that insert is expensive as it has to shift subsequent elements O(n)):" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['start', (1, 2, 3), [4, 5], 6]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nested_list.insert(0, 'start')\n", "nested_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pop is expensive as it has to shift subsequent elements O(n). The operation is O(1) if pop is used for the last element.\n", "\n", "Remove and return an element from a specified index:" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(1, 2, 3), [4, 5], 6]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nested_list.pop(0)\n", "nested_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Locates the first such value and remove it O(n):" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[[4, 5], 6]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nested_list.remove((1, 2, 3))\n", "nested_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check if a list contains a value O(n):" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "6 in nested_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concatenate lists by creating a new list and copying objects:" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1, 3, 2, 4, 5, 6]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[1, 3, 2] + [4, 5, 6]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Extend a list by appending elements (faster than concatenating lists, as it does not have to create a new list):" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[[4, 5], 6, 7, 8, 9]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nested_list.extend([7, 8, 9])\n", "nested_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## dict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A dict is also known as a hash map or associative array. A dict is a mutable collection of key-value pairs.\n", "\n", "Note: Big O complexities are listed as average case, with most worst case complexities being O(n).\n", "\n", "Create a dict:" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'a': 'foo', 'b': [0, 1, 2, 3]}" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dict_1 = { 'a' : 'foo', 'b' : [0, 1, 2, 3] }\n", "dict_1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Access a dict's elements by index O(1)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[0, 1, 2, 3]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dict_1['b']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Insert or set a dict's elements by index O(1):" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{5: 'bar', 'a': 'foo', 'b': [0, 1, 2, 3]}" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dict_1[5] = 'bar'\n", "dict_1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check if a dict contains a key O(1):" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "5 in dict_1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Delete a value from a dict O(1):" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'a': 'foo', 'b': [0, 1, 2, 3]}" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dict_2 = dict(dict_1)\n", "del dict_2[5]\n", "dict_2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Remove and return an element from a specified index O(1):" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0, 1, 2, 3]\n", "{'a': 'foo'}\n" ] } ], "source": [ "value = dict_2.pop('b')\n", "print(value)\n", "print(dict_2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get or pop can be called with a default value if the key is not found. By default, get() will return None and pop() will throw an exception if the key is not found." ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "value = dict_1.get('z', 0)\n", "value" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Return a default value if the key is not found:" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0, 1, 2, 3]\n", "None\n" ] } ], "source": [ "print(dict_1.setdefault('b', None))\n", "print(dict_1.setdefault('z', None))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "By contrast to setdefault(), defaultdict lets you specify the default when the container is initialized, which works well if the default is appropriate for all keys:" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "defaultdict(, {'b': ['bar', 'baz'], 'f': ['foo']})" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import defaultdict\n", "\n", "seq = ['foo', 'bar', 'baz']\n", "first_letter = defaultdict(list)\n", "for elem in seq:\n", " first_letter[elem[0]].append(elem)\n", "first_letter" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "dict keys must be \"hashable\", i.e. they must be immutable objects like scalars (int, float, string) or tuples whose objects are all immutable. Lists are mutable and therefore are not hashable, although you can convert the list portion to a tuple as a quick fix." ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-9167918882415130555\n", "-2725224101759650258\n" ] } ], "source": [ "print(hash('string'))\n", "print(hash((1, 2, (3, 4))))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the list of keys in no particular order (although keys() outputs the keys in the same order). In Python 3, keys() returns an iterator instead of a list." ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['a', 'b', 5, 'z']" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dict_1.keys()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the list of values in no particular order (although values() outputs the keys in the same order). In Python 3, values() returns a [view object](https://docs.python.org/3/library/stdtypes.html?highlight=dictview#dictionary-view-objects) instead of a list." ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['foo', [0, 1, 2, 3], 'bar', None]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dict_1.values()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Iterate through a dictionary's keys and values:" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a foo\n", "b [0, 1, 2, 3]\n", "5 bar\n", "z None\n" ] } ], "source": [ "for key, value in dict_1.items():\n", " print key, value" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Merge one dict into another:" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{5: 'bar',\n", " 'a': 'foo',\n", " 'b': [0, 1, 2, 3],\n", " 'e': 'elephant',\n", " 'f': 'fish',\n", " 'z': None}" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dict_1.update({'e' : 'elephant', 'f' : 'fish'})\n", "dict_1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pair up two sequences element-wise in a dict:" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{0: 6, 1: 5, 2: 4, 3: 3, 4: 2, 5: 1, 6: 0}" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mapping = dict(zip(range(7), reversed(range(7))))\n", "mapping" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## set" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A set is an unordered sequence of unique elements. \n", "\n", "Create a set:" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{0, 1, 2, 3, 4, 5}" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set_1 = set([0, 1, 2, 3, 4, 5])\n", "set_1" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{1, 2, 3, 5, 8, 13}" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set_2 = {1, 2, 3, 5, 8, 13}\n", "set_2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sets support set operations like union, intersection, difference, and symmetric difference." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Union O(len(set_1) + len(set_2)):" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{0, 1, 2, 3, 4, 5, 8, 13}" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set_1 | set_2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Intersection O(min(len(set_1), len(set_2)):" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{1, 2, 3, 5}" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set_1 & set_2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Difference O(len(set_1)):" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{0, 4}" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set_1 - set_2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Symmetric Difference O(len(set_1)):" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{0, 4, 8, 13}" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set_1 ^ set_2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Subset O(len(set_3)):" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set_3 = {1, 2, 3}\n", "set_3.issubset(set_2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Superset O(len(set_3)):" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set_2.issuperset(set_3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Equal O(min(len(set_1), len(set_2)):" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "{1, 2, 3} == {3, 2, 1}" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }