{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Advanced NumPy" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from __future__ import division\n", "from numpy.random import randn\n", "from pandas import Series\n", "import numpy as np\n", "np.set_printoptions(precision=4)\n", "import sys" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ndarray object internals" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### NumPy dtype hierarchy" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "ints = np.ones(10, dtype=np.uint16)\n", "floats = np.ones(10, dtype=np.float32)\n", "np.issubdtype(ints.dtype, np.integer)\n", "np.issubdtype(floats.dtype, np.floating)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "np.float64.mro()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Advanced array manipulation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reshaping arrays" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.arange(8)\n", "arr\n", "arr.reshape((4, 2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr.reshape((4, 2)).reshape((2, 4))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.arange(15)\n", "arr.reshape((5, -1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "other_arr = np.ones((3, 5))\n", "other_arr.shape\n", "arr.reshape(other_arr.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.arange(15).reshape((5, 3))\n", "arr\n", "arr.ravel()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr.flatten()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### C vs. Fortran order" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.arange(12).reshape((3, 4))\n", "arr\n", "arr.ravel()\n", "arr.ravel('F')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concatenating and splitting arrays" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr1 = np.array([[1, 2, 3], [4, 5, 6]])\n", "arr2 = np.array([[7, 8, 9], [10, 11, 12]])\n", "np.concatenate([arr1, arr2], axis=0)\n", "np.concatenate([arr1, arr2], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "np.vstack((arr1, arr2))\n", "np.hstack((arr1, arr2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from numpy.random import randn\n", "arr = randn(5, 2)\n", "arr\n", "first, second, third = np.split(arr, [1, 3])\n", "first\n", "second\n", "third" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Stacking helpers: " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.arange(6)\n", "arr1 = arr.reshape((3, 2))\n", "arr2 = randn(3, 2)\n", "np.r_[arr1, arr2]\n", "np.c_[np.r_[arr1, arr2], arr]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "np.c_[1:6, -10:-5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Repeating elements: tile and repeat" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.arange(3)\n", "arr.repeat(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr.repeat([2, 3, 4])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = randn(2, 2)\n", "arr\n", "arr.repeat(2, axis=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr.repeat([2, 3], axis=0)\n", "arr.repeat([2, 3], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr\n", "np.tile(arr, 2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr\n", "np.tile(arr, (2, 1))\n", "np.tile(arr, (3, 2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Fancy indexing equivalents: take and put" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.arange(10) * 100\n", "inds = [7, 1, 2, 6]\n", "arr[inds]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr.take(inds)\n", "arr.put(inds, 42)\n", "arr\n", "arr.put(inds, [40, 41, 42, 43])\n", "arr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "inds = [2, 0, 2, 1]\n", "arr = randn(2, 4)\n", "arr\n", "arr.take(inds, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Broadcasting" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.arange(5)\n", "arr\n", "arr * 4" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = randn(4, 3)\n", "arr.mean(0)\n", "demeaned = arr - arr.mean(0)\n", "demeaned\n", "demeaned.mean(0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr\n", "row_means = arr.mean(1)\n", "row_means.reshape((4, 1))\n", "demeaned = arr - row_means.reshape((4, 1))\n", "demeaned.mean(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Broadcasting over other axes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr - arr.mean(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr - arr.mean(1).reshape((4, 1))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.zeros((4, 4))\n", "arr_3d = arr[:, np.newaxis, :]\n", "arr_3d.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr_1d = np.random.normal(size=3)\n", "arr_1d[:, np.newaxis]\n", "arr_1d[np.newaxis, :]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = randn(3, 4, 5)\n", "depth_means = arr.mean(2)\n", "depth_means\n", "demeaned = arr - depth_means[:, :, np.newaxis]\n", "demeaned.mean(2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def demean_axis(arr, axis=0):\n", " means = arr.mean(axis)\n", "\n", " # This generalized things like [:, :, np.newaxis] to N dimensions\n", " indexer = [slice(None)] * arr.ndim\n", " indexer[axis] = np.newaxis\n", " return arr - means[indexer]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Setting array values by broadcasting" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.zeros((4, 3))\n", "arr[:] = 5\n", "arr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "col = np.array([1.28, -0.42, 0.44, 1.6])\n", "arr[:] = col[:, np.newaxis]\n", "arr\n", "arr[:2] = [[-1.37], [0.509]]\n", "arr" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Advanced ufunc usage" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Ufunc instance methods" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.arange(10)\n", "np.add.reduce(arr)\n", "arr.sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "np.random.seed(12346)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = randn(5, 5)\n", "arr[::2].sort(1) # sort a few rows\n", "arr[:, :-1] < arr[:, 1:]\n", "np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.arange(15).reshape((3, 5))\n", "np.add.accumulate(arr, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.arange(3).repeat([1, 2, 2])\n", "arr\n", "np.multiply.outer(arr, np.arange(5))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "result = np.subtract.outer(randn(3, 4), randn(5))\n", "result.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.arange(10)\n", "np.add.reduceat(arr, [0, 5, 8])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.multiply.outer(np.arange(4), np.arange(5))\n", "arr\n", "np.add.reduceat(arr, [0, 2, 4], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Custom ufuncs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def add_elements(x, y):\n", " return x + y\n", "add_them = np.frompyfunc(add_elements, 2, 1)\n", "add_them(np.arange(8), np.arange(8))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "add_them = np.vectorize(add_elements, otypes=[np.float64])\n", "add_them(np.arange(8), np.arange(8))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = randn(10000)\n", "%timeit add_them(arr, arr)\n", "%timeit np.add(arr, arr)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Structured and record arrays" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dtype = [('x', np.float64), ('y', np.int32)]\n", "sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)\n", "sarr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "sarr[0]\n", "sarr[0]['y']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "sarr['x']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Nested dtypes and multidimensional fields" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dtype = [('x', np.int64, 3), ('y', np.int32)]\n", "arr = np.zeros(4, dtype=dtype)\n", "arr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr[0]['x']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr['x']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]\n", "data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)\n", "data['x']\n", "data['y']\n", "data['x']['a']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Why use structured arrays?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Structured array manipulations: numpy.lib.recfunctions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## More about sorting" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = randn(6)\n", "arr.sort()\n", "arr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = randn(3, 5)\n", "arr\n", "arr[:, 0].sort() # Sort first column values in-place\n", "arr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = randn(5)\n", "arr\n", "np.sort(arr)\n", "arr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = randn(3, 5)\n", "arr\n", "arr.sort(axis=1)\n", "arr" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr[:, ::-1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Indirect sorts: argsort and lexsort" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "values = np.array([5, 0, 1, 3, 2])\n", "indexer = values.argsort()\n", "indexer\n", "values[indexer]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = randn(3, 5)\n", "arr[0] = values\n", "arr\n", "arr[:, arr[0].argsort()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])\n", "last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])\n", "sorter = np.lexsort((first_name, last_name))\n", "zip(last_name[sorter], first_name[sorter])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Alternate sort algorithms" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])\n", "key = np.array([2, 2, 1, 1, 1])\n", "indexer = key.argsort(kind='mergesort')\n", "indexer\n", "values.take(indexer)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### numpy.searchsorted: Finding elements in a sorted array" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.array([0, 1, 7, 12, 15])\n", "arr.searchsorted(9)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr.searchsorted([0, 8, 11, 16])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr = np.array([0, 0, 0, 1, 1, 1, 1])\n", "arr.searchsorted([0, 1])\n", "arr.searchsorted([0, 1], side='right')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data = np.floor(np.random.uniform(0, 10000, size=50))\n", "bins = np.array([0, 100, 1000, 5000, 10000])\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "labels = bins.searchsorted(data)\n", "labels" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "Series(data).groupby(labels).mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "np.digitize(data, bins)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## NumPy matrix class" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X = np.array([[ 8.82768214, 3.82222409, -1.14276475, 2.04411587],\n", " [ 3.82222409, 6.75272284, 0.83909108, 2.08293758],\n", " [-1.14276475, 0.83909108, 5.01690521, 0.79573241],\n", " [ 2.04411587, 2.08293758, 0.79573241, 6.24095859]])\n", "X[:, 0] # one-dimensional\n", "y = X[:, :1] # two-dimensional by slicing\n", "X\n", "y" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "np.dot(y.T, np.dot(X, y))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "Xm = np.matrix(X)\n", "ym = Xm[:, 0]\n", "Xm\n", "ym\n", "ym.T * Xm * ym" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "Xm.I * X" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Advanced array input and output" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Memory-mapped files" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "mmap = np.memmap('mymmap', dtype='float64', mode='w+', shape=(10000, 10000))\n", "mmap" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "section = mmap[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "section[:] = np.random.randn(5, 10000)\n", "mmap.flush()\n", "mmap\n", "del mmap" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))\n", "mmap" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%xdel mmap\n", "!rm mymmap" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### HDF5 and other array storage options" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Performance tips" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The importance of contiguous memory" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr_c = np.ones((1000, 1000), order='C')\n", "arr_f = np.ones((1000, 1000), order='F')\n", "arr_c.flags\n", "arr_f.flags\n", "arr_f.flags.f_contiguous" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%timeit arr_c.sum(1)\n", "%timeit arr_f.sum(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr_f.copy('C').flags" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "arr_c[:50].flags.contiguous\n", "arr_c[:, :50].flags" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%xdel arr_c\n", "%xdel arr_f\n", "%cd .." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Other speed options: Cython, f2py, C" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```cython\n", "from numpy cimport ndarray, float64_t\n", "\n", "def sum_elements(ndarray[float64_t] arr):\n", " cdef Py_ssize_t i, n = len(arr)\n", " cdef float64_t result = 0\n", "\n", " for i in range(n):\n", " result += arr[i]\n", "\n", " return result\n", "```" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }