{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "We want to group by multi-valued columns. For example, take this dataset:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
leftrightvalue
0[a, b][x, y]3
1[a, c][x, z]2
\n", "
" ], "text/plain": [ " left right value\n", "0 [a, b] [x, y] 3\n", "1 [a, c] [x, z] 2" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "data = pd.DataFrame([\n", " [['a', 'b'], ['x', 'y'], 3],\n", " [['a', 'c'], ['x', 'z'], 2],\n", " ], columns=['left', 'right', 'value'])\n", "data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`left` and `right` have arrays as values.\n", "\n", "We want the equivalent of a `.groupby([left, right])['value'].sum()` that would return:\n", "\n", " left right value\n", " a x 5 # (a, x) occurs in the first AND second row\n", " a y 3 # from first row\n", " b x 3 # from first row\n", " b y 3 # from first row\n", " a z 2 # from second row\n", " c x 2 # from second row\n", " c z 2 # from second row" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Approach 1: Python\n", "\n", "Let's write this as a series of `for` loops first:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{('a', 'x'): 5L,\n", " ('a', 'y'): 3L,\n", " ('a', 'z'): 2L,\n", " ('b', 'x'): 3L,\n", " ('b', 'y'): 3L,\n", " ('c', 'x'): 2L,\n", " ('c', 'z'): 2L}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def python_kpartite(data, left, right, value):\n", " result = {}\n", " for index, row in data.iterrows(): # Loop through every row\n", " for a in row[left]: # Loop through all values in the left cell\n", " for b in row[right]: # ... and the right cell\n", " if (a, b) in result: # Add value to the results counter\n", " result[a, b] += row[value]\n", " else:\n", " result[a, b] = row[value]\n", " return result # Return the dict\n", "\n", "python_kpartite(data, 'left', 'right', 'value')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This result is correct. But it is fairly slow even for small data." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "smalldata = pd.concat([data] * 10000)\n", "middata = pd.concat([data] * 100000)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 loops, best of 3: 2.46 s per loop\n" ] } ], "source": [ "%timeit python_kpartite(smalldata, 'left', 'right', 'value')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Python with Pandas" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "result = {}\n", "def python_inner(row):\n", " for a in row['left']:\n", " for b in row['right']:\n", " if (a, b) in result:\n", " result[a, b] += row['value']\n", " else:\n", " result[a, b] = row['value']" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 loops, best of 3: 1.44 s per loop\n" ] } ], "source": [ "%timeit smalldata.apply(python_inner, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Approach 2: Cython" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The Cython extension is already loaded. To reload it, use:\n", " %reload_ext Cython\n" ] } ], "source": [ "%load_ext Cython" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Naive Cython is not faster" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%cython\n", "\n", "# This is a one-time compilation! If you re-run it, result will still be cached.\n", "# Change this comment to recompile.\n", "result = {}\n", "def cython_inner(row):\n", " for a in row['left']:\n", " for b in row['right']:\n", " result[a, b] = result.get((a, b), 0) + row['value']" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 loops, best of 3: 1.42 s per loop\n" ] } ], "source": [ "%timeit smalldata.apply(cython_inner, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Approach 3: Cython on NumPy\n", "\n", "See http://nbviewer.ipython.org/gist/tillahoffmann/296501acea231cbdf5e7 for profiling" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Load Robert Kern's line profiler\n", "%load_ext line_profiler\n", "import line_profiler\n", "\n", "#Set compiler directives (cf. http://docs.cython.org/src/reference/compilation.html)\n", "from Cython.Compiler.Options import directive_defaults\n", "\n", "directive_defaults['linetrace'] = True\n", "directive_defaults['binding'] = True" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", " \n", " Cython: _cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2.pyx\n", " \n", " \n", "\n", "\n", "

Generated by Cython 0.23.4

\n", "

\n", " Yellow lines hint at Python interaction.
\n", " Click on a line that starts with a \"+\" to see the C code that Cython generated for it.\n", "

\n", "
 01: 
\n", "
+02: cimport numpy as np
\n", "
  __Pyx_TraceLine(2,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __Pyx_GOTREF(__pyx_t_1);\n",
       "  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;\n",
       "
 03: 
\n", "
+04: def cython_kpartite(np.ndarray npdata):
\n", "
/* Python wrapper */\n",
       "static PyObject *__pyx_pw_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_1cython_kpartite(PyObject *__pyx_self, PyObject *__pyx_v_npdata); /*proto*/\n",
       "static PyMethodDef __pyx_mdef_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_1cython_kpartite = {\"cython_kpartite\", (PyCFunction)__pyx_pw_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_1cython_kpartite, METH_O, 0};\n",
       "static PyObject *__pyx_pw_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_1cython_kpartite(PyObject *__pyx_self, PyObject *__pyx_v_npdata) {\n",
       "  PyObject *__pyx_r = 0;\n",
       "  __Pyx_RefNannyDeclarations\n",
       "  __Pyx_RefNannySetupContext(\"cython_kpartite (wrapper)\", 0);\n",
       "  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_npdata), __pyx_ptype_5numpy_ndarray, 1, \"npdata\", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __pyx_r = __pyx_pf_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_cython_kpartite(__pyx_self, ((PyArrayObject *)__pyx_v_npdata));\n",
       "  CYTHON_UNUSED int __pyx_lineno = 0;\n",
       "  CYTHON_UNUSED const char *__pyx_filename = NULL;\n",
       "  CYTHON_UNUSED int __pyx_clineno = 0;\n",
       "\n",
       "  /* function exit code */\n",
       "  goto __pyx_L0;\n",
       "  __pyx_L1_error:;\n",
       "  __pyx_r = NULL;\n",
       "  __pyx_L0:;\n",
       "  __Pyx_RefNannyFinishContext();\n",
       "  return __pyx_r;\n",
       "}\n",
       "\n",
       "static PyObject *__pyx_pf_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_cython_kpartite(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_npdata) {\n",
       "  int __pyx_v_index;\n",
       "  PyArrayObject *__pyx_v_row = 0;\n",
       "  PyObject *__pyx_v_left = 0;\n",
       "  PyObject *__pyx_v_right = 0;\n",
       "  double __pyx_v_value;\n",
       "  PyObject *__pyx_v_a = 0;\n",
       "  PyObject *__pyx_v_b = 0;\n",
       "  PyObject *__pyx_v_key = 0;\n",
       "  PyObject *__pyx_v_result = 0;\n",
       "  PyObject *__pyx_r = NULL;\n",
       "  __Pyx_TraceDeclarations\n",
       "  __Pyx_TraceFrameInit(__pyx_codeobj_)\n",
       "  __Pyx_RefNannyDeclarations\n",
       "  __Pyx_RefNannySetupContext(\"cython_kpartite\", 0);\n",
       "  __Pyx_TraceCall(\"cython_kpartite\", __pyx_f[0], 4, 0, {__pyx_filename = __pyx_f[0]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;});\n",
       "/* … */\n",
       "  /* function exit code */\n",
       "  __pyx_L1_error:;\n",
       "  __Pyx_XDECREF(__pyx_t_1);\n",
       "  __Pyx_XDECREF(__pyx_t_6);\n",
       "  __Pyx_XDECREF(__pyx_t_8);\n",
       "  __Pyx_XDECREF(__pyx_t_9);\n",
       "  __Pyx_XDECREF(__pyx_t_10);\n",
       "  __Pyx_AddTraceback(\"_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2.cython_kpartite\", __pyx_clineno, __pyx_lineno, __pyx_filename);\n",
       "  __pyx_r = NULL;\n",
       "  __pyx_L0:;\n",
       "  __Pyx_XDECREF((PyObject *)__pyx_v_row);\n",
       "  __Pyx_XDECREF(__pyx_v_left);\n",
       "  __Pyx_XDECREF(__pyx_v_right);\n",
       "  __Pyx_XDECREF(__pyx_v_a);\n",
       "  __Pyx_XDECREF(__pyx_v_b);\n",
       "  __Pyx_XDECREF(__pyx_v_key);\n",
       "  __Pyx_XDECREF(__pyx_v_result);\n",
       "  __Pyx_XGIVEREF(__pyx_r);\n",
       "  __Pyx_TraceReturn(__pyx_r, 0);\n",
       "  __Pyx_RefNannyFinishContext();\n",
       "  return __pyx_r;\n",
       "}\n",
       "/* … */\n",
       "  __pyx_tuple__8 = PyTuple_Pack(10, __pyx_n_s_npdata, __pyx_n_s_index, __pyx_n_s_row, __pyx_n_s_left, __pyx_n_s_right, __pyx_n_s_value, __pyx_n_s_a, __pyx_n_s_b, __pyx_n_s_key, __pyx_n_s_result); if (unlikely(!__pyx_tuple__8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __Pyx_GOTREF(__pyx_tuple__8);\n",
       "  __Pyx_GIVEREF(__pyx_tuple__8);\n",
       "/* … */\n",
       "  __Pyx_TraceLine(4,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_1cython_kpartite, 0, __pyx_n_s_cython_kpartite, NULL, __pyx_n_s_cython_magic_61780ddc8d4e3e1667, __pyx_d, ((PyObject *)__pyx_codeobj_)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __Pyx_GOTREF(__pyx_t_1);\n",
       "  if (PyDict_SetItem(__pyx_d, __pyx_n_s_cython_kpartite, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;\n",
       "
 05:     cdef int index
\n", "
 06:     cdef np.ndarray row
\n", "
 07:     cdef list left, right
\n", "
 08:     cdef double value
\n", "
 09:     cdef str a, b
\n", "
 10:     cdef tuple key
\n", "
+11:     cdef dict result = {}
\n", "
  __Pyx_TraceLine(11,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __Pyx_GOTREF(__pyx_t_1);\n",
       "  __pyx_v_result = ((PyObject*)__pyx_t_1);\n",
       "  __pyx_t_1 = 0;\n",
       "
+12:     for index in range(npdata.shape[0]):
\n", "
  __Pyx_TraceLine(12,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "  __pyx_t_2 = (__pyx_v_npdata->dimensions[0]);\n",
       "  for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) {\n",
       "    __pyx_v_index = __pyx_t_3;\n",
       "
+13:         row = npdata[index]
\n", "
    __Pyx_TraceLine(13,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "    __pyx_t_1 = __Pyx_GetItemInt(((PyObject *)__pyx_v_npdata), __pyx_v_index, int, 1, __Pyx_PyInt_From_int, 0, 1, 1); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;};\n",
       "    __Pyx_GOTREF(__pyx_t_1);\n",
       "    if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "    __Pyx_XDECREF_SET(__pyx_v_row, ((PyArrayObject *)__pyx_t_1));\n",
       "    __pyx_t_1 = 0;\n",
       "
+14:         left = row[0]
\n", "
    __Pyx_TraceLine(14,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "    __pyx_t_1 = __Pyx_GetItemInt(((PyObject *)__pyx_v_row), 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;};\n",
       "    __Pyx_GOTREF(__pyx_t_1);\n",
       "    if (!(likely(PyList_CheckExact(__pyx_t_1))||((__pyx_t_1) == Py_None)||(PyErr_Format(PyExc_TypeError, \"Expected %.16s, got %.200s\", \"list\", Py_TYPE(__pyx_t_1)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "    __Pyx_XDECREF_SET(__pyx_v_left, ((PyObject*)__pyx_t_1));\n",
       "    __pyx_t_1 = 0;\n",
       "
+15:         right = row[1]
\n", "
    __Pyx_TraceLine(15,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "    __pyx_t_1 = __Pyx_GetItemInt(((PyObject *)__pyx_v_row), 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;};\n",
       "    __Pyx_GOTREF(__pyx_t_1);\n",
       "    if (!(likely(PyList_CheckExact(__pyx_t_1))||((__pyx_t_1) == Py_None)||(PyErr_Format(PyExc_TypeError, \"Expected %.16s, got %.200s\", \"list\", Py_TYPE(__pyx_t_1)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "    __Pyx_XDECREF_SET(__pyx_v_right, ((PyObject*)__pyx_t_1));\n",
       "    __pyx_t_1 = 0;\n",
       "
+16:         value = row[2]
\n", "
    __Pyx_TraceLine(16,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "    __pyx_t_1 = __Pyx_GetItemInt(((PyObject *)__pyx_v_row), 2, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;};\n",
       "    __Pyx_GOTREF(__pyx_t_1);\n",
       "    __pyx_t_4 = __pyx_PyFloat_AsDouble(__pyx_t_1); if (unlikely((__pyx_t_4 == (double)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;\n",
       "    __pyx_v_value = __pyx_t_4;\n",
       "
+17:         for a in left:
\n", "
    __Pyx_TraceLine(17,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "    if (unlikely(__pyx_v_left == Py_None)) {\n",
       "      PyErr_SetString(PyExc_TypeError, \"'NoneType' object is not iterable\");\n",
       "      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "    }\n",
       "    __pyx_t_1 = __pyx_v_left; __Pyx_INCREF(__pyx_t_1); __pyx_t_5 = 0;\n",
       "    for (;;) {\n",
       "      if (__pyx_t_5 >= PyList_GET_SIZE(__pyx_t_1)) break;\n",
       "      #if CYTHON_COMPILING_IN_CPYTHON\n",
       "      __pyx_t_6 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_5); __Pyx_INCREF(__pyx_t_6); __pyx_t_5++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "      #else\n",
       "      __pyx_t_6 = PySequence_ITEM(__pyx_t_1, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "      __Pyx_GOTREF(__pyx_t_6);\n",
       "      #endif\n",
       "      if (!(likely(PyString_CheckExact(__pyx_t_6))||((__pyx_t_6) == Py_None)||(PyErr_Format(PyExc_TypeError, \"Expected %.16s, got %.200s\", \"str\", Py_TYPE(__pyx_t_6)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "      __Pyx_XDECREF_SET(__pyx_v_a, ((PyObject*)__pyx_t_6));\n",
       "      __pyx_t_6 = 0;\n",
       "/* … */\n",
       "      __Pyx_TraceLine(17,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "    }\n",
       "    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;\n",
       "  }\n",
       "
+18:             for b in right:
\n", "
      __Pyx_TraceLine(18,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "      if (unlikely(__pyx_v_right == Py_None)) {\n",
       "        PyErr_SetString(PyExc_TypeError, \"'NoneType' object is not iterable\");\n",
       "        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "      }\n",
       "      __pyx_t_6 = __pyx_v_right; __Pyx_INCREF(__pyx_t_6); __pyx_t_7 = 0;\n",
       "      for (;;) {\n",
       "        if (__pyx_t_7 >= PyList_GET_SIZE(__pyx_t_6)) break;\n",
       "        #if CYTHON_COMPILING_IN_CPYTHON\n",
       "        __pyx_t_8 = PyList_GET_ITEM(__pyx_t_6, __pyx_t_7); __Pyx_INCREF(__pyx_t_8); __pyx_t_7++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "        #else\n",
       "        __pyx_t_8 = PySequence_ITEM(__pyx_t_6, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "        __Pyx_GOTREF(__pyx_t_8);\n",
       "        #endif\n",
       "        if (!(likely(PyString_CheckExact(__pyx_t_8))||((__pyx_t_8) == Py_None)||(PyErr_Format(PyExc_TypeError, \"Expected %.16s, got %.200s\", \"str\", Py_TYPE(__pyx_t_8)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "        __Pyx_XDECREF_SET(__pyx_v_b, ((PyObject*)__pyx_t_8));\n",
       "        __pyx_t_8 = 0;\n",
       "/* … */\n",
       "        __Pyx_TraceLine(18,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "      }\n",
       "      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;\n",
       "
+19:                 key = a, b
\n", "
        __Pyx_TraceLine(19,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "        __pyx_t_8 = PyTuple_New(2); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "        __Pyx_GOTREF(__pyx_t_8);\n",
       "        __Pyx_INCREF(__pyx_v_a);\n",
       "        __Pyx_GIVEREF(__pyx_v_a);\n",
       "        PyTuple_SET_ITEM(__pyx_t_8, 0, __pyx_v_a);\n",
       "        __Pyx_INCREF(__pyx_v_b);\n",
       "        __Pyx_GIVEREF(__pyx_v_b);\n",
       "        PyTuple_SET_ITEM(__pyx_t_8, 1, __pyx_v_b);\n",
       "        __Pyx_XDECREF_SET(__pyx_v_key, ((PyObject*)__pyx_t_8));\n",
       "        __pyx_t_8 = 0;\n",
       "
+20:                 result[key] = result.get(key, 0)  + value
\n", "
        __Pyx_TraceLine(20,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "        __pyx_t_8 = __Pyx_PyDict_GetItemDefault(__pyx_v_result, __pyx_v_key, __pyx_int_0); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "        __Pyx_GOTREF(__pyx_t_8);\n",
       "        __pyx_t_9 = PyFloat_FromDouble(__pyx_v_value); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "        __Pyx_GOTREF(__pyx_t_9);\n",
       "        __pyx_t_10 = PyNumber_Add(__pyx_t_8, __pyx_t_9); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "        __Pyx_GOTREF(__pyx_t_10);\n",
       "        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;\n",
       "        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;\n",
       "        if (unlikely(PyDict_SetItem(__pyx_v_result, __pyx_v_key, __pyx_t_10) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "        __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;\n",
       "
+21:     return result
\n", "
  __Pyx_TraceLine(21,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "  __Pyx_XDECREF(__pyx_r);\n",
       "  __Pyx_INCREF(__pyx_v_result);\n",
       "  __pyx_r = __pyx_v_result;\n",
       "  goto __pyx_L0;\n",
       "
" ], "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%cython -a -f --compile-args=-DCYTHON_TRACE=1\n", "\n", "cimport numpy as np\n", "\n", "def cython_kpartite(np.ndarray npdata):\n", " cdef int index\n", " cdef np.ndarray row\n", " cdef list left, right\n", " cdef double value\n", " cdef str a, b\n", " cdef tuple key\n", " cdef dict result = {}\n", " for index in range(npdata.shape[0]):\n", " row = npdata[index]\n", " left = row[0]\n", " right = row[1]\n", " value = row[2]\n", " for a in left:\n", " for b in right:\n", " key = a, b\n", " result[key] = result.get(key, 0) + value\n", " return result" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 loops, best of 3: 294 ms per loop\n" ] } ], "source": [ "%timeit cython_kpartite(middata.values)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Optimisation" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", " \n", " Cython: _cython_magic_16adb21d1da83629b403b2790d247d8a.pyx\n", " \n", " \n", "\n", "\n", "

Generated by Cython 0.23.4

\n", "

\n", " Yellow lines hint at Python interaction.
\n", " Click on a line that starts with a \"+\" to see the C code that Cython generated for it.\n", "

\n", "
 01: 
\n", "
+02: cimport numpy as np
\n", "
  __Pyx_TraceLine(2,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __Pyx_GOTREF(__pyx_t_1);\n",
       "  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;\n",
       "
 03: 
\n", "
 04: # cimport cython
\n", "
 05: # @cython.boundscheck(False)
\n", "
 06: # @cython.wraparound(False)
\n", "
 07: # @cython.overflowcheck(False)
\n", "
 08: # @cython.nonecheck(False)
\n", "
+09: def cython_kpartite2(                           # Split parameters on input (-80ms)
\n", "
/* Python wrapper */\n",
       "static PyObject *__pyx_pw_46_cython_magic_16adb21d1da83629b403b2790d247d8a_1cython_kpartite2(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/\n",
       "static PyMethodDef __pyx_mdef_46_cython_magic_16adb21d1da83629b403b2790d247d8a_1cython_kpartite2 = {\"cython_kpartite2\", (PyCFunction)__pyx_pw_46_cython_magic_16adb21d1da83629b403b2790d247d8a_1cython_kpartite2, METH_VARARGS|METH_KEYWORDS, 0};\n",
       "static PyObject *__pyx_pw_46_cython_magic_16adb21d1da83629b403b2790d247d8a_1cython_kpartite2(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {\n",
       "  PyArrayObject *__pyx_v_leftarray = 0;\n",
       "  PyArrayObject *__pyx_v_rightarray = 0;\n",
       "  PyArrayObject *__pyx_v_valuearray = 0;\n",
       "  PyObject *__pyx_r = 0;\n",
       "  __Pyx_RefNannyDeclarations\n",
       "  __Pyx_RefNannySetupContext(\"cython_kpartite2 (wrapper)\", 0);\n",
       "  {\n",
       "    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_leftarray,&__pyx_n_s_rightarray,&__pyx_n_s_valuearray,0};\n",
       "    PyObject* values[3] = {0,0,0};\n",
       "    if (unlikely(__pyx_kwds)) {\n",
       "      Py_ssize_t kw_args;\n",
       "      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);\n",
       "      switch (pos_args) {\n",
       "        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);\n",
       "        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);\n",
       "        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);\n",
       "        case  0: break;\n",
       "        default: goto __pyx_L5_argtuple_error;\n",
       "      }\n",
       "      kw_args = PyDict_Size(__pyx_kwds);\n",
       "      switch (pos_args) {\n",
       "        case  0:\n",
       "        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_leftarray)) != 0)) kw_args--;\n",
       "        else goto __pyx_L5_argtuple_error;\n",
       "        case  1:\n",
       "        if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_rightarray)) != 0)) kw_args--;\n",
       "        else {\n",
       "          __Pyx_RaiseArgtupleInvalid(\"cython_kpartite2\", 1, 3, 3, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L3_error;}\n",
       "        }\n",
       "        case  2:\n",
       "        if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_valuearray)) != 0)) kw_args--;\n",
       "        else {\n",
       "          __Pyx_RaiseArgtupleInvalid(\"cython_kpartite2\", 1, 3, 3, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L3_error;}\n",
       "        }\n",
       "      }\n",
       "      if (unlikely(kw_args > 0)) {\n",
       "        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, \"cython_kpartite2\") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L3_error;}\n",
       "      }\n",
       "    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {\n",
       "      goto __pyx_L5_argtuple_error;\n",
       "    } else {\n",
       "      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);\n",
       "      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);\n",
       "      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);\n",
       "    }\n",
       "    __pyx_v_leftarray = ((PyArrayObject *)values[0]);\n",
       "    __pyx_v_rightarray = ((PyArrayObject *)values[1]);\n",
       "    __pyx_v_valuearray = ((PyArrayObject *)values[2]);\n",
       "  }\n",
       "  goto __pyx_L4_argument_unpacking_done;\n",
       "  __pyx_L5_argtuple_error:;\n",
       "  __Pyx_RaiseArgtupleInvalid(\"cython_kpartite2\", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L3_error;}\n",
       "  __pyx_L3_error:;\n",
       "  __Pyx_AddTraceback(\"_cython_magic_16adb21d1da83629b403b2790d247d8a.cython_kpartite2\", __pyx_clineno, __pyx_lineno, __pyx_filename);\n",
       "  __Pyx_RefNannyFinishContext();\n",
       "  return NULL;\n",
       "  __pyx_L4_argument_unpacking_done:;\n",
       "  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_leftarray), __pyx_ptype_5numpy_ndarray, 1, \"leftarray\", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_rightarray), __pyx_ptype_5numpy_ndarray, 1, \"rightarray\", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_valuearray), __pyx_ptype_5numpy_ndarray, 1, \"valuearray\", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __pyx_r = __pyx_pf_46_cython_magic_16adb21d1da83629b403b2790d247d8a_cython_kpartite2(__pyx_self, __pyx_v_leftarray, __pyx_v_rightarray, __pyx_v_valuearray);\n",
       "  int __pyx_lineno = 0;\n",
       "  const char *__pyx_filename = NULL;\n",
       "  int __pyx_clineno = 0;\n",
       "\n",
       "  /* function exit code */\n",
       "  goto __pyx_L0;\n",
       "  __pyx_L1_error:;\n",
       "  __pyx_r = NULL;\n",
       "  __pyx_L0:;\n",
       "  __Pyx_RefNannyFinishContext();\n",
       "  return __pyx_r;\n",
       "}\n",
       "\n",
       "static PyObject *__pyx_pf_46_cython_magic_16adb21d1da83629b403b2790d247d8a_cython_kpartite2(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_leftarray, PyArrayObject *__pyx_v_rightarray, PyArrayObject *__pyx_v_valuearray) {\n",
       "  int __pyx_v_index;\n",
       "  int __pyx_v_n;\n",
       "  PyObject *__pyx_v_result = NULL;\n",
       "  PyObject *__pyx_v_right = NULL;\n",
       "  PyObject *__pyx_v_value = NULL;\n",
       "  PyObject *__pyx_v_a = NULL;\n",
       "  PyObject *__pyx_v_b = NULL;\n",
       "  PyObject *__pyx_v_key = NULL;\n",
       "  __Pyx_LocalBuf_ND __pyx_pybuffernd_leftarray;\n",
       "  __Pyx_Buffer __pyx_pybuffer_leftarray;\n",
       "  __Pyx_LocalBuf_ND __pyx_pybuffernd_rightarray;\n",
       "  __Pyx_Buffer __pyx_pybuffer_rightarray;\n",
       "  __Pyx_LocalBuf_ND __pyx_pybuffernd_valuearray;\n",
       "  __Pyx_Buffer __pyx_pybuffer_valuearray;\n",
       "  PyObject *__pyx_r = NULL;\n",
       "  __Pyx_TraceDeclarations\n",
       "  __Pyx_TraceFrameInit(__pyx_codeobj_)\n",
       "  __Pyx_RefNannyDeclarations\n",
       "  __Pyx_RefNannySetupContext(\"cython_kpartite2\", 0);\n",
       "  __Pyx_TraceCall(\"cython_kpartite2\", __pyx_f[0], 9, 0, {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;});\n",
       "  __pyx_pybuffer_leftarray.pybuffer.buf = NULL;\n",
       "  __pyx_pybuffer_leftarray.refcount = 0;\n",
       "  __pyx_pybuffernd_leftarray.data = NULL;\n",
       "  __pyx_pybuffernd_leftarray.rcbuffer = &__pyx_pybuffer_leftarray;\n",
       "  __pyx_pybuffer_rightarray.pybuffer.buf = NULL;\n",
       "  __pyx_pybuffer_rightarray.refcount = 0;\n",
       "  __pyx_pybuffernd_rightarray.data = NULL;\n",
       "  __pyx_pybuffernd_rightarray.rcbuffer = &__pyx_pybuffer_rightarray;\n",
       "  __pyx_pybuffer_valuearray.pybuffer.buf = NULL;\n",
       "  __pyx_pybuffer_valuearray.refcount = 0;\n",
       "  __pyx_pybuffernd_valuearray.data = NULL;\n",
       "  __pyx_pybuffernd_valuearray.rcbuffer = &__pyx_pybuffer_valuearray;\n",
       "  {\n",
       "    __Pyx_BufFmt_StackElem __pyx_stack[1];\n",
       "    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_leftarray.rcbuffer->pybuffer, (PyObject*)__pyx_v_leftarray, &__Pyx_TypeInfo_object, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  }\n",
       "  __pyx_pybuffernd_leftarray.diminfo[0].strides = __pyx_pybuffernd_leftarray.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_leftarray.diminfo[0].shape = __pyx_pybuffernd_leftarray.rcbuffer->pybuffer.shape[0];\n",
       "  {\n",
       "    __Pyx_BufFmt_StackElem __pyx_stack[1];\n",
       "    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_rightarray.rcbuffer->pybuffer, (PyObject*)__pyx_v_rightarray, &__Pyx_TypeInfo_object, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  }\n",
       "  __pyx_pybuffernd_rightarray.diminfo[0].strides = __pyx_pybuffernd_rightarray.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_rightarray.diminfo[0].shape = __pyx_pybuffernd_rightarray.rcbuffer->pybuffer.shape[0];\n",
       "  {\n",
       "    __Pyx_BufFmt_StackElem __pyx_stack[1];\n",
       "    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_valuearray.rcbuffer->pybuffer, (PyObject*)__pyx_v_valuearray, &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  }\n",
       "  __pyx_pybuffernd_valuearray.diminfo[0].strides = __pyx_pybuffernd_valuearray.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_valuearray.diminfo[0].shape = __pyx_pybuffernd_valuearray.rcbuffer->pybuffer.shape[0];\n",
       "/* … */\n",
       "  /* function exit code */\n",
       "  __pyx_L1_error:;\n",
       "  __Pyx_XDECREF(__pyx_t_1);\n",
       "  __Pyx_XDECREF(__pyx_t_8);\n",
       "  __Pyx_XDECREF(__pyx_t_12);\n",
       "  __Pyx_XDECREF(__pyx_t_13);\n",
       "  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;\n",
       "    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);\n",
       "    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_leftarray.rcbuffer->pybuffer);\n",
       "    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_rightarray.rcbuffer->pybuffer);\n",
       "    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_valuearray.rcbuffer->pybuffer);\n",
       "  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}\n",
       "  __Pyx_AddTraceback(\"_cython_magic_16adb21d1da83629b403b2790d247d8a.cython_kpartite2\", __pyx_clineno, __pyx_lineno, __pyx_filename);\n",
       "  __pyx_r = NULL;\n",
       "  goto __pyx_L2;\n",
       "  __pyx_L0:;\n",
       "  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_leftarray.rcbuffer->pybuffer);\n",
       "  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_rightarray.rcbuffer->pybuffer);\n",
       "  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_valuearray.rcbuffer->pybuffer);\n",
       "  __pyx_L2:;\n",
       "  __Pyx_XDECREF(__pyx_v_result);\n",
       "  __Pyx_XDECREF(__pyx_v_right);\n",
       "  __Pyx_XDECREF(__pyx_v_value);\n",
       "  __Pyx_XDECREF(__pyx_v_a);\n",
       "  __Pyx_XDECREF(__pyx_v_b);\n",
       "  __Pyx_XDECREF(__pyx_v_key);\n",
       "  __Pyx_XGIVEREF(__pyx_r);\n",
       "  __Pyx_TraceReturn(__pyx_r, 0);\n",
       "  __Pyx_RefNannyFinishContext();\n",
       "  return __pyx_r;\n",
       "}\n",
       "/* … */\n",
       "  __pyx_tuple__8 = PyTuple_Pack(11, __pyx_n_s_leftarray, __pyx_n_s_rightarray, __pyx_n_s_valuearray, __pyx_n_s_index, __pyx_n_s_n, __pyx_n_s_result, __pyx_n_s_right, __pyx_n_s_value, __pyx_n_s_a, __pyx_n_s_b, __pyx_n_s_key); if (unlikely(!__pyx_tuple__8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __Pyx_GOTREF(__pyx_tuple__8);\n",
       "  __Pyx_GIVEREF(__pyx_tuple__8);\n",
       "/* … */\n",
       "  __Pyx_TraceLine(9,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_46_cython_magic_16adb21d1da83629b403b2790d247d8a_1cython_kpartite2, 0, __pyx_n_s_cython_kpartite2, NULL, __pyx_n_s_cython_magic_16adb21d1da83629b4, __pyx_d, ((PyObject *)__pyx_codeobj_)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __Pyx_GOTREF(__pyx_t_1);\n",
       "  if (PyDict_SetItem(__pyx_d, __pyx_n_s_cython_kpartite2, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;\n",
       "
 10:         np.ndarray[list] leftarray,
\n", "
 11:         np.ndarray[list] rightarray,
\n", "
 12:         np.ndarray[double] valuearray):
\n", "
+13:     cdef int index, n = leftarray.shape[0]
\n", "
  __Pyx_TraceLine(13,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "  __pyx_v_n = (__pyx_v_leftarray->dimensions[0]);\n",
       "
 14:     # cdef list left, right
\n", "
 15:     # cdef double value
\n", "
+16:     result = {}
\n", "
  __Pyx_TraceLine(16,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "  __Pyx_GOTREF(__pyx_t_1);\n",
       "  __pyx_v_result = ((PyObject*)__pyx_t_1);\n",
       "  __pyx_t_1 = 0;\n",
       "
+17:     for index in range(n):
\n", "
  __Pyx_TraceLine(17,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "  __pyx_t_2 = __pyx_v_n;\n",
       "  for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) {\n",
       "    __pyx_v_index = __pyx_t_3;\n",
       "
+18:         right = rightarray[index]
\n", "
    __Pyx_TraceLine(18,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "    __pyx_t_4 = __pyx_v_index;\n",
       "    __pyx_t_5 = -1;\n",
       "    if (__pyx_t_4 < 0) {\n",
       "      __pyx_t_4 += __pyx_pybuffernd_rightarray.diminfo[0].shape;\n",
       "      if (unlikely(__pyx_t_4 < 0)) __pyx_t_5 = 0;\n",
       "    } else if (unlikely(__pyx_t_4 >= __pyx_pybuffernd_rightarray.diminfo[0].shape)) __pyx_t_5 = 0;\n",
       "    if (unlikely(__pyx_t_5 != -1)) {\n",
       "      __Pyx_RaiseBufferIndexError(__pyx_t_5);\n",
       "      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "    }\n",
       "    __pyx_t_1 = (PyObject *) *__Pyx_BufPtrStrided1d(PyObject **, __pyx_pybuffernd_rightarray.rcbuffer->pybuffer.buf, __pyx_t_4, __pyx_pybuffernd_rightarray.diminfo[0].strides);\n",
       "    __Pyx_INCREF((PyObject*)__pyx_t_1);\n",
       "    __Pyx_XDECREF_SET(__pyx_v_right, __pyx_t_1);\n",
       "    __pyx_t_1 = 0;\n",
       "
+19:         value = valuearray[index]
\n", "
    __Pyx_TraceLine(19,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "    __pyx_t_6 = __pyx_v_index;\n",
       "    __pyx_t_5 = -1;\n",
       "    if (__pyx_t_6 < 0) {\n",
       "      __pyx_t_6 += __pyx_pybuffernd_valuearray.diminfo[0].shape;\n",
       "      if (unlikely(__pyx_t_6 < 0)) __pyx_t_5 = 0;\n",
       "    } else if (unlikely(__pyx_t_6 >= __pyx_pybuffernd_valuearray.diminfo[0].shape)) __pyx_t_5 = 0;\n",
       "    if (unlikely(__pyx_t_5 != -1)) {\n",
       "      __Pyx_RaiseBufferIndexError(__pyx_t_5);\n",
       "      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "    }\n",
       "    __pyx_t_1 = PyFloat_FromDouble((*__Pyx_BufPtrStrided1d(double *, __pyx_pybuffernd_valuearray.rcbuffer->pybuffer.buf, __pyx_t_6, __pyx_pybuffernd_valuearray.diminfo[0].strides))); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "    __Pyx_GOTREF(__pyx_t_1);\n",
       "    __Pyx_XDECREF_SET(__pyx_v_value, __pyx_t_1);\n",
       "    __pyx_t_1 = 0;\n",
       "
+20:         for a in leftarray[index]:
\n", "
    __Pyx_TraceLine(20,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "    __pyx_t_7 = __pyx_v_index;\n",
       "    __pyx_t_5 = -1;\n",
       "    if (__pyx_t_7 < 0) {\n",
       "      __pyx_t_7 += __pyx_pybuffernd_leftarray.diminfo[0].shape;\n",
       "      if (unlikely(__pyx_t_7 < 0)) __pyx_t_5 = 0;\n",
       "    } else if (unlikely(__pyx_t_7 >= __pyx_pybuffernd_leftarray.diminfo[0].shape)) __pyx_t_5 = 0;\n",
       "    if (unlikely(__pyx_t_5 != -1)) {\n",
       "      __Pyx_RaiseBufferIndexError(__pyx_t_5);\n",
       "      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "    }\n",
       "    __pyx_t_1 = (PyObject *) *__Pyx_BufPtrStrided1d(PyObject **, __pyx_pybuffernd_leftarray.rcbuffer->pybuffer.buf, __pyx_t_7, __pyx_pybuffernd_leftarray.diminfo[0].strides);\n",
       "    __Pyx_INCREF((PyObject*)__pyx_t_1);\n",
       "    if (unlikely(__pyx_t_1 == Py_None)) {\n",
       "      PyErr_SetString(PyExc_TypeError, \"'NoneType' object is not iterable\");\n",
       "      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "    }\n",
       "    __pyx_t_8 = __pyx_t_1; __Pyx_INCREF(__pyx_t_8); __pyx_t_9 = 0;\n",
       "    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;\n",
       "    for (;;) {\n",
       "      if (__pyx_t_9 >= PyList_GET_SIZE(__pyx_t_8)) break;\n",
       "      #if CYTHON_COMPILING_IN_CPYTHON\n",
       "      __pyx_t_1 = PyList_GET_ITEM(__pyx_t_8, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "      #else\n",
       "      __pyx_t_1 = PySequence_ITEM(__pyx_t_8, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "      __Pyx_GOTREF(__pyx_t_1);\n",
       "      #endif\n",
       "      __Pyx_XDECREF_SET(__pyx_v_a, __pyx_t_1);\n",
       "      __pyx_t_1 = 0;\n",
       "/* … */\n",
       "      __Pyx_TraceLine(20,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "    }\n",
       "    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;\n",
       "  }\n",
       "
+21:             for b in right:
\n", "
      __Pyx_TraceLine(21,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "      if (likely(PyList_CheckExact(__pyx_v_right)) || PyTuple_CheckExact(__pyx_v_right)) {\n",
       "        __pyx_t_1 = __pyx_v_right; __Pyx_INCREF(__pyx_t_1); __pyx_t_10 = 0;\n",
       "        __pyx_t_11 = NULL;\n",
       "      } else {\n",
       "        __pyx_t_10 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_v_right); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "        __Pyx_GOTREF(__pyx_t_1);\n",
       "        __pyx_t_11 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_11)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "      }\n",
       "      for (;;) {\n",
       "        if (likely(!__pyx_t_11)) {\n",
       "          if (likely(PyList_CheckExact(__pyx_t_1))) {\n",
       "            if (__pyx_t_10 >= PyList_GET_SIZE(__pyx_t_1)) break;\n",
       "            #if CYTHON_COMPILING_IN_CPYTHON\n",
       "            __pyx_t_12 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_10); __Pyx_INCREF(__pyx_t_12); __pyx_t_10++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "            #else\n",
       "            __pyx_t_12 = PySequence_ITEM(__pyx_t_1, __pyx_t_10); __pyx_t_10++; if (unlikely(!__pyx_t_12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "            __Pyx_GOTREF(__pyx_t_12);\n",
       "            #endif\n",
       "          } else {\n",
       "            if (__pyx_t_10 >= PyTuple_GET_SIZE(__pyx_t_1)) break;\n",
       "            #if CYTHON_COMPILING_IN_CPYTHON\n",
       "            __pyx_t_12 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_10); __Pyx_INCREF(__pyx_t_12); __pyx_t_10++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "            #else\n",
       "            __pyx_t_12 = PySequence_ITEM(__pyx_t_1, __pyx_t_10); __pyx_t_10++; if (unlikely(!__pyx_t_12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "            __Pyx_GOTREF(__pyx_t_12);\n",
       "            #endif\n",
       "          }\n",
       "        } else {\n",
       "          __pyx_t_12 = __pyx_t_11(__pyx_t_1);\n",
       "          if (unlikely(!__pyx_t_12)) {\n",
       "            PyObject* exc_type = PyErr_Occurred();\n",
       "            if (exc_type) {\n",
       "              if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();\n",
       "              else {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "            }\n",
       "            break;\n",
       "          }\n",
       "          __Pyx_GOTREF(__pyx_t_12);\n",
       "        }\n",
       "        __Pyx_XDECREF_SET(__pyx_v_b, __pyx_t_12);\n",
       "        __pyx_t_12 = 0;\n",
       "/* … */\n",
       "        __Pyx_TraceLine(21,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "      }\n",
       "      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;\n",
       "
+22:                 key = a, b                     # TODO: Use a string instead of a tuple (saves -80ms)
\n", "
        __Pyx_TraceLine(22,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "        __pyx_t_12 = PyTuple_New(2); if (unlikely(!__pyx_t_12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "        __Pyx_GOTREF(__pyx_t_12);\n",
       "        __Pyx_INCREF(__pyx_v_a);\n",
       "        __Pyx_GIVEREF(__pyx_v_a);\n",
       "        PyTuple_SET_ITEM(__pyx_t_12, 0, __pyx_v_a);\n",
       "        __Pyx_INCREF(__pyx_v_b);\n",
       "        __Pyx_GIVEREF(__pyx_v_b);\n",
       "        PyTuple_SET_ITEM(__pyx_t_12, 1, __pyx_v_b);\n",
       "        __Pyx_XDECREF_SET(__pyx_v_key, ((PyObject*)__pyx_t_12));\n",
       "        __pyx_t_12 = 0;\n",
       "
+23:                 result[key] = result.get(key, 0.0) + value
\n", "
        __Pyx_TraceLine(23,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "        __pyx_t_12 = __Pyx_PyDict_GetItemDefault(__pyx_v_result, __pyx_v_key, __pyx_float_0_0); if (unlikely(!__pyx_t_12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "        __Pyx_GOTREF(__pyx_t_12);\n",
       "        __pyx_t_13 = PyNumber_Add(__pyx_t_12, __pyx_v_value); if (unlikely(!__pyx_t_13)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "        __Pyx_GOTREF(__pyx_t_13);\n",
       "        __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;\n",
       "        if (unlikely(PyDict_SetItem(__pyx_v_result, __pyx_v_key, __pyx_t_13) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;}\n",
       "        __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0;\n",
       "
+24:     return result
\n", "
  __Pyx_TraceLine(24,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;})\n",
       "  __Pyx_XDECREF(__pyx_r);\n",
       "  __Pyx_INCREF(__pyx_v_result);\n",
       "  __pyx_r = __pyx_v_result;\n",
       "  goto __pyx_L0;\n",
       "
" ], "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%cython -a -f --compile-args=-DCYTHON_TRACE=1\n", "\n", "cimport numpy as np\n", "\n", "# cimport cython\n", "# @cython.boundscheck(False)\n", "# @cython.wraparound(False)\n", "# @cython.overflowcheck(False)\n", "# @cython.nonecheck(False)\n", "def cython_kpartite2( # Split parameters on input (-80ms)\n", " np.ndarray[list] leftarray,\n", " np.ndarray[list] rightarray,\n", " np.ndarray[double] valuearray):\n", " cdef int index, n = leftarray.shape[0]\n", " # cdef list left, right\n", " # cdef double value\n", " result = {}\n", " for index in range(n):\n", " right = rightarray[index]\n", " value = valuearray[index]\n", " for a in leftarray[index]:\n", " for b in right:\n", " key = a, b # TODO: Use a string instead of a tuple (saves -80ms)\n", " result[key] = result.get(key, 0.0) + value\n", " return result" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 loops, best of 3: 219 ms per loop\n" ] } ], "source": [ "%timeit cython_kpartite2(middata.left.values, middata.right.values, middata.value.astype(float).values)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Timer unit: 4.66511e-07 s\n", "\n", "Total time: 1.90599 s\n", "File: D:\\cygwin64\\home\\Anand\\.ipython\\cython\\_cython_magic_16adb21d1da83629b403b2790d247d8a.pyx\n", "Function: cython_kpartite2 at line 9\n", "\n", "Line # Hits Time Per Hit % Time Line Contents\n", "==============================================================\n", " 9 def cython_kpartite2( # Split parameters on input (-80ms)\n", " 10 np.ndarray[list] leftarray,\n", " 11 np.ndarray[list] rightarray,\n", " 12 np.ndarray[double] valuearray):\n", " 13 1 4 4.0 0.0 cdef int index, n = leftarray.shape[0]\n", " 14 # cdef list left, right\n", " 15 # cdef double value\n", " 16 1 1 1.0 0.0 result = {}\n", " 17 1 1 1.0 0.0 for index in range(n):\n", " 18 200000 187425 0.9 4.6 right = rightarray[index]\n", " 19 200000 188811 0.9 4.6 value = valuearray[index]\n", " 20 600000 570909 1.0 14.0 for a in leftarray[index]:\n", " 21 1200000 1132739 0.9 27.7 for b in right:\n", " 22 800000 794150 1.0 19.4 key = a, b # TODO: Use a string instead of a tuple (saves -80ms)\n", " 23 800000 1211571 1.5 29.7 result[key] = result.get(key, 0.0) + value\n", " 24 1 8 8.0 0.0 return result\n", "\n" ] } ], "source": [ "profile = line_profiler.LineProfiler(cython_kpartite2)\n", "profile.runcall(cython_kpartite2, middata.left.values, middata.right.values, middata.value.astype(float).values)\n", "profile.print_stats()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%cython\n", "\n", "cimport numpy as np\n", "\n", "def bipartite(np.ndarray[list] leftarray, np.ndarray[list] rightarray):\n", " cdef int index, n = leftarray.shape[0]\n", " cdef dict result = {}\n", " cdef list right\n", " cdef str a, b\n", " for index in range(n):\n", " right = rightarray[index]\n", " for a in leftarray[index]:\n", " for b in right:\n", " key = a, b\n", " result[key] = result.get(key, 0) + 1\n", " return result" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 loops, best of 3: 196 ms per loop\n" ] } ], "source": [ "%timeit bipartite(middata.left.values, middata.right.values)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----\n", "\n", "# Generalised K-Partite\n", "\n", "Let's generalise the problem. If we want multi-partite counts across multiple columns, and each could be multi-valued, like this:\n", "\n", " ColA ColB ColC\n", " A1,A2 B1 C1,C2,C3\n", " A1,A3 - C1\n", " A1 B1,B2 C2,C3\n", " \n", "In that case, the link pairs we want are:\n", "\n", " ColA ColB ColC => Internal links External links\n", " A1,A2 B1 C1,C2,C3 A1-A2, C1-C2, C1-C3, C2-C3 A1-B1, A1-C1, A1-C2, A1-C3, A2-B1, A2-C1, A2-C2, A2-C3\n", " A1,A3 - C1 A1-A3 A1-C1, A3-C1\n", " A1 B1,B2 C2,C3 B1-B2, C2-C3 A1-B1, A1-B2, A1-C2, A1-C3, B1-C2, B1-C3, B2-C2, B2-C3\n", "\n", "The psuedo-code for this is:\n", "\n", " for row in data: # Handle every row independently\n", " for column1 in columns: # Loop through every column\n", " for value1 in row[column1]: # Loop through each item in the list\n", " for column2 in columns: # Loop through every column again\n", " if column2 < column1: # But ignore columns already examined\n", " continue\n", " for value2 in row[column2]: # Loop through each item in the list\n", " count[value1, value2] += 1 # Increment the counter for every pair of values\n", " \n", "Let's get this right in Python first." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data = pd.DataFrame({\n", " 'ColA': [['A1', 'A2'], ['A1','A3'], ['A1']],\n", " 'ColB': [['B1'], [], ['B1', 'B2']],\n", " 'ColC': [['C1', 'C2', 'C3'], ['C1'], ['C2', 'C3']]\n", " })" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def python_kpartite(data):\n", " columns = range(len(data.columns))\n", " count = Counter()\n", " for row in data.values:\n", " for column1 in columns:\n", " for value1 in row[column1]:\n", " for column2 in columns:\n", " if column2 < column1:\n", " continue\n", " for value2 in row[column2]:\n", " count[value1, value2] += 1\n", " return pd.Series(count)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## How fast is it?\n", "\n", "Do we even need to optimise it? Let's take a look at it's speed on 30,000 rows:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 loops, best of 3: 1.34 s per loop\n" ] } ], "source": [ "%timeit python_kpartite(pd.concat([data]*10000))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's look at some real-life data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "airline = pd.read_csv('airline-patents.csv')\n", "\n", "# Convert all string columns to arrays -- split by \";\" and deduplicate non-empty values\n", "# columns = [col for col in airline.columns if airline[col].dtype.kind == 'O']\n", "columns = ['PA', 'IPC', 'INV', 'keywords']\n", "for col in columns:\n", " airline[col] = airline[col].str.split(';').apply(lambda v: [x for x in set(v) if x])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PAIPCINVkeywords
0[MESSERSCHMITT BOELKOW BLOHM][G10K, B64C, F04D][BSCHORR OSKAR][angular range, control system, fuselage skin,...
1[MESSERSCHMITT BOELKOW BLOHM][F02C, B64D][JUNGCLAUS GUENTHER, KROHN ERNST OTTO][air outlet port, air passage, missile body, a...
2[CRESSWELL ARNOLD W, VOSS WALDEMAR E, ERNSTING...[A62B, B64D][CRESSWELL ARNOLD W, VOSS WALDEMAR E, ERNSTING...[air supply duct, sectional area, non-return v...
3[Unassigned][C08L][CHEN JOHN Y][gelatinous compositions, soft gelatinous elas...
4[Unassigned][B01D, C10G][CHEN JOHN Y][gel rigidity region, metal foil, synthetic re...
\n", "
" ], "text/plain": [ " PA IPC \\\n", "0 [MESSERSCHMITT BOELKOW BLOHM] [G10K, B64C, F04D] \n", "1 [MESSERSCHMITT BOELKOW BLOHM] [F02C, B64D] \n", "2 [CRESSWELL ARNOLD W, VOSS WALDEMAR E, ERNSTING... [A62B, B64D] \n", "3 [Unassigned] [C08L] \n", "4 [Unassigned] [B01D, C10G] \n", "\n", " INV \\\n", "0 [BSCHORR OSKAR] \n", "1 [JUNGCLAUS GUENTHER, KROHN ERNST OTTO] \n", "2 [CRESSWELL ARNOLD W, VOSS WALDEMAR E, ERNSTING... \n", "3 [CHEN JOHN Y] \n", "4 [CHEN JOHN Y] \n", "\n", " keywords \n", "0 [angular range, control system, fuselage skin,... \n", "1 [air outlet port, air passage, missile body, a... \n", "2 [air supply duct, sectional area, non-return v... \n", "3 [gelatinous compositions, soft gelatinous elas... \n", "4 [gel rigidity region, metal foil, synthetic re... " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "airline[columns].head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "20000" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(airline)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 loops, best of 3: 770 ms per loop\n" ] } ], "source": [ "%timeit python_kpartite(airline[columns].head(1000))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We'd like this to be at much faster." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Naive Cython\n", "\n", "Let's just copy the same algorithm into Cython and see how fast it is." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%load_ext Cython" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%cython\n", "\n", "from collections import Counter\n", "\n", "def cython_kpartite(data):\n", " columns = range(len(data.columns))\n", " count = Counter()\n", " for row in data.values:\n", " for column1 in columns:\n", " for value1 in row[column1]:\n", " for column2 in columns:\n", " if column2 < column1:\n", " continue\n", " for value2 in row[column2]:\n", " count[value1, value2] += 1\n", " return count" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 loops, best of 3: 292 ms per loop\n" ] } ], "source": [ "%timeit cython_kpartite(airline[columns].head(1000))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### It's the counting that's slow\n", "\n", "Actually, let's run *the same algorithm* but instead of incrementing a `collections.Counter`, lets increment an integer and see how much time it takes." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%%cython\n", "\n", "def cython_kpartite(data):\n", " columns = range(len(data.columns))\n", " count = 0\n", " for row in data.values:\n", " for column1 in columns:\n", " for value1 in row[column1]:\n", " for column2 in columns:\n", " if column2 < column1:\n", " continue\n", " for value2 in row[column2]:\n", " count += 1\n", " return count" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100 loops, best of 3: 9.72 ms per loop\n" ] } ], "source": [ "%timeit cython_kpartite(airline[columns].head(1000))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "That's much faster. Not bad, but not good enough. Plus, we need to see how to optimise `collections.Counter`." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### We're creating too many links!\n", "\n", "Let's see how many links this will create:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "303,550\n", "3,722,267\n" ] } ], "source": [ "print '{:,d}'.format(cython_kpartite(airline[columns].head(1000)))\n", "print '{:,d}'.format(cython_kpartite(airline[columns].head(20000)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "That's a **HUGE** number links.\n", "\n", "Obviously, we're not interested in the long tail. We'd be primarily interested in the connections between the largest nodes. So we need to rework our approach:\n", "\n", "1. Find the N largest nodes (where N can be specified by the user)\n", "2. Find the links between the largest nodes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Find the N largest nodes\n", "\n", "Since every value is associated with every other value in a row, calculating this is simple:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%cython -f --compile-args=-DCYTHON_TRACE=1\n", "\n", "from collections import defaultdict\n", "\n", "def cython_count(data):\n", " cdef int ncols = len(data.columns) # ncols = number of columns in data\n", " cdef int rowsize # rowsize = number of nodes in each row\n", " count = defaultdict(int) # count[node] = number of links the node has\n", " for row in data.values: # We loop through each row\n", " rowsize = 0\n", " for col in range(ncols): # ... and calculate the number of nodes in this row\n", " rowsize += len(row[col])\n", " for col in range(ncols): # In every column...\n", " values = row[col]\n", " for index in range(len(values)): # ... for every node...\n", " count[col, values[index]] += rowsize # ... we add the number of nodes in the row to its count\n", " \n", " return count # Count is now a dict that has the number of links against each node" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10 loops, best of 3: 131 ms per loop\n" ] } ], "source": [ "%timeit cython_count(airline[columns].head(100000))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Incidentally, even here, the bulk of the time goes into `defaultdict(int)`. We need a faster counter." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Link the top N nodes\n", "\n", "Let's just create the links for the top nodes. We can skip anything that's not in that list.\n", "\n", "Here's what we could do in every row:\n", "\n", "1. Get the (column, value) for every column and value\n", "2. Map it to a node index\n", "3. Increment all combinations of every node index pairs" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%cython\n", "\n", "from collections import defaultdict\n", "\n", "def cython_kpartite(data, count):\n", " cdef int ncols = len(data.columns) # ncols = number of columns in data\n", "\n", " # For the top nodes, let's get the (column, value) -> node-index\n", " nodes = count.keys()\n", " index = {key: i for i, key in enumerate(count)}\n", " \n", " # Here's a slow implementation of the logic\n", " links = defaultdict(int)\n", " for row in data.values:\n", " indices = []\n", " for col in range(ncols):\n", " values = row[col]\n", " for i in range(len(values)):\n", " key = (col, values[i])\n", " if key in index:\n", " indices.append(index[key])\n", " n = len(indices)\n", " for i in range(n):\n", " for j in range(i + 1, n):\n", " links[i, j] += 1\n", " return nodes, links" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "([(0, 'AIRBUS'),\n", " (0, 'BOEING CO'),\n", " (1, 'B64D'),\n", " (1, 'G05D'),\n", " (0, 'Unassigned'),\n", " (1, 'B64F'),\n", " (1, 'B64G'),\n", " (1, 'G06F'),\n", " (1, 'G01C'),\n", " (1, 'B64C')],\n", " defaultdict(int,\n", " {(0, 1): 3565,\n", " (0, 2): 799,\n", " (0, 3): 137,\n", " (0, 4): 10,\n", " (0, 5): 2,\n", " (1, 2): 799,\n", " (1, 3): 137,\n", " (1, 4): 10,\n", " (1, 5): 2,\n", " (2, 3): 137,\n", " (2, 4): 10,\n", " (2, 5): 2,\n", " (3, 4): 10,\n", " (3, 5): 2,\n", " (4, 5): 2}))" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def top(data, top):\n", " return dict(sorted(data.items(), key=lambda v: -v[1])[:top])\n", "\n", "count = cython_count(airline[columns].head(10000))\n", "cython_kpartite(airline[columns].head(10000), top(count, 10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Memory views\n", "\n", "Functionally this works fine. Now let's optimise it. [Memory views](http://docs.cython.org/src/userguide/memoryviews.html) let us access NumPy arrays' memory directly. Let's do that." ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 loops, best of 1: 204 ms per loop\n" ] } ], "source": [ "# Unoptimised time:\n", "count = cython_count(airline[columns])\n", "%timeit -r1 cython_kpartite(airline[columns].head(100000), top(count, 500))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%cython -f --compile-args=-DCYTHON_TRACE=1\n", "\n", "from collections import defaultdict\n", "import numpy as np\n", "cimport numpy as np\n", "\n", "def cython_kpartite_memoryview(data, count):\n", " cdef int ncols = len(data.columns) # ncols = number of columns in data\n", "\n", " # For the top nodes, let's get the (column, value) -> node-index\n", " nodes = count.keys()\n", " index = {key: i for i, key in enumerate(count)}\n", "\n", " cdef int nodecount = len(nodes)\n", " cdef np.ndarray links = np.zeros([nodecount, nodecount], dtype=np.int)\n", " cdef int [:, :] link_view = links\n", " cdef int i, j, n\n", " for row in data.values:\n", " indices = []\n", " for col in range(ncols):\n", " values = row[col]\n", " for i in range(len(values)):\n", " key = (col, values[i])\n", " if key in index:\n", " indices.append(index[key])\n", " n = len(indices)\n", " for i in range(n):\n", " for j in range(i + 1, n):\n", " link_view[i, j] += 1\n", " return count, np.argwhere(links > 0)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10 loops, best of 3: 116 ms per loop\n" ] } ], "source": [ "%timeit cython_kpartite_memoryview(airline[columns].head(100000), top(count, 500))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "That's a good improvement. Now let's put it together:" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "warning: D:\\cygwin64\\home\\Anand\\.ipython\\cython\\_cython_magic_e58ac54db67eda082396b010cb77d72e.pyx:41:33: Index should be typed for more efficient access\n" ] } ], "source": [ "%%cython -f --compile-args=-DCYTHON_TRACE=1\n", "\n", "from collections import defaultdict\n", "import numpy as np\n", "cimport numpy as np\n", "\n", "def cython_kpartite_full(data, top=None):\n", " cdef int ncols = len(data.columns) # ncols = number of columns in data\n", " cdef int rowsize # rowsize = number of nodes in each row\n", " count = defaultdict(int) # count[node] = number of links the node has\n", " for row in data.values: # We loop through each row\n", " rowsize = 0\n", " for col in range(ncols): # ... and calculate the number of nodes in this row\n", " rowsize += len(row[col])\n", " for col in range(ncols): # In every column...\n", " values = row[col]\n", " for index in range(len(values)): # ... for every node...\n", " count[col, values[index]] += rowsize # ... we add the number of nodes in the row to its count\n", " \n", " if callable(top):\n", " count = top(count)\n", " \n", " # For the top nodes, let's get the (column, value) -> node-index\n", " nodes = count.keys()\n", " index = {key: i for i, key in enumerate(count)}\n", "\n", " cdef int nodecount = len(nodes)\n", " cdef np.ndarray links = np.zeros([nodecount, nodecount], dtype=np.int)\n", " cdef int [:, :] link_view = links\n", " cdef int i, j, n\n", " for row in data.values:\n", " indices = []\n", " for col in range(ncols):\n", " values = row[col]\n", " for i in range(len(values)):\n", " key = (col, values[i])\n", " if key in index:\n", " indices.append(index[key])\n", " n = len(indices)\n", " for i in range(n):\n", " for j in range(i + 1, n):\n", " link_view[indices[i], indices[j]] += 1\n", " return count, np.argwhere(links > 0)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 loops, best of 1: 284 ms per loop\n" ] } ], "source": [ "%timeit -r1 cython_kpartite_full(airline[columns], top=lambda count: top(count, 500))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Further optimisation\n", "\n", "This is the optimised code:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%cython\n", "\n", "import numpy as np\n", "cimport numpy as np\n", "\n", "cimport cython\n", "@cython.boundscheck(False)\n", "@cython.wraparound(False)\n", "def cython_kpartite_optimised(data, subset=None):\n", " # This block takes ~2s\n", " cdef int ncols = len(data.columns) # ncols = number of columns in data\n", " cdef int rowsize # rowsize = number of nodes in each row\n", " cdef int i, j, n\n", " count = {}\n", " for row in data.values: # We loop through each row\n", " rowsize = 0\n", " for i in range(ncols): # ... and calculate the number of values in this row\n", " rowsize += len(row[i])\n", " for i in range(ncols): # Add this rowsize against each value in each column\n", " subcount = count.setdefault(i, {})\n", " for val in row[i]:\n", " subcount[val] = rowsize + subcount.get(val, 0)\n", " \n", " # Now, count[col_number][val] has the number of nodes it is associated with.\n", " # Use count[col_number, val] instead for convenience\n", " count = {(i, val): n for i in count for val, n in count[i].iteritems()}\n", " \n", " # If a subset() function is provided, restrict the number of nodes to that many.\n", " # Otherwise, restrict to top 500 nodes\n", " if callable(subset):\n", " count = subset(count)\n", " else:\n", " subset = subset if isinstance(subset, int) else 500\n", " count = dict(sorted(count.items(), key=lambda v: -v[1])[:subset])\n", " \n", " # index[col_number, node] = index of the corresponding node\n", " nodes = count.keys()\n", " index = {key: i for i, key in enumerate(count)}\n", "\n", " # Calculate the links array\n", " cdef int nodecount = len(nodes)\n", " cdef np.ndarray links = np.zeros([nodecount, nodecount], dtype=np.int)\n", " cdef int [:, :] link_view = links\n", " cdef int indices[50000] # Limit number of distinct values possible in a row\n", " for row in data.values:\n", " # This block takes 1.3s\n", " n = 0\n", " for i in range(ncols):\n", " for val in row[i]:\n", " key = (i, val)\n", " if key in index:\n", " indices[n] = index[key]\n", " n += 1\n", " # This block takes ~0.9s\n", " for i in range(n):\n", " for j in range(i + 1, n):\n", " link_view[indices[i], indices[j]] += 1\n", " return count, np.argwhere(links > 0)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 loops, best of 1: 235 ms per loop\n" ] } ], "source": [ "%timeit -r1 cython_kpartite_optimised(airline[columns], subset=lambda count: top(count, 500))" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "count, links = cython_kpartite_optimised(airline[columns], subset=lambda count: top(count, 500))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 AEROSPATIALE 1955\n", " AIRBUS 22740\n", " ALLIED SIGNAL INC 1096\n", " APPLIED ELASTOMERICS INC 1744\n", " BE AEROSPACE INC 1313\n", "dtype: int64" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.Series(count).head()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 35],\n", " [ 0, 55],\n", " [ 0, 62],\n", " ..., \n", " [499, 459],\n", " [499, 462],\n", " [499, 480]], dtype=int64)" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "links" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }