{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preliminaries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import copy\n",
    "import itertools\n",
    "from collections import defaultdict\n",
    "from operator import itemgetter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Our dataset format\n",
    "An event is a list of strings.\n",
    "\n",
    "A sequence is a list of events.\n",
    "\n",
    "A dataset is a list of sequences.\n",
    "\n",
    "Thus, a dataset is a list of lists of lists of strings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dataset =  [\n",
    "    [[\"a\"], [\"a\", \"b\", \"c\"], [\"a\", \"c\"], [\"c\"]],\n",
    "    [[\"a\"], [\"c\"], [\"b\", \"c\"]],\n",
    "    [[\"a\", \"b\"], [\"d\"], [\"c\"], [\"b\"], [\"c\"]],\n",
    "    [[\"a\"], [\"c\"], [\"b\"], [\"c\"]]\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Foundations\n",
    "### Subsequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "This is a simple recursive method that checks if subsequence is a subSequence of mainSequence\n",
    "\"\"\"\n",
    "def isSubsequence(mainSequence, subSequence):\n",
    "    subSequenceClone = list(subSequence) # clone the sequence, because we will alter it\n",
    "    return isSubsequenceRecursive(mainSequence, subSequenceClone) #start recursion\n",
    "\n",
    "\"\"\"\n",
    "Function for the recursive call of isSubsequence, not intended for external calls\n",
    "\"\"\"\n",
    "def isSubsequenceRecursive(mainSequence, subSequenceClone, start=0):\n",
    "    # Check if empty: End of recursion, all itemsets have been found\n",
    "    if (not subSequenceClone):\n",
    "        return True\n",
    "    # retrieves element of the subsequence and removes is from subsequence \n",
    "    firstElem = set(subSequenceClone.pop(0))\n",
    "    # Search for the first itemset...\n",
    "    for i in range(start, len(mainSequence)):\n",
    "        if (set(mainSequence[i]).issuperset(firstElem)):\n",
    "            # and recurse\n",
    "            return isSubsequenceRecursive(mainSequence, subSequenceClone, i + 1)\n",
    "    return False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "aSequence = [[\"a\"], [\"b\", \"c\"], [\"d\"], [\"a\", \"e\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "isSubsequence(aSequence, [[\"a\"], [\"d\"], [\"e\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "isSubsequence(aSequence, [[\"a\"], [\"b\", \"c\"], [\"e\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "isSubsequence(aSequence, [[\"a\"], [\"b\", \"d\"]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Length of an itemset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Computes the length of the sequence (sum of the length of the contained itemsets)\n",
    "\"\"\"\n",
    "def sequenceLength(sequence):\n",
    "    return sum(len(i) for i in sequence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7\n"
     ]
    }
   ],
   "source": [
    "print sequenceLength ([[\"a\"], [\"b\", \"c\"], [\"a\"], [\"b\",\"c\",\"d\"]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Support of a sequence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Computes the support of a sequence in a dataset\n",
    "\"\"\"\n",
    "def countSupport (dataset, candidateSequence):\n",
    "    return sum(1 for seq in dataset if isSubsequence(seq, candidateSequence)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[['a'], ['a', 'b', 'c'], ['a', 'c'], ['c']],\n",
       " [['a'], ['c'], ['b', 'c']],\n",
       " [['a', 'b'], ['d'], ['c'], ['b'], ['c']],\n",
       " [['a'], ['c'], ['b'], ['c']]]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countSupport(dataset, [[\"b\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "countSupport(dataset, [[\"a\"], [\"b\", \"c\"]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# AprioriAll\n",
    "### 1 . Candidate Generation\n",
    "#### For a single pair:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Generates one candidate of length k from two candidates of length (k-1) as used in the AprioriAll algorithm\n",
    "\"\"\"\n",
    "def generateCandidatesForPair(cand1, cand2):\n",
    "    cand1Clone = copy.deepcopy(cand1)\n",
    "    cand2Clone = copy.deepcopy(cand2)\n",
    "    # drop the leftmost item from cand1:\n",
    "    if (len (cand1[0]) == 1):\n",
    "        cand1Clone.pop(0)\n",
    "    else:\n",
    "        cand1Clone[0] = cand1Clone[0][1:]\n",
    "    # drop the rightmost item from cand2:\n",
    "    if (len (cand2[-1]) == 1):\n",
    "        cand2Clone.pop(-1)\n",
    "    else:\n",
    "        cand2Clone[-1] = cand2Clone[-1][:-1]\n",
    "    \n",
    "    # if the result is not the same, then we dont need to join\n",
    "    if not cand1Clone == cand2Clone:\n",
    "        return []\n",
    "    else:\n",
    "        newCandidate = copy.deepcopy(cand1)\n",
    "        if (len (cand2[-1]) == 1):\n",
    "            newCandidate.append(cand2[-1])\n",
    "        else:\n",
    "            newCandidate [-1].extend(cand2[-1][-1])\n",
    "        return newCandidate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['a'], ['b', 'c'], ['d', 'e']]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "candidateA = [[\"a\"], [\"b\", \"c\"], [\"d\"]]\n",
    "candidateB = [[\"b\", \"c\"], [\"d\", \"e\"]]\n",
    "generateCandidatesForPair(candidateA, candidateB)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['a'], ['b', 'c'], ['d'], ['e']]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "candidateA = [[\"a\"], [\"b\", \"c\"], [\"d\"]]\n",
    "candidateC = [[\"b\", \"c\"], [\"d\"], [\"e\"]]\n",
    "generateCandidatesForPair(candidateA, candidateC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "candidateA = [[\"a\"], [\"b\", \"c\"], [\"d\"]]\n",
    "candidateD = [[\"a\"], [\"b\", \"c\"], [\"e\"]]\n",
    "generateCandidatesForPair(candidateA, candidateD)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### For a set of candidates (of the last level):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Generates the set of candidates of length k from the set of frequent sequences with length (k-1)\n",
    "\"\"\"\n",
    "def generateCandidates(lastLevelCandidates):\n",
    "    k = sequenceLength(lastLevelCandidates[0]) + 1\n",
    "    if (k == 2):\n",
    "        flatShortCandidates = [item for sublist2 in lastLevelCandidates for sublist1 in sublist2 for item in sublist1]\n",
    "        result = [[[a, b]] for a in flatShortCandidates for b in flatShortCandidates if b > a]\n",
    "        result.extend([[[a], [b]] for a in flatShortCandidates for b in flatShortCandidates])\n",
    "        return result\n",
    "    else:\n",
    "        candidates = []\n",
    "        for i in range(0, len(lastLevelCandidates)):\n",
    "            for j in range(0, len(lastLevelCandidates)):\n",
    "                newCand = generateCandidatesForPair(lastLevelCandidates[i], lastLevelCandidates[j])\n",
    "                if (not newCand == []):\n",
    "                    candidates.append(newCand)\n",
    "        candidates.sort()\n",
    "        return candidates"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "An example; Lets assume, we know the frequent sequences of level 2:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "lastLevelFrequentPatterns = [\n",
    "    [['a', 'b']], \n",
    "    [['b', 'c']], \n",
    "    [['a'], ['b']], \n",
    "    [['a'], ['c']], \n",
    "    [['b'], ['c']],\n",
    "    [['c'], ['b']], \n",
    "    [['c'], ['c']],  \n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Then we can compute the generate candidates for level 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[['a'], ['b'], ['c']],\n",
       " [['a'], ['b', 'c']],\n",
       " [['a'], ['c'], ['b']],\n",
       " [['a'], ['c'], ['c']],\n",
       " [['a', 'b'], ['c']],\n",
       " [['a', 'b', 'c']],\n",
       " [['b'], ['c'], ['b']],\n",
       " [['b'], ['c'], ['c']],\n",
       " [['b', 'c'], ['b']],\n",
       " [['b', 'c'], ['c']],\n",
       " [['c'], ['b'], ['c']],\n",
       " [['c'], ['b', 'c']],\n",
       " [['c'], ['c'], ['b']],\n",
       " [['c'], ['c'], ['c']]]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newCandidates = generateCandidates(lastLevelFrequentPatterns)\n",
    "newCandidates"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2 . Candidate Checking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Computes all direct subsequence for a given sequence.\n",
    "A direct subsequence is any sequence that originates from deleting exactly one item from any event in the original sequence.\n",
    "\"\"\"\n",
    "def generateDirectSubsequences(sequence):\n",
    "    result = []\n",
    "    for i, itemset in enumerate(sequence):\n",
    "        if (len(itemset) == 1):\n",
    "            sequenceClone = copy.deepcopy(sequence)\n",
    "            sequenceClone.pop(i)\n",
    "            result.append(sequenceClone)\n",
    "        else:\n",
    "            for j in range(len(itemset)):\n",
    "                sequenceClone = copy.deepcopy(sequence)\n",
    "                sequenceClone[i].pop(j)\n",
    "                result.append(sequenceClone)\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Prunes the set of candidates generated for length k given all frequent sequence of level (k-1), as done in AprioriAll\n",
    "\"\"\"\n",
    "def pruneCandidates(candidatesLastLevel, candidatesGenerated):\n",
    "    return [cand for cand in candidatesGenerated if all(x in candidatesLastLevel for x in generateDirectSubsequences(cand))]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We apply this on example dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[['a'], ['b'], ['c']],\n",
       " [['a'], ['b', 'c']],\n",
       " [['a'], ['c'], ['b']],\n",
       " [['a'], ['c'], ['c']],\n",
       " [['a', 'b'], ['c']],\n",
       " [['b'], ['c'], ['c']],\n",
       " [['b', 'c'], ['c']],\n",
       " [['c'], ['b'], ['c']],\n",
       " [['c'], ['b', 'c']],\n",
       " [['c'], ['c'], ['b']],\n",
       " [['c'], ['c'], ['c']]]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "candidatesPruned = pruneCandidates(lastLevelFrequentPatterns, newCandidates)\n",
    "candidatesPruned"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. Count Candidates (and filter not frequent ones):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[([['a'], ['b'], ['c']], 3),\n",
       " ([['a'], ['b', 'c']], 2),\n",
       " ([['a'], ['c'], ['b']], 3),\n",
       " ([['a'], ['c'], ['c']], 4),\n",
       " ([['a', 'b'], ['c']], 2),\n",
       " ([['b'], ['c'], ['c']], 2),\n",
       " ([['c'], ['b'], ['c']], 2)]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "minSupport = 2\n",
    "candidatesCounts = [(i, countSupport(dataset, i)) for i in candidatesPruned]\n",
    "resultLvl = [(i, count) for (i, count) in candidatesCounts if (count >= minSupport)]\n",
    "resultLvl"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Put it all together:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "The AprioriAll algorithm. Computes the frequent sequences in a seqeunce dataset for a given minSupport\n",
    "\n",
    "Args:\n",
    "    dataset: A list of sequences, for which the frequent (sub-)sequences are computed\n",
    "    minSupport: The minimum support that makes a sequence frequent\n",
    "    verbose: If true, additional information on the mining process is printed (i.e., candidates on each level)\n",
    "Returns:\n",
    "    A list of tuples (s, c), where s is a frequent sequence, and c is the count for that sequence\n",
    "\"\"\"\n",
    "def apriori(dataset, minSupport, verbose=False):\n",
    "    global numberOfCountingOperations\n",
    "    numberOfCountingOperations = 0\n",
    "    Overall = []\n",
    "    itemsInDataset = sorted(set ([item for sublist1 in dataset for sublist2 in sublist1 for item in sublist2]))\n",
    "    singleItemSequences = [[[item]] for item in itemsInDataset]\n",
    "    singleItemCounts = [(i, countSupport(dataset, i)) for i in singleItemSequences if countSupport(dataset, i) >= minSupport]\n",
    "    Overall.append(singleItemCounts)\n",
    "    print \"Result, lvl 1: \" + str(Overall[0])\n",
    "    k = 1\n",
    "    while (True):\n",
    "        if not Overall [k - 1]:\n",
    "            break\n",
    "        # 1. Candidate generation\n",
    "        candidatesLastLevel = [x[0] for x in Overall[k - 1]]\n",
    "        candidatesGenerated = generateCandidates (candidatesLastLevel)\n",
    "        # 2. Candidate pruning (using a \"containsall\" subsequences)\n",
    "        candidatesPruned = [cand for cand in candidatesGenerated if all(x in candidatesLastLevel for x in generateDirectSubsequences(cand))]\n",
    "        # 3. Candidate checking\n",
    "        candidatesCounts = [(i, countSupport(dataset, i)) for i in candidatesPruned]\n",
    "        resultLvl = [(i, count) for (i, count) in candidatesCounts if (count >= minSupport)]\n",
    "        if verbose:\n",
    "            print \"Candidates generated, lvl \" + str(k + 1) + \": \" + str(candidatesGenerated)\n",
    "            print \"Candidates pruned, lvl \" + str(k + 1) + \": \" + str(candidatesPruned)\n",
    "            print \"Result, lvl \" + str(k + 1) + \": \" + str(resultLvl)\n",
    "        Overall.append(resultLvl)\n",
    "        k = k + 1\n",
    "    # \"flatten\" Overall\n",
    "    Overall = Overall [:-1]\n",
    "    Overall = [item for sublist in Overall for item in sublist]\n",
    "    return Overall"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Result, lvl 1: [([['a']], 4), ([['b']], 4), ([['c']], 4)]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[([['a']], 4),\n",
       " ([['b']], 4),\n",
       " ([['c']], 4),\n",
       " ([['a', 'b']], 2),\n",
       " ([['b', 'c']], 2),\n",
       " ([['a'], ['b']], 4),\n",
       " ([['a'], ['c']], 4),\n",
       " ([['b'], ['c']], 3),\n",
       " ([['c'], ['b']], 3),\n",
       " ([['c'], ['c']], 4),\n",
       " ([['a'], ['b'], ['c']], 3),\n",
       " ([['a'], ['b', 'c']], 2),\n",
       " ([['a'], ['c'], ['b']], 3),\n",
       " ([['a'], ['c'], ['c']], 4),\n",
       " ([['a', 'b'], ['c']], 2),\n",
       " ([['b'], ['c'], ['c']], 2),\n",
       " ([['c'], ['b'], ['c']], 2),\n",
       " ([['a'], ['c'], ['b'], ['c']], 2),\n",
       " ([['a', 'b'], ['c'], ['c']], 2)]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "apriori(dataset, 2, verbose=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# PrefixSpan\n",
    "\n",
    "### Project a sequence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Projects a sequence according to a given prefix, as done in PrefixSpan\n",
    "\n",
    "Args:\n",
    "    sequence: the sequence the projection is built from\n",
    "    prefix: the prefix that is searched for in the sequence\n",
    "    newEvent: if set to True, the first itemset is ignored\n",
    "Returns:\n",
    "    If the sequence does not contain the prefix, then None.\n",
    "    Otherwise, a new sequence starting from the position of the prefix, including the itemset that includes the prefix\n",
    "\"\"\"\n",
    "def projectSequence(sequence, prefix, newEvent):\n",
    "    result = None\n",
    "    for i, itemset in enumerate(sequence):\n",
    "        if result is None:\n",
    "            if (not newEvent) or i > 0:\n",
    "                if (all(x in itemset for x in prefix)):\n",
    "                    result = [list(itemset)]\n",
    "        else:\n",
    "            result.append(copy.copy(itemset))\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['b', 'c'], ['a', 'c'], ['c']]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "seq = [[\"a\"], [\"b\", \"c\"], [\"a\", \"c\"], [\"c\"]]\n",
    "projectSequence(seq, [\"b\"], False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['a', 'c'], ['c']]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "projectSequence(seq, [\"a\", \"c\"], False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['a'], ['b', 'c'], ['a', 'c'], ['c']]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "projectSequence(seq, [\"a\"], False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['a', 'c'], ['c']]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "projectSequence(seq, [\"a\"], True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Project a dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Projects a dataset according to a given prefix, as done in PrefixSpan\n",
    "\n",
    "Args:\n",
    "    dataset: the dataset the projection is built from\n",
    "    prefix: the prefix that is searched for in the sequence\n",
    "    newEvent: if set to True, the first itemset is ignored\n",
    "Returns:\n",
    "    A (potentially empty) list of sequences\n",
    "\"\"\"\n",
    "def projectDatabase(dataset, prefix, newEvent):\n",
    "    projectedDB = []\n",
    "    for sequence in dataset:\n",
    "        seqProjected = projectSequence(sequence, prefix, newEvent)\n",
    "        if not seqProjected is None:\n",
    "            projectedDB.append(seqProjected)\n",
    "    return projectedDB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "datasetProject = [\n",
    "            [[\"a\"], [\"a\", \"b\", \"c\"], [\"a\", \"c\"], [\"d\"], [\"c\", \"f\"]],\n",
    "            [[\"a\", \"d\"], [\"c\"], [\"b\", \"c\"], [\"a\", \"e\"]],\n",
    "            [[\"e\", \"f\"], [\"a\", \"b\"], [\"d\", \"f\"], [\"d\"], [\"b\"]],\n",
    "            [[\"e\"], [\"g\"], [\"a\", \"f\"], [\"c\"], [\"b\"], [\"c\"]]\n",
    "        ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[['a', 'b', 'c'], ['a', 'c'], ['d'], ['c', 'f']],\n",
       " [['c'], ['b', 'c'], ['a', 'e']],\n",
       " [['c'], ['b'], ['c']]]"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "projectDatabase(datasetProject, [\"c\"], False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### The main algorithm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Some more utility functions:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Generates a list of all items that are contained in a dataset\n",
    "\"\"\"\n",
    "def generateItems(dataset):\n",
    "    return sorted(set ([item for sublist1 in dataset for sublist2 in sublist1 for item in sublist2]))\n",
    "\n",
    "\"\"\"\n",
    "Computes a defaultdict that maps each item in the dataset to its support\n",
    "\"\"\"\n",
    "def generateItemSupports(dataset, ignoreFirstEvent=False, prefix=[]):\n",
    "    result = defaultdict(int)\n",
    "    for sequence in dataset:\n",
    "        if ignoreFirstEvent:\n",
    "            sequence = sequence[1:]\n",
    "        cooccurringItems = set()\n",
    "        for itemset in sequence:\n",
    "            if all(x in itemset for x in prefix):\n",
    "                for item in itemset:\n",
    "                    if not item in prefix:\n",
    "                        cooccurringItems.add(item)\n",
    "        for item in cooccurringItems:\n",
    "            result [item] += 1\n",
    "    return sorted(result.items())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Finally, the algorithm:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "The PrefixSpan algorithm. Computes the frequent sequences in a seqeunce dataset for a given minSupport\n",
    "\n",
    "Args:\n",
    "    dataset: A list of sequences, for which the frequent (sub-)sequences are computed\n",
    "    minSupport: The minimum support that makes a sequence frequent\n",
    "Returns:\n",
    "    A list of tuples (s, c), where s is a frequent sequence, and c is the count for that sequence\n",
    "\"\"\"\n",
    "def prefixSpan(dataset, minSupport):\n",
    "    result = []\n",
    "    itemCounts = generateItemSupports(dataset)\n",
    "    for item, count in itemCounts:\n",
    "        if count >= minSupport:\n",
    "            newPrefix = [[item]]\n",
    "            result.append((newPrefix, count))\n",
    "            result.extend(prefixSpanInternal(projectDatabase(dataset, [item], False), minSupport, newPrefix))\n",
    "    return result\n",
    "\n",
    "def prefixSpanInternal(dataset, minSupport, prevPrefixes=[]):\n",
    "    result = []\n",
    "    \n",
    "    # Add a new item to the last element (==same time)\n",
    "    itemCountSameEvent = generateItemSupports(dataset, False, prefix=prevPrefixes[-1])\n",
    "    for item, count in itemCountSameEvent:\n",
    "        if (count >= minSupport) and item > prevPrefixes[-1][-1]:\n",
    "            newPrefix = copy.deepcopy(prevPrefixes)\n",
    "            newPrefix[-1].append(item)\n",
    "            result.append((newPrefix, count))\n",
    "            result.extend(prefixSpanInternal(projectDatabase(dataset, newPrefix[-1], False), minSupport, newPrefix))\n",
    "        \n",
    "    # Add a new event to the prefix\n",
    "    itemCountSubsequentEvents = generateItemSupports(dataset, True)\n",
    "    for item, count in itemCountSubsequentEvents:\n",
    "        if count >= minSupport:\n",
    "            newPrefix = copy.deepcopy(prevPrefixes)\n",
    "            newPrefix.append([item])\n",
    "            result.append((newPrefix, count))\n",
    "            result.extend(prefixSpanInternal(projectDatabase(dataset, [item], True), minSupport, newPrefix))\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[([['a']], 4),\n",
       " ([['a', 'b']], 2),\n",
       " ([['a', 'b'], ['c']], 2),\n",
       " ([['a', 'b'], ['c'], ['c']], 2),\n",
       " ([['a'], ['b']], 4),\n",
       " ([['a'], ['b', 'c']], 2),\n",
       " ([['a'], ['b'], ['c']], 3),\n",
       " ([['a'], ['c']], 4),\n",
       " ([['a'], ['c'], ['b']], 3),\n",
       " ([['a'], ['c'], ['b'], ['c']], 2),\n",
       " ([['a'], ['c'], ['c']], 4),\n",
       " ([['b']], 4),\n",
       " ([['b', 'c']], 2),\n",
       " ([['b'], ['c']], 3),\n",
       " ([['b'], ['c'], ['c']], 2),\n",
       " ([['c']], 4),\n",
       " ([['c'], ['b']], 3),\n",
       " ([['c'], ['b'], ['c']], 2),\n",
       " ([['c'], ['c']], 4)]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prefixSpan(dataset, 2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Filter for closed and maximal patterns\n",
    "### Closed patterns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Given a list of all frequent sequences and their counts, compute the set of closed frequent sequence (as a list)\n",
    "This is only a very simplistic (naive) implementation for demonstration purposes!\n",
    "\"\"\"\n",
    "def filterClosed(result):\n",
    "    for supersequence, countSeq in copy.deepcopy(result):\n",
    "        for subsequence, countSubSeq in copy.deepcopy(result):\n",
    "            if isSubsequence(supersequence, subsequence) and (countSeq == countSubSeq) and subsequence != supersequence:\n",
    "                result.remove((subsequence, countSubSeq))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[([['a', 'b'], ['c'], ['c']], 2),\n",
       " ([['a'], ['b']], 4),\n",
       " ([['a'], ['b', 'c']], 2),\n",
       " ([['a'], ['b'], ['c']], 3),\n",
       " ([['a'], ['c'], ['b']], 3),\n",
       " ([['a'], ['c'], ['b'], ['c']], 2),\n",
       " ([['a'], ['c'], ['c']], 4)]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = prefixSpan(dataset, 2)\n",
    "filterClosed(result)\n",
    "result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Maximal sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "Given a list of all frequent sequences and their counts, compute the set of maximal frequent sequence (as a list)\n",
    "This is only a very naive implementation for demonstration purposes!\n",
    "\"\"\"\n",
    "def filterMaximal(result):\n",
    "    for supersequence, countSeq in copy.deepcopy(result):\n",
    "        for subsequence, countSubSeq in copy.deepcopy(result):\n",
    "            if isSubsequence (supersequence, subsequence) and subsequence != supersequence:\n",
    "                result.remove((subsequence, countSubSeq)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[([['a', 'b'], ['c'], ['c']], 2),\n",
       " ([['a'], ['b', 'c']], 2),\n",
       " ([['a'], ['c'], ['b'], ['c']], 2)]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = prefixSpan (dataset, 2)\n",
    "filterMaximal(result)\n",
    "result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Application example: Wikispeedia\n",
    "Now we try to find some sequential patterns in a real world dataset: Wikispeedia\n",
    "\n",
    "First load the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import csv\n",
    "sequences = []\n",
    "\n",
    "# from https://snap.stanford.edu/data/wikispeedia.html\n",
    "for line in csv.reader((row for row in open(\"data/paths_finished.tsv\") if not row.startswith('#')), delimiter='\\t'):\n",
    "    if len(line) == 0:\n",
    "        continue\n",
    "    # for simplicity, let us remove back clicks\n",
    "    seq = line[3].split(\";\")\n",
    "    # for simplicity, let us remove back clicks\n",
    "    seq = [x for x in seq if x != \"<\"]\n",
    "    sequences.append(seq)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert this to the list of list of lists that we use as a dataformat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "wikispeediaData=[]\n",
    "for seq in sequences:\n",
    "    newSeq = []\n",
    "    for item in seq:\n",
    "        newSeq.append([item])\n",
    "    wikispeediaData.append(newSeq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 46.3 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[([['Africa']], 2738),\n",
       " ([['Agriculture']], 1147),\n",
       " ([['Animal']], 1666),\n",
       " ([['Asia']], 1167),\n",
       " ([['Asteroid']], 1171),\n",
       " ([['Asteroid'], ['Viking']], 1043),\n",
       " ([['Atlantic_Ocean']], 1297),\n",
       " ([['Bird']], 1182),\n",
       " ([['Brain']], 1316),\n",
       " ([['Brain'], ['Telephone']], 1043),\n",
       " ([['China']], 1110),\n",
       " ([['Christianity']], 1074),\n",
       " ([['Computer']], 1528),\n",
       " ([['Earth']], 3176),\n",
       " ([['England']], 3261),\n",
       " ([['English_language']], 1414),\n",
       " ([['Europe']], 4303),\n",
       " ([['France']], 1588),\n",
       " ([['Germany']], 1738),\n",
       " ([['Human']], 1604),\n",
       " ([['India']], 1216),\n",
       " ([['Internet']], 1023),\n",
       " ([['Japan']], 1070),\n",
       " ([['Mammal']], 1568),\n",
       " ([['North_America']], 1861),\n",
       " ([['Periodic_table']], 1394),\n",
       " ([['Plant']], 1127),\n",
       " ([['Russia']], 1007),\n",
       " ([['Science']], 1479),\n",
       " ([['Telephone']], 1251),\n",
       " ([['Theatre']], 1034),\n",
       " ([['United_Kingdom']], 3807),\n",
       " ([['United_Nations']], 1050),\n",
       " ([['United_States']], 8675),\n",
       " ([['Viking']], 1192),\n",
       " ([['World_War_II']], 2267),\n",
       " ([['Zebra']], 1041)]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time prefixSpan (wikispeediaData, 1000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This will take a long time:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# %time apriori(wikispeediaData, 1000)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}