{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Preliminaries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import copy\n", "import itertools\n", "from collections import defaultdict\n", "from operator import itemgetter" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Our dataset format\n", "An event is a list of strings.\n", "\n", "A sequence is a list of events.\n", "\n", "A dataset is a list of sequences.\n", "\n", "Thus, a dataset is a list of lists of lists of strings." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dataset = [\n", " [[\"a\"], [\"a\", \"b\", \"c\"], [\"a\", \"c\"], [\"c\"]],\n", " [[\"a\"], [\"c\"], [\"b\", \"c\"]],\n", " [[\"a\", \"b\"], [\"d\"], [\"c\"], [\"b\"], [\"c\"]],\n", " [[\"a\"], [\"c\"], [\"b\"], [\"c\"]]\n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Foundations\n", "### Subsequences" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "This is a simple recursive method that checks if subsequence is a subSequence of mainSequence\n", "\"\"\"\n", "def isSubsequence(mainSequence, subSequence):\n", " subSequenceClone = list(subSequence) # clone the sequence, because we will alter it\n", " return isSubsequenceRecursive(mainSequence, subSequenceClone) #start recursion\n", "\n", "\"\"\"\n", "Function for the recursive call of isSubsequence, not intended for external calls\n", "\"\"\"\n", "def isSubsequenceRecursive(mainSequence, subSequenceClone, start=0):\n", " # Check if empty: End of recursion, all itemsets have been found\n", " if (not subSequenceClone):\n", " return True\n", " # retrieves element of the subsequence and removes is from subsequence \n", " firstElem = set(subSequenceClone.pop(0))\n", " # Search for the first itemset...\n", " for i in range(start, len(mainSequence)):\n", " if (set(mainSequence[i]).issuperset(firstElem)):\n", " # and recurse\n", " return isSubsequenceRecursive(mainSequence, subSequenceClone, i + 1)\n", " return False" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "aSequence = [[\"a\"], [\"b\", \"c\"], [\"d\"], [\"a\", \"e\"]]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "isSubsequence(aSequence, [[\"a\"], [\"d\"], [\"e\"]])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "isSubsequence(aSequence, [[\"a\"], [\"b\", \"c\"], [\"e\"]])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "isSubsequence(aSequence, [[\"a\"], [\"b\", \"d\"]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Length of an itemset" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "Computes the length of the sequence (sum of the length of the contained itemsets)\n", "\"\"\"\n", "def sequenceLength(sequence):\n", " return sum(len(i) for i in sequence)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "7\n" ] } ], "source": [ "print sequenceLength ([[\"a\"], [\"b\", \"c\"], [\"a\"], [\"b\",\"c\",\"d\"]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Support of a sequence" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "Computes the support of a sequence in a dataset\n", "\"\"\"\n", "def countSupport (dataset, candidateSequence):\n", " return sum(1 for seq in dataset if isSubsequence(seq, candidateSequence)) " ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[[['a'], ['a', 'b', 'c'], ['a', 'c'], ['c']],\n", " [['a'], ['c'], ['b', 'c']],\n", " [['a', 'b'], ['d'], ['c'], ['b'], ['c']],\n", " [['a'], ['c'], ['b'], ['c']]]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "4" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countSupport(dataset, [[\"b\"]])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countSupport(dataset, [[\"a\"], [\"b\", \"c\"]])" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# AprioriAll\n", "### 1 . Candidate Generation\n", "#### For a single pair:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "Generates one candidate of length k from two candidates of length (k-1) as used in the AprioriAll algorithm\n", "\"\"\"\n", "def generateCandidatesForPair(cand1, cand2):\n", " cand1Clone = copy.deepcopy(cand1)\n", " cand2Clone = copy.deepcopy(cand2)\n", " # drop the leftmost item from cand1:\n", " if (len (cand1[0]) == 1):\n", " cand1Clone.pop(0)\n", " else:\n", " cand1Clone[0] = cand1Clone[0][1:]\n", " # drop the rightmost item from cand2:\n", " if (len (cand2[-1]) == 1):\n", " cand2Clone.pop(-1)\n", " else:\n", " cand2Clone[-1] = cand2Clone[-1][:-1]\n", " \n", " # if the result is not the same, then we dont need to join\n", " if not cand1Clone == cand2Clone:\n", " return []\n", " else:\n", " newCandidate = copy.deepcopy(cand1)\n", " if (len (cand2[-1]) == 1):\n", " newCandidate.append(cand2[-1])\n", " else:\n", " newCandidate [-1].extend(cand2[-1][-1])\n", " return newCandidate" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['a'], ['b', 'c'], ['d', 'e']]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "candidateA = [[\"a\"], [\"b\", \"c\"], [\"d\"]]\n", "candidateB = [[\"b\", \"c\"], [\"d\", \"e\"]]\n", "generateCandidatesForPair(candidateA, candidateB)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['a'], ['b', 'c'], ['d'], ['e']]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "candidateA = [[\"a\"], [\"b\", \"c\"], [\"d\"]]\n", "candidateC = [[\"b\", \"c\"], [\"d\"], [\"e\"]]\n", "generateCandidatesForPair(candidateA, candidateC)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "candidateA = [[\"a\"], [\"b\", \"c\"], [\"d\"]]\n", "candidateD = [[\"a\"], [\"b\", \"c\"], [\"e\"]]\n", "generateCandidatesForPair(candidateA, candidateD)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### For a set of candidates (of the last level):" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "Generates the set of candidates of length k from the set of frequent sequences with length (k-1)\n", "\"\"\"\n", "def generateCandidates(lastLevelCandidates):\n", " k = sequenceLength(lastLevelCandidates[0]) + 1\n", " if (k == 2):\n", " flatShortCandidates = [item for sublist2 in lastLevelCandidates for sublist1 in sublist2 for item in sublist1]\n", " result = [[[a, b]] for a in flatShortCandidates for b in flatShortCandidates if b > a]\n", " result.extend([[[a], [b]] for a in flatShortCandidates for b in flatShortCandidates])\n", " return result\n", " else:\n", " candidates = []\n", " for i in range(0, len(lastLevelCandidates)):\n", " for j in range(0, len(lastLevelCandidates)):\n", " newCand = generateCandidatesForPair(lastLevelCandidates[i], lastLevelCandidates[j])\n", " if (not newCand == []):\n", " candidates.append(newCand)\n", " candidates.sort()\n", " return candidates" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "An example; Lets assume, we know the frequent sequences of level 2:" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "lastLevelFrequentPatterns = [\n", " [['a', 'b']], \n", " [['b', 'c']], \n", " [['a'], ['b']], \n", " [['a'], ['c']], \n", " [['b'], ['c']],\n", " [['c'], ['b']], \n", " [['c'], ['c']], \n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then we can compute the generate candidates for level 3" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[[['a'], ['b'], ['c']],\n", " [['a'], ['b', 'c']],\n", " [['a'], ['c'], ['b']],\n", " [['a'], ['c'], ['c']],\n", " [['a', 'b'], ['c']],\n", " [['a', 'b', 'c']],\n", " [['b'], ['c'], ['b']],\n", " [['b'], ['c'], ['c']],\n", " [['b', 'c'], ['b']],\n", " [['b', 'c'], ['c']],\n", " [['c'], ['b'], ['c']],\n", " [['c'], ['b', 'c']],\n", " [['c'], ['c'], ['b']],\n", " [['c'], ['c'], ['c']]]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "newCandidates = generateCandidates(lastLevelFrequentPatterns)\n", "newCandidates" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2 . Candidate Checking" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "Computes all direct subsequence for a given sequence.\n", "A direct subsequence is any sequence that originates from deleting exactly one item from any event in the original sequence.\n", "\"\"\"\n", "def generateDirectSubsequences(sequence):\n", " result = []\n", " for i, itemset in enumerate(sequence):\n", " if (len(itemset) == 1):\n", " sequenceClone = copy.deepcopy(sequence)\n", " sequenceClone.pop(i)\n", " result.append(sequenceClone)\n", " else:\n", " for j in range(len(itemset)):\n", " sequenceClone = copy.deepcopy(sequence)\n", " sequenceClone[i].pop(j)\n", " result.append(sequenceClone)\n", " return result" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [], "source": [ "\"\"\"\n", "Prunes the set of candidates generated for length k given all frequent sequence of level (k-1), as done in AprioriAll\n", "\"\"\"\n", "def pruneCandidates(candidatesLastLevel, candidatesGenerated):\n", " return [cand for cand in candidatesGenerated if all(x in candidatesLastLevel for x in generateDirectSubsequences(cand))]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We apply this on example dataset" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[[['a'], ['b'], ['c']],\n", " [['a'], ['b', 'c']],\n", " [['a'], ['c'], ['b']],\n", " [['a'], ['c'], ['c']],\n", " [['a', 'b'], ['c']],\n", " [['b'], ['c'], ['c']],\n", " [['b', 'c'], ['c']],\n", " [['c'], ['b'], ['c']],\n", " [['c'], ['b', 'c']],\n", " [['c'], ['c'], ['b']],\n", " [['c'], ['c'], ['c']]]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "candidatesPruned = pruneCandidates(lastLevelFrequentPatterns, newCandidates)\n", "candidatesPruned" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. Count Candidates (and filter not frequent ones):" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[([['a'], ['b'], ['c']], 3),\n", " ([['a'], ['b', 'c']], 2),\n", " ([['a'], ['c'], ['b']], 3),\n", " ([['a'], ['c'], ['c']], 4),\n", " ([['a', 'b'], ['c']], 2),\n", " ([['b'], ['c'], ['c']], 2),\n", " ([['c'], ['b'], ['c']], 2)]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "minSupport = 2\n", "candidatesCounts = [(i, countSupport(dataset, i)) for i in candidatesPruned]\n", "resultLvl = [(i, count) for (i, count) in candidatesCounts if (count >= minSupport)]\n", "resultLvl" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Put it all together:" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [], "source": [ "\"\"\"\n", "The AprioriAll algorithm. Computes the frequent sequences in a seqeunce dataset for a given minSupport\n", "\n", "Args:\n", " dataset: A list of sequences, for which the frequent (sub-)sequences are computed\n", " minSupport: The minimum support that makes a sequence frequent\n", " verbose: If true, additional information on the mining process is printed (i.e., candidates on each level)\n", "Returns:\n", " A list of tuples (s, c), where s is a frequent sequence, and c is the count for that sequence\n", "\"\"\"\n", "def apriori(dataset, minSupport, verbose=False):\n", " global numberOfCountingOperations\n", " numberOfCountingOperations = 0\n", " Overall = []\n", " itemsInDataset = sorted(set ([item for sublist1 in dataset for sublist2 in sublist1 for item in sublist2]))\n", " singleItemSequences = [[[item]] for item in itemsInDataset]\n", " singleItemCounts = [(i, countSupport(dataset, i)) for i in singleItemSequences if countSupport(dataset, i) >= minSupport]\n", " Overall.append(singleItemCounts)\n", " print \"Result, lvl 1: \" + str(Overall[0])\n", " k = 1\n", " while (True):\n", " if not Overall [k - 1]:\n", " break\n", " # 1. Candidate generation\n", " candidatesLastLevel = [x[0] for x in Overall[k - 1]]\n", " candidatesGenerated = generateCandidates (candidatesLastLevel)\n", " # 2. Candidate pruning (using a \"containsall\" subsequences)\n", " candidatesPruned = [cand for cand in candidatesGenerated if all(x in candidatesLastLevel for x in generateDirectSubsequences(cand))]\n", " # 3. Candidate checking\n", " candidatesCounts = [(i, countSupport(dataset, i)) for i in candidatesPruned]\n", " resultLvl = [(i, count) for (i, count) in candidatesCounts if (count >= minSupport)]\n", " if verbose:\n", " print \"Candidates generated, lvl \" + str(k + 1) + \": \" + str(candidatesGenerated)\n", " print \"Candidates pruned, lvl \" + str(k + 1) + \": \" + str(candidatesPruned)\n", " print \"Result, lvl \" + str(k + 1) + \": \" + str(resultLvl)\n", " Overall.append(resultLvl)\n", " k = k + 1\n", " # \"flatten\" Overall\n", " Overall = Overall [:-1]\n", " Overall = [item for sublist in Overall for item in sublist]\n", " return Overall" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Result, lvl 1: [([['a']], 4), ([['b']], 4), ([['c']], 4)]\n" ] }, { "data": { "text/plain": [ "[([['a']], 4),\n", " ([['b']], 4),\n", " ([['c']], 4),\n", " ([['a', 'b']], 2),\n", " ([['b', 'c']], 2),\n", " ([['a'], ['b']], 4),\n", " ([['a'], ['c']], 4),\n", " ([['b'], ['c']], 3),\n", " ([['c'], ['b']], 3),\n", " ([['c'], ['c']], 4),\n", " ([['a'], ['b'], ['c']], 3),\n", " ([['a'], ['b', 'c']], 2),\n", " ([['a'], ['c'], ['b']], 3),\n", " ([['a'], ['c'], ['c']], 4),\n", " ([['a', 'b'], ['c']], 2),\n", " ([['b'], ['c'], ['c']], 2),\n", " ([['c'], ['b'], ['c']], 2),\n", " ([['a'], ['c'], ['b'], ['c']], 2),\n", " ([['a', 'b'], ['c'], ['c']], 2)]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "apriori(dataset, 2, verbose=False)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# PrefixSpan\n", "\n", "### Project a sequence" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "Projects a sequence according to a given prefix, as done in PrefixSpan\n", "\n", "Args:\n", " sequence: the sequence the projection is built from\n", " prefix: the prefix that is searched for in the sequence\n", " newEvent: if set to True, the first itemset is ignored\n", "Returns:\n", " If the sequence does not contain the prefix, then None.\n", " Otherwise, a new sequence starting from the position of the prefix, including the itemset that includes the prefix\n", "\"\"\"\n", "def projectSequence(sequence, prefix, newEvent):\n", " result = None\n", " for i, itemset in enumerate(sequence):\n", " if result is None:\n", " if (not newEvent) or i > 0:\n", " if (all(x in itemset for x in prefix)):\n", " result = [list(itemset)]\n", " else:\n", " result.append(copy.copy(itemset))\n", " return result" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['b', 'c'], ['a', 'c'], ['c']]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "seq = [[\"a\"], [\"b\", \"c\"], [\"a\", \"c\"], [\"c\"]]\n", "projectSequence(seq, [\"b\"], False)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['a', 'c'], ['c']]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "projectSequence(seq, [\"a\", \"c\"], False)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['a'], ['b', 'c'], ['a', 'c'], ['c']]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "projectSequence(seq, [\"a\"], False)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['a', 'c'], ['c']]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "projectSequence(seq, [\"a\"], True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Project a dataset" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "Projects a dataset according to a given prefix, as done in PrefixSpan\n", "\n", "Args:\n", " dataset: the dataset the projection is built from\n", " prefix: the prefix that is searched for in the sequence\n", " newEvent: if set to True, the first itemset is ignored\n", "Returns:\n", " A (potentially empty) list of sequences\n", "\"\"\"\n", "def projectDatabase(dataset, prefix, newEvent):\n", " projectedDB = []\n", " for sequence in dataset:\n", " seqProjected = projectSequence(sequence, prefix, newEvent)\n", " if not seqProjected is None:\n", " projectedDB.append(seqProjected)\n", " return projectedDB" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "datasetProject = [\n", " [[\"a\"], [\"a\", \"b\", \"c\"], [\"a\", \"c\"], [\"d\"], [\"c\", \"f\"]],\n", " [[\"a\", \"d\"], [\"c\"], [\"b\", \"c\"], [\"a\", \"e\"]],\n", " [[\"e\", \"f\"], [\"a\", \"b\"], [\"d\", \"f\"], [\"d\"], [\"b\"]],\n", " [[\"e\"], [\"g\"], [\"a\", \"f\"], [\"c\"], [\"b\"], [\"c\"]]\n", " ]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[[['a', 'b', 'c'], ['a', 'c'], ['d'], ['c', 'f']],\n", " [['c'], ['b', 'c'], ['a', 'e']],\n", " [['c'], ['b'], ['c']]]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "projectDatabase(datasetProject, [\"c\"], False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The main algorithm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Some more utility functions:" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "Generates a list of all items that are contained in a dataset\n", "\"\"\"\n", "def generateItems(dataset):\n", " return sorted(set ([item for sublist1 in dataset for sublist2 in sublist1 for item in sublist2]))\n", "\n", "\"\"\"\n", "Computes a defaultdict that maps each item in the dataset to its support\n", "\"\"\"\n", "def generateItemSupports(dataset, ignoreFirstEvent=False, prefix=[]):\n", " result = defaultdict(int)\n", " for sequence in dataset:\n", " if ignoreFirstEvent:\n", " sequence = sequence[1:]\n", " cooccurringItems = set()\n", " for itemset in sequence:\n", " if all(x in itemset for x in prefix):\n", " for item in itemset:\n", " if not item in prefix:\n", " cooccurringItems.add(item)\n", " for item in cooccurringItems:\n", " result [item] += 1\n", " return sorted(result.items())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Finally, the algorithm:" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "The PrefixSpan algorithm. Computes the frequent sequences in a seqeunce dataset for a given minSupport\n", "\n", "Args:\n", " dataset: A list of sequences, for which the frequent (sub-)sequences are computed\n", " minSupport: The minimum support that makes a sequence frequent\n", "Returns:\n", " A list of tuples (s, c), where s is a frequent sequence, and c is the count for that sequence\n", "\"\"\"\n", "def prefixSpan(dataset, minSupport):\n", " result = []\n", " itemCounts = generateItemSupports(dataset)\n", " for item, count in itemCounts:\n", " if count >= minSupport:\n", " newPrefix = [[item]]\n", " result.append((newPrefix, count))\n", " result.extend(prefixSpanInternal(projectDatabase(dataset, [item], False), minSupport, newPrefix))\n", " return result\n", "\n", "def prefixSpanInternal(dataset, minSupport, prevPrefixes=[]):\n", " result = []\n", " \n", " # Add a new item to the last element (==same time)\n", " itemCountSameEvent = generateItemSupports(dataset, False, prefix=prevPrefixes[-1])\n", " for item, count in itemCountSameEvent:\n", " if (count >= minSupport) and item > prevPrefixes[-1][-1]:\n", " newPrefix = copy.deepcopy(prevPrefixes)\n", " newPrefix[-1].append(item)\n", " result.append((newPrefix, count))\n", " result.extend(prefixSpanInternal(projectDatabase(dataset, newPrefix[-1], False), minSupport, newPrefix))\n", " \n", " # Add a new event to the prefix\n", " itemCountSubsequentEvents = generateItemSupports(dataset, True)\n", " for item, count in itemCountSubsequentEvents:\n", " if count >= minSupport:\n", " newPrefix = copy.deepcopy(prevPrefixes)\n", " newPrefix.append([item])\n", " result.append((newPrefix, count))\n", " result.extend(prefixSpanInternal(projectDatabase(dataset, [item], True), minSupport, newPrefix))\n", " return result" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[([['a']], 4),\n", " ([['a', 'b']], 2),\n", " ([['a', 'b'], ['c']], 2),\n", " ([['a', 'b'], ['c'], ['c']], 2),\n", " ([['a'], ['b']], 4),\n", " ([['a'], ['b', 'c']], 2),\n", " ([['a'], ['b'], ['c']], 3),\n", " ([['a'], ['c']], 4),\n", " ([['a'], ['c'], ['b']], 3),\n", " ([['a'], ['c'], ['b'], ['c']], 2),\n", " ([['a'], ['c'], ['c']], 4),\n", " ([['b']], 4),\n", " ([['b', 'c']], 2),\n", " ([['b'], ['c']], 3),\n", " ([['b'], ['c'], ['c']], 2),\n", " ([['c']], 4),\n", " ([['c'], ['b']], 3),\n", " ([['c'], ['b'], ['c']], 2),\n", " ([['c'], ['c']], 4)]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "prefixSpan(dataset, 2)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# Filter for closed and maximal patterns\n", "### Closed patterns" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "Given a list of all frequent sequences and their counts, compute the set of closed frequent sequence (as a list)\n", "This is only a very simplistic (naive) implementation for demonstration purposes!\n", "\"\"\"\n", "def filterClosed(result):\n", " for supersequence, countSeq in copy.deepcopy(result):\n", " for subsequence, countSubSeq in copy.deepcopy(result):\n", " if isSubsequence(supersequence, subsequence) and (countSeq == countSubSeq) and subsequence != supersequence:\n", " result.remove((subsequence, countSubSeq))" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "[([['a', 'b'], ['c'], ['c']], 2),\n", " ([['a'], ['b']], 4),\n", " ([['a'], ['b', 'c']], 2),\n", " ([['a'], ['b'], ['c']], 3),\n", " ([['a'], ['c'], ['b']], 3),\n", " ([['a'], ['c'], ['b'], ['c']], 2),\n", " ([['a'], ['c'], ['c']], 4)]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = prefixSpan(dataset, 2)\n", "filterClosed(result)\n", "result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Maximal sequences" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\"\"\"\n", "Given a list of all frequent sequences and their counts, compute the set of maximal frequent sequence (as a list)\n", "This is only a very naive implementation for demonstration purposes!\n", "\"\"\"\n", "def filterMaximal(result):\n", " for supersequence, countSeq in copy.deepcopy(result):\n", " for subsequence, countSubSeq in copy.deepcopy(result):\n", " if isSubsequence (supersequence, subsequence) and subsequence != supersequence:\n", " result.remove((subsequence, countSubSeq)) " ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[([['a', 'b'], ['c'], ['c']], 2),\n", " ([['a'], ['b', 'c']], 2),\n", " ([['a'], ['c'], ['b'], ['c']], 2)]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = prefixSpan (dataset, 2)\n", "filterMaximal(result)\n", "result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Application example: Wikispeedia\n", "Now we try to find some sequential patterns in a real world dataset: Wikispeedia\n", "\n", "First load the dataset." ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import csv\n", "sequences = []\n", "\n", "# from https://snap.stanford.edu/data/wikispeedia.html\n", "for line in csv.reader((row for row in open(\"data/paths_finished.tsv\") if not row.startswith('#')), delimiter='\\t'):\n", " if len(line) == 0:\n", " continue\n", " # for simplicity, let us remove back clicks\n", " seq = line[3].split(\";\")\n", " # for simplicity, let us remove back clicks\n", " seq = [x for x in seq if x != \"<\"]\n", " sequences.append(seq)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert this to the list of list of lists that we use as a dataformat" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [], "source": [ "wikispeediaData=[]\n", "for seq in sequences:\n", " newSeq = []\n", " for item in seq:\n", " newSeq.append([item])\n", " wikispeediaData.append(newSeq)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wall time: 46.3 s\n" ] }, { "data": { "text/plain": [ "[([['Africa']], 2738),\n", " ([['Agriculture']], 1147),\n", " ([['Animal']], 1666),\n", " ([['Asia']], 1167),\n", " ([['Asteroid']], 1171),\n", " ([['Asteroid'], ['Viking']], 1043),\n", " ([['Atlantic_Ocean']], 1297),\n", " ([['Bird']], 1182),\n", " ([['Brain']], 1316),\n", " ([['Brain'], ['Telephone']], 1043),\n", " ([['China']], 1110),\n", " ([['Christianity']], 1074),\n", " ([['Computer']], 1528),\n", " ([['Earth']], 3176),\n", " ([['England']], 3261),\n", " ([['English_language']], 1414),\n", " ([['Europe']], 4303),\n", " ([['France']], 1588),\n", " ([['Germany']], 1738),\n", " ([['Human']], 1604),\n", " ([['India']], 1216),\n", " ([['Internet']], 1023),\n", " ([['Japan']], 1070),\n", " ([['Mammal']], 1568),\n", " ([['North_America']], 1861),\n", " ([['Periodic_table']], 1394),\n", " ([['Plant']], 1127),\n", " ([['Russia']], 1007),\n", " ([['Science']], 1479),\n", " ([['Telephone']], 1251),\n", " ([['Theatre']], 1034),\n", " ([['United_Kingdom']], 3807),\n", " ([['United_Nations']], 1050),\n", " ([['United_States']], 8675),\n", " ([['Viking']], 1192),\n", " ([['World_War_II']], 2267),\n", " ([['Zebra']], 1041)]" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%time prefixSpan (wikispeediaData, 1000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This will take a long time:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# %time apriori(wikispeediaData, 1000)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }