{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Panama Palindrome\n", "\n", "## Utilities" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import random, re, bisect, time\n", "\n", "def is_palindrome(s):\n", " \"Test if a string is a palindrome (only considering letters a-z).\"\n", " s1 = canonical(s)\n", " return s1 == reversestr(s1)\n", "\n", "def is_unique_palindrome(s):\n", " \"Test if string s is a palindrome where each comma-separated phrase is unique.\"\n", " return is_palindrome(s) and is_unique(phrases(s))\n", "\n", "def canonical(word, sub=re.compile('[^a-z]').sub):\n", " \"The canonical form for comparing: only lowercase a-z.\"\n", " return sub('', word.lower())\n", "\n", "def phrases(s):\n", " \"Break a string s into comma-separated phrases.\"\n", " return [phrase.strip() for phrase in s.split(',')]\n", "\n", "def reversestr(s):\n", " \"Reverse a string.\"\n", " return s[::-1]\n", "\n", "def is_unique(collection):\n", " \"Return true if collection has no duplicate elements.\"\n", " return len(collection) == len(set(collection))" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'test_utils passes'" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def test_utils():\n", " assert is_unique_palindrome('A man, a plan, a canal, Panama!')\n", " assert is_unique_palindrome('''A (man), a PLAN... a ``canal?'' -- Panama!''')\n", " assert not is_unique_palindrome('A man, a plan, a radar, a canal, Panama.')\n", " \n", " assert is_palindrome('A man, a plan, a canal, Panama.')\n", " assert is_palindrome('Radar. Radar? Radar!')\n", " assert not is_palindrome('radars')\n", "\n", " assert phrases('A man, a plan, Panama') == ['A man', 'a plan', 'Panama']\n", " assert canonical('A man, a plan, a canal, Panama') == 'amanaplanacanalpanama'\n", " assert reversestr('foo') == 'oof'\n", " assert is_unique([1, 2, 3])\n", " assert not is_unique([1, 2, 2])\n", " return 'test_utils passes'\n", "\n", "test_utils()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## The Dictionary" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "! [ -e npdict.txt ] || curl -O http://norvig.com/npdict.txt" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 126144 204928 1383045 npdict.txt\r\n" ] } ], "source": [ "! wc npdict.txt" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [], "source": [ "################ Reading in a dictionary\n", "\n", "class PalDict:\n", " \"\"\"A dictionary with the following fields:\n", " words: a sorted list of words: ['ant', 'bee', 'sea']\n", " rwords: a sorted list of reversed words: ['aes', 'eeb', 'tna']\n", " truename: a dict of {canonical:true} pairs, e.g. {'anelk': 'an elk', 'anneelk': 'Anne Elk'}\n", " k:\n", " and the followng methods:\n", " \"\"\"\n", " \n", " def __init__(self, k=100, filename='npdict.txt'):\n", " words, rwords, truename = [], [], {'': '', 'panama': 'Panama!'}\n", " for tword in open(filename, 'r', encoding='ascii', errors='ignore').read().splitlines():\n", " word = canonical(tword)\n", " words.append(word)\n", " rwords.append(reversestr(word))\n", " truename[word] = tword\n", " words.sort()\n", " rwords.sort()\n", " self.k = k\n", " self.words = words\n", " self.rwords = rwords\n", " self.truename = truename\n", " self.rangek = range(k)\n", " self.tryharder = False\n", "\n", " def startswith(self, prefix):\n", " \"\"\"Return up to k canonical words that start with prefix.\n", " If there are more than k, choose from them at random.\"\"\"\n", " return self._k_startingwith(self.words, prefix)\n", "\n", " def endswith(self, rsuffix):\n", " \"\"\"Return up to k canonical words that end with the reversed suffix.\n", " If you want words ending in 'ing', ask for d.endswith('gni').\n", " If there are more than k, choose from them at random.\"\"\"\n", " return [reversestr(s) for s in self._k_startingwith(self.rwords, rsuffix)]\n", "\n", " def __contains__(self, word):\n", " return word in self.truename\n", "\n", " def _k_startingwith(self, words, prefix):\n", " start = bisect.bisect_left(words, prefix)\n", " end = bisect.bisect(words, prefix + 'zzzz')\n", " n = end - start\n", " if self.k >= n: # get all the words that start with prefix\n", " results = words[start:end]\n", " else: # sample from words starting with prefix \n", " indexes = random.sample(range(start, end), self.k)\n", " results = [words[i] for i in indexes]\n", " random.shuffle(results)\n", " ## Consider words that are prefixes of the prefix.\n", " ## This is very slow, so don't use it until late in the game.\n", " if self.tryharder:\n", " for i in range(3, len(prefix)):\n", " w = prefix[0:i]\n", " if ((words == self.words and w in self.truename) or\n", " (words == self.rwords and reversestr(w) in self.truename)):\n", " results.append(w)\n", " return results\n", "\n", "paldict = PalDict() \n", "\n", "def anpdictshort():\n", " \"Find the words that are valid when every phrase must start with 'a'\"\n", " def segment(word): return [s for s in word.split('a') if s]\n", " def valid(word): return all(reversestr(s) in segments for s in segment(word))\n", " words = [canonical(w) for w in open('anpdict.txt')]\n", " segments = set(s for w in words for s in segment(w))\n", " valid_words = [paldict.truename[w] for w in words if valid(w)]\n", " file('anpdict-short2.txt', 'w').write('\\n'.join(valid_words))\n", "\n", "################ Search for a palindrome\n", "\n", "class Panama:\n", " def __init__(self, L='A man, a plan', R='a canal, Panama', dict=paldict):\n", " ## .left and .right hold lists of canonical words\n", " ## .diff holds the number of characters that are not matched,\n", " ## positive for words on left, negative for right.\n", " ## .stack holds (action, side, arg) tuples\n", " self.left = []\n", " self.right = []\n", " self.best = 0\n", " self.seen = {}\n", " self.diff = 0\n", " self.stack = []\n", " self.starttime = time.clock()\n", " self.dict = dict\n", " self.steps = 0\n", " for word in L.split(','):\n", " self.add('left', canonical(word))\n", " for rword in reversestr(R).split(','):\n", " self.add('right', canonical(reversestr(rword)))\n", " self.consider_candidates()\n", " \n", " def search(self, steps=10*1000*1000):\n", " \"Search for palindromes.\"\n", " for self.steps in range(steps):\n", " if not self.stack:\n", " return 'done'\n", " action, dir, substr, arg = self.stack[-1]\n", " if action == 'added': # undo the last word added\n", " self.remove(dir, arg)\n", " elif action == 'trying' and arg: # try the next word if there is one\n", " self.add(dir, arg.pop()) and self.consider_candidates()\n", " elif action == 'trying' and not arg: # otherwise backtrack\n", " self.stack.pop()\n", " else:\n", " raise ValueError(action)\n", " self.report()\n", " return self\n", "\n", " def add(self, dir, word):\n", " \"add a word\"\n", " if word in self.seen:\n", " return False\n", " else:\n", " getattr(self, dir).append(word)\n", " self.diff += factor[dir] * len(word)\n", " self.seen[word] = True\n", " self.stack.append(('added', dir, '?', word))\n", " return True\n", "\n", " def remove(self, dir, word):\n", " \"remove a word\"\n", " oldword = getattr(self, dir).pop()\n", " assert word == oldword\n", " self.diff -= factor[dir] * len(word)\n", " del self.seen[word]\n", " self.stack.pop()\n", " \n", " def consider_candidates(self):\n", " \"\"\"Push a new state with a set of candidate words onto stack.\"\"\"\n", " if self.diff > 0: # Left is longer, consider adding on right\n", " dir = 'right'\n", " substr = self.left[-1][-self.diff:]\n", " candidates = self.dict.endswith(substr)\n", " elif self.diff < 0: # Right is longer, consider adding on left\n", " dir = 'left'\n", " substr = reversestr(self.right[-1][0:-self.diff])\n", " candidates = self.dict.startswith(substr)\n", " else: # Both sides are same size\n", " dir = 'left'\n", " substr = ''\n", " candidates = self.dict.startswith('')\n", " if substr == reversestr(substr):\n", " self.report()\n", " self.stack.append(('trying', dir, substr, candidates))\n", " \n", " def report(self):\n", " \"Report a new palindrome to log file (if it is sufficiently big).\"\n", " N = len(self)\n", " if N > 13333:\n", " self.dict.tryharder = True\n", " if N > self.best and (N > 13000 or N > self.best+1000):\n", " self.best = len(self)\n", " self.bestphrase = str(self)\n", " print('%5d phrases (%5d words) in %3d seconds (%6d steps)' % (\n", " self.best, self.bestphrase.count(' ')+1, time.clock() - self.starttime,\n", " self.steps))\n", " assert is_unique_palindrome(self.bestphrase)\n", "\n", " def __len__(self):\n", " return len(self.left) + len(self.right)\n", "\n", " def __str__(self):\n", " truename = self.dict.truename\n", " lefts = [truename[w] for w in self.left]\n", " rights = [truename[w] for w in self.right]\n", " return ', '.join(lefts + rights[::-1])\n", " \n", " def __repr__(self):\n", " return ''.format(len(self))\n", "\n", "factor = {'left': +1, 'right': -1}\n", "\n", "# Note that we only allow one truename per canonical name. Occasionally\n", "# this means we miss a good word (as in \"a node\" vs. \"an ode\"), but there\n", "# are only 665 of these truename collisions, and most of them are of the\n", "# form \"a mark-up\" vs. \"a markup\" so it seemed better to disallow them.\n" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "126144" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(paldict.words)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "all tests pass\n", " 1005 phrases ( 1239 words) in 0 seconds ( 18582 steps)\n", " 2012 phrases ( 2478 words) in 0 seconds ( 41886 steps)\n", " 3017 phrases ( 3710 words) in 0 seconds ( 64444 steps)\n", " 4020 phrases ( 4957 words) in 0 seconds ( 92989 steps)\n", " 5022 phrases ( 6184 words) in 1 seconds (128986 steps)\n", " 6024 phrases ( 7408 words) in 1 seconds (162634 steps)\n", " 7027 phrases ( 8607 words) in 1 seconds (204639 steps)\n", " 8036 phrases ( 9846 words) in 2 seconds (254992 steps)\n", " 9037 phrases (11050 words) in 2 seconds (320001 steps)\n", "10039 phrases (12257 words) in 2 seconds (417723 steps)\n", "11040 phrases (13481 words) in 3 seconds (565050 steps)\n", "12043 phrases (14711 words) in 4 seconds (887405 steps)\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "################ Unit Tests\n", " \n", "def test2(p=PalDict()):\n", " d = p.dict\n", " def sameset(a, b): return set(a) == set(b)\n", " assert 'panama' in d\n", " assert d.words[0] in d\n", " assert d.words[-1] in d\n", " assert sameset(d.startswith('aword'), ['awording', 'awordbreak',\n", " 'awordiness', 'awordage', 'awordplay', 'awordlore', 'awordbook',\n", " 'awordlessness', 'aword', 'awordsmith'])\n", " assert sameset(d.endswith('ytisob'), ['aglobosity', 'averbosity',\n", " 'asubglobosity', 'anonverbosity', 'agibbosity'])\n", " d.tryharder = True\n", " assert sameset(d.startswith('oklahoma'), ['oklahoma', 'okla'])\n", " d.tryharder = False\n", " assert d.startswith('oklahoma') == ['oklahoma']\n", " assert d.startswith('fsfdsfdsfds') == []\n", " print('all tests pass')\n", " return p\n", "\n", "p = Panama()\n", "test2(p)\n", "p.search().report()\n", "p" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 847k 100 847k 0 0 1037k 0 --:--:-- --:--:-- --:--:-- 1037k\n" ] } ], "source": [ "! [ -e anpdict.txt ] || curl -O http://norvig.com/anpdict.txt" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [], "source": [ "anpdictshort()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 4527 9055 39467 anpdict-short.txt\r\n", " 4527 9055 39467 anpdict-short2.txt\r\n", " 69241 138489 867706 anpdict.txt\r\n", " 126144 204928 1383045 npdict.txt\r\n", " 204439 361527 2329685 total\r\n" ] } ], "source": [ "! wc *npd*" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# Letter-By-Letter Approach\n", "\n", "Can we go letter-by-letter instead of word-by-word? Advantages: \n", "\n", "* We can (if need be) be exhaustive at each decision point, trying all 26 possibilities.\n", "* We can try the most likely letters first.\n", "\n", "Process\n", "\n", "* Keep left- nad right- partial phrase lists; and the current state:\n", "\n", " {left: ['aman', 'aplan'], right: ['acanal', panama'],\n", " left_word: True, right_word: True, extra_chars: +3, palindrome: True}\n", " \n", "* Now consider all ways of extending:\n", "\n", " - Add the letter `'a'` to the left, either as a new word or a continuation of the old word (perhaps going for `'a planaria'`).\n", " - Add a letter, any letter, to the right, either as a new word or a continuation of \n", " \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from collections import namedtuple\n", "\n", "\n", "def do(state, action, side, L): action(state, side, L)\n", "def add(state, side, L): getattr(state, side)[-1] += L\n", "def new(state, side, L): getattr(state, side).append(L)\n", "def undo(action, letter):\n", " if action == add:\n", " elif action == new:\n", " else:\n", " raise ValueError()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 0 }