{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Panama Palindrome\n",
    "\n",
    "## Utilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import random, re, bisect, time\n",
    "\n",
    "def is_palindrome(s):\n",
    "    \"Test if a string is a palindrome (only considering letters a-z).\"\n",
    "    s1 = canonical(s)\n",
    "    return s1 == reversestr(s1)\n",
    "\n",
    "def is_unique_palindrome(s):\n",
    "    \"Test if string s is a palindrome where each comma-separated phrase is unique.\"\n",
    "    return is_palindrome(s) and is_unique(phrases(s))\n",
    "\n",
    "def canonical(word, sub=re.compile('[^a-z]').sub):\n",
    "    \"The canonical form for comparing: only lowercase a-z.\"\n",
    "    return sub('', word.lower())\n",
    "\n",
    "def phrases(s):\n",
    "    \"Break a string s into comma-separated phrases.\"\n",
    "    return [phrase.strip() for phrase in s.split(',')]\n",
    "\n",
    "def reversestr(s):\n",
    "    \"Reverse a string.\"\n",
    "    return s[::-1]\n",
    "\n",
    "def is_unique(collection):\n",
    "    \"Return true if collection has no duplicate elements.\"\n",
    "    return len(collection) == len(set(collection))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'test_utils passes'"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def test_utils():\n",
    "    assert is_unique_palindrome('A man, a plan, a canal, Panama!')\n",
    "    assert is_unique_palindrome('''A (man),     a   PLAN... a ``canal?'' -- Panama!''')\n",
    "    assert not is_unique_palindrome('A man, a plan, a radar, a canal, Panama.')\n",
    "    \n",
    "    assert is_palindrome('A man, a plan, a canal, Panama.')\n",
    "    assert is_palindrome('Radar. Radar? Radar!')\n",
    "    assert not is_palindrome('radars')\n",
    "\n",
    "    assert phrases('A man, a plan, Panama') == ['A man', 'a plan', 'Panama']\n",
    "    assert canonical('A man, a plan, a canal, Panama') == 'amanaplanacanalpanama'\n",
    "    assert reversestr('foo') == 'oof'\n",
    "    assert is_unique([1, 2, 3])\n",
    "    assert not is_unique([1, 2, 2])\n",
    "    return 'test_utils passes'\n",
    "\n",
    "test_utils()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The Dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "! [ -e npdict.txt ] || curl -O http://norvig.com/npdict.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  126144  204928 1383045 npdict.txt\r\n"
     ]
    }
   ],
   "source": [
    "! wc npdict.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "################ Reading in a dictionary\n",
    "\n",
    "class PalDict:\n",
    "    \"\"\"A dictionary with the following fields:\n",
    "    words: a sorted list of words: ['ant', 'bee', 'sea']\n",
    "    rwords: a sorted list of reversed words: ['aes', 'eeb', 'tna']\n",
    "    truename: a dict of {canonical:true} pairs, e.g. {'anelk': 'an elk', 'anneelk': 'Anne Elk'}\n",
    "    k:\n",
    "    and the followng methods:\n",
    "    \"\"\"\n",
    "    \n",
    "    def __init__(self, k=100, filename='npdict.txt'):\n",
    "        words, rwords, truename = [], [], {'': '', 'panama': 'Panama!'}\n",
    "        for tword in open(filename, 'r', encoding='ascii', errors='ignore').read().splitlines():\n",
    "            word = canonical(tword)\n",
    "            words.append(word)\n",
    "            rwords.append(reversestr(word))\n",
    "            truename[word] = tword\n",
    "        words.sort()\n",
    "        rwords.sort()\n",
    "        self.k = k\n",
    "        self.words = words\n",
    "        self.rwords = rwords\n",
    "        self.truename = truename\n",
    "        self.rangek = range(k)\n",
    "        self.tryharder = False\n",
    "\n",
    "    def startswith(self, prefix):\n",
    "        \"\"\"Return up to k canonical words that start with prefix.\n",
    "        If there are more than k, choose from them at random.\"\"\"\n",
    "        return self._k_startingwith(self.words, prefix)\n",
    "\n",
    "    def endswith(self, rsuffix):\n",
    "        \"\"\"Return up to k canonical words that end with the reversed suffix.\n",
    "        If you want words ending in 'ing', ask for d.endswith('gni').\n",
    "        If there are more than k, choose from them at random.\"\"\"\n",
    "        return [reversestr(s) for s in self._k_startingwith(self.rwords, rsuffix)]\n",
    "\n",
    "    def __contains__(self, word):\n",
    "        return word in self.truename\n",
    "\n",
    "    def _k_startingwith(self, words, prefix):\n",
    "        start = bisect.bisect_left(words, prefix)\n",
    "        end = bisect.bisect(words, prefix + 'zzzz')\n",
    "        n = end - start\n",
    "        if self.k >= n: # get all the words that start with prefix\n",
    "            results = words[start:end]\n",
    "        else: # sample from words starting with prefix \n",
    "            indexes = random.sample(range(start, end), self.k)\n",
    "            results = [words[i] for i in indexes]\n",
    "        random.shuffle(results)\n",
    "        ## Consider words that are prefixes of the prefix.\n",
    "        ## This is very slow, so don't use it until late in the game.\n",
    "        if self.tryharder:\n",
    "            for i in range(3, len(prefix)):\n",
    "                w = prefix[0:i]\n",
    "                if ((words == self.words and w in self.truename) or\n",
    "                    (words == self.rwords and reversestr(w) in self.truename)):\n",
    "                    results.append(w)\n",
    "        return results\n",
    "\n",
    "paldict = PalDict() \n",
    "\n",
    "def anpdictshort():\n",
    "    \"Find the words that are valid when every phrase must start with 'a'\"\n",
    "    def segment(word):  return [s for s in word.split('a') if s]\n",
    "    def valid(word): return all(reversestr(s) in segments for s in segment(word))\n",
    "    words = [canonical(w) for w in open('anpdict.txt')]\n",
    "    segments = set(s for w in words for s in segment(w))\n",
    "    valid_words = [paldict.truename[w] for w in words if valid(w)]\n",
    "    file('anpdict-short2.txt', 'w').write('\\n'.join(valid_words))\n",
    "\n",
    "################ Search for a palindrome\n",
    "\n",
    "class Panama:\n",
    "    def __init__(self, L='A man, a plan', R='a canal, Panama', dict=paldict):\n",
    "        ## .left and .right hold lists of canonical words\n",
    "        ## .diff holds the number of characters that are not matched,\n",
    "        ##  positive for words on left, negative for right.\n",
    "        ## .stack holds (action, side, arg) tuples\n",
    "        self.left = []\n",
    "        self.right = []\n",
    "        self.best = 0\n",
    "        self.seen = {}\n",
    "        self.diff = 0\n",
    "        self.stack = []\n",
    "        self.starttime = time.process_time()\n",
    "        self.dict = dict\n",
    "        self.steps = 0\n",
    "        for word in L.split(','):\n",
    "            self.add('left', canonical(word))\n",
    "        for rword in reversestr(R).split(','):\n",
    "            self.add('right', canonical(reversestr(rword)))\n",
    "        self.consider_candidates()\n",
    "        \n",
    "    def search(self, steps=10*1000*1000):\n",
    "        \"Search for palindromes.\"\n",
    "        for self.steps in range(steps):\n",
    "            if not self.stack:\n",
    "                return 'done'\n",
    "            action, dir, substr, arg = self.stack[-1]\n",
    "            if action == 'added': # undo the last word added\n",
    "                self.remove(dir, arg)\n",
    "            elif action == 'trying' and arg: # try the next word if there is one\n",
    "                self.add(dir, arg.pop()) and self.consider_candidates()\n",
    "            elif action == 'trying' and not arg: # otherwise backtrack\n",
    "                self.stack.pop()\n",
    "            else:\n",
    "                raise ValueError(action)\n",
    "        self.report()\n",
    "        return self\n",
    "\n",
    "    def add(self, dir, word):\n",
    "        \"add a word\"\n",
    "        if word in self.seen:\n",
    "            return False\n",
    "        else:\n",
    "            getattr(self, dir).append(word)\n",
    "            self.diff += factor[dir] * len(word)\n",
    "            self.seen[word] = True\n",
    "            self.stack.append(('added', dir, '?', word))\n",
    "            return True\n",
    "\n",
    "    def remove(self, dir, word):\n",
    "        \"remove a word\"\n",
    "        oldword = getattr(self, dir).pop()\n",
    "        assert word == oldword\n",
    "        self.diff -= factor[dir] * len(word)\n",
    "        del self.seen[word]\n",
    "        self.stack.pop()\n",
    "        \n",
    "    def consider_candidates(self):\n",
    "        \"\"\"Push a new state with a set of candidate words onto stack.\"\"\"\n",
    "        if self.diff > 0: # Left is longer, consider adding on right\n",
    "            dir = 'right'\n",
    "            substr =  self.left[-1][-self.diff:]\n",
    "            candidates = self.dict.endswith(substr)\n",
    "        elif self.diff < 0: # Right is longer, consider adding on left\n",
    "            dir = 'left'\n",
    "            substr =  reversestr(self.right[-1][0:-self.diff])\n",
    "            candidates = self.dict.startswith(substr)\n",
    "        else: # Both sides are same size\n",
    "            dir = 'left'\n",
    "            substr = ''\n",
    "            candidates = self.dict.startswith('')\n",
    "        if substr == reversestr(substr):\n",
    "            self.report()\n",
    "        self.stack.append(('trying', dir, substr, candidates))\n",
    "                \n",
    "    def report(self):\n",
    "        \"Report a new palindrome to log file (if it is sufficiently big).\"\n",
    "        N = len(self)\n",
    "        if N > 13333:\n",
    "            self.dict.tryharder = True\n",
    "        if N > self.best and (N > 13000 or N > self.best+1000):\n",
    "            self.best = len(self)\n",
    "            self.bestphrase = str(self)\n",
    "            print('%5d phrases (%5d words) in %3d seconds (%6d steps)' % (\n",
    "                self.best, self.bestphrase.count(' ')+1, time.process_time() - self.starttime,\n",
    "                self.steps))\n",
    "            assert is_unique_palindrome(self.bestphrase)\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.left) + len(self.right)\n",
    "\n",
    "    def __str__(self):\n",
    "        truename = self.dict.truename\n",
    "        lefts =  [truename[w] for w in self.left]\n",
    "        rights = [truename[w] for w in self.right]\n",
    "        return ', '.join(lefts + rights[::-1])\n",
    "    \n",
    "    def __repr__(self):\n",
    "        return '<Panama with {} phrases>'.format(len(self))\n",
    "\n",
    "factor = {'left': +1, 'right': -1}\n",
    "\n",
    "# Note that we only allow one truename per canonical name.  Occasionally\n",
    "# this means we miss a good word (as in \"a node\" vs. \"an ode\"), but there\n",
    "# are only 665 of these truename collisions, and most of them are of the\n",
    "# form \"a mark-up\" vs. \"a markup\" so it seemed better to disallow them.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "126144"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(paldict.words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "all tests pass\n",
      " 1005 phrases ( 1239 words) in   0 seconds ( 18582 steps)\n",
      " 2012 phrases ( 2478 words) in   0 seconds ( 41886 steps)\n",
      " 3017 phrases ( 3710 words) in   0 seconds ( 64444 steps)\n",
      " 4020 phrases ( 4957 words) in   0 seconds ( 92989 steps)\n",
      " 5022 phrases ( 6184 words) in   1 seconds (128986 steps)\n",
      " 6024 phrases ( 7408 words) in   1 seconds (162634 steps)\n",
      " 7027 phrases ( 8607 words) in   1 seconds (204639 steps)\n",
      " 8036 phrases ( 9846 words) in   2 seconds (254992 steps)\n",
      " 9037 phrases (11050 words) in   2 seconds (320001 steps)\n",
      "10039 phrases (12257 words) in   2 seconds (417723 steps)\n",
      "11040 phrases (13481 words) in   3 seconds (565050 steps)\n",
      "12043 phrases (14711 words) in   4 seconds (887405 steps)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<Panama with 12764 phrases>"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "################ Unit Tests\n",
    " \n",
    "def test2(p=PalDict()):\n",
    "    d = p.dict\n",
    "    def sameset(a, b): return set(a) == set(b)\n",
    "    assert 'panama' in d\n",
    "    assert d.words[0] in d\n",
    "    assert d.words[-1] in d\n",
    "    assert sameset(d.startswith('aword'), ['awording', 'awordbreak',\n",
    "        'awordiness', 'awordage', 'awordplay', 'awordlore', 'awordbook',\n",
    "        'awordlessness', 'aword', 'awordsmith'])\n",
    "    assert sameset(d.endswith('ytisob'), ['aglobosity', 'averbosity',\n",
    "        'asubglobosity', 'anonverbosity', 'agibbosity'])\n",
    "    d.tryharder = True\n",
    "    assert sameset(d.startswith('oklahoma'), ['oklahoma', 'okla'])\n",
    "    d.tryharder = False\n",
    "    assert d.startswith('oklahoma') == ['oklahoma']\n",
    "    assert d.startswith('fsfdsfdsfds') == []\n",
    "    print('all tests pass')\n",
    "    return p\n",
    "\n",
    "p = Panama()\n",
    "test2(p)\n",
    "p.search().report()\n",
    "p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100  847k  100  847k    0     0  1037k      0 --:--:-- --:--:-- --:--:-- 1037k\n"
     ]
    }
   ],
   "source": [
    "! [ -e anpdict.txt ] || curl -O http://norvig.com/anpdict.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "anpdictshort()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    4527    9055   39467 anpdict-short.txt\r\n",
      "    4527    9055   39467 anpdict-short2.txt\r\n",
      "   69241  138489  867706 anpdict.txt\r\n",
      "  126144  204928 1383045 npdict.txt\r\n",
      "  204439  361527 2329685 total\r\n"
     ]
    }
   ],
   "source": [
    "! wc *npd*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Letter-By-Letter Approach\n",
    "\n",
    "Can we go letter-by-letter instead of word-by-word? Advantages: \n",
    "\n",
    "* We can (if need be) be exhaustive at each decision point, trying all 26 possibilities.\n",
    "* We can try the most likely letters first.\n",
    "\n",
    "Process\n",
    "\n",
    "* Keep left- nad right- partial phrase lists; and the current state:\n",
    "\n",
    "    {left: ['aman', 'aplan'],  right: ['acanal', panama'],\n",
    "     left_word: True, right_word: True, extra_chars: +3, palindrome: True}\n",
    "     \n",
    "* Now consider all ways of extending:\n",
    "\n",
    "  - Add the letter `'a'` to the left, either as a new word or a continuation of the old word (perhaps going for `'a planaria'`).\n",
    "  - Add a letter, any letter, to the right, either as a new word or a continuation of \n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import namedtuple\n",
    "\n",
    "\n",
    "def do(state, action, side, L): action(state, side, L)\n",
    "def add(state, side, L): getattr(state, side)[-1]  += L\n",
    "def new(state, side, L): getattr(state, side).append(L)\n",
    "def undo(action, letter):\n",
    "    if action == add:\n",
    "    elif action == new:\n",
    "    else:\n",
    "        raise ValueError()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}