{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Catch 22 - Character Appearances"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import re\n",
      "import itertools\n",
      "import string\n",
      "import csv\n",
      "\n",
      "# Open raw text and split over newlines\n",
      "FILE = open('/bigdrive/Documents/MSAN622_Data_Visualization/msan622/project-prototype/Catch-22.txt')\n",
      "data = FILE.read()\n",
      "data = data.split(\"\\n\")\n",
      "\n",
      "# Find chapters, skipping intro and appendix\n",
      "chapters = {}\n",
      "key = False\n",
      "for line in data:\n",
      "    # Find chapter markers and make new dictionary entry\n",
      "    if re.match(r'^[0-9]+ [A-Za-z0-9-&\\'. ]+$', line) is not None:\n",
      "        key = int(line[0:2])\n",
      "        chapters[key] = []\n",
      "    # If we have a new chapter, append the lowercase, punctuation-cleaned words to the dictionary\n",
      "    elif key:\n",
      "        chapters[key].append(line.lower().translate(string.maketrans(\"\",\"\"), string.punctuation).split())\n",
      "    # Stop at the end of the book\n",
      "    if line == 'APPENDIX':\n",
      "        break\n",
      "# Clean up broken lists into one total list for each chapter\n",
      "for chapter in chapters:\n",
      "    chapters[chapter] = list(itertools.chain(*chapters[chapter]))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Now look for occurances of the main characters in the book\n",
      "char_names = {'yossarian':\"Yossarian\",\n",
      "              'chaplain':\"Chaplain Tappman\",\n",
      "              'milo':\"Milo Minderbinder\",\n",
      "              'cathcart':\"Colonel Cathcart\",\n",
      "              'korn':\"Colonel Korn\",\n",
      "              'nately':\"Nately\",\n",
      "              'orr':\"Orr\",\n",
      "              'major':\"Major Major Major Major\",\n",
      "              'dunbar':\"Dunbar\",\n",
      "              'daneeka':\"Doc Daneeka\",\n",
      "              'joe':\"Hungry Joe\",\n",
      "              'clevinger':\"Clevinger\",\n",
      "              'aarfy':\"Aarfy\",\n",
      "              'dreedle':\"General Dreedle\",\n",
      "              'danby':\"Major Danby\",\n",
      "              'mcwatt':\"McWatt\",\n",
      "              'scheisskopf':\"General Scheisskopf\",\n",
      "              'peckem':\"General Peckem\",\n",
      "              'dobbs':\"Dobbs\",\n",
      "              'whitcomb':\"Corporal Whitcomb\",\n",
      "              'black':\"Captain Black\",\n",
      "              'halfoat':\"Chief White Halfoat\",\n",
      "              'duckett':\"Nurse Duckett\",\n",
      "              'coverley':\"Major \u2014 de Coverley\",\n",
      "              'wintergreen':\"ex-P.F.C. Wintergreen\",\n",
      "              'appleby':\"Appleby\",\n",
      "              'havermeyer':\"Havermeyer\",\n",
      "              'snowden':\"Snowden\"}\n",
      "# Loop through characters and chapters, index an appearance by the percentile of a chapter,\n",
      "# ie. 1st percentile of chapter 2, encode as 2.01\n",
      "characters = {character: [] for character in char_names}\n",
      "for character in characters:\n",
      "    for chapter in chapters:\n",
      "        length = len(chapters[chapter])\n",
      "        # Speacial handling for Major Major (Major Major Major Major)\n",
      "        if character == 'major':\n",
      "            b = ['major','major']\n",
      "            location = [i for i in range(len(chapters[chapter])) if chapters[chapter][i:i+len(b)] == b]\n",
      "            location.append(0)\n",
      "            location = [location[i] for i in range(len(location) - 1) if location[i] != location[i+1] - 1]\n",
      "            location = [(chapter + (float(x)/length)) for x in location]\n",
      "        # Speacial handling for Captain Black\n",
      "        elif character == 'black':\n",
      "            b = ['captain','black']\n",
      "            location = [(chapter + (float(i)/length)) for i in range(len(chapters[chapter])) if\n",
      "                        chapters[chapter][i:i+len(b)] == b]\n",
      "        else:\n",
      "            location = [(chapter + (float(i)/length)) for i, x in enumerate(chapters[chapter]) if \n",
      "                        x == character]\n",
      "        characters[character].append(location)\n",
      "    # Clean up broken lists, remove duplicates (only relevant if binning locations) and sort\n",
      "    characters[character] = sorted(list(set(list(itertools.chain(*characters[character])))))\n",
      "\n",
      "# Print summary of number of appearances, limit character dictionary to those only appearing 50+ times\n",
      "for char in sorted(characters):\n",
      "    print char, len(characters[char])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "aarfy 130\n",
        "appleby 55\n",
        "black 70\n",
        "cathcart 310\n",
        "chaplain 446\n",
        "clevinger 131\n",
        "coverley 59\n",
        "danby 127\n",
        "daneeka 150\n",
        "dobbs 82\n",
        "dreedle 128\n",
        "duckett 61\n",
        "dunbar 169\n",
        "halfoat 69\n",
        "havermeyer 52\n",
        "joe 141\n",
        "korn 214\n",
        "major 183\n",
        "mcwatt 116\n",
        "milo 393\n",
        "nately 205\n",
        "orr 185\n",
        "peckem 102\n",
        "scheisskopf 115\n",
        "snowden 52\n",
        "whitcomb 78\n",
        "wintergreen 56\n",
        "yossarian 1347\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Now load it into a melted CSV file with characters and their appearance times\n",
      "with open('catch22.csv', 'wb') as csvfile:\n",
      "    csvwriter = csv.writer(csvfile, delimiter=',',quotechar='\"', quoting=csv.QUOTE_MINIMAL)\n",
      "    headers = ['Character', 'Chapter']\n",
      "    csvwriter.writerow(headers)\n",
      "    for character in characters:\n",
      "        for location in characters[character]:\n",
      "            this_row = [char_names[character], location]\n",
      "            csvwriter.writerow(this_row)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 10
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Catch 22- Locations"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Now look for occurances of the main locations visited in the book\n",
      "locations = {'pianosa':'Pianosa, Italy',\n",
      "             'rome':'Rome, Italy',\n",
      "             'smyrna':'Smyrna, Turkey',\n",
      "             'corsica':'Corsica, France',\n",
      "             'parma':'Parma, Italy',\n",
      "             'salerno':'Salerno, Italy',\n",
      "             'marrakech':'Marrakech, Morocco',\n",
      "             'malta':'Valletta, Malta',\n",
      "             'cairo':'Cairo, Egypt', \n",
      "             'sicily':'Sicily, Italy', \n",
      "             'istanbul':'Istanbul, Turkey', \n",
      "             'etna':'Mt Etna, Italy',\n",
      "             'vesuvius':'Mt Vesuvius, Italy',\n",
      "             'palermo':'Palermo, Italy', \n",
      "             'catania':'Catania, Italy', \n",
      "             'oran':'Oran, Algeria',\n",
      "             'beirut':'Beirut, Lebanon',\n",
      "             'bengasi':'Bengasi, Libya',\n",
      "             'sardinia':'Sardinia, Italy',\n",
      "             'barcelona':'Barcelona, Spain',\n",
      "             'leghorn':'Livorno, Italy',\n",
      "             'marseilles':'Marseilles, France',\n",
      "             'spezia':'Spezia, Italy',\n",
      "             'majorca':'Majorca, Spain',\n",
      "             'elba':'Elba, Italy',\n",
      "             'ferrara':'Ferrara, Italy',\n",
      "             'bologna':'Bologna, Italy',\n",
      "             'arezzo':'Arezzo, Italy',\n",
      "             'avignon':'Avignon, France'}\n",
      "# Use OpenStreetMaps to geo-code the cities\n",
      "from geopy.geocoders import Nominatim\n",
      "geolocator = Nominatim(timeout=10)\n",
      "loc_geo = {}\n",
      "for locale in sorted(locations):\n",
      "    address, (latitude, longitude) = geolocator.geocode(locations[locale])\n",
      "    loc_geo[locale] = (latitude, longitude)\n",
      "\n",
      "# Loop through locations and chapters, index a location mention by the percentile of a chapter,\n",
      "# ie. 1st percentile of chapter 2, encode as 2.01\n",
      "loc_times = {locale: [] for locale in locations}\n",
      "for locale in locations:\n",
      "    for chapter in chapters:\n",
      "        length = len(chapters[chapter])\n",
      "        location = [(chapter + (float(i)/length)) for i, x in enumerate(chapters[chapter]) if \n",
      "                    x == locale]\n",
      "        loc_times[locale].append(location)\n",
      "    # Clean up broken lists, remove duplicates (only relevant if binning locations) and sort\n",
      "    loc_times[locale] = sorted(list(set(list(itertools.chain(*loc_times[locale])))))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Now load it into a melted CSV file with locations, their mention times, and the geo-coded location\n",
      "with open('catch22geo.csv', 'wb') as csvfile:\n",
      "    csvwriter = csv.writer(csvfile, delimiter=',',quotechar='\"', quoting=csv.QUOTE_MINIMAL)\n",
      "    headers = ['Location', 'Time', 'Lat', 'Lon']\n",
      "    csvwriter.writerow(headers)\n",
      "    for locale in sorted(locations):\n",
      "        for t in loc_times[locale]:\n",
      "            this_line = [locale, t, loc_geo[locale][0], loc_geo[locale][1]]\n",
      "            csvwriter.writerow(this_line)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Catch 22 - Most Used Words Around Yossarian"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import nltk\n",
      "from nltk.tag.simplify import simplify_wsj_tag\n",
      "# Now look for the words surrounding our main character\n",
      "yo_words = {'words': [], 'locs': []}\n",
      "for chapter in chapters:\n",
      "    length = len(chapters[chapter])\n",
      "    location = [i for i, x in enumerate(chapters[chapter]) if x == 'yossarian']\n",
      "    # Expand range of words to 20 either side, this just gets indexes\n",
      "    locations = [range(max(0,(i-25)),min(len(chapters[chapter]),(i+26))) for i in location]\n",
      "    # Remove duplicates for overlapping ranges\n",
      "    locations = list(set(list(itertools.chain(*locations))))\n",
      "    # Grab the words and store to dictionary\n",
      "    words = [chapters[chapter][i] for i in locations]\n",
      "    locations = [(chapter + (float(x)/length)) for x in locations]\n",
      "    yo_words['words'].append(words)\n",
      "    yo_words['locs'].append(locations)\n",
      "    \n",
      "# Clean up broken liss\n",
      "yo_words['words'] = list(itertools.chain(*yo_words['words']))\n",
      "yo_words['locs'] = list(itertools.chain(*yo_words['locs']))\n",
      "yo_words['words'] = nltk.pos_tag(yo_words['words'])\n",
      "yo_words['words'] = [(word, simplify_wsj_tag(tag)) for word, tag in yo_words['words']]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from nltk.corpus import stopwords\n",
      "stop = stopwords.words('english')\n",
      "stop.extend(('said','thats','im','dont','got','get','say','youre'))\n",
      "\n",
      "# Now load it into a melted CSV file with word, POS type and their mention times\n",
      "with open('catch22pos.csv', 'wb') as csvfile:\n",
      "    csvwriter = csv.writer(csvfile, delimiter=',',quotechar='\"', quoting=csv.QUOTE_MINIMAL)\n",
      "    headers = ['Word', 'Time', 'POS']\n",
      "    csvwriter.writerow(headers)\n",
      "    for i in range(len(yo_words['locs'])):\n",
      "        if yo_words['words'][i][0] not in stop and yo_words['words'][i][0] not in char_names:\n",
      "            this_line = [yo_words['words'][i][0], yo_words['locs'][i], yo_words['words'][i][1]]\n",
      "            csvwriter.writerow(this_line)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}