{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this excercise, the objective is to figure out what a program does and how it does it based on the methodology in the programming concepts cheat sheet. The code to be studied comes from a tutorial on [the Programming Historian](https://programminghistorian.org/). On a headline level, it takes a transcription of a [historic trial](https://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33), and tries to summarize what the trial is about by printing meaningful frequent words. But, how does it do it? What processing steps and decisions have gone into it? For example, how is \"meaningful\" defined? \n",
    "\n",
    "Feel free to also add/replace print statements in the code below to figure out for example what the variables contain.\n",
    "\n",
    "**Note: load the library in the second cell (by running the cell) first before running this for it to work.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "# html-to-freq-3.py\n",
    "# create sorted dictionary of word-frequency pairs\n",
    "url = 'https://www.dhi.ac.uk/api/data/oldbailey_record_single?idkey=t17800628-33'\n",
    "text = getTextFromAPI(url)\n",
    "\n",
    "fullwordlist = stripNonAlphaNum(text)\n",
    "wordlist = removeStopwords(fullwordlist, stopwords)\n",
    "dictionary = wordListToFreqDict(wordlist)\n",
    "sorteddict = sortFreqDict(dictionary)\n",
    "\n",
    "# compile dictionary into string:\n",
    "outstring = \"\"\n",
    "for key,value in sorteddict:\n",
    "    outstring += value + \": \" + str(key) + \"\\n\"\n",
    "print(outstring)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "# obo.py library\n",
    "\n",
    "stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']\n",
    "stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']\n",
    "stopwords += ['already', 'also', 'although', 'always', 'am', 'among']\n",
    "stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']\n",
    "stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']\n",
    "stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']\n",
    "stopwords += ['because', 'become', 'becomes', 'becoming', 'been']\n",
    "stopwords += ['before', 'beforehand', 'behind', 'being', 'below']\n",
    "stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']\n",
    "stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']\n",
    "stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']\n",
    "stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']\n",
    "stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']\n",
    "stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']\n",
    "stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']\n",
    "stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']\n",
    "stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']\n",
    "stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']\n",
    "stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']\n",
    "stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']\n",
    "stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']\n",
    "stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']\n",
    "stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']\n",
    "stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']\n",
    "stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']\n",
    "stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']\n",
    "stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']\n",
    "stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']\n",
    "stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']\n",
    "stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']\n",
    "stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']\n",
    "stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']\n",
    "stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']\n",
    "stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']\n",
    "stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']\n",
    "stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']\n",
    "stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']\n",
    "stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']\n",
    "stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby']\n",
    "stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']\n",
    "stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']\n",
    "stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']\n",
    "stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']\n",
    "stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']\n",
    "stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']\n",
    "stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']\n",
    "stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']\n",
    "stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']\n",
    "stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']\n",
    "stopwords += ['within', 'without', 'would', 'yet', 'you', 'your']\n",
    "stopwords += ['yours', 'yourself', 'yourselves']\n",
    "\n",
    "\n",
    "def stripTags(pageContents):\n",
    "    startLoc = pageContents.find(\"<p>\")\n",
    "    endLoc = pageContents.rfind(\"<br/>\")\n",
    "\n",
    "    pageContents = pageContents[startLoc:endLoc]\n",
    "\n",
    "    inside = 0\n",
    "    text = ''\n",
    "\n",
    "    for char in pageContents:\n",
    "        if char == '<':\n",
    "            inside = 1\n",
    "        elif (inside == 1 and char == '>'):\n",
    "            inside = 0\n",
    "        elif inside == 1:\n",
    "            continue\n",
    "        else:\n",
    "            text += char\n",
    "\n",
    "    return text\n",
    "\n",
    "# Get the text from an Old Bailey Online API request\n",
    "def getTextFromAPI(url):\n",
    "    from urllib.request import urlopen\n",
    "    response = urlopen(url)\n",
    "    import json\n",
    "    text = json.loads(response.read().decode('utf-8'))['hits']['hits'][0]['_source']['text']\n",
    "    return text\n",
    "\n",
    "# Given a text string, remove all non-alphanumeric\n",
    "# characters (using Unicode definition of alphanumeric).\n",
    "\n",
    "def stripNonAlphaNum(text):\n",
    "    import re\n",
    "    return re.compile(r'\\W+', re.UNICODE).split(text)\n",
    "    \n",
    "# Given a list of words, return a dictionary of\n",
    "# word-frequency pairs.\n",
    "\n",
    "def wordListToFreqDict(wordlist):\n",
    "    wordfreq = [wordlist.count(p) for p in wordlist]\n",
    "    return dict(zip(wordlist,wordfreq))\n",
    "    \n",
    "# Sort a dictionary of word-frequency pairs in\n",
    "# order of descending frequency.\n",
    "\n",
    "def sortFreqDict(freqdict):\n",
    "    aux = [(freqdict[key], key) for key in freqdict]\n",
    "    aux.sort()\n",
    "    aux.reverse()\n",
    "    return aux\n",
    "\n",
    "# Given a list of words, remove any that are\n",
    "# in a list of stop words.\n",
    "\n",
    "def removeStopwords(wordlist, stopwords):\n",
    "    return [w for w in wordlist if w not in stopwords]\n",
    "\n",
    "# Given a URL, return string of lowercase text from page.\n",
    "\n",
    "def webPageToText(url):\n",
    "    from urllib.request import urlopen\n",
    "    response = urlopen(url)\n",
    "    html = response.read().decode('utf-8')\n",
    "    text = stripTags(html).lower()\n",
    "    return text\n",
    "\n",
    "# Given name of calling program, a url and a string to wrap,\n",
    "# output string in html body with basic metadata and open in Firefox tab.\n",
    "\n",
    "def wrapStringInHTMLMac(program, url, body):\n",
    "    import datetime\n",
    "    from webbrowser import open_new_tab\n",
    "\n",
    "    now = datetime.datetime.today().strftime(\"%Y%m%d-%H%M%S\")\n",
    "    filename = program + '.html'\n",
    "    f = open(filename,'w')\n",
    "\n",
    "    wrapper = \"\"\"<html>\n",
    "    <head>\n",
    "    <title>%s output - %s</title>\n",
    "    </head>\n",
    "    <body><p>URL: <a href=\\\"%s\\\">%s</a></p><p>%s</p></body>\n",
    "    </html>\"\"\"\n",
    "\n",
    "    whole = wrapper % (program, now, url, url, body)\n",
    "    f.write(whole)\n",
    "    f.close()\n",
    "\n",
    "    #Change the filepath variable below to match the location of your directory\n",
    "    filename = 'file:///Users/username/Desktop/programming-historian/' + filename\n",
    "\n",
    "    open_new_tab(filename)\n",
    "    \n",
    "# Given name of calling program, a url and a string to wrap,\n",
    "# output string in html body with basic metadata\n",
    "# and open in Firefox tab.\n",
    "\n",
    "def wrapStringInHTMLWindows(program, url, body):\n",
    "    import datetime\n",
    "    from webbrowser import open_new_tab\n",
    "\n",
    "    now = datetime.datetime.today().strftime(\"%Y%m%d-%H%M%S\")\n",
    "\n",
    "    filename = program + '.html'\n",
    "    f = open(filename,'w')\n",
    "\n",
    "    wrapper = \"\"\"<html>\n",
    "    <head>\n",
    "    <title>%s output - %s</title>\n",
    "    </head>\n",
    "    <body><p>URL: <a href=\\\"%s\\\">%s</a></p><p>%s</p></body>\n",
    "    </html>\"\"\"\n",
    "\n",
    "    whole = wrapper % (program, now, url, url, body)\n",
    "    f.write(whole)\n",
    "    f.close()\n",
    "\n",
    "    open_new_tab(filename)\n",
    "\n",
    "# Given a list of words and a number n, return a list\n",
    "# of n-grams.\n",
    "\n",
    "def getNGrams(wordlist, n):\n",
    "    return [wordlist[i:i+n] for i in range(len(wordlist)-(n-1))]\n",
    "    \n",
    "# Given a list of n-grams, return a dictionary of KWICs,\n",
    "# indexed by keyword.\n",
    "\n",
    "def nGramsToKWICDict(ngrams):\n",
    "    keyindex = len(ngrams[0]) // 2\n",
    "\n",
    "    kwicdict = {}\n",
    "\n",
    "    for k in ngrams:\n",
    "        if k[keyindex] not in kwicdict:\n",
    "            kwicdict[k[keyindex]] = [k]\n",
    "        else:\n",
    "            kwicdict[k[keyindex]].append(k)\n",
    "    return kwicdict\n",
    "\n",
    "# Given a KWIC, return a string that is formatted for\n",
    "# pretty printing.\n",
    "\n",
    "def prettyPrintKWIC(kwic):\n",
    "    n = len(kwic)\n",
    "    keyindex = n // 2\n",
    "    width = 10\n",
    "\n",
    "    outstring = ' '.join(kwic[:keyindex]).rjust(width*keyindex)\n",
    "    outstring += str(kwic[keyindex]).center(len(kwic[keyindex])+6)\n",
    "    outstring += ' '.join(kwic[(keyindex+1):])\n",
    "\n",
    "    return outstring"
   ]
  }
 ],
 "metadata": {
  "kernel_info": {
   "name": "python3"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  },
  "nteract": {
   "version": "0.2.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}