{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Credit: Data8.org\n", "# Stuff that will appear at the top of notebooks;\n", "# You don't have to do anything about it.\n", "\n", "from datascience import *\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')\n", "import warnings\n", "warnings.simplefilter(action=\"ignore\", category=FutureWarning)\n", "\n", "from urllib.request import urlopen \n", "import re\n", "def read_url(url): \n", " return re.sub('\\\\s+', ' ', urlopen(url).read().decode())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "2+3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Read two books, fast!\n", "\n", "huck_finn_url = 'https://www.inferentialthinking.com/data/huck_finn.txt'\n", "huck_finn_text = read_url(huck_finn_url)\n", "huck_finn_chapters = huck_finn_text.split('CHAPTER ')[44:]\n", "\n", "little_women_url = 'https://www.inferentialthinking.com/data/little_women.txt'\n", "little_women_text = read_url(little_women_url)\n", "little_women_chapters = little_women_text.split('CHAPTER ')[1:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Display the chapters of Huckleberry Finn\n", "\n", "Table().with_column('Chapters', huck_finn_chapters)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Count how many times the names Jim, Tom, and Huck appear in each chapter\n", "\n", "counts = Table().with_columns([\n", " 'Jim', np.char.count(huck_finn_chapters, 'Jim'),\n", " 'Tom', np.char.count(huck_finn_chapters, 'Tom'),\n", " 'Huck', np.char.count(huck_finn_chapters, 'Huck')\n", " ])\n", "\n", "# Plot the cumulative counts:\n", "# how many times in Chapter 1, how many times in Chapters 1 and 2, and so on.\n", "\n", "cum_counts = counts.cumsum().with_column('Chapter', np.arange(1, 44, 1))\n", "cum_counts.plot(column_for_xticks=3)\n", "plots.title('Cumulative Number of Times Name Appears');" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# The chapters of Little Women\n", "\n", "Table().with_column('Chapters', little_women_chapters)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Counts of names in the chapters of Little Women\n", "\n", "people = ['Amy', 'Beth', 'Jo', 'Laurie', 'Meg']\n", "people_counts = {pp: np.char.count(little_women_chapters, pp) for pp in people}\n", "\n", "counts = Table().with_columns([\n", " 'Amy', people_counts['Amy'],\n", " 'Beth', people_counts['Beth'],\n", " 'Jo', people_counts['Jo'],\n", " 'Laurie', people_counts['Laurie'],\n", " 'Meg', people_counts['Meg']\n", " ])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plot the cumulative counts\n", "\n", "cum_counts = counts.cumsum().with_column('Chapter', np.arange(1, 48, 1))\n", "cum_counts.plot(column_for_xticks=5)\n", "plots.title('Cumulative Number of Times Name Appears');" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# In each chapter, count the number of all characters;\n", "# call this the \"length\" of the chapter.\n", "# Also count the number of periods.\n", "\n", "chars_periods_hf = Table().with_columns([\n", " 'HF Chapter Length', [len(s) for s in huck_finn_chapters],\n", " 'Number of Periods', np.char.count(huck_finn_chapters, '.')\n", " ])\n", "chars_periods_lw = Table().with_columns([\n", " 'LW Chapter Length', [len(s) for s in little_women_chapters],\n", " 'Number of Periods', np.char.count(little_women_chapters, '.')\n", " ])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# The counts for Huckleberry Finn\n", "\n", "chars_periods_hf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# The counts for Little Women\n", "\n", "chars_periods_lw" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plots.figure(figsize=(10,10))\n", "plots.scatter(chars_periods_hf[1], chars_periods_hf[0], color='darkblue')\n", "plots.scatter(chars_periods_lw[1], chars_periods_lw[0], color='gold')\n", "plots.xlabel('Number of periods in chapter')\n", "plots.ylabel('Number of characters in chapter');" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "celltoolbar": "Raw Cell Format", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.5" } }, "nbformat": 4, "nbformat_minor": 1 }