{ "metadata": { "name": "", "signature": "sha256:5b95a8302b15faeeb3cd06add8368865bcd635b7cc339daa35ec7ebd79bb6106" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Analysis of Pronoun Usage In Presidential Addresses\n", "\n", "This notebook is designed to look at how presidents have used first person vs. second person pronouns during their speeches." ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import json\n", "import nltk" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 20 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load in Data\n", "\n", "The data used in this notebook comes from Vocativ's collection of presidential addressses, which can be found here: https://github.com/Vocativ-data/presidents_readability" ] }, { "cell_type": "code", "collapsed": false, "input": [ "objects = json.loads(open(\"../../vocativ_president_data/The original speeches.json\").read())[\"objects\"]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "speeches_df = pd.DataFrame(objects)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "speeches_df[\"word_count\"] = speeches_df[\"Text\"].apply(lambda x: len(x.split()))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "json_data = open().read()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "speeches_df[\"tokens\"] = speeches_df[\"Text\"].apply(lambda x: nltk.word_tokenize(x))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Find and Count All First-Person Singular Pronouns" ] }, { "cell_type": "code", "collapsed": false, "input": [ "speeches_df[\"i\"] = speeches_df.apply(lambda x: len([ t for t in x[\"tokens\"] if t.lower() == \"i\"]), axis=1)\n", "speeches_df[\"me\"] = speeches_df.apply(lambda x: len([ t for t in x[\"tokens\"] if t.lower() == \"me\"]), axis=1)\n", "speeches_df[\"my\"] = speeches_df.apply(lambda x: len([ t for t in x[\"tokens\"] if t.lower() == \"my\"]), axis=1)\n", "speeches_df[\"mine\"] = speeches_df.apply(lambda x: len([ t for t in x[\"tokens\"] if t.lower() == \"mine\"]), axis=1)\n", "speeches_df[\"myself\"] = speeches_df.apply(lambda x: len([ t for t in x[\"tokens\"] if t.lower() == \"myself\"]), axis=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "speeches_df[\"first_person_singular\"] = speeches_df.apply(lambda x: x[\"i\"] + x[\"me\"] + x[\"my\"] +\\\n", " x[\"mine\"] + x[\"myself\"], axis=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Find And Count All First-Person Plural Pronouns" ] }, { "cell_type": "code", "collapsed": false, "input": [ "speeches_df[\"we\"] = speeches_df.apply(lambda x: len([ t for t in x[\"tokens\"] if t.lower() == \"we\"]), axis=1)\n", "speeches_df[\"our\"] = speeches_df.apply(lambda x: len([ t for t in x[\"tokens\"] if t.lower() == \"our\"]), axis=1)\n", "speeches_df[\"ours\"] = speeches_df.apply(lambda x: len([ t for t in x[\"tokens\"] if t.lower() == \"ours\"]), axis=1)\n", "speeches_df[\"ourselves\"] = speeches_df.apply(lambda x: len([ t for t in x[\"tokens\"] if t.lower() == \"ourselves\"]), axis=1)\n", "speeches_df[\"us\"] = speeches_df.apply(lambda x: len([ t for t in x[\"tokens\"] if t.lower() == \"us\"]), axis=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "speeches_df[\"first_person_plural\"] = speeches_df.apply(lambda x: x[\"we\"] + x[\"our\"] + x[\"ours\"] + x[\"ourselves\"] + x[\"us\"], axis=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "speeches_df[\"first_person\"] = speeches_df.apply(lambda x: x[\"first_person_singular\"] + x[\"first_person_singular\"], axis=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Segment Off Necessary Data Points" ] }, { "cell_type": "code", "collapsed": false, "input": [ "speech_analysis = speeches_df[[\"word_count\", \"tokens\", \"President\", \"first_person\", \n", " \"first_person_singular\", \"first_person_plural\"]]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 11 }, { "cell_type": "markdown", "metadata": {}, "source": [ "We only want modern presidents (since 1929) because that's the data that's available for our news conference analysis. This is a list of all the presidents with names matching the data found in the President column of the address dataframe." ] }, { "cell_type": "code", "collapsed": false, "input": [ "news_conf_presidents = [\"Richard Nixon\", \"Gerald Ford\", \"George H. W. Bush\", \"Lyndon B. Johnson\", \"Jimmy Carter\", \n", " \"Bill Clinton\", \"Harry S. Truman\", \"Ronald Reagan\", \"Barack Obama\", \"John F. Kennedy\", \n", " \"Franklin D. Roosevelt\", \"Dwight D. Eisenhower\", \"Herbert Hoover\", \"George W. Bush\"]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "modern_presidents = speech_analysis[speech_analysis[\"President\"].isin(news_conf_presidents)]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "presidents = pd.DataFrame(modern_presidents.groupby(\"President\").sum())" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Analyze Each President's Total Corpus of Speeches" ] }, { "cell_type": "code", "collapsed": false, "input": [ "presidents[\"pct_first\"] = presidents.apply(lambda x: round(100.0 * x[\"first_person\"] / x[\"word_count\"], 2), axis=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "presidents[\"pct_first_singular\"] = presidents.apply(lambda x: round(100.0 * x[\"first_person_singular\"] / x[\"word_count\"], 2), axis=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "presidents[\"pct_first_plural\"] = presidents.apply(lambda x: round(100.0 * x[\"first_person_plural\"] / x[\"word_count\"], 2), axis=1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "presidents.sort(\"pct_first_singular\", ascending=False)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | word_count | \n", "first_person | \n", "first_person_singular | \n", "first_person_plural | \n", "pct_first | \n", "pct_first_singular | \n", "pct_first_plural | \n", "
---|---|---|---|---|---|---|---|
President | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
Richard Nixon | \n", "67445 | \n", "3368 | \n", "1684 | \n", "1943 | \n", "4.99 | \n", "2.50 | \n", "2.88 | \n", "
Gerald Ford | \n", "40301 | \n", "1950 | \n", "975 | \n", "1323 | \n", "4.84 | \n", "2.42 | \n", "3.28 | \n", "
George H. W. Bush | \n", "89646 | \n", "4308 | \n", "2154 | \n", "2878 | \n", "4.81 | \n", "2.40 | \n", "3.21 | \n", "
Lyndon B. Johnson | \n", "246786 | \n", "10116 | \n", "5058 | \n", "8062 | \n", "4.10 | \n", "2.05 | \n", "3.27 | \n", "
Jimmy Carter | \n", "91936 | \n", "3642 | \n", "1821 | \n", "2997 | \n", "3.96 | \n", "1.98 | \n", "3.26 | \n", "
Bill Clinton | \n", "145846 | \n", "5234 | \n", "2617 | \n", "5694 | \n", "3.59 | \n", "1.79 | \n", "3.90 | \n", "
Harry S. Truman | \n", "31802 | \n", "1132 | \n", "566 | \n", "852 | \n", "3.56 | \n", "1.78 | \n", "2.68 | \n", "
Ronald Reagan | \n", "206217 | \n", "6592 | \n", "3296 | \n", "6679 | \n", "3.20 | \n", "1.60 | \n", "3.24 | \n", "
Barack Obama | \n", "33672 | \n", "1046 | \n", "523 | \n", "1292 | \n", "3.11 | \n", "1.55 | \n", "3.84 | \n", "
John F. Kennedy | \n", "160468 | \n", "4670 | \n", "2335 | \n", "4907 | \n", "2.91 | \n", "1.46 | \n", "3.06 | \n", "
Franklin D. Roosevelt | \n", "130024 | \n", "3034 | \n", "1517 | \n", "3222 | \n", "2.33 | \n", "1.17 | \n", "2.48 | \n", "
Dwight D. Eisenhower | \n", "17919 | \n", "354 | \n", "177 | \n", "429 | \n", "1.98 | \n", "0.99 | \n", "2.39 | \n", "
George W. Bush | \n", "45437 | \n", "808 | \n", "404 | \n", "1818 | \n", "1.78 | \n", "0.89 | \n", "4.00 | \n", "
Herbert Hoover | \n", "10718 | \n", "178 | \n", "89 | \n", "303 | \n", "1.66 | \n", "0.83 | \n", "2.83 | \n", "