{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Using Scattertext and AgeFromName to find gender-discriminating terms\n", "\n", "https://github.com/JasonKessler/scattertext\n", "\n", "https://github.com/JasonKessler/agefromname\n", "\n", "Cite as:\n", "Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.\n", "\n", "Link to preprint: https://arxiv.org/abs/1703.00565\n", "\n", "`\n", "@article{kessler2017scattertext,\n", " author = {Kessler, Jason S.},\n", " title = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},\n", " booktitle = {ACL System Demonstrations},\n", " year = {2017},\n", "}\n", "`" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "import scattertext as st\n", "import re, io, itertools\n", "from pprint import pprint\n", "import pandas as pd\n", "import numpy as np\n", "import spacy\n", "import os, pkgutil, json, urllib\n", "from urllib.request import urlopen\n", "from IPython.display import IFrame\n", "from IPython.core.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "nlp = spacy.load('en')\n", "# If this doesn't work, please uncomment the following line and use a regex-based parser instead\n", "#nlp = st.whitespace_nlp_with_sentences" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "convention_df = st.SampleCorpora.ConventionData2012.get_data()\n", "convention_df['parsed'] = convention_df.text.apply(nlp)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## The `agefromname` package takes \n", "### - a first name,\n", "### - optional: a minimum age,\n", "### - optional: current year\n", "## and returns\n", "### - the probablity someone is male or female\n", "### `pip install agefromname`" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from agefromname import AgeFromName" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "gender_imputer = AgeFromName()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.031370941932688919" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gender_imputer.prob_male('kelsey')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.83377422744681196" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gender_imputer.prob_male('kelsey', minimum_age=70)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW8AAAEPCAYAAACNyEVOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xl8VNX9//HXJxBWWQJRVhEECuLCpiwqGqwsoiLuC6t+\ni1q14tcNsVX4VS3WVsWlKm64UIzLFw1YiiglxRVEFNmC0goiIoKCICAEcn5/nJswhAlZmMydSd7P\nx+M+cufeM3c+uTP55My595xjzjlERCS5pIQdgIiIlJ6St4hIElLyFhFJQkreIiJJSMlbRCQJKXmL\niCShYpO3mT1rZuvNbPEByjxsZl+a2SIz6xzbEEVEpLCS1LwnAf2L2mlmA4A2zrm2wJXA4zGKTURE\nilBs8nbOvQtsOkCRgcDzQdl5QH0zaxSb8EREJJpYtHk3A9ZEPP4GaB6D44qISBFidcHSCj1Wn3sR\nkXJUNQbHWAscHvG4ebBtH2amhC4iUgbOucIV5JjUvKcBwwDMrAew2Tm3vogAEnYZO3Zs6DEovsoX\nm+JTfMUtRSm25m1mLwGnAulmtgYYC6QGyXiic26GmQ0ws5XANuDyUib/hHf33bBgAezcuXfZtWvv\nsmcP3HILjBwZdqQiUlkUm7ydc5eWoMx1sQknMZ1yChx9NFSvvu9SrZpffvoJhg+HZcvgr3+FKlXC\njlhEKrpYtHlXCBkZGUXuO+WU4p//0Udw4YVw9tmQmQl168YuNjhwfIkgkeNL5NhA8R2syhqfHahN\nJaYvZObi9Vphyc2F66+Hd9+FN9+Eli3DjkhEkp2Z4aJcsFTyjjHn4JFH4M9/hqwsOP74sCOSZGC2\n39+mVELRcqSSd5y98Ya/gDlpEpx1VtjRSKIL/kDDDkNCVNRnoKjkrVEFy8mgQb7pZORIeOKJsKMR\nkYpGFyzLUffu8N57MGAAfP893HEH6NuxiMSCknc5a90a5s6F00+HX36Be+5RAheRg6dmkzho1Ajm\nzIGZM+Gmm/xFTZFkN3HiRP73f/83psd87rnn6NWrV4nKZmdn07t374N+zVWrVpGSkkJeXt5BH6so\n3bt3Z9myZTE9ppJ3nKSnw+zZ8P77cO21UI6fE5GYadmyJbVq1aJOnTo0btyYyy+/nG3btrFr1y7u\nuecebr31VmBvAuzSpcs+z9+4cSPVqlWjVatW5R7riBEjuOOOOwoeL126lCZNmvDAAw+U+2sX5+ab\nb+bOO++M6TGVvOMoLQ3efhsWL4Zhw3zXepFEZma8+eabbN26lYULF7JgwQLuvvtusrKyaN++PU2a\nNNmn/I4dO1i6dGnB4ylTpnDkkUfG5VZIMyt4nU8//ZTTTjuNO++8kxtvvLHcX7s4Z599NnPmzGH9\n+qjDPpWJknec1a0Ls2bBli0wcCBs2xZ2RCIl07RpU8444wyWLFnCzJkzo/YcHDp0KM8//3zB4xdf\nfJFhw4btcwvcvffeS5s2bahbty5HH300b7zxRpGvmZOTQ58+fWjYsCHt27fn1VdfPWCMzjnmz59P\n3759GT9+PL/97W8Ltue/bnp6OhdffDGbNkWfY+a5556jdevW1K1blyOPPJIpU6YU7Hv22Wfp0KED\nDRo0oH///nz99dcAXHvttdx88837HGfgwIFMmDABgBo1atC1a1feeuutA8ZfGkreIahZE6ZOhaZN\n4bTTYOPGsCMSKVp+4l2zZg0zZsygc+fOLF68mHbt2u1XdvDgwWRmZuKcY9myZfz888907959nzJt\n2rThvffeY8uWLYwdO5YhQ4ZErZFu27aNPn36MGTIEDZs2EBmZibXXHMNy5cvLzLWefPmccYZZzBh\nwgSuuOKKgu0PP/ww06ZNY+7cuaxbt460tDSuvfbaqK85atQoZs6cyZYtW/jwww/p1KkTAFlZWYwf\nP57XX3+djRs30qtXLy691A/9NGLECF566aWCc7Vx40Zmz57N4MGDC4591FFHsWjRoiJjLy0l75BU\nrQrPPAO9e0OvXvCvf+lCphTNLDZLaTnnGDRoEGlpafTq1YuMjAxuv/12Nm/eTJ06dfYr37x5c9q1\na8fbb7/NCy+8wLBhw/Yrc8EFF9C4cWMALrroItq2bcu8efP2K/fmm2/SqlUrhg8fTkpKCp06deK8\n884rsvbtnGPevHnUr1+f/v33nXZ34sSJ3H333TRt2pTU1FTGjh3La6+9FvUiZUpKCosXL2bHjh00\natSIDh06APDEE08wZswY2rVrR0pKCmPGjOGzzz5jzZo1nHDCCdSrV4/Zs2cDkJmZSe/evTn00EML\njlunTh02b95c1KkuNSXvEJnBvffC738P110HHTvCs8/6WwpFIjkXm6W0zIysrCw2bdrEqlWrePTR\nR6lRowZpaWls2bIlavlhw4YxadIkMjMzGTp06H69Bl944QU6d+5MWloaaWlpLFmyhB9++GG/Y61e\nvZp58+YVlEtLS2PKlClFthubGddeey1du3alT58++yTKVatWce655xYcp0OHDlStWnW/Y9WuXZuX\nX36ZJ554gqZNm3LWWWexYsWKgnhGjRpVcIyGDRsCsHatn3tm2LBhTJ48GYDJkyczdOjQfY69ZcsW\n0tLSDni+S0PJOwEMGQJLl/rhZF97DY44Av7977CjEinacccdxxdffBF133nnnceMGTNo3bo1zZvv\nO53t6tWrufLKK/nb3/7Gjz/+yKZNmzjmmGOidgtv0aIFp556Kps2bSpYtm7dyt/+9rci46patSpT\npkyhRYsW9OvXj61btxYca+bMmfsca/v27ftdcAXo27cvs2bN4rvvvqN9+/aMDAbqb9GiBU8++eQ+\nx9i2bRs9evQAYMiQIWRlZbFo0SJycnIYNGjQPsddvnw5HTt2PMBZLR0l7wRhBn37wowZMGWKH172\ngw/CjkokugEDBvDvImoYtWvXZs6cOTz99NP77du2bRtmRnp6Onl5eUyaNIklS5ZEPc6ZZ57JF198\nweTJk8nNzSU3N5ePP/6YnJycqOXzZ56pWrUqr776Kunp6QwYMIDt27dz9dVXc/vttxdcYNywYQPT\npk3b7xjff/89WVlZbNu2jdTUVGrXrk2VYID+q6++mj/96U8F92v/9NNP+zThNG/enOOPP55hw4Zx\nwQUXUL169YJ9v/zyCwsXLqRPnz5RYy8LJe8E9Otfw4sv+vFR5s8POxqR/Z111lnk5OSwbt26gm2R\ntwN26dJln3u78/d16NCBm266iZ49e9K4cWOWLFnCySefvE+5/LJ16tRh1qxZZGZm0qxZM5o0acKY\nMWPYVcQ9tpHPTU1NZerUqdSoUYOBAwdy5ZVXMnDgQPr27UvdunXp2bMn8yP+uPKfl5eXx4MPPkiz\nZs1o2LAh7777Lo8//jgAgwYNYvTo0VxyySXUq1ePY489dr+7R4YPH87ixYv3azKZPn06vXv3Lmjr\nj4l4zePmX0pKY/p05w47zLlPPgk7Eilvyfj38eSTT7obbrghtNefM2eOy8jICO31o5k7d65r0aLF\nftu7d+/uli5desDnFvUZCLbvl1M1tkkCO+ssPyLhgAEwebIfH0UkUYzUpK37yM3NZcKECVHPy0cf\nfRTz11OzSYI791yfuC+/3M/Ss3172BGJJIbIZpKwLV++nLS0NNavX88NN9wQl9fUZAxJYtMmfzvh\nJ5/49vATTgg7IoklTcYgmoyhgkpLg7//Hf74R9+c8sAD6tQjUpmp5p2EVq+G88+HNm3g6afhkEPC\njkgOlmreopp3JXDEEX6Gntq1oUcPKKKvhIhUYKp5JzHn4Kmn4A9/8GOFH3ts2BFJWSXKhTcJV2lq\n3rpVMImZwZVXQo0acOmlvkNPrVphRyVloYqNlJZq3hWAczB0KNSpA0FnMBGpINTmXYGZwWOP+Uke\npk4NOxoRiQfVvCuQefP87DwffwwtWoQdjYjEgmrelUD37nDjjTB4MOTmhh2NiJQn1bwrmLw8fw/4\nzp1+bHBdwBRJbqp5VxIpKfDKK3DoodCnj+9WLyIVj5J3BZSaCpMm+WaUU06Bb78NOyIRiTUl7woq\nJQXuvx8uuwxOPhm++irsiEQkltRJpwIzgzFjoG5dP0v9v/4FRx4ZdlQiEgtK3pXAtdf6RH7aaUrg\nIhVFsc0mZtbfzHLM7EszGx1lf7qZzTSzz8xsiZmNKJdI5aBccw2MHu1r4P/9b9jRiMjBOmDyNrMq\nwKNAf6ADcKmZHVWo2HXAp865TkAGcL+ZqUafgH77W7jtNsjIgFdf9bcVikhyKq7m3Q1Y6Zxb5ZzL\nBTKBcwqVWQfUDdbrAj8453bHNkyJld/+1o9EeP/9cMwxfoq13Xq3RJJOccm7GbAm4vE3wbZITwFH\nm9m3wCJgVOzCk/LQrx98+CE8/LBP5O3bw7PPqlemSDIprnmjJF0ibwc+c85lmFlr4G0z6+ic21q4\n4Lhx4wrWMzIyyMjIKEWoEktmfjb600+HuXP99Gp33eXvThkxAqpVCztCkcopOzub7OzsYssdsHu8\nmfUAxjnn+gePxwB5zrk/R5SZAdzjnHs/eDwbGO2cW1DoWOoen+Def98n8JwcP7lD69ZhRyQiZe0e\nvwBoa2YtzawacDEwrVCZHOD04EUaAe0A3c+QhE46CWbOhJtv9qMTbtkSdkQiUpRiB6YyszOACUAV\n4Bnn3HgzuwrAOTfRzNKBSUAL/D+D8c65KVGOo5p3knDOX9j89lt4/XWoUiXsiEQqr6Jq3hpVUKLa\ntcsPbHXiiTB+fNjRiFReGlVQSqVaNT+kbGYmTNnve5SIhE3JW4p06KGQlQWjRvlmlC++CDsiEcmn\n5C0HdNxxsHgxpKf7C5rnnONvLVQLmEi41OYtJbZtG7zwAkyYAFWrwsiRftb6hg3Djkyk4lKbtxy0\n2rV980lOjp+tfsECfy/4kCGa8EEk3pS8pdTM4NRT/bgoX33lh5jt0sW3j4tIfKjZRGLigw98Dbxv\nXz/oVe3aYUckUjGo2UTK1Yknwqef+nbxLl3g7bfDjkikYlPNW2IuKwtuugk6dPC18LZtw45IJHmp\n5i1xc845sHSpn/i4Z0+45RbYsSPsqEQqFiVvKRfVq8Ott/okvmYNnHACLFkSdlQiFYeSt5SrRo3g\npZd8M0rv3vDEE+rgIxILavOWuFmxAi65BFq2hL/+VeOFi5SE2rwldO3awUcfQefO0L07XHGFZrIX\nKSslb4mr6tXhzjth5Upo0QK6dYPLL4eFC8OOTCS5qNlEQrVpEzz+ODz5pB/86qqrfNNKnTphRyaS\nGDQZgyS0PXt8x56JE2HOHD8RxLnnwplnQr16YUcnEh4lb0kaGzbA9OkwdaoffvbXv/a188aNw45M\nJP6UvCUpbdni70x5/nl44w1/sVOkMlHylqT26qtwzTX+PvHzzw87GpH4UfKWpLdwIQwaBIMHw7Bh\n0L69H55WpCLTfd6S9Lp0gXnzYP16GDDA9948/3x4+mnIyws7OpH4Us1bktbXX8O778Ijj0CrVjBp\nEtSoEXZUIrGlZhOpsHbsgOHD/VRsb7zh7xcXqSjUbCIVVs2akJkJvXr5IWi//DLsiETKn5K3VAgp\nKTB+vB+Gtnt3uO46P1GySEWl5C0VysiR8PnnkJYGGRm+p+bMmWFHJRJ7avOWCmvnTn9/+B13+MGv\n7rhDtxZK8tEFS6m0vvvOj5HSubPvZp+aGnZEIiWn5C2V2s8/w4UX+vVXX4VDDgk3HpGS0t0mUqkd\ncghMmwbNmsEpp8A334QdkcjBUfKWSiM1FZ56Ci6+2N+RMm9e2BGJlJ2aTaRSmj7dT8P24IMwZEjY\n0YgUTW3eIoUsWQIDB/q28Lvv1oVMSUxq8xYp5JhjYP58f1/4ySf7eTVFkkWxydvM+ptZjpl9aWaj\niyiTYWafmtkSM8uOeZQi5SQ9Hf7xDz/MbM+e8NxzoC+IkgwO2GxiZlWAFcDpwFrgY+BS59zyiDL1\ngfeBfs65b8ws3Tm3Mcqx1GwiCW3JErjsMjjqKD/MrCZBlkRQ1maTbsBK59wq51wukAmcU6jMZcD/\nOee+AYiWuEWSQX4zSr160KOHBriSxFZc8m4GrIl4/E2wLVJboIGZzTGzBWY2NJYBisRTjRrw5JMw\nahScdJJvUhFJRMUl75K0c6QCXYABQD/gDjNre7CBiYTpyishKwuuugrGjYPdu8OOSGRfVYvZvxY4\nPOLx4fjad6Q1wEbn3A5gh5nNBToC+33pHDduXMF6RkYGGRkZpY9YJE569oSPP4ahQ/3dKM8/D+3a\nhR2VVHTZ2dlkZ2cXW664C5ZV8Rcsfw18C8xn/wuW7YFH8bXu6sA84GLn3LJCx9IFS0lKeXl+QKux\nY/3IhL/7nR8/XCQeytxJx8zOACYAVYBnnHPjzewqAOfcxKDMzcDlQB7wlHPu4SjHUfKWpPbllzBi\nBFSrBs8+6+fNFClv6mEpEgN79sADD8B998E99/jJHzRGuJQnJW+RGFq61E96nJ7u7wlv3jzsiKSi\nUvd4kRg6+mj48EM48UTo1k33hEv8FXe3iYgUITUV7rwTmjSBvn3hvff8eOEi8aDkLXKQRo6ETZt8\nAp87Fxo2DDsiqQzU5i0SI7fdBnPmwDvvaFwUiR1dsBQpZ875HplffeW71VerFnZEUhEoeYvEwZ49\nfnKHmjXhxRfVmUcOnu42EYmDKlXg73+H1at9M4pIeVHyFomxmjX9TPXTp8NDD4UdjVRUuttEpBw0\naAAzZ/phZZs0gYsuCjsiqWjU5i1Sjj77DE4/3XfoaauBkqUM1OYtEoJOnXxHnqFDNSa4xJaSt0g5\nu+46qFsXxo8POxKpSNRsIhIHa9dCly7w5ptwwglhRyPJRM0mIiFq1gweftg3n2zfHnY0UhGo5i0S\nR0OG+DtRHt5vuhKR6NTDUiQB/Pijnwfz3Xehffuwo5FkoGYTkQTQoAHceiuMGRN2JJLsVPMWibNf\nfoFf/QoyM/1kDiIHopq3SIKoUQP++EdfA1d9RspKyVskBEOHwk8/+fFPRMpCyVskBFWqwL33+pEH\n1fNSykLJWyQkAwbAYYfB88+HHYkkI12wFAnR++/D8OHwxReauEGi0wVLkQR04olQqxZkZ4cdiSQb\nJW+REJn52eefeirsSCTZqNlEJGSbNkGrVrByJaSnhx2NJBo1m4gkqLQ0GDjQT1gsUlJK3iIJIL/p\nRF9OpaSUvEUSwMkn+8T9wQdhRyLJQslbJAGYwW9+owuXUnK6YCmSIDZs8JMUr1oF9euHHY0kCl2w\nFElwhx4K/frB3/8ediSSDJS8RRLI//wPvPBC2FFIMlCziUgCyc2Fxo1h8WJo2jTsaCQRqNlEJAmk\npvoBq6ZNCzsSSXTFJm8z629mOWb2pZmNPkC5E8xst5mdF9sQRSqXc86BN94IOwpJdAdsNjGzKsAK\n4HRgLfAxcKlzbnmUcm8D24FJzrn/i3IsNZuIlMDWrdCsGaxZA/XqhR2NhK2szSbdgJXOuVXOuVwg\nEzgnSrnfAa8BGw46UpFKrk4dOOUU+Oc/w45EEllxybsZsCbi8TfBtgJm1gyf0B8PNql6LXKQzjkH\nsrLCjkISWXHJuySJeAJwW9AmYsEiIgfh7LNh5kzYuTPsSCRRVS1m/1rg8IjHh+Nr35G6AplmBpAO\nnGFmuc65/a6Xjxs3rmA9IyODjIyM0kcsUgk0bgwdOvhJGvr1Czsaiafs7GyySzA7R3EXLKviL1j+\nGvgWmE+UC5YR5ScB051zU6Ps0wVLkVK47z746it4/PHiy0rFVaYLls653cB1wFvAMuBl59xyM7vK\nzK4qn1BFBGDQIH+/d15e2JFIIlIPS5EE1qEDPPccdOsWdiQSFvWwFElCgwapw45Ep+QtksAGDYLX\nXw87CklESt4iCez442HLFlixIuxIJNEoeYsksJQU1b4lOiVvkQR37rlK3rI/3W0ikuByc6FRIz/G\nd7NmxZeXikV3m4gkqdRUOPNMjXUi+1LyFkkCajqRwtRsIpIEtm2DJk1g9WpISws7GoknNZuIJLHa\ntaF3b/jHP8KORBKFkrdIklDTiURSs4lIkvjhBzjySPjuO6hZM+xoJF7UbCKS5Bo2hK5dYdassCOR\nRKDkLZJE1HQi+dRsIpJE1q6FY4/1TSfVqoUdjcSDmk1EKoBmzeDoo9V0IkreIknnoovglVfCjkLC\npmYTkSSzbp2fYWfdOqhRI+xopLyp2USkgmjSBDp2hLfeCjsSCZOSt0gSuvhiePnlsKOQMKnZRCQJ\nrV8P7dr5phN12KnY1GwiUoE0auSnSJsxI+xIJCxK3iJJSnedVG5qNhFJUhs3QuvW8O23ftRBqZjU\nbCJSwaSnQ48eGia2slLyFkliF18ML74YdhQSBjWbiCSx7dv9MLGzZ/tu81LxqNlEpAKqVQuuvx7u\nuy/sSCTeVPMWSXKbN/sLlwsXwhFHhB2NxJpq3iIVVP368JvfwP33hx2JxJNq3iIVwLp1vs17xQo4\n9NCwo5FYUs1bpAJr0sR32nnoobAjkXhRzVukgvjPf6B7d/jvf6Fu3bCjkVhRzVukgmvdGvr0gYkT\nw45E4kE1b5EKZNEiOOMMX/vWRA0Vg2reIpVAx47QqZN6XVYGJUreZtbfzHLM7EszGx1l/2AzW2Rm\nn5vZ+2Z2XOxDFZGSuO0232lnz56wI5HyVGzyNrMqwKNAf6ADcKmZHVWo2H+BU5xzxwF3AU/GOlAR\nKZlevfztglOnhh2JlKeS1Ly7ASudc6ucc7lAJnBOZAHn3IfOuZ+Ch/OA5rENU0RKyszXvu+9F3SZ\nqeIqSfJuBqyJePxNsK0o/wNofg+REJ11FvzyC7zzTtiRSHmpWoIyJf7fbWa9gSuAk6LtHzduXMF6\nRkYGGRkZJT20iJRCSgqMHg3jx/vbByV5ZGdnk52dXWy5Ym8VNLMewDjnXP/g8Rggzzn350LljgOm\nAv2dcyujHEe3CorEUW4utGnjp0rr3j3saKSsDuZWwQVAWzNraWbVgIuBaYUO3gKfuIdES9wiEn+p\nqXDTTfDXv4YdiZSHEnXSMbMzgAlAFeAZ59x4M7sKwDk30cyeBs4Fvg6ekuuc61boGKp5i8TZli1+\nmNilS6Fp07CjkbIoquatHpYiFdzVV/vEfeedYUciZaHkLVJJLVoEZ54Jq1ZB1ZLcoiAJRd3jRSqp\njh2hRQuYPj3sSCSWlLxFKoFrroHHHw87CoklNZuIVAK//OJr3++/D23bhh2NlIaaTUQqsRo1YMQI\njfVdkajmLVJJ/Oc/0KMHfP011KwZdjRSUqp5i1RyrVtD166+x6UkP9W8RSqRWbP8xcvPP4datcKO\nRkpC93mLCACDB0PjxnD//WFHIiWh5C0iAGzcCMceC6+/7tvAJbGpzVtEAEhPh4cegiuugJ07w45G\nykrJW6QSuvBCaN8e7ror7EikrNRsIlJJrVvnu86/9RZ07hx2NFIUNZuIyD6aNIG//MU3n+Tmhh2N\nlJaSt0glNmwYNGqkCRuSkZpNRCq51at955333vPt4JJY1GwiIlEdcQSMGwe/+Q3k5YUdjZSUkreI\ncM01/udjj4Ubh5Scmk1EBIAVK+Dkk2HBAl8bl8SgZhMROaB27eCWW+C883wvTElsSt4iUuCWW6Bf\nPzjlFFi7Nuxo5EA0HamIFDCDP/0J0tKgVy8/CmGbNmFHJdEoeYvIfm65BerXh1NPhRkzfE9MSSxq\nNhGRqEaOhAkT4PTT/eTFut8gsehuExE5oBUr4LLLoHlzeOYZPyqhxI/uNhGRMmnXDj780P/s1Amy\nslQLTwSqeYtIic2eDTfeCFWqwB/+AIMGQYqqgOVKM+mISEzk5cH06X4s8J074YYb4NxzoUGDsCOr\nmJS8RSSmnIOZM+Hpp+Gdd6BnTz/Jw4ABfrhZiQ0lbxEpNz//DP/4B7zyCsyZAzVq+JEKu3aFY46B\nli390rChv5dcSk7JW0Tiwjk/zOwnn/glJwe++gpWrYLdu+Hww6FZM2ja1P887DDf5JKWtvdnvXr+\nPvNatZTslbxFJHSbN8OaNfDtt35ZuxY2bIAff4RNm/zPzZv3Lrm5cMghULu2T+TRlho1oFo1qF59\n78/CS7Vqe/dVqeIvsuYvVapA1ap+SU31j/O35a9HPo72s/ASy4u4St4iknR27fJNMtu3+2XbNtix\nY9/HO3f6ZdeuveuRS/72/J979vhvB3l5fj0vz/+T2L3b/9yzZ++ye3f0x9F+5j9/927/bSHaP4DI\nJf8fR0pK9HL5++fPV/IWEYmLvLy9CT0y+ecv+f848tcLl4vc37OnkreISNIpcw9LM+tvZjlm9qWZ\njS6izMPB/kVm1jkWAYuISNEOmLzNrArwKNAf6ABcamZHFSozAGjjnGsLXAk8Xk6xlqvs7OywQzgg\nxVd2iRwbKL6DVVnjK67m3Q1Y6Zxb5ZzLBTKBcwqVGQg8D+CcmwfUN7NGMY+0nFXWD0CsJHJ8iRwb\nKL6DVVnjKy55NwPWRDz+JthWXJnmBx+aiIgUpbjkXdIrjIUb03VlUkSkHB3wbhMz6wGMc871Dx6P\nAfKcc3+OKPMEkO2cywwe5wCnOufWFzqWErqISBlEu9ukuGnQFgBtzawl8C1wMXBpoTLTgOuAzCDZ\nby6cuIt6cRERKZsDJm/n3G4zuw54C6gCPOOcW25mVwX7JzrnZpjZADNbCWwDLi/3qEVEKrm4ddIR\nEZHYKfPwKWb2rJmtN7PFEds6mtmHZva5mU0zszrB9m5m9mmwfG5mF0c8p6uZLQ46+Tx0cL9O2eKL\n2N/CzH42s5sSKT4za2lmOyLO4WOJFF+w77hg35Jgf7VEic/MBkecu0/NbI+ZHVde8ZUythpm9lKw\nfZmZ3RbxnEQ4d9XMbFKw/TMzOzUO8R1uZnPMbGnwebo+2N7AzN42sy/MbJaZ1Y94zpggjhwz61ue\nMZY2vmD7HDPbamaPFDpW2eNzzpVpAXoBnYHFEds+BnoF65cDfwzWawIpwXpjYCNQJXg8H+gWrM8A\n+pc1prLGF7H/NeBl4KaIbaHHB7SMLFfoOIkQX1VgEXBs8Dgt4v0OPb5CzzsG33eh3M5fKc/dCOCl\niL+Tr4AnhOkOAAAJrUlEQVQWiXLugGvxzaUAhwIL4vDZawx0CtYPAVYARwH3AbcG20cD9wbrHYDP\ngNTgb2Ule1sVyuP9LW18tYCTgKuARwodq8zxHewv0bLQB2BzxPrhwNIoz2kF/CdYbwIsj9h3CfBE\nLD4ApY0PGBSc/LEEyTtR4itcLqJMosQ3AHgxUeMr9Jw/AXeVd3ylOHf98Bf9qwDpQSKonyjnDt/D\nekjEvneAE8o7vkKxvgGcDuQAjYJtjYGcYH0MMDqi/EygR7xiLC6+iHIjiEjeBxtfrKcOXWpm+T0w\nL8R/CICCppOlwFLgxmBzM3ynnnxr2b8TULnHZ2aHALcC4wqVT4j4Aq2Cr/zZZnZygsX3K8CZ2Uwz\n+8TMbkmw+CJdBLwUrMczvqixOefeArYA64BVwF+cc5vjHFuR8eG/UQ00sypm1groiu+EF5f4zN/p\n1hmYh0+M+XeyrQfye3I3LRRLfmfCwttjHmMJ48tX+ALjQZ3DWCfvK4BrzGwB/uvErvwdzrn5zrmj\ngS7AQ2ZWL8avfTDxjQMedM5tZ/8OR/FUVHzfAoc75zrj//FNsULt9SHHVxU4Gbgs+HmumZ1G/Dtr\nFfn5AzCz7sB259yyOMdVZGxmNgTfXNIE/6305iBJJkR8wLP4BLMAeBD4ANhDHN7boFL1f8Ao59zW\nyH3OV1VDvdsi7PiKu8+7VJxzK/BfAzGzXwFnRimTY2b/Adqwf1f65vj/PuUiSnwDgl3dgPPN7D78\nV9Y8M9sBTA05vjOD7bsI/piccwuD89c2iCX0+PDDI8x1zv0Y7JuB/yc9OUHiy3cJMCXicdzO3wE+\neycCrzvn9gAbzOx9fO32vXjFVkR8+Z+9Pez9pkwQ3xfAT+UZn5ml4hPji865N4LN682ssXPuOzNr\nAnwfbF/Lvt+ymuNzS7m9v6WMrygHFV9Ma95mdmjwMwX4A8EIg+bvlqgarB+BTzxfOue+A7aYWXcz\nM2Aovv2oXESJ7wkA59wpzrlWzrlWwATgHufcYwkQX/75Szc/wiNmdiT+/P3XObcuEeLD9wM41sxq\nBu/zqfg204Q4fxHbLsQPrgZAPM9fUZ89fDvpacG+2vi22pxEOXfBe1o7WO8D5Drncsrz3AXHewZY\n5pybELFrGjA8WB8e8XrTgEvM3xnTCv/3Mb+8zmEZ4it4auSDgz6HB9FI/xL+6/wufM3rCuB6/AWX\nFcCfIsoOAZYAn+KvrvaP2NcVWIy/QvxwDC8ilDi+Qs8bC9yYSPEB50Wcv0+AMxMpvqD84CDGxQRX\n2RMsvgzggyjHiXl8pXxvq+O/oSzGXw+6qTxjK0N8LfH/YJYBs/DNd+Ud38lAHv4Okk+DpT/QAH/B\n9IsglvoRz7k9iCMH6FfO729Z4lsF/ABsDc55+4ONT510RESSUKwvWIqISBwoeYuIJCElbxGRJKTk\nLSKShJS8RUSSkJK3iEgSUvKuRMx718z6R2y70Mz+GWZcycjMbjCzmnF4nQwzmx7x+G4z+6cFQ+5G\nKd/SIoZ6lYpLybsScf6m/quBB8ysejA2wz3ANWU5Xn6v2UpqFH6ozxILei+WmZn9AegJDHJ+yASp\nxJS8Kxnn3FJgOnAbcCe+d98fzGyemS00s4FQUIOba36UwE/MrGewPSOovWfhewQWMLPLzezBiMcj\nzeyBYP1G84POLzazURGvETkhwM1mNrZwzGZ2qJm9Zmbzg+XEYPs48xMLzDGz/5jZ7yKeM8zMFpmf\nQOCFiNf7V7D9HTPLH1XyOTM7P+K5P0f8rtlm9qqZLTezycH26/Ej1s0xs9nBtr5m9kFwrl6J6FK+\nyszuNbNPgAsK/V5nm9lHwXl/28wOi/aWBWVvwo89crZzbqf5Uf7+EpyPRWZ2ZZTzdnTwvn4alGkd\nbB8Ssf0JM0sxsyuKeu8kQcWqS6uW5FnwNcYc4HP8+NaDg+318d2ja+FHuqsebG8LfBysZwA/A0dE\nOW5tfDff/Ik23geOxncB/jw4Zm18N/pO7D+m9E3A2CjHnQKcFKy3wI8pAX40yPfwg/A3JJjkI3jN\nFUCD/N8r+DkdGBqsX44fEApgEnB+xOttjfhdN+MTteFH1Dsx2PdVxPHTgX8DNYPHo4E7IsrdXMT7\nENl9+jfAX6OUyQA24btcHxKx/Urg98F6dfxkCi0jzynwCHBZsF4VqIGfNGBaxHv0GH5MjajvXdif\nVS1FL5X5a2+l5ZzbbmYv45PwRcDZZnZzsLs6foS274BHzawjfgjQthGHmO+cWx3luNvM7F/B8XKA\nVOfc0qCmPdU5twPAzKbiZ3OZVsKQTweO8mP3AFAnqNk64B/OuVzgBzP7Hj8I/mnAKy4Y5dD58bHB\nD/o0KFifjJ98ozjznXPfBnF/hk+OHxQq0wM/m8sHQYzVCpV5uYhjH25mrwQxV8Mn+sIc8CX+H2tf\n/EiXBOvHmll+bb4ufqTOlRHP/QD4vZk1x5//lWb2a/w/0wVBrDWB74p674qIWxKAknfllRcsBpzn\nnPsycqeZjQPWOeeGmh/R8JeI3dsOcNyngd8Dy/FjQYNPQJEjqlmwbTf7Nt0VdQHQgO6uUDtvkHwi\nt+3Bf6YLv17hYxVWEEfQLh15MXBnlONH87Zz7rIi9hV1vh7B17bfND835Lgi4l2PH/hrtpn96JzL\nDvZd55x7e5/CfnIAAJxzL5nZR8BZwAwzuyrY9bxz7vYorxXtvZMEpTZveQs/ohwAZtY5WK2Lr30D\nDMM3RxTLOTcfPy7xZeydseZdYJDtHV50ULDte+Aw8xO0VscnmWhmFYqx44FCAP4FXGhmDYLyacG+\nD/BjeoNPhnOD9VX42ijAQHwzTHG24s8R+FlUTopoU65tZm2LfOZedfGj+4GfIqtIwT/X84DJwe//\nFn7yhPyhln9lZvtcQDWzI51zXznnHgGygGOB2cAFtncI2AZm1iJ4jWjvnSQoJe/KzQF3AanmZwdf\nAvy/YN9jwPCgqaAdvokl8nkH8grwnnPuJwDn3KfAc/jhgD8CnnLOLQqaO/4YbJ+FH3Y02rGvB44P\nLrotxU/kWmQszs+Ucw/w7yD++4NdvwMuN7NF+OQ9Ktj+FHBqULZHCX/XJ4GZZjbbObeBYCLh4Ngf\n4M9ZccYBr5qfvWZDEa9VMCOLc24Bvq1+Gn7o0WXAwuCi7+Ps/Qebf5yLzM9u/in+OsALzrnl+PG6\nZwWxzsI32+Tb572TxKUhYSXmzN+X/IBzbk7YsUjp6L1LHqp5S8yYWX0zW4GfJ1J//ElE713yUc1b\nRCQJqeYtIpKElLxFRJKQkreISBJS8hYRSUJK3iIiSUjJW0QkCf1/YaQBhn5DI9QAAAAASUVORK5C\nYII=\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "(pd.DataFrame([{'Year you encounter a Kelsey': year, \n", " 'P(Male|Kelsey)': gender_imputer.prob_male('kelsey', current_year=year)}\n", " for year in range(1930, 2015)])\n", " .set_index('Year you encounter a Kelsey')\n", " .plot())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Let's assign speakers a gender if, assuming they're at least 35, \n", "## there's a 90% someone with their name is male or female.\n", "## Otherwise, drop their speech from the dataset." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "convention_df['speaker_gender'] = (convention_df.speaker\n", " .apply(lambda x: (gender_imputer.prob_male(x.split()[0], minimum_age = 35) if x else 0.5))\n", " .apply(lambda x: 'Male' if x > 0.9 else 'Female' if x < 0.1 else '?')\n", ")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
partyspeakertextparsedspeaker_gender
0democratBARACK OBAMAThank you. Thank you. Thank you. Thank you so ...(Thank, you, ., Thank, you, ., Thank, you, ., ...?
1democratMICHELLE OBAMAThank you so much. Tonight, I am so thrilled a...(Thank, you, so, much, ., Tonight, ,, I, am, s...Female
2democratRICHARD DURBINThank you. It is a singular honor to be here t...(Thank, you, ., It, is, a, singular, honor, to...Male
\n", "
" ], "text/plain": [ " party speaker \\\n", "0 democrat BARACK OBAMA \n", "1 democrat MICHELLE OBAMA \n", "2 democrat RICHARD DURBIN \n", "\n", " text \\\n", "0 Thank you. Thank you. Thank you. Thank you so ... \n", "1 Thank you so much. Tonight, I am so thrilled a... \n", "2 Thank you. It is a singular honor to be here t... \n", "\n", " parsed speaker_gender \n", "0 (Thank, you, ., Thank, you, ., Thank, you, ., ... ? \n", "1 (Thank, you, so, much, ., Tonight, ,, I, am, s... Female \n", "2 (Thank, you, ., It, is, a, singular, honor, to... Male " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "convention_df.iloc[:3]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Male 105\n", "Female 65\n", "? 19\n", "Name: speaker_gender, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "convention_df.speaker_gender.value_counts()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "convention_df_gender = convention_df[convention_df.speaker_gender.isin(['Male', 'Female'])]\n", "convention_df_gender['speaker_party'] = (convention_df_gender['speaker'] \n", " + ' (' + convention_df_gender['party'].apply(lambda x: x.upper()[0]) + ')')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Let's plot the differences in language use by gender" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "corpus_gender = st.CorpusFromParsedDocuments(convention_df_gender, \n", " category_col='speaker_gender', \n", " parsed_col='parsed').build()\n", "html = st.produce_scattertext_explorer(corpus_gender,\n", " category='Female',\n", " category_name='Female',\n", " not_category_name='Male',\n", " minimum_term_frequency=5,\n", " width_in_pixels=1000,\n", " metadata=convention_df_gender['speaker_party'])\n", "file_name = 'output/Conventions2012Gender.html'\n", "open(file_name, 'wb').write(html.encode('utf-8'))\n", "IFrame(src=file_name, width = 1200, height=700)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Let's see how gender and party-associated terms differ" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Compute gender and party scaled f-scores" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "female_scores = corpus_gender.get_scaled_f_scores('Female')\n", "democratic_scores = (st.CorpusFromParsedDocuments(convention_df_gender, \n", " category_col='party', \n", " parsed_col='parsed')\n", " .build()\n", " .get_scaled_f_scores('democrat'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Use custom coordinates to plot the gender scaled f-score vs. the party scaled f-score" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "html = st.produce_scattertext_explorer(corpus_gender,\n", " category='Female',\n", " category_name='Female',\n", " not_category_name='Male',\n", " minimum_term_frequency=5,\n", " pmi_filter_thresold=4,\n", " width_in_pixels=1000,\n", " scores=female_scores,\n", " sort_by_dist=False,\n", " x_coords=democratic_scores,\n", " y_coords=female_scores,\n", " show_characteristic=False,\n", " metadata=(convention_df_gender['speaker'] \n", " + ' (' \n", " + convention_df_gender['party'].apply(lambda x: x.upper()[0]) \n", " + ')'),\n", " x_label='More Democratic',\n", " y_label='More Female')\n", "file_name = 'output/Conventions2012GenderAndParty.html'\n", "open(file_name, 'wb').write(html.encode('utf-8'))\n", "IFrame(src=file_name, width = 1200, height=700)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# Women and men appear to discuss different topics \n", "## The package Empath uses a crowd-sourced topic model to categorize a given document\n", "\n", "Fast, Ethan, Binbin Chen, and Michael S. Bernstein. \"Empath: Understanding topic signals in large-scale text.\" Proceedings of the 2016 CHI Conference on Human Factors in Computing Systems. ACM, 2016." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "empath_corpus = st.CorpusFromParsedDocuments(convention_df_gender,\n", " category_col='speaker_gender',\n", " feats_from_spacy_doc=st.FeatsFromOnlyEmpath(),\n", " parsed_col='text').build()\n", "html = st.produce_scattertext_explorer(empath_corpus,\n", " category='Female',\n", " category_name='Female',\n", " not_category_name='Male',\n", " width_in_pixels=1000,\n", " metadata=convention_df_gender['speaker_party'],\n", " use_non_text_features=True,\n", " use_full_doc=True)\n", "file_name = 'output/EmpathGender.html'\n", "open(file_name, 'wb').write(html.encode('utf-8'))\n", "IFrame(src=file_name, width = 1200, height=700)\n" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [Root]", "language": "python", "name": "Python [Root]" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }