{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "The data is from: \n", "\n", "Mahesh Joshi, Dipanjan Das, Kevin Gimpel, and Noah A. Smith. Movie Reviews and Revenues: An Experiment in Text Regression. In Proceedings of the North American Chapter of the Association for Computational Linguistics Human Language Technologies Conference, Los Angeles, CA, June 2010." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "import scattertext as st\n", "import re, io\n", "from pprint import pprint\n", "import pandas as pd\n", "import numpy as np\n", "import spacy.en\n", "import seaborn as sns\n", "from urllib.request import urlopen\n", "from IPython.display import IFrame\n", "from IPython.core.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "nlp = spacy.en.English()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def draw_corpus(df, corpus, category, other_category, category_col, extra='', scores=None, \n", " singleScoreMode=False, minimum_term_frequency=2):\n", " html = st.produce_scattertext_explorer(corpus, \n", " category=category, \n", " category_name=category.lower() +' Term', \n", " not_category_name=other_category.lower() + ' Term',\n", " pmi_filter_thresold=6,\n", " minimum_term_frequency=minimum_term_frequency,\n", " metadata=df['metadata'],\n", " scores=scores,\n", " width_in_pixels=1000,\n", " singleScoreMode=singleScoreMode,\n", " use_full_doc = False)\n", " file_name = category.lower() + '-' + other_category.lower() + extra + '.html'\n", " open(file_name, 'wb').write(html.encode('utf-8'))\n", " return IFrame(src=file_name, width = 1200, height=1000)\n", "\n", "def draw_plot(df, category, other_category, category_col, extra='', minimum_term_frequency=3):\n", " category_vs_other_df = df[(df[category_col] == category) | (df[category_col] == other_category)]\n", " corpus = st.CorpusFromPandas(category_vs_other_df, \n", " category_col = category_col, \n", " text_col = 'text',\n", " nlp = nlp).build()\n", " return draw_corpus(category_vs_other_df, corpus, category, other_category, category_col, extra=extra, minimum_term_frequency=minimum_term_frequency)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Let's read the data and only look at the training set, and see that PG-13 movies have a large variance in revenue" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df = pd.read_csv('movies_and_revenue.csv')\n", "df['revenue_level'] = df['revenue percentile'].apply(lambda x: ('High' if x >= 2./3 else ('Low' if x <= 1./3 else 'Mid')) + ' Revenue')\n", "df['metadata'] = df['name'] + ' Rated: ' + df['rating'] + ' Made: ' + df['revenue'].apply(lambda x:'${:,.0f}'.format(x))\n", "df = df[df['split'] == 'train']" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of review 4779\n", "Number of movies 1147\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAA3cAAAFOCAYAAAAl2eRaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X90VPWd//HXTCYTIBMgQLTGDRHTUHpqi8Jardu0RyWK\nW39WkwYw8YjtsdU9nBbJyo/wI9vERIKui8iulQOFAKamgti02i0FbSsV2dBE4w/0y7r8PNUQAmQm\nMZNk7vcPzJhASIYfN/fOnefjrztzM5NX8Ep4zfvez3UZhmEIAAAAABDV3FYHAAAAAACcP8odAAAA\nADgA5Q4AAAAAHIByBwAAAAAOQLkDAAAAAAeg3AEAAACAA0Rluauvr1d+fn6/X1NeXq6cnBzl5eVp\n9+7dg5QMAAAAAKzhsTrA2Vq1apW2bNmixMTEM37NBx98oLq6OlVXV2vfvn362c9+pk2bNg1iSgAA\nAAAYXFE3uUtPT9czzzwTfrxnzx4VFBSooKBAs2bNkt/v18UXX6whQ4YoGAyqpaVFXq/XwsQAAAAA\nYL6oK3fZ2dmKi4sLP160aJEWL16sdevW6Tvf+Y6ee+45eTweuVwuTZ06VQ888IBmzpxpYWIAAAAA\nMF/UnZZ5qr1796q4uFiS1NnZqfT0dL300ktKSUnRmjVr5Pf7NW3aNE2cOFEXX3yxxWkBAAAAwBxR\nX+4uv/xyLV26VF/60pe0e/duHTlyRJ999pmGDRsmSRo6dKi8Xq/a2tosTgoAAAAA5jG93NXX12vZ\nsmWqrKzs9fzbb7+txx9/XJI0ZswYVVRUnNO1cYsXL1ZhYaG6urrkdrtVWlqqtLQ01dbWKi8vT4Zh\n6LbbbtNll112IX4cAAAAALAll2EYhllv3nNly6qqql777rzzTj399NNKS0vTr3/9a/3jP/4jBQwA\nAAAAzpGpC6qcurJlt48//lgjR47UmjVrlJ+fr+PHj1PsAAAAAOA8mFruTl3Zsltzc7Pq6uqUn5+v\nNWvWaMeOHdq5c6eZUQAAAADA0Sy5FcLIkSM1duxYjRs3Th6PR1lZWWpoaBjwdZ2dXYOQDgAAAACi\nz6CslnnqZX1paWlqbW3VgQMHwouf3HPPPQO+T3Nzq1kRAQAAAMD2UlKSzrhvUMqdy+WSJNXU1Kit\nrU05OTkqLS3V7NmzJUlXXXWVvvvd7w5GFAAAAABwJFNXy7zQGhtbrI4AAAAAAJbpb3JnyTV3AAAA\nAIALi3IHAAAAAA5AuQMAAAAAB6DcAQAAAIADUO4AAAAAwAEodwAAAADgAJQ7AAAAAHAAyh0AWyks\nnKXCwllWxwAAAIg6HqsDAEBPTU1HrI4AAAAQlZjcAbCNnhM7pncAAABnh3IHwDZ6Tu2Y4AEAAJwd\nyh0AAAAAOADlDgAAAAAcgHIHwDZSUy/tcxsAAAADcxmGYVgdIlKNjS1WRwBgspkzp0uSVq/eaHES\nAAAA+0lJSTrjPm6FAMBWmNgBAACcGyZ3AAAAABAl+pvccc0dAAAAADgA5Q4AAAAAHIByBwAAAAAO\nQLkDAAAAAAeg3AEAAACAA1DuAAAAAMABKHcAAAAA4ACUOwAAAABwAModAAAAADgA5Q4AAAAAHMD0\ncldfX6/8/Pwz7l+0aJGefPJJs2MAiBKlpYtVWrrY6hgAAABRx2Pmm69atUpbtmxRYmJin/urqqr0\n4Ycf6pvf/KaZMQBEkb17P7I6AgAAQFQydXKXnp6uZ555ps99f/vb3/TOO+8oLy/PzAgAokjPiR3T\nOwAAgLNjarnLzs5WXFzcac83NjZqxYoVWrRokQzDMDMCgCjSc2rHBA8AAODsmHpa5pm8+uqrOnbs\nmH70ox+psbFR7e3tuvzyy3XnnXf2+7rk5GHyeE4viwCcKSUlyeoIAAAAUWNQyt2p07n8/PzwIiub\nN2/Wxx9/PGCxk6Tm5lZT8gGwh4yMzPDELiMjU42NLRYnAgAAsJf+PvwelFshuFwuSVJNTY2qq6sH\n41sCiEILFhT3uQ0AAICBuYwouuiNT/EB53vwwfskSc8+u9biJAAAAPbT3+TOkmvuAOBMOjo6rI4A\nAAAQlQbltEwAiERRUWGf2wAAABgY5Q6AbRw+fKjPbQAAAAyMcgcAAAAADkC5A2AbqamX9rkNAACA\ngbFaJgBbmTlzuiRp9eqNFicBAACwH1bLBBA1mNgBAACcGyZ3AAAAABAl+pvccc0dAAAAADgA5Q4A\nAAAAHIByBwAAAAAOQLkDAAAAAAeg3AEAAACAA1DuANhKUVGhiooKrY4BAAAQdbjPHQBbOXz4kNUR\nAAAAohKTOwC20XNix/QOAADg7FDuANhGz6kdEzwAAICzQ7kDAAAAAAeg3AGwjdTUS/vcBgAAwMBc\nhmEYVoeIVGNji9URAJhs5szpkqTVqzdanAQAAMB+UlKSzriP1TIB2AoTOwAAgHPD5A4AAAAAokR/\nkzuuuQMAIEJFRYXcpgMAYFuclgkAQIS4RQcAwM6Y3AEAEIGeEzumdwAAO6LcAQAQgZ5TOyZ4AAA7\nMr3c1dfXKz8//7Tna2pqlJubq+nTp2vJkiVmxwAAAAAARzO13K1atUpFRUXq6Ojo9Xx7e7uWL1+u\n9evXa+PGjWppadH27dvNjAIAwHnpeZsObtkBALAjU8tdenq6nnnmmdOe93q9qqqqktfrlSR1dnYq\nISHBzCgAAJyXkpKKPrcBALALU1fLzM7O1qFDp1+X4HK5NGrUKElSZWWl2tradN1115kZBQCA88bE\nDgBgZ5bdCsEwDC1dulT79u3TihUrrIoBAEDEmNgBAOxsUMqdYRinPbdw4UINGTJEK1eujPh9kpOH\nyeOJu5DRAAAAAMARBqXcuVwuSSdXyGxra9PXvvY1bdq0SZMnT1Z+fr5cLpcKCgo0ZcqUft+nubl1\nMOICAAAAgC2lpCSdcZ/L6GusZlONjS1WRwAAAAAAy/RX7riJOQAAAAA4AOUOAAAAAByAcgcAAAAA\nDkC5AwAAAAAHoNwBAAAAgANQ7gAAAADAASh3AAAAAOAAlDsAAAAAcADKHQAAAAA4AOUOAAAAAByA\ncgcAAAAADkC5AwAAAAAHoNwBAAAAgANQ7gAAAADAASh3AAAAAOAAlDsAAAAAcADKHQAAAAA4AOUO\nAAAAAByAcgcAAAAADkC5AwAAAAAHoNwBAAAAgANQ7gAAAADAASh3AAAAAOAAlDsAtvLDH96rH/7w\nXqtjAAAARB2P1QEAoKdQKGR1BAAAgKjE5A6AbfSc2DG9AwAAODuUOwC20XNqxwQPAADg7Jhe7urr\n65Wfn3/a89u2bdM999yjvLw8VVdXmx0DAAAAABzN1GvuVq1apS1btigxMbHX852dnSovL9emTZuU\nkJCgadOm6cYbb9SoUaPMjAMAAAAAjmXq5C49PV3PPPPMac/v3btX6enp8vl8io+P1+TJk7Vr1y4z\nowAAAACAo5la7rKzsxUXF3fa836/X0lJSeHHiYmJamlpMTMKAAAAADiaJbdC8Pl88vv94ceBQEDD\nhw8f8HXJycPk8ZxeFgE4w9ixY7V///7wdkpK0gCvAAAAQLdBKXeGYfR6nJGRoX379unEiRMaMmSI\ndu3apQceeGDA92lubjUrIgAbWLKkXDNnTg9vNzYy0QcAAOipvw+/B6XcuVwuSVJNTY3a2tqUk5Oj\nefPmaebMmTIMQzk5ObrooosGIwoAm0tNvdTqCAAAAFHJZZw6VrMxPsUHAAAAEMv6m9xxE3MAAAAA\ncADKHQAAAAA4AOUOAAAAAByAcgcAAAAADkC5AwAAAAAHoNwBAAAAgANQ7gAAAADAASh3AAAAAOAA\nlDsAAAAAcADKHQAAAAA4AOUOAAAAAByAcgcAAAAADkC5AwAAAAAHoNwBAAAAgANEXO4OHjyo1157\nTV1dXTpw4ICZmQAAAAAAZymicve73/1OP/nJT1RSUqJjx44pLy9PW7ZsMTsbAAAAACBCEZW75557\nTs8//7x8Pp9Gjx6tzZs36xe/+IXZ2QAAAAAAEYqo3Lndbvl8vvDjiy66SG43l+sBAAAAgF14Ivmi\nzMxMrV+/Xp2dnXr//fe1ceNGTZgwwexsAAAAAIAIuQzDMAb6otbWVv3nf/6nduzYoVAopGuvvVYP\nP/xwr2neYGhsbBnU7wdg8JWWLpYkLVhQbHESAAAA+0lJSTrjvojKnV1Q7gDnmzlzuiRp9eqNFicB\nAACwn/MudxMmTJDL5TrlTVP0pz/96fzTnQXKHeBspaWLtXfvR5KkjIxMpncAAACn6K/cRXTN3Qcf\nfBDe7ujo0NatW1VXV3f+yQCgh+5id+o2AAAABnbWS17Gx8frlltu0ZtvvmlGHgAAAADAOYhocvfS\nSy+Ftw3D0EcffaT4+HjTQgGITRkZmb1OywQAAEDkIrrmbt68eb0eJycna9q0aUpLSzMtWF+45g5w\nPhZUAQAAOLPzvuaurKzsgoUBgP4wsQMAADg3EU3u/vznP+upp57S8ePH1fPL//jHP/b7OsMwtGTJ\nEu3Zs0der1elpaW9pn0vv/yyfvnLXyouLk7f//73NW3atH7fj8kdAAAAgFh23pO7kpISzZ07V5mZ\nmafdEqE/W7duVTAYVFVVlerr61VWVqaVK1eG9y9dulSvvPKKhgwZou9973u69dZblZR05rAAAAAA\ngL5FVO6Sk5N1/fXXn/Wb19bWKisrS5I0ceJENTQ09No/YcIEHT9+PFwYz6Y4AgAAAAC+EFG5mzx5\nssrKypSVlaWEhITw81dffXW/r/P7/b0mcR6PR6FQSG73yTswZGZm6u6779awYcOUnZ0tn893Lj8D\nAAAAAMS8iMrd22+/LUl67733ws+5XC6tW7eu39f5fD4FAoHw457Fbs+ePXrttde0bds2DRs2THPm\nzNHvf/973XzzzWd8v+TkYfJ44iKJDAAAAAAxJaJyV1lZeU5vPmnSJG3fvl1Tp05VXV2dxo8fH96X\nlJSkoUOHyuv1yuVyadSoUTpx4kS/79fc3HpOOQBEjwcfvE+S9Oyzay1OAgAAYD/9LajijuQNDh06\npPvvv1833XSTGhsbVVBQoIMHDw74uuzsbHm9XuXl5am8vFzz5s1TTU2NqqurlZqaqtzcXE2fPl0z\nZsyQ3+/XXXfdFflPBcCROjo61NHRYXUMAACAqBPRrRAeeOAB3X///Vq2bJk2b96s6upqbdmyRRs2\nbBiMjGHcCgFwtgcfvC9c7OLj45neAQAAnOK8J3fNzc369re/LenktXa5ubny+/0XJh0AfK7nxI7p\nHQAAwNmJqNwNGTJEf//738O3Kvif//kfeb1eU4MBAAAAACIX0YIqc+fO1YMPPqj9+/frjjvu0PHj\nx/Uf//EfZmcDEGPi4+N7nZYJAACAyEV0zd3Ro0eVlJSk//u//1NXV5cuv/xySyZ3XHMHON/MmdMl\nSatXb7Q4CQAAgP2c9zV3d911l/7lX/5FH374ocaNG8cpmQAAAABgMxGVu+3bt+u+++7TG2+8oVtu\nuUVz587VX//6V7OzAYgxpaWL+9wGAADAwCI6LbOnnTt36vHHH9e+fftUW1trVq4+cVom4Gzdp2R2\n49RMAACA3vo7LTOiBVXeffdd1dTUaOvWrbrssst0//33Kzs7+4IFBAAAAACcn4jK3cKFC3XHHXfo\n+eef15gxY8zOhEFQVFQoSSopqbA4CQAAAIALIaJr7jZt2qQbb7xRDQ0N6urq0oEDB8zOBZMdPnxI\nhw8fsjoGAAAAgAskonL3u9/9Tj/5yU9UUlKiY8eOKS8vT1u2bDE7G0zSPbU7dRuwWmrqpX1uAwAA\nYGARlbvnnntOzz//vHw+n0aPHq3NmzfrF7/4hdnZYJKeEzumd7CTnqcJc8owAADA2Ymo3Lndbvl8\nvvDjiy66SG53RC8FAAAAAAyCiBpaZmam1q9fr87OTr3//vtauHChJkyYYHY2mIRT32BXhYWz+twG\n7KKoqJDT2QEAthVRuWttbdUnn3yihIQEzZ8/Xz6fT4sXc4PhaMWpb7CrpqYjfW4DdsFiVAAAO4vo\nVgiHDh3SY489pkceecTsPBgkTOwA4OycuhgVH44BAOwmonLndrt1ww03aNy4cUpISAg/v27dOtOC\nwVz8owR2NHr0mPDEbvRo7qkJe2ExKgCA3UVU7goLub4AgPkqKpZr5szp4W0AAABELqJy981vftPs\nHACg0tLFvbYXLCi2MA3QW2rqpeGJHae2AwDsiPsZALCNvXs/6nMbsIP29vY+twEAsAvKHQAAEWA1\nVwCA3VHuANhGRkZmn9sAAAAYGOUOgG30vMaO6+1gNz1XcGU1VwCAHVHuANhKRkYmUzvYUs8VXFnN\nFQBgRxGtlgkAg4WJHeyMiR0AwM5chmEYVoeIVGNji9URAAAAAMAyKSlJZ9zHaZkAAAAA4ACmnpZp\nGIaWLFmiPXv2yOv1qrS0VGlpaeH9b7/9th5//HFJ0pgxY1RRUSGv12tmJAAAAMcpKiqUJJWUVFic\nBICVTC13W7duVTAYVFVVlerr61VWVqaVK1eG9y9atEhPP/200tLS9Otf/1qHDx/WZZddZmYkAAAA\nxzl8+JDVEQDYgKmnZdbW1iorK0uSNHHiRDU0NIT3ffzxxxo5cqTWrFmj/Px8HT9+nGIHAABwlrqn\ndqduA4g9ppY7v9+vpKQvLvjzeDwKhUKSpObmZtXV1Sk/P19r1qzRjh07tHPnTjPjAAAAOE7PqR0T\nPCC2mXpaps/nUyAQCD8OhUJyu0/2yZEjR2rs2LEaN26cJCkrK0sNDQ265pprzvh+ycnD5PHEmRkZ\nAIAzKikpkSQVFRVZnAQ4s/5W0gPgbKaWu0mTJmn79u2aOnWq6urqNH78+PC+tLQ0tba26sCBA0pL\nS1Ntba3uueeeft+vubnVzLgAAPSr+wwTbs0DO0lNvTQ8sUtNvZTjE3C4/j7AMfU+dz1Xy5SksrIy\nvfvuu2pra1NOTo527typZcuWSZKuuuoqzZ8/v9/34y8rAIBVli9fprq63ZKkK6+cpFmz5licCPjC\nzJnTJUmrV2+0OAkAs/VX7kyd3LlcLhUXF/d6rvs0TEm65pprVF1dbWYEAFGmsHCWJKmiYrnFSYDe\nuovdqduAHaSmXmp1BAA2YGq5A4Cz1dR0xOoIABB1uL8dAMnk1TIB4Gx0T+1O3QbsID4+vs9tAADs\ngnIHwDZ6Tu2Y4MFuOjo6+twGAMAuKHcAAAAA4ACUOwC2MXr0mD63ATvg+AQA2J2pt0K40LgVAuB8\nLOcNO+P4BABYzbJbIQDA2WIiAjvj+AQA2BmTOwAAAACIEv1N7rjmDgAAAAAcgHIHAAAAAA5AuQMA\nAAAAB6DcAQAAAIADsFomgLAXXtigXbt2WpohEAhIkhITEy3NIUlXX32NcnNnWB0DAAAgIkzuANhK\nMNiuYLDd6hgAAABRh1shALCVwsJZkqSKiuUWJ4Gd2GGqLNlnssxUGQBiF7dCAADgAmCyDLsqLJwV\n/nAMQOzimjsAgO3l5s6wxaSKyTLsqqnpiNURANgAkzsAAIAo1nNix/QOiG2UOwAAgCjWc2rHBA+I\nbZQ7AAAAAHAAyh0AAEAUGz16TJ/bAGIP5S5GsaoWAADO0HOBHxb7AWIbq2XGKM7JBwDAOZjYAZCY\n3MUkVtUCAMBZKiqWM7UDQLmLRayqBQAAADgP5Q4AAAAAHIByF4NYVQsAAABwHlPLnWEYWrx4sfLy\n8lRQUKADBw70+XWLFi3Sk08+aWYU9MCqWgAAOEtRUaGKigqtjgHAYqaWu61btyoYDKqqqkqPPPKI\nysrKTvuaqqoqffjhh2bGQB9Gjx7D1A4AAIc4fPiQDh8+ZHUMABYztdzV1tYqKytLkjRx4kQ1NDT0\n2v+3v/1N77zzjvLy8syMgT6wqhYAAM7Qc2LH9A6IbaaWO7/fr6SkpPBjj8ejUCgkSWpsbNSKFSu0\naNEiGYZhZgwAAADH6jmxY3oHxDZTb2Lu8/kUCATCj0OhkNzuk33y1Vdf1bFjx/SjH/1IjY2Nam9v\n1+WXX64777zzjO+XnDxMHk+cmZEBWCwu7uTfESkpSQN8JTD4OD4RDTg+gdhlarmbNGmStm/frqlT\np6qurk7jx48P78vPz1d+fr4kafPmzfr444/7LXaS1NzcamZcADbQ1dU93W+xOAlwOo5P2FFq6qXh\niV1q6qUcn4DD9fcBjqmnZWZnZ8vr9SovL0/l5eWaN2+eampqVF1dbea3RQRKSxertHSx1TEAAMB5\nKimp6HMbQOwxdXLncrlUXFzc67lx48ad9nV33XWXmTHQh717P7I6AgAAuEBSUy+1OgIAG+Am5jGo\n58SO6R0AANGvpKSCqR0Ayl0s6jm1Y4IHAAAAOAPlDgAAAAAcgHIXgzIyMvvcBgAAABC9KHcxaMGC\n4j63AQAAAEQvyl0M+ulPf9znNgAAAIDoRbmLQSdOnOhzGwAAAED0otwBAAAAgANQ7gAAAADAASh3\nAAAAAOAAlDsAAAAAcADKHQAAAAA4AOUOAAAAAByAcgcAAAAADkC5AwAAAAAHoNwBAAAAgANQ7gAA\nAADAASh3AAAAAOAAlDsAAAAAcADKHQAAAAA4AOUOAAAAABzAY3UAAACAaPbCCxu0a9dOSzMEAgFJ\nUmJioqU5JOnqq69Rbu4Mq2MAMYnJXQxyuVx9bgMAgOgUDLYrGGy3OgYAizG5i0Hx8fEKBoPhbQAA\ncO5yc2dYPqkqLJwlSaqoWG5pDgDWYnIXg7qL3anbAAAAAKIX5Q4AAAAAHIByBwAAAAAOYOo1d4Zh\naMmSJdqzZ4+8Xq9KS0uVlpYW3l9TU6N169bJ4/Fo/PjxWrJkiZlxANt67LElam4+anUMW+j+c+i+\nfiTWJSeP0vz5S6yOAQAAooCp5W7r1q0KBoOqqqpSfX29ysrKtHLlSklSe3u7li9frpqaGnm9Xj3y\nyCPavn27rr/+ejMjAbbU3HxUTUePyD2UNY5CbkOS1Nx2zOIk1gu1dVodQRIfPvTEhw+98eEDANiL\nqf+SrK2tVVZWliRp4sSJamhoCO/zer2qqqqS1+uVJHV2diohIcHMOICtuYd6lDx1rNUxYCPNr+63\nOoKkk4XmaNMR+dycyR8XCkmSgpRd+T//swAA2Iep5c7v9yspKemLb+bxKBQKye12y+VyadSoUZKk\nyspKtbW16brrrjMzDgDgHPncbt07YpTVMWAj649TcAHAbkwtdz6fT4FAIPy4u9h1MwxDS5cu1b59\n+7RixYoB3y85eZg8njhTssaylJSkgb8IpoqLYyKCvsXFuS3/f5TjE2di9fH5r//6r2pqarLs+9tJ\n9ynDc+f+1OIk9jB69GgtXbrU6hjAoDO13E2aNEnbt2/X1KlTVVdXp/Hjx/fav3DhQg0ZMiR8Hd5A\nmptbzYgZ8xobW6yOEPO6uji9CX3r6gpZ/v8oxyfOxOrj89NPG9XU1KSE+GGWZbALl05++H28OTDA\nVzpfe0er5ccmYKb+PlQztdxlZ2frjTfeUF5eniSprKxMNTU1amtr09e+9jVt2rRJkydPVn5+vlwu\nlwoKCjRlyhQzIwEAAAdJiB+mSV+92+oYsJHd779odQTAMqaWO5fLpeLi4l7PjRs3Lrz93nvvmfnt\nAQAAACBmcCEFAAAAADgA5Q4AAAAAHIA7Jg+yF17YoF27dlqawe12K/T5/YncbrelN+O9+uprlJs7\nw7LvDwAAADgFk7sYlJw8qs9tAAAAANGLyd0gy82dYYtJ1Q9/eK8kqaJiucVJAAAAAFwIlLsYxcQO\nAAAAcBZOywQAAAAAB6DcAQAAAIADUO4AAAAAwAEodwAAAADgACyoAgAAolIgEFB7x2fa/f6LVkeB\njbR3tMoVCFkdA7AEkzsAAAAAcAAmd4ANBAIBhdo71fzqfqujwEZCbZ0KhAJWxzg5HQmFtP74Uauj\nwEb8oZASAtYen4mJiTK63Jr01bstzQF72f3+i0pMHGp1DMASTO4AAAAAwAGY3AE2kJiYqKC7Q8lT\nx1odBTbS/Op+JQ5NtDqGEhMTFR9s170jRlkdBTay/vhReROtPz4BAF9gcgcAAAAADkC5AwAAAAAH\noNwBAAAAgAPE1DV3jz22RM3NrPYmKfznUFg4y+Ik9pCcPErz5y+xOgYAAABwzmKq3DU3H1VTU5Nc\n8SyPa3w+tD16otXiJNYzOtqsjgAAOEftHa3cxFxSZ1dQkuSJ81qcxHrtHa3yiX/rITbFVLmTJFf8\nUPm+fLvVMWAj/v/3stURAADnIDmZFVy7NTef/KDSN5xS49NQjg3ErJgrdwAAwBk4nf4L3ZdZVFQs\ntzgJACuxoAoAAAAAOADlDgAAAAAcgHIHAAAAAA5AuQMAAAAAB2BBFQDAgPyhkNYf5z6hn4VCkqQh\nbj4b9YdCYj1CALAXU8udYRhasmSJ9uzZI6/Xq9LSUqWlpYX3b9u2TStXrpTH49Hdd9+tnJwcM+MA\nAM4BS4p/IdB8suB6+TPRKHFsAIDdmFrutm7dqmAwqKqqKtXX16usrEwrV66UJHV2dqq8vFybNm1S\nQkKCpk2bphtvvFGjRvGLAgDshOXmv8By8wAAOzO13NXW1iorK0uSNHHiRDU0NIT37d27V+np6fL5\nfJKkyZMna9euXbr55ptNyxMIBGR0fMZNq9GL0dGmQMCwOoZCbZ1qfnW/1TEsFwp2SZLc3jiLk1gv\n1NYpcT9iAAAQIVPLnd/vV1JS0hffzONRKBSS2+0+bV9iYqJaWlrMjPM5Q0ZH2yB8nzN/f5zKZfH3\nt/6/iV1ObQoEAgoG260NEfr8v8fn1zZZyetNUGJionUBhtrn2LDaCy9s0K5dO62OoebPT8vsnuBZ\n5eqrr1Fu7gxLM+ALdjg+7XJsShyf3V54YYP++79fsTRDyAa/S+3GbfE10zfddIup/3+YWu58Pp8C\ngUD4cXex697n9/vD+wKBgIYPH97v+6WkJPW7fyDV1S+c1+sBs/z7vz9hdQTA1h5++MeSfmx1DKBP\nHJ/oy8MP//jzYwMYPKZW10mTJun111+XJNXV1Wn8+PHhfRkZGdq3b59OnDihYDCoXbt26corrzQz\nDgAAAADUhdR7AAAILUlEQVQ4lsswDNPOSeu5WqYklZWV6d1331VbW5tycnL02muvacWKFTIMQ/fc\nc4+mTZtmVhQAAAAAcDRTyx0AAAAAYHBwF1YAAAAAcADKHQAAAAA4AOUOAAAAABzA1FshwH7eeust\n/fSnP9WXv/xlSSfvRTh27FgtW7ZMHg+HAy68U4+5YDCo2267TTNmzNBvf/tbbdy4US6XS263WxMm\nTFBhYaHi4+P7fK9f/vKXOnr0qGbPni1J+v3vf6/nnntObrdbt956qwoKCgbt54KzXcjjFjDbgQMH\nVFFRoU8//VQJCQkaOnSo5syZEz5+gQvprbfe0kMPPaTf/va3uvjiiyVJTzzxhDIyMnTDDTfo8ccf\n1/79+9XZ2anU1FQVFxfL5/P1+V49f68fOXJEP/vZz+RyuWQYhj744APNmTNHP/jBDwbzx4t6/Gs+\nBn3rW9/SE098cV+1Rx55RNu2bdNNN91kYSo4Wc9jLhgMaurUqRoxYoRefPFFPfvss+G/9MvLy/XS\nSy8pJyen1+vb29u1YMECvfPOO7r55pslnbxv5pNPPqlNmzZp6NCh+ud//mfdfvvtGjly5OD+cHCs\n8z1ugcHw2Wef6aGHHlJpaam+8Y1vSJLeeecd/du//ZvWrVtncTo4ldfr1bx587R69erwc4ZhaPbs\n2crLy9OUKVMknSxvixcv7vXvTqnv3+tjxoxRZWWlpJO3UHvqqaeUm5s7SD+Rc1DuYlDPBVKDwaAa\nGxsHvIE8cD56HnN+v19xcXGqrq7W3Llze32aN3fu3D5f397eru9///v69re/rf/93/+VJLndbr3y\nyityu91qamqSYRhMTnBBne9xCwyGbdu26dprrw0XO0n6+te/TrGDqa699loZhqENGzZoxowZkqRD\nhw6pqakpXOwk6b777lMgEDjt9X39Xu/p5z//uZ588km5XC7zfgiHotzFoDfffFMFBQVqamqS2+3W\nD37wA1177bVWx4KDdR9zLpdL8fHxWrhwocrLyzV27FhJJz+he+KJJ8KncJz6Cd/w4cN13XXXafPm\nzb2ed7vd+sMf/qDi4mJdf/31GjZs2KD9THC+8z1ugcFw8OBBpaenhx8/9NBDamlpUWNjo9auXRs+\nbQ64kFwulxYvXqycnBxlZWVJOjnN+4d/+IfTvq6vUzLP9HtdOvmBxfjx43sd14gcC6rEoG9961ta\nt26dNmzYoPj4+NP+RwQutO5jbu3atVq1apW+853v6JJLLtHBgwclSVdeeaUqKytVWlqqxsZG7d69\nW/n5+SooKNDrr7/e73tnZ2frL3/5i4LBoF566aXB+HEQI872uAWscMkll+jAgQPhxytXrlRlZaVG\njBihrq4uC5PB6UaMGKF58+bp0UcflWEYCoVC+vvf/97razo7O/Wb3/xGtbW1Ef9ef/nllzkd8zxQ\n7mLYyJEjVVFRoQULFujIkSNWx0GMmTFjhpYuXSq/3x9+bufOnXK5XJo0aZIqKyu1bt06ffe73+3z\n9X6/X/n5+QoGg5KkoUOHcvoGTNffcQtY4cYbb9Rf//pXvf322+Hn9u3bp08++YTjEqa7/vrrNW7c\nOG3atElf+tKXlJycrD/+8Y/h/WvXrtW2bds0efLkAX+vd2toaNBVV11ldnTH4rTMGJeRkaGCggKV\nlJToqaeesjoOYsgNN9ygrq4uPfTQQ3K5XPL7/crMzNTPf/7ziF7v8/l0++23695771V8fLy+8pWv\n6I477jA5NWLd+R63wIU2bNgw/dd//ZeWLVumxsZGdXZ2yuPxaP78+brkkkusjocYMH/+fL355puS\npKVLl6q4uFhr1qxRR0eH0tLSzurvx6NHjyopKcmsqDHBZfS8YhwAAAAAEJU4LRMAAAAAHIByBwAA\nAAAOQLkDAAAAAAeg3AEAAACAA1DuAAAAAMABKHcAAAAA4ACUOwAA+nHw4EEtWLBA0smb6y5cuNDi\nRAAA9I2bmAMA0I9Dhw7pwIEDkqQrrrhCV1xxhcWJAADoGzcxBwDErLfeeksVFRUKhUIaMWKE3G63\nWlpa1NjYqFtvvVWzZ8/W7bffroMHD+quu+7SzTffrKefflqVlZXKz8/XN77xDdXW1qq5uVlFRUXK\nysrSJ598ojlz5ujEiRPKzMzUrl279Prrr1v9owIAYgCnZQIAYtq+ffu0du1aZWVl6dZbb9WvfvUr\nvfzyy9qwYYOOHTumoqIiXXHFFeHTMV0uV/i1nZ2dqqqq0ty5c/XUU09JkkpLS/W9731PW7Zs0dSp\nU/Xpp59a8nMBAGIPp2UCAGLauHHj5PP5dP/992vnzp1avXq1PvroI3V2dqqtra3f12ZlZUmSMjMz\ndfz4cUnSG2+8ofLycknSlClTNHz4cHN/AAAAPke5AwDEtISEBElSeXm5Dh06pNtuu01TpkzRjh07\nNNCVC92vdblc4a+Ni4tTKBQKfw1XPwAABgunZQIAIGnHjh164IEHdNNNN+nw4cP69NNP1dXVpbi4\nOHV1dUX8Pv/0T/+k3/zmN5Kk119/XS0tLWZFBgCgFyZ3AABIevDBB1VYWKjhw4drzJgxuuKKK3Tw\n4EF99atf1YkTJ/Too4/q7rvvDn99z2vvepo3b54effRRVVdX6ytf+QqnZQIABg2rZQIAcAFVVlbq\nuuuuU0ZGht577z0tXLhQL774otWxAAAxgMkdAAAXUHp6umbPni23262EhASVlJRYHQkAECOY3AEA\nAACAA7CgCgAAAAA4AOUOAAAAAByAcgcAAAAADkC5AwAAAAAHoNwBAAAAgANQ7gAAAADAAf4/4dqq\nenVheN4AAAAASUVORK5CYII=\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print('Number of review', len(df))\n", "print('Number of movies', len(df['id'].unique()))\n", "import matplotlib.pyplot as plt\n", "plt.figure(figsize=(15, 5))\n", "sns.boxplot(x=\"rating\", y=\"revenue\", data=df[df['rating'].isin({'R', 'G', 'PG', 'PG-13', 'NC-17'})])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Let's limit to pg-13, re-percentile revenue, and take a peek at the review structure\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1383\n" ] }, { "data": { "text/plain": [ "id 1408\n", "revenue 2.06177e+07\n", "split train\n", "text Based on a Stephen King short story that bears...\n", "revenue percentile 0.854363\n", "label True\n", "name 1408\n", "number_of_screens 2678\n", "origin USA\n", "rating PG-13\n", "metadata 1408 Rated: PG-13 Made: $20,617,667\n", "revenue_level High Revenue\n", "Name: 7, dtype: object" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_pg13 = df[df['rating'] == 'PG-13']\n", "print(len(df_pg13))\n", "# Replace revenue-level through a wonky one-liner\n", "df_pg13 = df_pg13[[c for c in df_pg13.columns if c != 'revenue_level']].join(pd.DataFrame(\n", " {'revenue_level': df.ix[df_pg13['id'].drop_duplicates().index].set_index('id')['revenue']\n", " .rank(pct=True)\n", " .apply(lambda x: 'High Revenue' if x > 2./3 else ('Low Revenue' if x < 1./3 else 'Mid Revenue'))}), on='id')\n", "df_pg13.iloc[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Let plot it... something's wrong. High and low revenue movies, directors, and actors, etc. are showing up." ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "draw_plot(df_pg13, 'High Revenue', 'Low Revenue', 'revenue_level', extra='naive', minimum_term_frequency=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Let's limit our term frequency count to once per document, and let's consider all reviews for a single movie one big document" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "id 1408\n", "revenue 2.06177e+07\n", "revenue_level High Revenue\n", "rating PG-13\n", "name 1408\n", "metadata 1408 Rated: PG-13 Made: $20,617,667\n", "text Based on a Stephen King short story that bears...\n", "Name: 0, dtype: object" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "one_movie_per_doc_df = (df_pg13.groupby(['id', 'revenue', 'revenue_level', 'rating', 'name', 'metadata'])\n", " .apply(lambda x: pd.Series({'text':'\\n\\n\\n'.join(x['text'])}))\n", " .reset_index())\n", "one_movie_per_doc_df.iloc[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# A PG-13 movie will probably make a lot of money if:\n", "* Sequel or part of a franchise\n", "* Superhero should be manly, and should kick nasty, evil bad guys\n", "* Lots of suspenseful B-movie action sequences\n", "* Someone should wear a memorable t-shirt\n", "* Someone involved should be named Jerry\n", "* Avoid strong language, but have some slapstick comedy\n", "* Loud and dumb are probably okay, obvious may not be isn't" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "draw_plot(one_movie_per_doc_df, 'High Revenue', 'Low Revenue', 'revenue_level', \n", " extra='onemoveperdoc', \n", " term_ranker = st.termranking.OncePerDocFrequencyRanker, # only count one mention per document in frequency ranking\n", " minimum_term_frequency=8)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [], "source": [ "corpus = st.CorpusFromPandas(df[df['revenue_level'].isin(('High Revenue', 'Low Revenue'))],\n", " category_col = 'revenue_level', \n", " text_col = 'text',\n", " nlp = nlp).build()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "ename": "NameError", "evalue": "name 'category' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mtarget_term\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'war'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m html = st.word_similarity_explorer(corpus, \n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mcategory\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcategory\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mcategory_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcategory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m\u001b[0;34m' Term'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mnot_category_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mother_category\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' Term'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'category' is not defined" ] } ], "source": [ "target_term = 'war'\n", "html = st.word_similarity_explorer(corpus, \n", " category=category, \n", " category_name=category.lower() +' Term', \n", " not_category_name=other_category.lower() + ' Term',\n", " pmi_filter_thresold=6,\n", " minimum_term_frequency=10,\n", " metadata=df['metadata'],\n", " target_term=target_term,\n", " width_in_pixels=1200,\n", " singleScoreMode=True,\n", " use_full_doc = False,\n", " alpha=0.01,\n", " max_p_val=0.05,\n", " term_ranker = st.termranking.OncePerDocFrequencyRanker)\n", "file_name = category.lower() + '-' + other_category.lower() + extra + '.html'\n", "open(file_name, 'wb').write(html.encode('utf-8'))\n", "return IFrame(src=file_name, width = 1400, height=1000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [Root]", "language": "python", "name": "Python [Root]" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }