{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2024-06-13T14:05:55.052373800Z", "start_time": "2024-06-13T14:05:53.599538500Z" } }, "outputs": [], "source": [ "import psycopg2\n", "import pandas as pd\n", "import numpy as np\n", "import warnings\n", "import google.generativeai as genai\n", "from sklearn.base import BaseEstimator\n", "from sklearn.model_selection import train_test_split\n", "warnings.filterwarnings(\"ignore\")\n" ] }, { "cell_type": "code", "execution_count": 3, "outputs": [], "source": [ "conn = psycopg2.connect(dbname='bechdel_test', user='postgres', password='guest')\n", "cur = conn.cursor()\n", "\n", "cur.execute('SELECT * FROM imsdb_scripts JOIN bechdel_ratings ON imsdb_scripts.imdb_id = bechdel_ratings.imdb_id JOIN tmdb_data ON tmdb_data.imdb_id = imsdb_scripts.imdb_id;')\n", "data = pd.DataFrame(cur.fetchall())\n", "df = data.copy()\n", "df.set_index(0, inplace=True)\n", "\n", "cur.execute('SELECT genre.imdb_id, genre FROM genre JOIN imsdb_scripts ON imsdb_scripts.imdb_id = genre.imdb_id;')\n", "genre = pd.DataFrame(cur.fetchall())\n", "cur.close()\n", "conn.close()" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:05:55.878641100Z", "start_time": "2024-06-13T14:05:55.057773700Z" } }, "id": "14e136167bccac0" }, { "cell_type": "code", "execution_count": 4, "outputs": [], "source": [ "for genre_ in genre[1].unique():\n", " df[genre_] = pd.Series()" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:05:55.903387700Z", "start_time": "2024-06-13T14:05:55.880911400Z" } }, "id": "8e360700252225e0" }, { "cell_type": "code", "execution_count": 5, "outputs": [], "source": [ "for row in genre.iterrows():\n", " df[row[1][1]][row[1][0]] = 1" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:05:56.237431200Z", "start_time": "2024-06-13T14:05:55.898809600Z" } }, "id": "fb80dad8b86a94a6" }, { "cell_type": "code", "execution_count": 6, "outputs": [], "source": [ "df.rename(columns={0:'imdb_id',\n", " 1:'script_date',\n", " 2:'script',\n", " 3:'bechdel_id',\n", " 5:'title',\n", " 6:'release_year',\n", " 7:'bechdel_rating',\n", " 11:'language',\n", " 13:'popularity',\n", " 14:'vote_average',\n", " 15:'vote_count',\n", " 16:'overview'\n", " }, \n", " inplace=True)\n", "df.drop(columns=[4, 8, 9, 10, 12], inplace=True)\n", "df.fillna(0, inplace=True)\n", "df.replace('none', np.nan, inplace=True)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:05:56.258892400Z", "start_time": "2024-06-13T14:05:56.239236400Z" } }, "id": "c6652299813ffa8c" }, { "cell_type": "code", "execution_count": 7, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['bechdel_rating']), df['bechdel_rating'], test_size=0.234, random_state=42)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:05:56.287930500Z", "start_time": "2024-06-13T14:05:56.255756800Z" } }, "id": "1e3c4066671fbf5d" }, { "cell_type": "code", "execution_count": 8, "outputs": [], "source": [ "def y_transform(y):\n", " y = pd.DataFrame(y)\n", " y['pass_fail'] = y['bechdel_rating'].map({0:0, 1:0, 2:0, 3:1})\n", " return y\n", "y_train, y_test = y_transform(y_train), y_transform(y_test)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:05:56.290267600Z", "start_time": "2024-06-13T14:05:56.273297300Z" } }, "id": "1ede4a695018215c" }, { "cell_type": "code", "execution_count": 9, "outputs": [ { "data": { "text/plain": " script_date script \\\n0 \n472033 NaN \\n\\n ... \n120780 March 1998 \"Out of Sight\"\\r\\n\\r\\n\\r\\n ... \n1706593 NaN CHRONICLE\\r\\n\\r\\n\\r\\... \n2911666 NaN JOHN WICK\\r\\n\\r\\n\\... \n61722 March 1967 \\t\\t\\t\\t\"THE GRADUATE\"\\r\\n\\r\\n\\r\\n\\t\\t\\t\\tScre... \n... ... ... \n100814 June 1988 \\n\\nS. S. Wilson & Brent Maddock's \"Tremors\"\\n... \n109506 September 1992 The CROW\\r\\n\\r\\n\\tby\\r\\n\\r\\n\\tDavis Schow\\r\\n\\... \n765443 NaN EASTERN PROMISES\\r\\... \n816462 October 2009 NaN \n110148 NaN Interview with the Vampire\\r\\n\\r\\n\\tScreenplay... \n\n bechdel_id title release_year language \\\n0 \n472033 494 9 2009 en \n120780 2247 Out of Sight 1998 en \n1706593 3037 Chronicle 2012 en \n2911666 5897 John Wick 2014 en \n61722 616 Graduate, The 1967 en \n... ... ... ... ... \n100814 1663 Tremors 1990 en \n109506 3820 Crow, The 1994 en \n765443 3069 Eastern Promises 2007 en \n816462 2636 Conan the Barbarian 2011 en \n110148 120 Interview with the Vampire 1994 en \n\n popularity vote_average vote_count \\\n0 \n472033 71.590 6.921 3407 \n120780 24.781 6.682 1203 \n1706593 40.036 6.816 5119 \n2911666 105.961 7.430 18679 \n61722 30.980 7.700 3206 \n... ... ... ... \n100814 77.463 6.896 3105 \n109506 54.672 7.527 3786 \n765443 35.119 7.362 3194 \n816462 35.716 5.299 1792 \n110148 83.427 7.387 5627 \n\n overview ... Thriller \\\n0 ... \n472033 When 9 first comes to life, he finds himself i... ... 1 \n120780 Meet Jack Foley, a smooth criminal who bends t... ... 0 \n1706593 Three high school students make an incredible ... ... 1 \n2911666 Ex-hitman John Wick comes out of retirement to... ... 1 \n61722 Benjamin, a recent college graduate very worri... ... 0 \n... ... ... ... \n100814 Val McKee and Earl Bassett are in a fight for ... ... 0 \n109506 Exactly one year after young rock guitarist Er... ... 1 \n765443 A Russian teenager living in London dies durin... ... 1 \n816462 A quest that begins as a personal vendetta for... ... 0 \n110148 A vampire relates his epic life story of love,... ... 0 \n\n War Comedy Music Western Horror Science Fiction Action \\\n0 \n472033 0 0 0 0 0 1 1 \n120780 0 1 0 0 0 0 0 \n1706593 0 0 0 0 0 1 0 \n2911666 0 0 0 0 0 0 1 \n61722 0 1 0 0 0 0 0 \n... ... ... ... ... ... ... ... \n100814 0 1 0 0 1 1 1 \n109506 0 0 0 0 0 0 1 \n765443 0 0 0 0 0 0 0 \n816462 0 0 0 0 0 0 1 \n110148 0 0 0 0 1 0 0 \n\n Animation History \n0 \n472033 1 0 \n120780 0 0 \n1706593 0 0 \n2911666 0 0 \n61722 0 0 \n... ... ... \n100814 0 0 \n109506 0 0 \n765443 0 0 \n816462 0 0 \n110148 0 0 \n\n[326 rows x 27 columns]", "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>script_date</th>\n <th>script</th>\n <th>bechdel_id</th>\n <th>title</th>\n <th>release_year</th>\n <th>language</th>\n <th>popularity</th>\n <th>vote_average</th>\n <th>vote_count</th>\n <th>overview</th>\n <th>...</th>\n <th>Thriller</th>\n <th>War</th>\n <th>Comedy</th>\n <th>Music</th>\n <th>Western</th>\n <th>Horror</th>\n <th>Science Fiction</th>\n <th>Action</th>\n <th>Animation</th>\n <th>History</th>\n </tr>\n <tr>\n <th>0</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>472033</th>\n <td>NaN</td>\n <td>\\n\\n ...</td>\n <td>494</td>\n <td>9</td>\n <td>2009</td>\n <td>en</td>\n <td>71.590</td>\n <td>6.921</td>\n <td>3407</td>\n <td>When 9 first comes to life, he finds himself i...</td>\n <td>...</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>120780</th>\n <td>March 1998</td>\n <td>\"Out of Sight\"\\r\\n\\r\\n\\r\\n ...</td>\n <td>2247</td>\n <td>Out of Sight</td>\n <td>1998</td>\n <td>en</td>\n <td>24.781</td>\n <td>6.682</td>\n <td>1203</td>\n <td>Meet Jack Foley, a smooth criminal who bends t...</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1706593</th>\n <td>NaN</td>\n <td>CHRONICLE\\r\\n\\r\\n\\r\\...</td>\n <td>3037</td>\n <td>Chronicle</td>\n <td>2012</td>\n <td>en</td>\n <td>40.036</td>\n <td>6.816</td>\n <td>5119</td>\n <td>Three high school students make an incredible ...</td>\n <td>...</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>2911666</th>\n <td>NaN</td>\n <td>JOHN WICK\\r\\n\\r\\n\\...</td>\n <td>5897</td>\n <td>John Wick</td>\n <td>2014</td>\n <td>en</td>\n <td>105.961</td>\n <td>7.430</td>\n <td>18679</td>\n <td>Ex-hitman John Wick comes out of retirement to...</td>\n <td>...</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>61722</th>\n <td>March 1967</td>\n <td>\\t\\t\\t\\t\"THE GRADUATE\"\\r\\n\\r\\n\\r\\n\\t\\t\\t\\tScre...</td>\n <td>616</td>\n <td>Graduate, The</td>\n <td>1967</td>\n <td>en</td>\n <td>30.980</td>\n <td>7.700</td>\n <td>3206</td>\n <td>Benjamin, a recent college graduate very worri...</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>100814</th>\n <td>June 1988</td>\n <td>\\n\\nS. S. Wilson & Brent Maddock's \"Tremors\"\\n...</td>\n <td>1663</td>\n <td>Tremors</td>\n <td>1990</td>\n <td>en</td>\n <td>77.463</td>\n <td>6.896</td>\n <td>3105</td>\n <td>Val McKee and Earl Bassett are in a fight for ...</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>109506</th>\n <td>September 1992</td>\n <td>The CROW\\r\\n\\r\\n\\tby\\r\\n\\r\\n\\tDavis Schow\\r\\n\\...</td>\n <td>3820</td>\n <td>Crow, The</td>\n <td>1994</td>\n <td>en</td>\n <td>54.672</td>\n <td>7.527</td>\n <td>3786</td>\n <td>Exactly one year after young rock guitarist Er...</td>\n <td>...</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>765443</th>\n <td>NaN</td>\n <td>EASTERN PROMISES\\r\\...</td>\n <td>3069</td>\n <td>Eastern Promises</td>\n <td>2007</td>\n <td>en</td>\n <td>35.119</td>\n <td>7.362</td>\n <td>3194</td>\n <td>A Russian teenager living in London dies durin...</td>\n <td>...</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>816462</th>\n <td>October 2009</td>\n <td>NaN</td>\n <td>2636</td>\n <td>Conan the Barbarian</td>\n <td>2011</td>\n <td>en</td>\n <td>35.716</td>\n <td>5.299</td>\n <td>1792</td>\n <td>A quest that begins as a personal vendetta for...</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n </tr>\n <tr>\n <th>110148</th>\n <td>NaN</td>\n <td>Interview with the Vampire\\r\\n\\r\\n\\tScreenplay...</td>\n <td>120</td>\n <td>Interview with the Vampire</td>\n <td>1994</td>\n <td>en</td>\n <td>83.427</td>\n <td>7.387</td>\n <td>5627</td>\n <td>A vampire relates his epic life story of love,...</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>326 rows × 27 columns</p>\n</div>" }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:05:56.359117Z", "start_time": "2024-06-13T14:05:56.285786400Z" } }, "id": "22f7b01462f775b3" }, { "cell_type": "markdown", "source": [ "## Modeling without LLMs" ], "metadata": { "collapsed": false }, "id": "d5171d3049101429" }, { "cell_type": "code", "execution_count": 10, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import GridSearchCV" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:05:56.594270800Z", "start_time": "2024-06-13T14:05:56.339615400Z" } }, "id": "580e8a5c27fe9e5d" }, { "cell_type": "code", "execution_count": 11, "outputs": [ { "data": { "text/plain": "{'max_depth': 7,\n 'min_samples_leaf': 4,\n 'min_samples_split': 15,\n 'n_estimators': 1000}" }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "param_grid = {'n_estimators': [1000],\n", " 'max_depth': [7],\n", " 'min_samples_split': [15],\n", " 'min_samples_leaf': [4]}\n", "grid = GridSearchCV(RandomForestClassifier(random_state=42),\n", " param_grid=param_grid,\n", " cv=3)\n", "grid.fit(X_train[['release_year', 'popularity', 'vote_average', 'vote_count', 'Drama', 'Romance', 'Adventure', 'Animation', 'Fantasy', 'Science Fiction', 'Family', 'Mystery', 'Crime', 'Thriller', 'War', 'Western', 'Comedy', 'Music', 'Horror', 'Action', 'History']], y_train['pass_fail'])\n", "grid.best_params_" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:06:02.681325900Z", "start_time": "2024-06-13T14:05:56.541616Z" } }, "id": "f29f1842c880fedf" }, { "cell_type": "code", "execution_count": 12, "outputs": [ { "data": { "text/plain": "0\n6644200 True\n100477 True\n124315 False\n78748 False\n480687 False\n ... \n349903 True\n481499 False\n905372 True\n43014 False\n86510 True\nName: pass_fail, Length: 100, dtype: bool" }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "random_forest_clf = grid.best_estimator_\n", "y_test['pass_fail'] == random_forest_clf.predict(X_test[['release_year', 'popularity', 'vote_average', 'vote_count', 'Drama', 'Romance', 'Adventure', 'Animation', 'Fantasy', 'Science Fiction', 'Family', 'Mystery', 'Crime', 'Thriller', 'War', 'Western', 'Comedy', 'Music', 'Horror', 'Action', 'History']])" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:06:02.794011300Z", "start_time": "2024-06-13T14:06:02.668488700Z" } }, "id": "9a590e8f9b6d33d0" }, { "cell_type": "code", "execution_count": 12, "outputs": [], "source": [], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:06:02.796176600Z", "start_time": "2024-06-13T14:06:02.777075900Z" } }, "id": "be455db130ffc68b" }, { "cell_type": "code", "execution_count": 13, "outputs": [], "source": [ "apikey = open('apikey.txt').read()\n", "genai.configure(api_key=apikey)\n", "model = 'models/embedding-001'\n", "from google.api_core import retry\n", "from tqdm.auto import tqdm\n", "tqdm.pandas()\n", "\n", "def make_embed_text_fn(model):\n", " @retry.Retry(timeout=300.0)\n", " def embed_fn(text: str) -> list[float]:\n", " embedding = genai.embed_content(model=model,\n", " content=text,\n", " task_type='classification')\n", " return embedding['embedding']\n", " return embed_fn\n", "\n", "def create_embeddings(model, df):\n", " df['embeddings'] = df['overview'].progress_apply(make_embed_text_fn(model))\n", " return df\n" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:06:02.798289700Z", "start_time": "2024-06-13T14:06:02.780760900Z" } }, "id": "5b3855b9d8987239" }, { "cell_type": "code", "execution_count": 14, "outputs": [ { "data": { "text/plain": " 0%| | 0/326 [00:00<?, ?it/s]", "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "fdcd789312524a2cad326b7cf23a2080" } }, "metadata": {}, "output_type": "display_data" } ], "source": [ "X_train_embedded = create_embeddings(model, X_train)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:07:10.957807Z", "start_time": "2024-06-13T14:06:02.795100700Z" } }, "id": "ae3fb74aff6aa825" }, { "cell_type": "code", "execution_count": 15, "outputs": [ { "data": { "text/plain": " script_date script \\\n0 \n472033 NaN \\n\\n ... \n120780 March 1998 \"Out of Sight\"\\r\\n\\r\\n\\r\\n ... \n1706593 NaN CHRONICLE\\r\\n\\r\\n\\r\\... \n2911666 NaN JOHN WICK\\r\\n\\r\\n\\... \n61722 March 1967 \\t\\t\\t\\t\"THE GRADUATE\"\\r\\n\\r\\n\\r\\n\\t\\t\\t\\tScre... \n\n bechdel_id title release_year language popularity \\\n0 \n472033 494 9 2009 en 71.590 \n120780 2247 Out of Sight 1998 en 24.781 \n1706593 3037 Chronicle 2012 en 40.036 \n2911666 5897 John Wick 2014 en 105.961 \n61722 616 Graduate, The 1967 en 30.980 \n\n vote_average vote_count \\\n0 \n472033 6.921 3407 \n120780 6.682 1203 \n1706593 6.816 5119 \n2911666 7.430 18679 \n61722 7.700 3206 \n\n overview ... War Comedy \\\n0 ... \n472033 When 9 first comes to life, he finds himself i... ... 0 0 \n120780 Meet Jack Foley, a smooth criminal who bends t... ... 0 1 \n1706593 Three high school students make an incredible ... ... 0 0 \n2911666 Ex-hitman John Wick comes out of retirement to... ... 0 0 \n61722 Benjamin, a recent college graduate very worri... ... 0 1 \n\n Music Western Horror Science Fiction Action Animation History \\\n0 \n472033 0 0 0 1 1 1 0 \n120780 0 0 0 0 0 0 0 \n1706593 0 0 0 1 0 0 0 \n2911666 0 0 0 0 1 0 0 \n61722 0 0 0 0 0 0 0 \n\n embeddings \n0 \n472033 [0.009933546, 0.028054273, -0.027433202, 0.011... \n120780 [0.054206412, 0.025492756, 0.036431577, -0.048... \n1706593 [0.0023177029, -0.011539809, -0.0100571215, 0.... \n2911666 [0.048869684, 0.029862285, -0.0062307036, -0.0... \n61722 [0.04377451, -0.034744043, 0.011599346, -0.005... \n\n[5 rows x 28 columns]", "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>script_date</th>\n <th>script</th>\n <th>bechdel_id</th>\n <th>title</th>\n <th>release_year</th>\n <th>language</th>\n <th>popularity</th>\n <th>vote_average</th>\n <th>vote_count</th>\n <th>overview</th>\n <th>...</th>\n <th>War</th>\n <th>Comedy</th>\n <th>Music</th>\n <th>Western</th>\n <th>Horror</th>\n <th>Science Fiction</th>\n <th>Action</th>\n <th>Animation</th>\n <th>History</th>\n <th>embeddings</th>\n </tr>\n <tr>\n <th>0</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>472033</th>\n <td>NaN</td>\n <td>\\n\\n ...</td>\n <td>494</td>\n <td>9</td>\n <td>2009</td>\n <td>en</td>\n <td>71.590</td>\n <td>6.921</td>\n <td>3407</td>\n <td>When 9 first comes to life, he finds himself i...</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>[0.009933546, 0.028054273, -0.027433202, 0.011...</td>\n </tr>\n <tr>\n <th>120780</th>\n <td>March 1998</td>\n <td>\"Out of Sight\"\\r\\n\\r\\n\\r\\n ...</td>\n <td>2247</td>\n <td>Out of Sight</td>\n <td>1998</td>\n <td>en</td>\n <td>24.781</td>\n <td>6.682</td>\n <td>1203</td>\n <td>Meet Jack Foley, a smooth criminal who bends t...</td>\n <td>...</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>[0.054206412, 0.025492756, 0.036431577, -0.048...</td>\n </tr>\n <tr>\n <th>1706593</th>\n <td>NaN</td>\n <td>CHRONICLE\\r\\n\\r\\n\\r\\...</td>\n <td>3037</td>\n <td>Chronicle</td>\n <td>2012</td>\n <td>en</td>\n <td>40.036</td>\n <td>6.816</td>\n <td>5119</td>\n <td>Three high school students make an incredible ...</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>[0.0023177029, -0.011539809, -0.0100571215, 0....</td>\n </tr>\n <tr>\n <th>2911666</th>\n <td>NaN</td>\n <td>JOHN WICK\\r\\n\\r\\n\\...</td>\n <td>5897</td>\n <td>John Wick</td>\n <td>2014</td>\n <td>en</td>\n <td>105.961</td>\n <td>7.430</td>\n <td>18679</td>\n <td>Ex-hitman John Wick comes out of retirement to...</td>\n <td>...</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>[0.048869684, 0.029862285, -0.0062307036, -0.0...</td>\n </tr>\n <tr>\n <th>61722</th>\n <td>March 1967</td>\n <td>\\t\\t\\t\\t\"THE GRADUATE\"\\r\\n\\r\\n\\r\\n\\t\\t\\t\\tScre...</td>\n <td>616</td>\n <td>Graduate, The</td>\n <td>1967</td>\n <td>en</td>\n <td>30.980</td>\n <td>7.700</td>\n <td>3206</td>\n <td>Benjamin, a recent college graduate very worri...</td>\n <td>...</td>\n <td>0</td>\n <td>1</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>0</td>\n <td>[0.04377451, -0.034744043, 0.011599346, -0.005...</td>\n </tr>\n </tbody>\n</table>\n<p>5 rows × 28 columns</p>\n</div>" }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train_embedded.head()" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:07:11.258588400Z", "start_time": "2024-06-13T14:07:11.069302400Z" } }, "id": "c4b8ccd2dd978cd8" }, { "cell_type": "code", "execution_count": 16, "outputs": [], "source": [ "from sklearn.neural_network import MLPClassifier, MLPRegressor\n", "\n", "mlp_clf = MLPClassifier()\n" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:07:11.259715300Z", "start_time": "2024-06-13T14:07:11.129664Z" } }, "id": "981b7b51da8fefc9" }, { "cell_type": "code", "execution_count": 17, "outputs": [], "source": [ "def emb_arr(col=X_train_embedded['embeddings']):\n", " embeddings = np.ndarray((len(col),768))\n", " j = 0\n", " for i in col.index:\n", " try:\n", " embeddings[j] = col[i]\n", " j+=1\n", " except: j+=1\n", " return embeddings" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:07:11.261949300Z", "start_time": "2024-06-13T14:07:11.150054100Z" } }, "id": "2089a40b08c711ba" }, { "cell_type": "code", "execution_count": 18, "outputs": [], "source": [ "embeddings = emb_arr(X_train_embedded['embeddings'])" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:07:11.323453700Z", "start_time": "2024-06-13T14:07:11.156411500Z" } }, "id": "7ddc5132c8a89398" }, { "cell_type": "code", "execution_count": 18, "outputs": [], "source": [], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:07:11.325599100Z", "start_time": "2024-06-13T14:07:11.176392Z" } }, "id": "2877eee86d1673de" }, { "cell_type": "code", "execution_count": 19, "outputs": [ { "data": { "text/plain": "MLPClassifier()", "text/html": "<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>MLPClassifier()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">MLPClassifier</label><div class=\"sk-toggleable__content\"><pre>MLPClassifier()</pre></div></div></div></div></div>" }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mlp = MLPClassifier()\n", "mlp.fit(embeddings, y_train['pass_fail'].reset_index().drop(0, axis=1))\n" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:07:12.909485500Z", "start_time": "2024-06-13T14:07:11.182412600Z" } }, "id": "aba42d46e9ffccf" }, { "cell_type": "code", "execution_count": 20, "outputs": [ { "data": { "text/plain": "array([0.56060606, 0.70769231, 0.64615385, 0.64615385, 0.61538462])" }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import cross_val_score, cross_val_predict\n", "\n", "mlp = MLPClassifier()\n", "cross_val_score(mlp, embeddings, y_train['pass_fail'].reset_index().drop(0, axis=1), cv=5)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:07:17.961681700Z", "start_time": "2024-06-13T14:07:12.900586Z" } }, "id": "34bdb3cd355ff348" }, { "cell_type": "code", "execution_count": 21, "outputs": [ { "data": { "text/plain": "{'hidden_layer_sizes': (50,), 'solver': 'lbfgs'}" }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "param_grid = {\n", " 'hidden_layer_sizes':[(32,),(50,),(64,),(32,32,)],\n", " 'solver':['lbfgs']\n", "}\n", "grid = GridSearchCV(MLPClassifier(random_state=0), \n", " param_grid=param_grid,\n", " cv=3\n", " )\n", "grid.fit(embeddings, y_train['pass_fail'].reset_index().drop(0, axis=1))\n", "\n", "grid.best_params_" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:07:21.874591400Z", "start_time": "2024-06-13T14:07:17.962802700Z" } }, "id": "c45ef4f2a7105e13" }, { "cell_type": "code", "execution_count": 22, "outputs": [ { "data": { "text/plain": "array([0.51515152, 0.72307692, 0.64615385, 0.63076923, 0.64615385])" }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cross_val_score(grid.best_estimator_, embeddings, y_train['pass_fail'].reset_index().drop(0, axis=1), cv=5)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:07:23.446309600Z", "start_time": "2024-06-13T14:07:21.864137600Z" } }, "id": "59da71109d3a3610" }, { "cell_type": "code", "execution_count": 23, "outputs": [], "source": [ "X_train_embedded['neural_net_preds'] = cross_val_predict(MLPClassifier(hidden_layer_sizes=(50,), solver='lbfgs'), embeddings, y_train['pass_fail'].reset_index().drop(0, axis=1), cv=6)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:07:25.566075Z", "start_time": "2024-06-13T14:07:23.446309600Z" } }, "id": "a6dcd03b13eafd1a" }, { "cell_type": "code", "execution_count": 24, "outputs": [ { "data": { "text/plain": "array([0.54545455, 0.70769231, 0.73846154, 0.58461538, 0.63076923])" }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cross_val_score(RandomForestClassifier(max_depth=7,\n", " min_samples_leaf=4,\n", " min_samples_split=15,\n", " n_estimators=1000), X_train_embedded[['release_year', 'popularity', 'vote_average', 'vote_count', 'Drama', 'Romance', 'Adventure', 'Animation', 'Fantasy', 'Science Fiction', 'Family', 'Mystery', 'Crime', 'Thriller', 'War', 'Western', 'Comedy', 'Music', 'Horror', 'Action', 'History', 'neural_net_preds']], y_train['pass_fail'])" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:07:31.850987600Z", "start_time": "2024-06-13T14:07:25.570330900Z" } }, "id": "b25658587ce8d473" }, { "cell_type": "code", "execution_count": 25, "outputs": [ { "data": { "text/plain": "{'max_depth': 3,\n 'max_features': 12,\n 'min_samples_leaf': 3,\n 'min_samples_split': 50}" }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.tree import DecisionTreeClassifier\n", "tree_clf = GridSearchCV(DecisionTreeClassifier(),\n", " param_grid={'max_depth': [1,2,3,4,5],\n", " 'min_samples_leaf': [1,2,3,4,5,40,50],\n", " 'min_samples_split': [2,3,4,40,50],\n", " 'max_features': [1,2,3,12,30]},\n", " cv=10)\n", "tree_clf.fit(X_train_embedded[['release_year', 'popularity', 'vote_average', 'vote_count', 'Drama', 'Romance', 'Adventure', 'Animation', 'Fantasy', 'Science Fiction', 'Family', 'Mystery', 'Crime', 'Thriller', 'War', 'Western', 'Comedy', 'Music', 'Horror', 'Action', 'History', 'neural_net_preds']], y_train['pass_fail'])\n", "tree_clf.best_params_" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:08:16.241447Z", "start_time": "2024-06-13T14:07:31.838016400Z" } }, "id": "2bf52517b393ff75" }, { "cell_type": "code", "execution_count": 26, "outputs": [ { "data": { "text/plain": "<Figure size 1800x500 with 1 Axes>", "image/png": "" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from sklearn.tree import plot_tree\n", "import matplotlib.pyplot as plt\n", "\n", "fig, ax = plt.subplots(figsize=(18,5))\n", "plot_tree(tree_clf.best_estimator_, fontsize=11, feature_names=['release_year', 'popularity', 'vote_average', 'vote_count', 'Drama', 'Romance', 'Adventure', 'Animation', 'Fantasy', 'Science Fiction', 'Family', 'Mystery', 'Crime', 'Thriller', 'War', 'Western', 'Comedy', 'Music', 'Horror', 'Action', 'History', 'neural_net_preds'], ax=ax, rounded=True)\n", "plt.show()" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:08:16.702287400Z", "start_time": "2024-06-13T14:08:16.232372600Z" } }, "id": "7da14d284462b2c6" }, { "cell_type": "code", "execution_count": 27, "outputs": [ { "data": { "text/plain": " 0%| | 0/100 [00:00<?, ?it/s]", "application/vnd.jupyter.widget-view+json": { "version_major": 2, "version_minor": 0, "model_id": "a98b4ca334f244fa92a80a2f1807c3b8" } }, "metadata": {}, "output_type": "display_data" } ], "source": [ "X_test_embedded = create_embeddings(model, X_test)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:08:37.664685Z", "start_time": "2024-06-13T14:08:16.695634Z" } }, "id": "17f1c03a01b74e2f" }, { "cell_type": "code", "execution_count": 28, "outputs": [], "source": [ "test_embeddings = emb_arr(col=X_test_embedded['embeddings'])\n", "mlp_trained = MLPClassifier(hidden_layer_sizes=(50,),solver='lbfgs')\n", "mlp_trained.fit(embeddings, y_train['pass_fail'].reset_index().drop(0, axis=1))\n", "X_test_embedded['neural_net_preds'] = mlp_trained.predict(test_embeddings)\n", "test_preds = tree_clf.best_estimator_.predict(X_test_embedded[['release_year', 'popularity', 'vote_average', 'vote_count', 'Drama', 'Romance', 'Adventure', 'Animation', 'Fantasy', 'Science Fiction', 'Family', 'Mystery', 'Crime', 'Thriller', 'War', 'Western', 'Comedy', 'Music', 'Horror', 'Action', 'History', 'neural_net_preds']])" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:08:38.182393300Z", "start_time": "2024-06-13T14:08:37.832999200Z" } }, "id": "5db2ccb2bd662394" }, { "cell_type": "code", "execution_count": 28, "outputs": [], "source": [], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:08:38.201211700Z", "start_time": "2024-06-13T14:08:38.183861200Z" } }, "id": "7909cce6623f8c53" }, { "cell_type": "code", "execution_count": 29, "outputs": [ { "data": { "text/plain": "0\n6644200 True\n100477 False\n124315 True\n78748 True\n480687 False\n ... \n349903 True\n481499 True\n905372 True\n43014 True\n86510 True\nLength: 100, dtype: bool" }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_embedded['neural_net_preds'] == y_test['pass_fail']" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:08:38.202413900Z", "start_time": "2024-06-13T14:08:38.188874200Z" } }, "id": "31be544145c03c33" }, { "cell_type": "code", "execution_count": 30, "outputs": [ { "data": { "text/plain": "0\n6644200 True\n100477 True\n124315 True\n78748 False\n480687 False\n ... \n349903 True\n481499 True\n905372 True\n43014 False\n86510 True\nName: pass_fail, Length: 100, dtype: bool" }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_preds == y_test['pass_fail']" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:08:38.242058600Z", "start_time": "2024-06-13T14:08:38.198208500Z" } }, "id": "5105c83744b885fb" }, { "cell_type": "code", "execution_count": 31, "outputs": [ { "data": { "text/plain": "0\n472033 True\n120780 True\n1706593 False\n2911666 True\n61722 False\n ... \n100814 True\n109506 False\n765443 True\n816462 True\n110148 True\nLength: 326, dtype: bool" }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train_embedded['neural_net_preds'] == y_train['pass_fail']" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:08:38.243125100Z", "start_time": "2024-06-13T14:08:38.222298600Z" } }, "id": "44d9e4ba042fa5ee" }, { "cell_type": "code", "execution_count": 32, "outputs": [], "source": [ "conn = psycopg2.connect(dbname='bechdel_test', user='postgres', password='guest')\n", "cur = conn.cursor()\n", "\n", "cur.execute('SELECT * FROM bechdel_ratings JOIN tmdb_data ON tmdb_data.imdb_id = bechdel_ratings.imdb_id;')\n", "data = pd.DataFrame(cur.fetchall())\n", "df = data.copy()\n", "df.set_index(0, inplace=True)\n", "cur.close()\n", "conn.close()" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:08:38.376569600Z", "start_time": "2024-06-13T14:08:38.235291900Z" } }, "id": "e0ca6a6a46b8fca8" }, { "cell_type": "code", "execution_count": 33, "outputs": [ { "data": { "text/plain": " 1 2 3 4 5 6 \\\n0 \n9804 14495706 La Rosace Magique 1877 0 766094 14495706 \n9806 12592084 Le singe musicien 1878 0 751212 12592084 \n9832 8588366 L'homme machine 1885 0 585297 8588366 \n9614 2075247 Man Walking Around the Corner 1887 0 159897 2075247 \n9841 7754902 Man Riding Jumping Horse 1887 0 1191584 7754902 \n... ... ... ... .. ... ... \n11302 21235248 Ghostbusters: Frozen Empire 2024 3 967847 21235248 \n11303 3359350 Road House 2024 3 359410 3359350 \n11317 14539740 Godzilla x Kong: New Empire 2024 3 823464 14539740 \n11318 19356262 Drive-Away Dolls 2024 3 957304 19356262 \n11322 26658104 Imaginary 2024 1 1125311 26658104 \n\n 7 8 9 10 11 12 \\\n0 \n9804 The Magic Rosette xx 1878-05-07 2.194 5.800 19 \n9806 The Musician Monkey xx 1878-05-07 2.560 5.900 25 \n9832 L'Homme Machine xx 1885-01-01 1.149 4.629 31 \n9614 Man Walking Around a Corner xx 1887-08-18 5.529 4.900 80 \n9841 Man Riding Jumping Horse en None 0.187 4.000 5 \n... ... .. ... ... ... ... \n11302 Ghostbusters: Frozen Empire en 2024-03-20 603.739 6.671 873 \n11303 Road House en 2024-03-08 483.627 7.024 1810 \n11317 Godzilla x Kong: The New Empire en 2024-03-27 3853.790 7.278 2211 \n11318 Drive-Away Dolls en 2024-02-22 81.501 5.531 208 \n11322 Imaginary en 2024-03-06 154.811 6.210 312 \n\n 13 \n0 \n9804 Praxinoscope strip of a shifting rosette. Seri... \n9806 A pre-cinematograph colour animation of the mo... \n9832 Animated stick drawings representing a man wal... \n9614 The last remaining production of Le Prince's L... \n9841 A man riding a horse jumps over an obstacle. \n... ... \n11302 When the discovery of an ancient artifact unle... \n11303 Ex-UFC fighter Dalton takes a job as a bouncer... \n11317 Following their explosive showdown, Godzilla a... \n11318 Jamie, an uninhibited free spirit bemoaning ye... \n11322 When Jessica moves back into her childhood hom... \n\n[10133 rows x 13 columns]", "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>1</th>\n <th>2</th>\n <th>3</th>\n <th>4</th>\n <th>5</th>\n <th>6</th>\n <th>7</th>\n <th>8</th>\n <th>9</th>\n <th>10</th>\n <th>11</th>\n <th>12</th>\n <th>13</th>\n </tr>\n <tr>\n <th>0</th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>9804</th>\n <td>14495706</td>\n <td>La Rosace Magique</td>\n <td>1877</td>\n <td>0</td>\n <td>766094</td>\n <td>14495706</td>\n <td>The Magic Rosette</td>\n <td>xx</td>\n <td>1878-05-07</td>\n <td>2.194</td>\n <td>5.800</td>\n <td>19</td>\n <td>Praxinoscope strip of a shifting rosette. Seri...</td>\n </tr>\n <tr>\n <th>9806</th>\n <td>12592084</td>\n <td>Le singe musicien</td>\n <td>1878</td>\n <td>0</td>\n <td>751212</td>\n <td>12592084</td>\n <td>The Musician Monkey</td>\n <td>xx</td>\n <td>1878-05-07</td>\n <td>2.560</td>\n <td>5.900</td>\n <td>25</td>\n <td>A pre-cinematograph colour animation of the mo...</td>\n </tr>\n <tr>\n <th>9832</th>\n <td>8588366</td>\n <td>L&#39;homme machine</td>\n <td>1885</td>\n <td>0</td>\n <td>585297</td>\n <td>8588366</td>\n <td>L'Homme Machine</td>\n <td>xx</td>\n <td>1885-01-01</td>\n <td>1.149</td>\n <td>4.629</td>\n <td>31</td>\n <td>Animated stick drawings representing a man wal...</td>\n </tr>\n <tr>\n <th>9614</th>\n <td>2075247</td>\n <td>Man Walking Around the Corner</td>\n <td>1887</td>\n <td>0</td>\n <td>159897</td>\n <td>2075247</td>\n <td>Man Walking Around a Corner</td>\n <td>xx</td>\n <td>1887-08-18</td>\n <td>5.529</td>\n <td>4.900</td>\n <td>80</td>\n <td>The last remaining production of Le Prince's L...</td>\n </tr>\n <tr>\n <th>9841</th>\n <td>7754902</td>\n <td>Man Riding Jumping Horse</td>\n <td>1887</td>\n <td>0</td>\n <td>1191584</td>\n <td>7754902</td>\n <td>Man Riding Jumping Horse</td>\n <td>en</td>\n <td>None</td>\n <td>0.187</td>\n <td>4.000</td>\n <td>5</td>\n <td>A man riding a horse jumps over an obstacle.</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>11302</th>\n <td>21235248</td>\n <td>Ghostbusters: Frozen Empire</td>\n <td>2024</td>\n <td>3</td>\n <td>967847</td>\n <td>21235248</td>\n <td>Ghostbusters: Frozen Empire</td>\n <td>en</td>\n <td>2024-03-20</td>\n <td>603.739</td>\n <td>6.671</td>\n <td>873</td>\n <td>When the discovery of an ancient artifact unle...</td>\n </tr>\n <tr>\n <th>11303</th>\n <td>3359350</td>\n <td>Road House</td>\n <td>2024</td>\n <td>3</td>\n <td>359410</td>\n <td>3359350</td>\n <td>Road House</td>\n <td>en</td>\n <td>2024-03-08</td>\n <td>483.627</td>\n <td>7.024</td>\n <td>1810</td>\n <td>Ex-UFC fighter Dalton takes a job as a bouncer...</td>\n </tr>\n <tr>\n <th>11317</th>\n <td>14539740</td>\n <td>Godzilla x Kong: New Empire</td>\n <td>2024</td>\n <td>3</td>\n <td>823464</td>\n <td>14539740</td>\n <td>Godzilla x Kong: The New Empire</td>\n <td>en</td>\n <td>2024-03-27</td>\n <td>3853.790</td>\n <td>7.278</td>\n <td>2211</td>\n <td>Following their explosive showdown, Godzilla a...</td>\n </tr>\n <tr>\n <th>11318</th>\n <td>19356262</td>\n <td>Drive-Away Dolls</td>\n <td>2024</td>\n <td>3</td>\n <td>957304</td>\n <td>19356262</td>\n <td>Drive-Away Dolls</td>\n <td>en</td>\n <td>2024-02-22</td>\n <td>81.501</td>\n <td>5.531</td>\n <td>208</td>\n <td>Jamie, an uninhibited free spirit bemoaning ye...</td>\n </tr>\n <tr>\n <th>11322</th>\n <td>26658104</td>\n <td>Imaginary</td>\n <td>2024</td>\n <td>1</td>\n <td>1125311</td>\n <td>26658104</td>\n <td>Imaginary</td>\n <td>en</td>\n <td>2024-03-06</td>\n <td>154.811</td>\n <td>6.210</td>\n <td>312</td>\n <td>When Jessica moves back into her childhood hom...</td>\n </tr>\n </tbody>\n</table>\n<p>10133 rows × 13 columns</p>\n</div>" }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:08:38.399932400Z", "start_time": "2024-06-13T14:08:38.378708700Z" } }, "id": "b9b26041b7a79037" }, { "cell_type": "code", "execution_count": 48, "outputs": [], "source": [ "model = genai.GenerativeModel('gemini-1.5-flash', safety_settings=[\n", " {\n", " \"category\": \"HARM_CATEGORY_HARASSMENT\",\n", " \"threshold\": \"BLOCK_NONE\"\n", " },\n", " {\n", " \"category\": \"HARM_CATEGORY_HATE_SPEECH\",\n", " \"threshold\": \"BLOCK_NONE\"\n", " },\n", " {\n", " \"category\": \"HARM_CATEGORY_SEXUALLY_EXPLICIT\",\n", " \"threshold\": \"BLOCK_NONE\"\n", " },\n", " {\n", " \"category\": \"HARM_CATEGORY_DANGEROUS_CONTENT\",\n", " \"threshold\": \"BLOCK_NONE\"\n", " },\n", "])\n", "import time\n", "timer = time.time()\n", "gemini_flash_guesses = []\n", "\n", "for i in X_train.index[:10]:\n", " chat = model.start_chat()\n", " if (1 - float(time.time() - timer)) > 0: \n", " time.sleep(1 - float(time.time() - timer))\n", " timer = time.time()\n", " try: \n", " response = model.generate_content('How many female characters does the following script contain?'\n", " 'Script:'\n", " ''\n", " '' + X_train['script'][i] )\n", " r_text = response.text\n", " except Exception as e: \n", " r_text = response.prompt_feedback\n", " gemini_flash_guesses.append((i, r_text))" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:44:25.971515900Z", "start_time": "2024-06-13T14:43:07.917993600Z" } }, "id": "5b0e291093a90fb7" }, { "cell_type": "code", "execution_count": 49, "outputs": [ { "data": { "text/plain": "[(472033,\n 'This script features **one** named female character: **7**. \\n\\nWhile the scientist is referred to as \"he\" and is presumably male, there are no other female characters mentioned in the script. \\n'),\n (120780,\n 'The script \"Out of Sight\" contains the following female characters:\\n\\n* **Loretta:** A bank teller who is robbed by Foley.\\n* **Lulu:** Chino\\'s \"wife\" and accomplice in his escape plan. \\n* **Adele:** Foley\\'s ex-wife.\\n* **Karen Sisco:** A federal marshal who becomes involved in Foley\\'s escape and later attempts to apprehend him.\\n* **Moselle:** Maurice Miller\\'s girlfriend.\\n* **Midge:** Richard Ripley\\'s maid. \\n* **Yonelle:** A transsexual who is murdered at Eddie Solomon\\'s house. \\n* **Regina Mary Bragg:** Buddy\\'s sister, a born-again Christian who calls the FBI to report Buddy and Foley.\\n* **Celeste:** A waitress at the Westin Hotel in Detroit.\\n\\nIt\\'s important to note that some of these characters are only briefly mentioned and do not have any lines in the script. \\n'),\n (1706593,\n 'The script \"Chronicle\" features **two** female characters:\\n\\n1. **Sandra Detmer:** Andrew\\'s mother, who is ill and confined to bed.\\n2. **Casey Letter:** Matt\\'s girlfriend, who is a videoblogger and later becomes a love interest for Matt. \\n'),\n (2911666,\n \"This script contains only one female character: **Norma Wick**, John Wick's deceased wife. \\n\\nWhile there are female characters mentioned, like the Delivery Woman and the Waitress at the Red Circle, they don't have speaking roles or significant actions in the script. \\n\"),\n (61722,\n 'The script \"The Graduate\" features **three** female characters:\\n\\n1. **Mrs. Robinson:** The older woman who has an affair with Benjamin.\\n2. **Elaine Robinson:** Mrs. Robinson\\'s daughter, who Benjamin falls in love with.\\n3. **Mrs. Braddock:** Benjamin\\'s mother. \\n'),\n (1311071,\n \"This script features 7 female characters: \\n\\n1. **Naomi Ginsberg:** Allen's mother, struggling with a mental health condition.\\n2. **Edie Parker:** Jack Kerouac's girlfriend, an art student.\\n3. **Permissions Librarian:** A librarian at Columbia University.\\n4. **Gwendolyn:** A page at Columbia University's library.\\n5. **Edith Cohen:** A woman accompanying Louis Ginsberg.\\n6. **Marion Carr:** Lucien Carr's mother.\\n7. **Grandma Frankie:** Jack Kerouac's grandmother. \\n\"),\n (115632,\n \"The script you provided has **two** female characters:\\n\\n1. **Matilde:** Jean Michel Basquiat's mother, who appears in a dream sequence and later in a mental hospital.\\n2. **Gina Cardinale:** Jean Michel's girlfriend, who plays a significant role throughout the script. \\n\"),\n (499549,\n \"This script contains **2** female characters:\\n\\n* **Dr. Grace Augustine:** The head of the Avatar Program and a renowned Pandoran botanist.\\n* **Neytiri:** A fierce and beautiful Na'vi warrior who becomes Jake's teacher and love interest. \\n\"),\n (113101, block_reason: OTHER),\n (64665, block_reason: OTHER)]" }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gemini_flash_guesses" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:44:26.015457500Z", "start_time": "2024-06-13T14:44:25.989596Z" } }, "id": "9c18c98dceacb7f7" }, { "cell_type": "code", "execution_count": 47, "outputs": [ { "data": { "text/plain": "script_date April 1985\nscript NaN\nbechdel_id 2576\ntitle Commando\nrelease_year 1985\nlanguage en\npopularity 46.667\nvote_average 6.678\nvote_count 2677\noverview John Matrix, the former leader of a special co...\nDrama 0\nRomance 0\nAdventure 1\nFantasy 0\nFamily 0\nMystery 0\nCrime 0\nThriller 1\nWar 0\nComedy 0\nMusic 0\nWestern 0\nHorror 0\nScience Fiction 0\nAction 1\nAnimation 0\nHistory 0\nembeddings [0.03867363, 0.038765118, -0.029283423, -0.043...\nneural_net_preds 0\nName: 88944, dtype: object" }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.loc[88944]" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-13T14:41:57.141371100Z", "start_time": "2024-06-13T14:41:57.094035800Z" } }, "id": "741aa215c9e1ac5f" }, { "cell_type": "code", "execution_count": 111, "outputs": [ { "data": { "text/plain": "<coroutine object ChatSession.send_message_async at 0x000001734E1BA480>" }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-12T23:30:56.524967900Z", "start_time": "2024-06-12T23:30:56.504129400Z" } }, "id": "e948fa3a8d827b91" }, { "cell_type": "code", "execution_count": 114, "outputs": [], "source": [ "response.close()" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-12T23:31:24.415640600Z", "start_time": "2024-06-12T23:31:24.379790600Z" } }, "id": "7f35f43b37df1fb1" }, { "cell_type": "code", "execution_count": 117, "outputs": [ { "ename": "AttributeError", "evalue": "'coroutine' object has no attribute '__dict__'", "output_type": "error", "traceback": [ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[1;31mAttributeError\u001B[0m Traceback (most recent call last)", "Cell \u001B[1;32mIn[117], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m response\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__dict__\u001B[39m\n", "\u001B[1;31mAttributeError\u001B[0m: 'coroutine' object has no attribute '__dict__'" ] } ], "source": [ "r" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-06-12T23:31:40.959115300Z", "start_time": "2024-06-12T23:31:40.917165400Z" } }, "id": "64e80d3418ea5576" }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false }, "id": "cdd22dad0e02671c" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }