{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "65c6136d", "metadata": {}, "outputs": [], "source": [ "#main\n", "import pandas as pd\n", "import numpy as np\n", "import math\n", "import sklearn\n", "import re\n", "\n", "#graphic\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns \n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "id": "68553d29", "metadata": {}, "outputs": [], "source": [ "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.max_rows', None)" ] }, { "cell_type": "code", "execution_count": 3, "id": "054011ff", "metadata": {}, "outputs": [], "source": [ "movie_overview_2022 = pd.read_csv('./movies_2022.csv', index_col='Unnamed: 0')\n", "movie_detail_2022 = pd.read_csv('./movie_details_2022.csv', index_col='Unnamed: 0')\n", "movie_award_2022 = pd.read_csv('./awards_2022.csv', index_col='Unnamed: 0')\n", "top_1000_movies = pd.read_csv('./imdb_top_1000.csv')\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "f43f1bf2", "metadata": {}, "outputs": [], "source": [ "movie_overview_2022 = movie_overview_2022.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 5, "id": "ef9573a7", "metadata": {}, "outputs": [], "source": [ "movie_detail_2022.drop(['title','movie_id','movie_imdb_link' ], axis=1, inplace=True) # drop duplicate columns" ] }, { "cell_type": "code", "execution_count": 6, "id": "60002f92", "metadata": {}, "outputs": [], "source": [ "movie_award_2022.drop(['title','movie_id'], axis=1, inplace=True) # drop duplicate columns" ] }, { "cell_type": "code", "execution_count": 7, "id": "f70eca32", "metadata": {}, "outputs": [], "source": [ "movies_df = pd.concat([movie_overview_2022, movie_award_2022, movie_detail_2022 ],axis = 1)\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "086eb0c9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(477, 26)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_df.shape" ] }, { "cell_type": "code", "execution_count": 9, "id": "732abe68", "metadata": {}, "outputs": [], "source": [ "def separete_awards(df):\n", " df['total_award_nominations'] = '' #creating empty columns\n", " df['total_award_wins'] = ''\n", "\n", " for i in df.index:\n", " if df['awards_total'][i] != '0':\n", " pattern= '\\d+'\n", " numbers = re.findall(pattern,df['awards_total'][i])\n", " df['total_award_wins'][i] = numbers[0] \n", " df['total_award_nominations'][i] = numbers[1] \n", " \n", " else:\n", " df['total_award_wins'][i] = df['awards_total'][i]\n", " df['total_award_nominations'][i] = df['awards_total'][i]\n", " \n", " return df" ] }, { "cell_type": "code", "execution_count": 10, "id": "0c400730", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/2812189874.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['total_award_wins'][i] = numbers[0]\n", "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/2812189874.py:10: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['total_award_nominations'][i] = numbers[1]\n", "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/2812189874.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['total_award_wins'][i] = df['awards_total'][i]\n", "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/2812189874.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['total_award_nominations'][i] = df['awards_total'][i]\n" ] } ], "source": [ "movies_df_v1 = separete_awards(movies_df)" ] }, { "cell_type": "code", "execution_count": 11, "id": "4391f073", "metadata": {}, "outputs": [], "source": [ "def separete_genre(df):\n", " df['primary_genre'] = '' #creating empty columns\n", " df['secondary_genre'] = ''\n", " \n", " pattern = \"[\\w']+\"\n", " \n", " for i in df.index:\n", " if df['genre'][i] != '[]':\n", " g = re.findall(pattern,df['genre'][i])\n", " \n", " if len(g)>1:\n", " df['primary_genre'][i] = g[0] \n", " df['secondary_genre'][i] = g[1]\n", " else:\n", " df['primary_genre'][i] = df['genre'][i]\n", " df['secondary_genre'][i] = 'Other'\n", " else:\n", " df['primary_genre'][i] = ''\n", " df['secondary_genre'][i] = ''\n", " \n", " return df\n", " " ] }, { "cell_type": "code", "execution_count": 12, "id": "9f7b7559", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/4184901931.py:12: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['primary_genre'][i] = g[0]\n", "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/4184901931.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['secondary_genre'][i] = g[1]\n", "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/4184901931.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['primary_genre'][i] = df['genre'][i]\n", "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/4184901931.py:16: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['secondary_genre'][i] = 'Other'\n" ] } ], "source": [ "movies_df_v2 = separete_genre(movies_df_v1)" ] }, { "cell_type": "code", "execution_count": 13, "id": "7c71fbf9", "metadata": {}, "outputs": [], "source": [ "movies_df_v2['certificate']= movies_df_v2['certificate'].replace({\n", " 'R':'Adult',\n", " 'PG-13':'+13/14', \n", " 'TV-13':'+13/14',\n", " 'TV-MA':'Adult',\n", " 'PG': 'Parental Guidance',\n", " 'TV-14': '+13/14',\n", " 'Unrated': 'Not Rated',\n", " 'Approved': 'Not Rated',\n", " 'TV-PG': 'Parental Guidance',\n", " 'TV-G': 'Kids',\n", " 'G': 'Suitable for all',\n", " 'TV-Y7': 'Kids',\n", " '18': 'Adult',\n", " 'TV-Y': 'Kids',\n", " 'TV-Y7-FV': 'Kids',\n", " 'T':'Adult',\n", " 'M': 'Adult'\n", "})" ] }, { "cell_type": "code", "execution_count": 14, "id": "ede6be51", "metadata": {}, "outputs": [], "source": [ "def drop_k(df):\n", " \n", " pattern = \"K\"\n", " \n", " for i in df.index:\n", " if df['num_user_reviews'][i] != '[]':\n", " review = re.findall(pattern,df['num_user_reviews'][i]) \n", " if len(review)>0:\n", " a = df['num_user_reviews'][i].replace('K','') \n", " df['num_user_reviews'][i] = float(a)*1000\n", " \n", " else:\n", " df['num_user_reviews'][i] = df['num_user_reviews'][i]\n", " else:\n", " df['num_user_reviews'][i] = 0\n", " \n", " df['num_user_reviews'] = pd.to_numeric(df['num_user_reviews'], errors='coerce')\n", " \n", " return df\n", " " ] }, { "cell_type": "code", "execution_count": 15, "id": "549a8a4a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/3710906870.py:10: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['num_user_reviews'][i] = float(a)*1000\n", "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/3710906870.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['num_user_reviews'][i] = df['num_user_reviews'][i]\n" ] } ], "source": [ "movies_df_v2 = drop_k(movies_df_v2)" ] }, { "cell_type": "code", "execution_count": 16, "id": "2543ee87", "metadata": {}, "outputs": [], "source": [ "movies_df_v2['num_critic_reviews'] = pd.to_numeric(movies_df_v2['num_critic_reviews'], errors='coerce')" ] }, { "cell_type": "code", "execution_count": null, "id": "f1897392", "metadata": {}, "outputs": [], "source": [ "movies_df_v2['release_date'].value_counts(dropna=False)" ] }, { "cell_type": "code", "execution_count": 18, "id": "b802f866", "metadata": {}, "outputs": [], "source": [ "movies_df_v2['release_date']= movies_df_v2['release_date'].replace({\n", " '2022':'',\n", " '2023':''\n", "})" ] }, { "cell_type": "code", "execution_count": 19, "id": "6623ce1b", "metadata": {}, "outputs": [], "source": [ "movies_df_v2['release_date']=pd.to_datetime(movies_df_v2['release_date'], errors='coerce')" ] }, { "cell_type": "code", "execution_count": null, "id": "84460bf9", "metadata": {}, "outputs": [], "source": [ "movies_df_v2['release_date'].value_counts(dropna=False)" ] }, { "cell_type": "code", "execution_count": 21, "id": "f0846511", "metadata": {}, "outputs": [], "source": [ "import datetime as dt\n", "\n", "movies_df_v2['release_weekday'] = movies_df_v2['release_date'].dt.isocalendar().day # weekday 5 is a friday\n" ] }, { "cell_type": "code", "execution_count": 22, "id": "c2e636c1", "metadata": {}, "outputs": [], "source": [ "movies_df_v2 = movies_df_v2.drop(['release_month'], axis=1) #dropping the original month column and \n", "#creating one from datetime. The original release_month column has more nonsensical values\n" ] }, { "cell_type": "code", "execution_count": 23, "id": "74f4336e", "metadata": {}, "outputs": [], "source": [ "movies_df_v2['release_month'] = pd.DatetimeIndex(movies_df_v2['release_date']).month\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "a4241d70", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9.0 57\n", "8.0 54\n", "10.0 54\n", "2.0 49\n", "3.0 45\n", "6.0 41\n", "7.0 40\n", "4.0 40\n", "1.0 39\n", "5.0 32\n", "11.0 14\n", "12.0 7\n", "NaN 5\n", "Name: release_month, dtype: int64" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_df_v2['release_month'].value_counts(dropna=False)" ] }, { "cell_type": "code", "execution_count": 25, "id": "10eb3ce8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2022 477\n", "Name: release_year, dtype: int64" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_df_v2['release_year'].value_counts(dropna=False)" ] }, { "cell_type": "code", "execution_count": 26, "id": "37a53771", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "United States 426\n", "United Kingdom 19\n", "Canada 4\n", "Ireland 4\n", "Australia 3\n", "Japan 2\n", "China 2\n", "Italy 2\n", "Spain 2\n", "Germany 2\n", "Mexico 2\n", "United Arab Emirates 1\n", "Puerto Rico 1\n", "Portugal 1\n", "France 1\n", "South Korea 1\n", "Poland 1\n", "Morocco 1\n", "Hungary 1\n", "Switzerland 1\n", "Name: country_of_origin, dtype: int64" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_df_v2['country_of_origin'].value_counts(dropna=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "5f0ad2a5", "metadata": {}, "outputs": [], "source": [ "movies_df_v2['metascore'].value_counts(dropna=False)" ] }, { "cell_type": "code", "execution_count": 28, "id": "26360c9f", "metadata": {}, "outputs": [], "source": [ "movies_df_v2['metascore']= movies_df_v2['metascore'].replace({\n", " '[]':'Not scored' \n", "})\n" ] }, { "cell_type": "code", "execution_count": 29, "id": "9e9e0181", "metadata": {}, "outputs": [], "source": [ "def remove_currency(row): \n", " if type(row) != float:\n", " row = row.replace('$', '')\n", " if '€' in row:\n", " row = row.replace('€', '').strip() # no cunversion needed as of date\n", " if 'CA' in row:\n", " row = row.replace('CA', '').strip()\n", " row = str(int(row)*0.73) \n", " if '₹' in row:\n", " row = row.replace('₹', '').strip()\n", " row = str(int(row)*0.012) \n", " if '£' in row:\n", " row = row.replace('£', '').strip()\n", " row = str(int(row)*1.14)\n", " if 'CN¥' in row:\n", " row = row.replace('CN¥', '').strip()\n", " row = str(int(row)*0.0067)\n", " if 'RUR' in row:\n", " row = row.replace('RUR', '').strip()\n", " row = str(int(row)*0.016)\n", " \n", " return row\n", "\n", "movies_df_v2['budget_in_usd'] = movies_df_v2['budget_in_usd'].apply(remove_currency)" ] }, { "cell_type": "code", "execution_count": 30, "id": "3aaedee0", "metadata": {}, "outputs": [], "source": [ "def remove_nonsensical(row): \n", " if type(row) != float:\n", " if ':' in row:\n", " row = np.nan\n", " \n", " return row" ] }, { "cell_type": "code", "execution_count": 31, "id": "c6efe99c", "metadata": {}, "outputs": [], "source": [ "movies_df_v2['opening_weekend_us_can_in_usd'] = movies_df_v2['opening_weekend_us_can_in_usd'].apply(remove_nonsensical)" ] }, { "cell_type": "code", "execution_count": 32, "id": "e73c6f7e", "metadata": {}, "outputs": [], "source": [ "movies_df_v2['gross_us_can_in_usd'] = movies_df_v2['gross_us_can_in_usd'].apply(remove_nonsensical)" ] }, { "cell_type": "code", "execution_count": 33, "id": "9cc752b8", "metadata": {}, "outputs": [], "source": [ "movies_df_v2['gross_worldwide_in_usd'] = movies_df_v2['gross_worldwide_in_usd'].apply(remove_nonsensical)\n" ] }, { "cell_type": "code", "execution_count": 34, "id": "d8320a8a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "movie_id 0\n", "movie_imdb_link 0\n", "certificate 0\n", "runtime_in_mins 0\n", "genre 0\n", "imdb_rating 0\n", "number_of_votes 0\n", "metascore 0\n", "top_director 0\n", "release_year 0\n", "awards_link 0\n", "awards_total 0\n", "top_writer 0\n", "top_star_1 0\n", "top_star_2 0\n", "top_star_3 0\n", "num_user_reviews 0\n", "num_critic_reviews 20\n", "release_date 5\n", "country_of_origin 0\n", "top_production_company 0\n", "budget_in_usd 0\n", "opening_weekend_us_can_in_usd 136\n", "gross_us_can_in_usd 4\n", "gross_worldwide_in_usd 97\n", "total_award_nominations 0\n", "total_award_wins 0\n", "primary_genre 0\n", "secondary_genre 0\n", "release_weekday 5\n", "release_month 5\n", "dtype: int64" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_df_v2.isna().sum()" ] }, { "cell_type": "code", "execution_count": 35, "id": "a0d3ae7c", "metadata": {}, "outputs": [], "source": [ "movies_df_v2.replace('',np.nan, inplace=True)\n", "movies_df_v2.replace('[]',np.nan, inplace=True)" ] }, { "cell_type": "code", "execution_count": 36, "id": "c7b42308", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "movie_id 0\n", "movie_imdb_link 0\n", "certificate 124\n", "runtime_in_mins 2\n", "genre 0\n", "imdb_rating 0\n", "number_of_votes 0\n", "metascore 0\n", "top_director 0\n", "release_year 0\n", "awards_link 0\n", "awards_total 0\n", "top_writer 1\n", "top_star_1 3\n", "top_star_2 3\n", "top_star_3 3\n", "num_user_reviews 0\n", "num_critic_reviews 20\n", "release_date 5\n", "country_of_origin 0\n", "top_production_company 270\n", "budget_in_usd 258\n", "opening_weekend_us_can_in_usd 370\n", "gross_us_can_in_usd 359\n", "gross_worldwide_in_usd 278\n", "total_award_nominations 0\n", "total_award_wins 0\n", "primary_genre 0\n", "secondary_genre 0\n", "release_weekday 5\n", "release_month 5\n", "dtype: int64" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_df_v2.isna().sum()" ] }, { "cell_type": "code", "execution_count": 37, "id": "9f1f2f8b", "metadata": {}, "outputs": [], "source": [ "cols_numeric = ['runtime_in_mins','num_user_reviews', 'metascore','budget_in_usd', 'opening_weekend_us_can_in_usd',\n", " 'gross_us_can_in_usd', 'gross_worldwide_in_usd','total_award_nominations',\n", " 'total_award_wins']\n", "\n", "movies_df_v2[cols_numeric] = movies_df_v2[cols_numeric].apply(pd.to_numeric, errors='coerce', axis=1)\n" ] }, { "cell_type": "code", "execution_count": 38, "id": "a6c10a55", "metadata": {}, "outputs": [], "source": [ "cols_object = ['release_year','release_month', 'release_weekday']\n", "\n", "movies_df_v2[cols_object] = movies_df_v2[cols_object].astype(object)" ] }, { "cell_type": "code", "execution_count": 39, "id": "d2cfb3e8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "movie_id 0\n", "movie_imdb_link 0\n", "certificate 124\n", "runtime_in_mins 2\n", "genre 0\n", "imdb_rating 0\n", "number_of_votes 0\n", "metascore 223\n", "top_director 0\n", "release_year 0\n", "awards_link 0\n", "awards_total 0\n", "top_writer 1\n", "top_star_1 3\n", "top_star_2 3\n", "top_star_3 3\n", "num_user_reviews 0\n", "num_critic_reviews 20\n", "release_date 5\n", "country_of_origin 0\n", "top_production_company 270\n", "budget_in_usd 258\n", "opening_weekend_us_can_in_usd 370\n", "gross_us_can_in_usd 359\n", "gross_worldwide_in_usd 425\n", "total_award_nominations 0\n", "total_award_wins 0\n", "primary_genre 0\n", "secondary_genre 0\n", "release_weekday 5\n", "release_month 5\n", "dtype: int64" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_df_v2.isna().sum()" ] }, { "cell_type": "code", "execution_count": 40, "id": "991a52b5", "metadata": {}, "outputs": [], "source": [ "# Feature engineering" ] }, { "cell_type": "code", "execution_count": 41, "id": "2e1917a5", "metadata": {}, "outputs": [], "source": [ "## Oscars proved to be useless for the model." ] }, { "cell_type": "code", "execution_count": 42, "id": "b42e90d1", "metadata": {}, "outputs": [], "source": [ "## Director" ] }, { "cell_type": "code", "execution_count": 43, "id": "737e949d", "metadata": {}, "outputs": [], "source": [ "top_1000_movies_copy = top_1000_movies.copy()\n", "best_directors =top_1000_movies_copy.pivot_table(index= ['Director'], aggfunc = ['count'])\n", "best_directors.columns = best_directors.columns.droplevel(0)\n", "best_directors = best_directors.reset_index().rename_axis(None, axis=1)\n" ] }, { "cell_type": "code", "execution_count": 44, "id": "960e61a2", "metadata": {}, "outputs": [], "source": [ "best_directors = best_directors.drop(['Genre', 'Gross', 'IMDB_Rating',\n", " 'Meta_score', 'No_of_Votes', 'Overview', 'Poster_Link', 'Released_Year',\n", " 'Runtime', 'Series_Title', 'Star1', 'Star2', 'Star3', 'Star4'], axis=1)\n", "\n", "best_directors = best_directors.rename(columns={'Director':'top_director',\n", " 'Certificate':'is_among_best_director'})\n", "\n", "best_directors['is_among_best_director'] = 'Y'" ] }, { "cell_type": "code", "execution_count": 45, "id": "54907d9c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(548, 2)" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "best_directors.shape\n" ] }, { "cell_type": "code", "execution_count": 46, "id": "900227e3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(477, 31)" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_df_v2.shape" ] }, { "cell_type": "code", "execution_count": 47, "id": "296f8e01", "metadata": {}, "outputs": [], "source": [ "movies_df_v10 = pd.merge(movies_df_v2, best_directors, how='left', on='top_director')" ] }, { "cell_type": "code", "execution_count": 48, "id": "dad81a79", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idmovie_imdb_linkcertificateruntime_in_minsgenreimdb_ratingnumber_of_votesmetascoretop_directorrelease_yearawards_linkawards_totaltop_writertop_star_1top_star_2top_star_3num_user_reviewsnum_critic_reviewsrelease_datecountry_of_origintop_production_companybudget_in_usdopening_weekend_us_can_in_usdgross_us_can_in_usdgross_worldwide_in_usdtotal_award_nominationstotal_award_winsprimary_genresecondary_genrerelease_weekdayrelease_monthis_among_best_director
467tt13429928https://www.imdb.com/title/tt13429928/?ref_=ad...Not Rated87.0Action, Adventure, History3.2210.0NaNSteven Luke2022https://www.imdb.com/title/tt13429928/awards/?...0Steven LukeHiram A. MurrayAndrew SteckerApostolos Gliarmis11.05.02022-10-06United StatesNaNNaNNaNNaNNaN0.00.0ActionAdventure410.0NaN
468tt11939970https://www.imdb.com/title/tt11939970/?ref_=ad...NaN80.0Drama, Horror, Mystery2.4209.0NaNKameron Hale2022https://www.imdb.com/title/tt11939970/awards/?...Showing all 1 win and 3 nominationsScott HaleMiranda NiemanHayley SunshineScott Hale22.07.02022-01-18United StatesNaNNaNNaNNaNNaN3.01.0DramaHorror21.0NaN
469tt12907932https://www.imdb.com/title/tt12907932/?ref_=ad...NaN82.0Horror, Thriller2.5208.0NaNKipp Tribble2022https://www.imdb.com/title/tt12907932/awards/?...0Kipp TribbleAndi Sweeney BlancoRichard SiegelmanKipp Tribble10.011.02022-01-21United StatesNaNNaNNaNNaNNaN0.00.0HorrorThriller51.0NaN
470tt9093076https://www.imdb.com/title/tt9093076/?ref_=adv...NaN72.0Horror3.7207.0NaNKurtis Spieler2022https://www.imdb.com/title/tt9093076/awards/?r...0Kurtis SpielerLaura DoolingAdrienne KingFrank Wihbey13.015.02022-08-09United StatesNaNNaNNaNNaNNaN0.00.0HorrorOther28.0NaN
471tt14555908https://www.imdb.com/title/tt14555908/?ref_=ad...NaN86.0Comedy5.3205.0NaNAndrew Nackman2022https://www.imdb.com/title/tt14555908/awards/?...Showing all 0 wins and 1 nominationJake GreeneEthan DizonMadison WolfeBernard White9.010.02022-05-24United StatesNaNNaNNaNNaNNaN1.00.0ComedyOther25.0NaN
472tt19511880https://www.imdb.com/title/tt19511880/?ref_=ad...NaNNaNReality-TV9.1204.0NaNDiane Paloma Eskenazi2022https://www.imdb.com/title/tt19511880/awards/?...0Nino DalakishviliNaNNaNNaN0.0NaN2022-05-01United StatesNaN50000.0NaNNaNNaN0.00.0RealityTV75.0NaN
473tt10696116https://www.imdb.com/title/tt10696116/?ref_=ad...NaN92.0Horror7.9204.0NaNJohn Ainslie2022https://www.imdb.com/title/tt10696116/awards/?...0John AinslieKimberly LaferriereRogan ChristopherJanet Porter3.026.02022-08-19United StatesNaNNaNNaNNaNNaN0.00.0HorrorOther58.0NaN
474tt3447590https://www.imdb.com/title/tt3447590/?ref_=adv...Parental Guidance117.0Comedy, Drama, Family6.1201.067.0Matthew Warchus2022https://www.imdb.com/title/tt3447590/awards/?r...0Roald DahlStephen GrahamEmma ThompsonAndrea Riseborough15.015.02022-12-25United KingdomWorking Title FilmsNaNNaNNaNNaN0.00.0ComedyDrama712.0Y
475tt20861742https://www.imdb.com/title/tt20861742/?ref_=ad...NaN78.0Horror4.5200.0NaNBrendan Rudnicki2022https://www.imdb.com/title/tt20861742/awards/?...Showing all 1 win and 4 nominationsBrendan RudnickiWalter BraithwaiteDylan DeVaneBrent Downs101.04.02022-07-29United StatesNaNNaNNaNNaNNaN4.01.0HorrorOther57.0NaN
476tt12885770https://www.imdb.com/title/tt12885770/?ref_=ad...Adult90.0Horror, Sci-Fi, Thriller2.2200.0NaNLance Kawas2022https://www.imdb.com/title/tt12885770/awards/?...0Lance KawasBrande RoderickDonald CerroneKelly Lynn Reiter9.03.02022-10-11United StatesNaNNaNNaNNaNNaN0.00.0HorrorSci210.0NaN
\n", "
" ], "text/plain": [ " movie_id movie_imdb_link \\\n", "467 tt13429928 https://www.imdb.com/title/tt13429928/?ref_=ad... \n", "468 tt11939970 https://www.imdb.com/title/tt11939970/?ref_=ad... \n", "469 tt12907932 https://www.imdb.com/title/tt12907932/?ref_=ad... \n", "470 tt9093076 https://www.imdb.com/title/tt9093076/?ref_=adv... \n", "471 tt14555908 https://www.imdb.com/title/tt14555908/?ref_=ad... \n", "472 tt19511880 https://www.imdb.com/title/tt19511880/?ref_=ad... \n", "473 tt10696116 https://www.imdb.com/title/tt10696116/?ref_=ad... \n", "474 tt3447590 https://www.imdb.com/title/tt3447590/?ref_=adv... \n", "475 tt20861742 https://www.imdb.com/title/tt20861742/?ref_=ad... \n", "476 tt12885770 https://www.imdb.com/title/tt12885770/?ref_=ad... \n", "\n", " certificate runtime_in_mins genre \\\n", "467 Not Rated 87.0 Action, Adventure, History \n", "468 NaN 80.0 Drama, Horror, Mystery \n", "469 NaN 82.0 Horror, Thriller \n", "470 NaN 72.0 Horror \n", "471 NaN 86.0 Comedy \n", "472 NaN NaN Reality-TV \n", "473 NaN 92.0 Horror \n", "474 Parental Guidance 117.0 Comedy, Drama, Family \n", "475 NaN 78.0 Horror \n", "476 Adult 90.0 Horror, Sci-Fi, Thriller \n", "\n", " imdb_rating number_of_votes metascore top_director \\\n", "467 3.2 210.0 NaN Steven Luke \n", "468 2.4 209.0 NaN Kameron Hale \n", "469 2.5 208.0 NaN Kipp Tribble \n", "470 3.7 207.0 NaN Kurtis Spieler \n", "471 5.3 205.0 NaN Andrew Nackman \n", "472 9.1 204.0 NaN Diane Paloma Eskenazi \n", "473 7.9 204.0 NaN John Ainslie \n", "474 6.1 201.0 67.0 Matthew Warchus \n", "475 4.5 200.0 NaN Brendan Rudnicki \n", "476 2.2 200.0 NaN Lance Kawas \n", "\n", " release_year awards_link \\\n", "467 2022 https://www.imdb.com/title/tt13429928/awards/?... \n", "468 2022 https://www.imdb.com/title/tt11939970/awards/?... \n", "469 2022 https://www.imdb.com/title/tt12907932/awards/?... \n", "470 2022 https://www.imdb.com/title/tt9093076/awards/?r... \n", "471 2022 https://www.imdb.com/title/tt14555908/awards/?... \n", "472 2022 https://www.imdb.com/title/tt19511880/awards/?... \n", "473 2022 https://www.imdb.com/title/tt10696116/awards/?... \n", "474 2022 https://www.imdb.com/title/tt3447590/awards/?r... \n", "475 2022 https://www.imdb.com/title/tt20861742/awards/?... \n", "476 2022 https://www.imdb.com/title/tt12885770/awards/?... \n", "\n", " awards_total top_writer \\\n", "467 0 Steven Luke \n", "468 Showing all 1 win and 3 nominations Scott Hale \n", "469 0 Kipp Tribble \n", "470 0 Kurtis Spieler \n", "471 Showing all 0 wins and 1 nomination Jake Greene \n", "472 0 Nino Dalakishvili \n", "473 0 John Ainslie \n", "474 0 Roald Dahl \n", "475 Showing all 1 win and 4 nominations Brendan Rudnicki \n", "476 0 Lance Kawas \n", "\n", " top_star_1 top_star_2 top_star_3 \\\n", "467 Hiram A. Murray Andrew Stecker Apostolos Gliarmis \n", "468 Miranda Nieman Hayley Sunshine Scott Hale \n", "469 Andi Sweeney Blanco Richard Siegelman Kipp Tribble \n", "470 Laura Dooling Adrienne King Frank Wihbey \n", "471 Ethan Dizon Madison Wolfe Bernard White \n", "472 NaN NaN NaN \n", "473 Kimberly Laferriere Rogan Christopher Janet Porter \n", "474 Stephen Graham Emma Thompson Andrea Riseborough \n", "475 Walter Braithwaite Dylan DeVane Brent Downs \n", "476 Brande Roderick Donald Cerrone Kelly Lynn Reiter \n", "\n", " num_user_reviews num_critic_reviews release_date country_of_origin \\\n", "467 11.0 5.0 2022-10-06 United States \n", "468 22.0 7.0 2022-01-18 United States \n", "469 10.0 11.0 2022-01-21 United States \n", "470 13.0 15.0 2022-08-09 United States \n", "471 9.0 10.0 2022-05-24 United States \n", "472 0.0 NaN 2022-05-01 United States \n", "473 3.0 26.0 2022-08-19 United States \n", "474 15.0 15.0 2022-12-25 United Kingdom \n", "475 101.0 4.0 2022-07-29 United States \n", "476 9.0 3.0 2022-10-11 United States \n", "\n", " top_production_company budget_in_usd opening_weekend_us_can_in_usd \\\n", "467 NaN NaN NaN \n", "468 NaN NaN NaN \n", "469 NaN NaN NaN \n", "470 NaN NaN NaN \n", "471 NaN NaN NaN \n", "472 NaN 50000.0 NaN \n", "473 NaN NaN NaN \n", "474 Working Title Films NaN NaN \n", "475 NaN NaN NaN \n", "476 NaN NaN NaN \n", "\n", " gross_us_can_in_usd gross_worldwide_in_usd total_award_nominations \\\n", "467 NaN NaN 0.0 \n", "468 NaN NaN 3.0 \n", "469 NaN NaN 0.0 \n", "470 NaN NaN 0.0 \n", "471 NaN NaN 1.0 \n", "472 NaN NaN 0.0 \n", "473 NaN NaN 0.0 \n", "474 NaN NaN 0.0 \n", "475 NaN NaN 4.0 \n", "476 NaN NaN 0.0 \n", "\n", " total_award_wins primary_genre secondary_genre release_weekday \\\n", "467 0.0 Action Adventure 4 \n", "468 1.0 Drama Horror 2 \n", "469 0.0 Horror Thriller 5 \n", "470 0.0 Horror Other 2 \n", "471 0.0 Comedy Other 2 \n", "472 0.0 Reality TV 7 \n", "473 0.0 Horror Other 5 \n", "474 0.0 Comedy Drama 7 \n", "475 1.0 Horror Other 5 \n", "476 0.0 Horror Sci 2 \n", "\n", " release_month is_among_best_director \n", "467 10.0 NaN \n", "468 1.0 NaN \n", "469 1.0 NaN \n", "470 8.0 NaN \n", "471 5.0 NaN \n", "472 5.0 NaN \n", "473 8.0 NaN \n", "474 12.0 Y \n", "475 7.0 NaN \n", "476 10.0 NaN " ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_df_v10.tail(10)" ] }, { "cell_type": "code", "execution_count": 49, "id": "f8471dd2", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/2676440861.py:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", " movies_df_v10_gross_uscan = movies_df_v10_bud[~(\n" ] } ], "source": [ "movies_df_v10_bud = movies_df_v10[~(\n", " movies_df_v10['budget_in_usd'] < 10000)]\n", "\n", "movies_df_v10_gross_uscan = movies_df_v10_bud[~(\n", " movies_df_v10['gross_us_can_in_usd'] < 2000)]" ] }, { "cell_type": "code", "execution_count": 50, "id": "d1b25af5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "movie_id 0\n", "movie_imdb_link 0\n", "certificate 122\n", "runtime_in_mins 2\n", "genre 0\n", "imdb_rating 0\n", "number_of_votes 0\n", "metascore 219\n", "top_director 0\n", "release_year 0\n", "awards_link 0\n", "awards_total 0\n", "top_writer 1\n", "top_star_1 3\n", "top_star_2 3\n", "top_star_3 3\n", "num_user_reviews 0\n", "num_critic_reviews 18\n", "release_date 5\n", "country_of_origin 0\n", "top_production_company 263\n", "budget_in_usd 258\n", "opening_weekend_us_can_in_usd 363\n", "gross_us_can_in_usd 352\n", "gross_worldwide_in_usd 418\n", "total_award_nominations 0\n", "total_award_wins 0\n", "primary_genre 0\n", "secondary_genre 0\n", "release_weekday 5\n", "release_month 5\n", "is_among_best_director 436\n", "dtype: int64" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_df_v10_gross_uscan.isna().sum()" ] }, { "cell_type": "code", "execution_count": 51, "id": "5c479a89", "metadata": {}, "outputs": [], "source": [ "# I will continue with a filtered data where the financial columns are not null.\n", "\n", "mask1 = movies_df_v10_gross_uscan['gross_worldwide_in_usd'].notnull()\n", "mask2 = movies_df_v10_gross_uscan['opening_weekend_us_can_in_usd'].notnull()\n", "mask3 = movies_df_v10_gross_uscan['budget_in_usd'].notnull()\n", "\n", "movies_filtered = movies_df_v10_gross_uscan[mask1 & mask2 & mask3].copy()" ] }, { "cell_type": "code", "execution_count": 52, "id": "2448dde7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(52, 32)" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_filtered.shape" ] }, { "cell_type": "code", "execution_count": 53, "id": "75d0fa7e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "movie_id 0\n", "movie_imdb_link 0\n", "certificate 0\n", "runtime_in_mins 0\n", "genre 0\n", "imdb_rating 0\n", "number_of_votes 0\n", "metascore 0\n", "top_director 0\n", "release_year 0\n", "awards_link 0\n", "awards_total 0\n", "top_writer 0\n", "top_star_1 0\n", "top_star_2 0\n", "top_star_3 0\n", "num_user_reviews 0\n", "num_critic_reviews 0\n", "release_date 1\n", "country_of_origin 0\n", "top_production_company 5\n", "budget_in_usd 0\n", "opening_weekend_us_can_in_usd 0\n", "gross_us_can_in_usd 0\n", "gross_worldwide_in_usd 0\n", "total_award_nominations 0\n", "total_award_wins 0\n", "primary_genre 0\n", "secondary_genre 0\n", "release_weekday 1\n", "release_month 1\n", "is_among_best_director 41\n", "dtype: int64" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_filtered.isna().sum()" ] }, { "cell_type": "code", "execution_count": 54, "id": "4b8b834c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5 49\n", "4 2\n", "Name: release_weekday, dtype: int64" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_filtered['release_weekday'].value_counts()" ] }, { "cell_type": "code", "execution_count": 55, "id": "3b2d7b1f", "metadata": {}, "outputs": [], "source": [ "movies_filtered['release_weekday'] = movies_filtered['release_weekday'].fillna(5).astype(object)\n" ] }, { "cell_type": "code", "execution_count": 56, "id": "91fca13a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4.0 10\n", "2.0 7\n", "9.0 7\n", "3.0 4\n", "7.0 4\n", "8.0 4\n", "6.0 4\n", "10.0 4\n", "5.0 3\n", "1.0 2\n", "11.0 2\n", "Name: release_month, dtype: int64" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_filtered['release_month'].value_counts()" ] }, { "cell_type": "code", "execution_count": 57, "id": "c0a412ac", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4.0 11\n", "2.0 7\n", "9.0 7\n", "3.0 4\n", "7.0 4\n", "8.0 4\n", "6.0 4\n", "10.0 4\n", "5.0 3\n", "1.0 2\n", "11.0 2\n", "Name: release_month, dtype: int64" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_filtered['release_month'] = movies_filtered['release_month'].fillna(4.0).astype(object)\n", "movies_filtered['release_month'].value_counts()\n" ] }, { "cell_type": "code", "execution_count": 58, "id": "b0d4529b", "metadata": {}, "outputs": [], "source": [ "def group_prod_company(df):\n", "\n", " major_prod_company = ['Universal Pictures', 'Columbia Pictures', 'Warner Bros.', 'Paramount Pictures']\n", " df['top_production_company_grouped'] = '' #creating an empty column\n", " \n", " for i in df.index:\n", " if df['top_production_company'][i] not in major_prod_company:\n", " df['top_production_company_grouped'][i] = 'Other' \n", " \n", " else:\n", " df['top_production_company_grouped'][i] = df['top_production_company'][i]\n", " \n", " return df\n", "\n" ] }, { "cell_type": "code", "execution_count": 59, "id": "4b1ed944", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/2722901165.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['top_production_company_grouped'][i] = df['top_production_company'][i]\n", "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/2722901165.py:8: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['top_production_company_grouped'][i] = 'Other'\n" ] } ], "source": [ "movies_filtered_v1 = group_prod_company(movies_filtered)" ] }, { "cell_type": "code", "execution_count": 60, "id": "e1a7636c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Other 37\n", "Universal Pictures 6\n", "Paramount Pictures 4\n", "Warner Bros. 3\n", "Columbia Pictures 2\n", "Name: top_production_company_grouped, dtype: int64" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_filtered_v1['top_production_company_grouped'].value_counts(dropna=False)" ] }, { "cell_type": "code", "execution_count": 61, "id": "1c4d29fb", "metadata": {}, "outputs": [], "source": [ "movies_filtered_v1['top_production_company']= movies_filtered_v1['top_production_company'].fillna('Other')" ] }, { "cell_type": "code", "execution_count": 62, "id": "96eb0fc3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_filtered_v1['top_production_company'].isna().sum()" ] }, { "cell_type": "code", "execution_count": 63, "id": "98348dcb", "metadata": {}, "outputs": [], "source": [ "def group_country(df):\n", "\n", " df['country_of_origin_grouped'] = '' #creating an empty column\n", " \n", " for i in df.index:\n", " if df['country_of_origin'][i] != 'United States':\n", " df['country_of_origin_grouped'][i] = 'Other' \n", " \n", " else:\n", " df['country_of_origin_grouped'][i] = df['country_of_origin'][i]\n", " \n", " return df\n", "\n" ] }, { "cell_type": "code", "execution_count": 64, "id": "421e0518", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/2431420963.py:10: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['country_of_origin_grouped'][i] = df['country_of_origin'][i]\n", "/var/folders/33/p_3l01b14g96rn22vzwly2g00000gn/T/ipykernel_64253/2431420963.py:7: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df['country_of_origin_grouped'][i] = 'Other'\n" ] } ], "source": [ "movies_filtered_v2 = group_country(movies_filtered_v1)" ] }, { "cell_type": "code", "execution_count": 65, "id": "3774dc7b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Action 22\n", "Animation 6\n", "Comedy 6\n", "Horror 5\n", "Drama 4\n", "Crime 3\n", "Biography 2\n", "Adventure 2\n", "Other 2\n", "Name: primary_genre, dtype: int64" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_filtered_v2['primary_genre']= movies_filtered_v2['primary_genre'].replace({'Fantasy': 'Other',\n", " 'Thriller': 'Other','Romance': 'Other', 'Music': 'Other', 'Mystery': 'Other', 'Musical': 'Other',\n", " 'Sci':'Other' })\n", "\n", "movies_filtered_v2['primary_genre'].value_counts(dropna=False)\n" ] }, { "cell_type": "code", "execution_count": 66, "id": "57d0f11b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Adventure 15\n", "Drama 11\n", "Other 11\n", "Thriller 6\n", "Crime 3\n", "Comedy 3\n", "Romance 2\n", "Horror 1\n", "Name: secondary_genre, dtype: int64" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_filtered_v2['secondary_genre']= movies_filtered_v2['secondary_genre'].replace({'Mystery': 'Other',\n", " 'Fantasy': 'Other','Family': 'Other', 'Biography': 'Other', 'Action': 'Other', 'Sci': 'Other',\n", " 'History': 'Other','Music': 'Other', 'Sport': 'Other', 'War': 'Other', 'Musical': 'Other', \n", " 'Western': 'Other', 'Fi': 'Other'})\n", "\n", "movies_filtered_v2['secondary_genre'].value_counts(dropna=False)\n" ] }, { "cell_type": "code", "execution_count": 67, "id": "b79335ee", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "+13/14 24\n", "Adult 20\n", "Parental Guidance 8\n", "Name: certificate, dtype: int64" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_filtered_v2['certificate']=movies_filtered_v2['certificate'].fillna('Not Rated')\n", "movies_filtered_v2['certificate'].value_counts(dropna=False)" ] }, { "cell_type": "code", "execution_count": 68, "id": "3413b683", "metadata": {}, "outputs": [], "source": [ "movies_filtered_v2['num_critic_reviews']=movies_filtered_v2['num_critic_reviews'].fillna(0)" ] }, { "cell_type": "code", "execution_count": 69, "id": "6ebeb13d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "42" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_filtered_v2.isna().sum().sum()" ] }, { "cell_type": "code", "execution_count": 70, "id": "18665127", "metadata": {}, "outputs": [], "source": [ "movies_filtered_v2['num_critic_reviews']=movies_filtered_v2['num_critic_reviews'].fillna(0)" ] }, { "cell_type": "code", "execution_count": 71, "id": "7b1a92fc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['movie_id', 'movie_imdb_link', 'certificate', 'runtime_in_mins',\n", " 'genre', 'imdb_rating', 'number_of_votes', 'metascore', 'top_director',\n", " 'release_year', 'awards_link', 'awards_total', 'top_writer',\n", " 'top_star_1', 'top_star_2', 'top_star_3', 'num_user_reviews',\n", " 'num_critic_reviews', 'release_date', 'country_of_origin',\n", " 'top_production_company', 'budget_in_usd',\n", " 'opening_weekend_us_can_in_usd', 'gross_us_can_in_usd',\n", " 'gross_worldwide_in_usd', 'total_award_nominations', 'total_award_wins',\n", " 'primary_genre', 'secondary_genre', 'release_weekday', 'release_month',\n", " 'is_among_best_director', 'top_production_company_grouped',\n", " 'country_of_origin_grouped'],\n", " dtype='object')" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_filtered_v2.columns" ] }, { "cell_type": "code", "execution_count": 72, "id": "1b43cdf7", "metadata": {}, "outputs": [], "source": [ "movies_filtered_v2['is_among_best_director']= movies_filtered_v2['is_among_best_director'].fillna('N')" ] }, { "cell_type": "code", "execution_count": 73, "id": "a1145715", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_filtered_v2.isna().sum().sum()" ] }, { "cell_type": "code", "execution_count": 74, "id": "a057483b", "metadata": {}, "outputs": [], "source": [ "movies_filtered_v2.to_csv('movies_2022_treated.csv')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "35c8c025", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }