{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "Y9Tga5BDkra8"
},
"source": [
"# CS5481 - Tutorial 3\n",
"## Data Preprocessing and Regularization Expression\n",
"\n",
"\n",
"## Preparation\n",
"- Python\n",
"- Python Libraries\n",
"- - Pandas\n",
"- - re"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"start_time": "2023-09-18T03:28:17.230Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GFTw5qL4kra9",
"outputId": "27f4f5fb-cba9-4b5a-bb01-c576ab6edafa"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.1.4)\n",
"Requirement already satisfied: numpy<2,>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.26.4)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.1)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
"\u001b[31mERROR: Could not find a version that satisfies the requirement re (from versions: none)\u001b[0m\u001b[31m\n",
"\u001b[0m\u001b[31mERROR: No matching distribution found for re\u001b[0m\u001b[31m\n",
"\u001b[0m"
]
}
],
"source": [
"!pip install pandas"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sVwmV7dokra9"
},
"source": [
"## 1. Import Libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:28:53.632817Z",
"start_time": "2023-09-18T03:28:53.034836Z"
},
"id": "ubkMahTRkra9"
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "h5DXLw8ckra9"
},
"source": [
"## 2. Data Preprocessing"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IuFSmRA3kra-"
},
"source": [
"### 2.1 Data Cleaning"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:33:58.665755Z",
"start_time": "2023-09-18T03:33:58.598859Z"
},
"id": "e3sCxYlfkra-"
},
"outputs": [],
"source": [
"data = pd.read_csv(r'movie_metadata.csv', encoding=\"utf-8\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "lS_CYbgXkra-"
},
"source": [
"- Basic Operations"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:34:02.565803Z",
"start_time": "2023-09-18T03:34:02.510710Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 307
},
"id": "wUUU2_--kra-",
"outputId": "13ce574c-ba60-4bae-88d1-4fb9588beb76"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" director_name | \n",
" num_critic_for_reviews | \n",
" duration | \n",
" director_facebook_likes | \n",
" actor_3_facebook_likes | \n",
" actor_2_name | \n",
" actor_1_facebook_likes | \n",
" gross | \n",
" genres | \n",
" ... | \n",
" num_user_for_reviews | \n",
" language | \n",
" country | \n",
" content_rating | \n",
" budget | \n",
" title_year | \n",
" actor_2_facebook_likes | \n",
" imdb_score | \n",
" aspect_ratio | \n",
" movie_facebook_likes | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Color | \n",
" James Cameron | \n",
" 723.0 | \n",
" 178.0 | \n",
" 0.0 | \n",
" 855.0 | \n",
" Joel David Moore | \n",
" 1000.0 | \n",
" 760505847.0 | \n",
" Action|Adventure|Fantasy|Sci-Fi | \n",
" ... | \n",
" 3054.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 237000000.0 | \n",
" 2009.0 | \n",
" 936.0 | \n",
" 7.9 | \n",
" 1.78 | \n",
" 33000 | \n",
"
\n",
" \n",
" | 1 | \n",
" Color | \n",
" Gore Verbinski | \n",
" 302.0 | \n",
" 169.0 | \n",
" 563.0 | \n",
" 1000.0 | \n",
" Orlando Bloom | \n",
" 40000.0 | \n",
" 309404152.0 | \n",
" Action|Adventure|Fantasy | \n",
" ... | \n",
" 1238.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 300000000.0 | \n",
" 2007.0 | \n",
" 5000.0 | \n",
" 7.1 | \n",
" 2.35 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" Color | \n",
" Sam Mendes | \n",
" 602.0 | \n",
" 148.0 | \n",
" 0.0 | \n",
" 161.0 | \n",
" Rory Kinnear | \n",
" 11000.0 | \n",
" 200074175.0 | \n",
" Action|Adventure|Thriller | \n",
" ... | \n",
" 994.0 | \n",
" English | \n",
" UK | \n",
" PG-13 | \n",
" 245000000.0 | \n",
" 2015.0 | \n",
" 393.0 | \n",
" 6.8 | \n",
" 2.35 | \n",
" 85000 | \n",
"
\n",
" \n",
" | 3 | \n",
" Color | \n",
" Christopher Nolan | \n",
" 813.0 | \n",
" 164.0 | \n",
" 22000.0 | \n",
" 23000.0 | \n",
" Christian Bale | \n",
" 27000.0 | \n",
" 448130642.0 | \n",
" Action|Thriller | \n",
" ... | \n",
" 2701.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 250000000.0 | \n",
" 2012.0 | \n",
" 23000.0 | \n",
" 8.5 | \n",
" 2.35 | \n",
" 164000 | \n",
"
\n",
" \n",
" | 4 | \n",
" NaN | \n",
" Doug Walker | \n",
" NaN | \n",
" NaN | \n",
" 131.0 | \n",
" NaN | \n",
" Rob Walker | \n",
" 131.0 | \n",
" NaN | \n",
" Documentary | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 12.0 | \n",
" 7.1 | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 28 columns
\n",
"
"
],
"text/plain": [
" color director_name num_critic_for_reviews duration \\\n",
"0 Color James Cameron 723.0 178.0 \n",
"1 Color Gore Verbinski 302.0 169.0 \n",
"2 Color Sam Mendes 602.0 148.0 \n",
"3 Color Christopher Nolan 813.0 164.0 \n",
"4 NaN Doug Walker NaN NaN \n",
"\n",
" director_facebook_likes actor_3_facebook_likes actor_2_name \\\n",
"0 0.0 855.0 Joel David Moore \n",
"1 563.0 1000.0 Orlando Bloom \n",
"2 0.0 161.0 Rory Kinnear \n",
"3 22000.0 23000.0 Christian Bale \n",
"4 131.0 NaN Rob Walker \n",
"\n",
" actor_1_facebook_likes gross genres ... \\\n",
"0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi ... \n",
"1 40000.0 309404152.0 Action|Adventure|Fantasy ... \n",
"2 11000.0 200074175.0 Action|Adventure|Thriller ... \n",
"3 27000.0 448130642.0 Action|Thriller ... \n",
"4 131.0 NaN Documentary ... \n",
"\n",
" num_user_for_reviews language country content_rating budget \\\n",
"0 3054.0 English USA PG-13 237000000.0 \n",
"1 1238.0 English USA PG-13 300000000.0 \n",
"2 994.0 English UK PG-13 245000000.0 \n",
"3 2701.0 English USA PG-13 250000000.0 \n",
"4 NaN NaN NaN NaN NaN \n",
"\n",
" title_year actor_2_facebook_likes imdb_score aspect_ratio \\\n",
"0 2009.0 936.0 7.9 1.78 \n",
"1 2007.0 5000.0 7.1 2.35 \n",
"2 2015.0 393.0 6.8 2.35 \n",
"3 2012.0 23000.0 8.5 2.35 \n",
"4 NaN 12.0 7.1 NaN \n",
"\n",
" movie_facebook_likes \n",
"0 33000 \n",
"1 0 \n",
"2 85000 \n",
"3 164000 \n",
"4 0 \n",
"\n",
"[5 rows x 28 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# show the first 5 lines of the file\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:34:04.255414Z",
"start_time": "2023-09-18T03:34:04.206760Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 325
},
"id": "YU__Sh7pkra-",
"outputId": "ff46f456-82ee-459b-8c37-a0d600fd1f75"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" director_name | \n",
" num_critic_for_reviews | \n",
" duration | \n",
" director_facebook_likes | \n",
" actor_3_facebook_likes | \n",
" actor_2_name | \n",
" actor_1_facebook_likes | \n",
" gross | \n",
" genres | \n",
" ... | \n",
" num_user_for_reviews | \n",
" language | \n",
" country | \n",
" content_rating | \n",
" budget | \n",
" title_year | \n",
" actor_2_facebook_likes | \n",
" imdb_score | \n",
" aspect_ratio | \n",
" movie_facebook_likes | \n",
"
\n",
" \n",
" \n",
" \n",
" | 5038 | \n",
" Color | \n",
" Scott Smith | \n",
" 1.0 | \n",
" 87.0 | \n",
" 2.0 | \n",
" 318.0 | \n",
" Daphne Zuniga | \n",
" 637.0 | \n",
" NaN | \n",
" Comedy|Drama | \n",
" ... | \n",
" 6.0 | \n",
" English | \n",
" Canada | \n",
" NaN | \n",
" NaN | \n",
" 2013.0 | \n",
" 470.0 | \n",
" 7.7 | \n",
" NaN | \n",
" 84 | \n",
"
\n",
" \n",
" | 5039 | \n",
" Color | \n",
" NaN | \n",
" 43.0 | \n",
" 43.0 | \n",
" NaN | \n",
" 319.0 | \n",
" Valorie Curry | \n",
" 841.0 | \n",
" NaN | \n",
" Crime|Drama|Mystery|Thriller | \n",
" ... | \n",
" 359.0 | \n",
" English | \n",
" USA | \n",
" TV-14 | \n",
" NaN | \n",
" NaN | \n",
" 593.0 | \n",
" 7.5 | \n",
" 16.00 | \n",
" 32000 | \n",
"
\n",
" \n",
" | 5040 | \n",
" Color | \n",
" Benjamin Roberds | \n",
" 13.0 | \n",
" 76.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" Maxwell Moody | \n",
" 0.0 | \n",
" NaN | \n",
" Drama|Horror|Thriller | \n",
" ... | \n",
" 3.0 | \n",
" English | \n",
" USA | \n",
" NaN | \n",
" 1400.0 | \n",
" 2013.0 | \n",
" 0.0 | \n",
" 6.3 | \n",
" NaN | \n",
" 16 | \n",
"
\n",
" \n",
" | 5041 | \n",
" Color | \n",
" Daniel Hsia | \n",
" 14.0 | \n",
" 100.0 | \n",
" 0.0 | \n",
" 489.0 | \n",
" Daniel Henney | \n",
" 946.0 | \n",
" 10443.0 | \n",
" Comedy|Drama|Romance | \n",
" ... | \n",
" 9.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" NaN | \n",
" 2012.0 | \n",
" 719.0 | \n",
" 6.3 | \n",
" 2.35 | \n",
" 660 | \n",
"
\n",
" \n",
" | 5042 | \n",
" Color | \n",
" Jon Gunn | \n",
" 43.0 | \n",
" 90.0 | \n",
" 16.0 | \n",
" 16.0 | \n",
" Brian Herzlinger | \n",
" 86.0 | \n",
" 85222.0 | \n",
" Documentary | \n",
" ... | \n",
" 84.0 | \n",
" English | \n",
" USA | \n",
" PG | \n",
" 1100.0 | \n",
" 2004.0 | \n",
" 23.0 | \n",
" 6.6 | \n",
" 1.85 | \n",
" 456 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 28 columns
\n",
"
"
],
"text/plain": [
" color director_name num_critic_for_reviews duration \\\n",
"5038 Color Scott Smith 1.0 87.0 \n",
"5039 Color NaN 43.0 43.0 \n",
"5040 Color Benjamin Roberds 13.0 76.0 \n",
"5041 Color Daniel Hsia 14.0 100.0 \n",
"5042 Color Jon Gunn 43.0 90.0 \n",
"\n",
" director_facebook_likes actor_3_facebook_likes actor_2_name \\\n",
"5038 2.0 318.0 Daphne Zuniga \n",
"5039 NaN 319.0 Valorie Curry \n",
"5040 0.0 0.0 Maxwell Moody \n",
"5041 0.0 489.0 Daniel Henney \n",
"5042 16.0 16.0 Brian Herzlinger \n",
"\n",
" actor_1_facebook_likes gross genres ... \\\n",
"5038 637.0 NaN Comedy|Drama ... \n",
"5039 841.0 NaN Crime|Drama|Mystery|Thriller ... \n",
"5040 0.0 NaN Drama|Horror|Thriller ... \n",
"5041 946.0 10443.0 Comedy|Drama|Romance ... \n",
"5042 86.0 85222.0 Documentary ... \n",
"\n",
" num_user_for_reviews language country content_rating budget \\\n",
"5038 6.0 English Canada NaN NaN \n",
"5039 359.0 English USA TV-14 NaN \n",
"5040 3.0 English USA NaN 1400.0 \n",
"5041 9.0 English USA PG-13 NaN \n",
"5042 84.0 English USA PG 1100.0 \n",
"\n",
" title_year actor_2_facebook_likes imdb_score aspect_ratio \\\n",
"5038 2013.0 470.0 7.7 NaN \n",
"5039 NaN 593.0 7.5 16.00 \n",
"5040 2013.0 0.0 6.3 NaN \n",
"5041 2012.0 719.0 6.3 2.35 \n",
"5042 2004.0 23.0 6.6 1.85 \n",
"\n",
" movie_facebook_likes \n",
"5038 84 \n",
"5039 32000 \n",
"5040 16 \n",
"5041 660 \n",
"5042 456 \n",
"\n",
"[5 rows x 28 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# show the last 5 lines of the file\n",
"data.tail()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:34:07.023407Z",
"start_time": "2023-09-18T03:34:06.998476Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 335
},
"id": "nUV3nBfPkra-",
"outputId": "dda7fe83-3080-4e28-d2a4-17c58cb9273e"
},
"outputs": [
{
"data": {
"text/plain": [
"count 5028.000000\n",
"mean 107.201074\n",
"std 25.197441\n",
"min 7.000000\n",
"25% 93.000000\n",
"50% 103.000000\n",
"75% 118.000000\n",
"max 511.000000\n",
"Name: duration, dtype: float64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check stat info of columns: data.columnname.describe()\n",
"data.duration.describe()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:38:51.325376Z",
"start_time": "2023-09-18T03:38:51.311134Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 458
},
"id": "w4jXV9cAkra-",
"outputId": "566a8516-fa0a-4ed1-dfa1-9f77fd7c903d"
},
"outputs": [
{
"data": {
"text/plain": [
"0 Color\n",
"1 Color\n",
"2 Color\n",
"3 Color\n",
"4 NaN\n",
" ... \n",
"5038 Color\n",
"5039 Color\n",
"5040 Color\n",
"5041 Color\n",
"5042 Color\n",
"Name: color, Length: 5043, dtype: object"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# choose a column: data[columnname]\n",
"data['color']"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:38:54.516852Z",
"start_time": "2023-09-18T03:38:54.492521Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 397
},
"id": "w1CDAQkckra-",
"outputId": "ed4116a5-de00-40dc-f5f7-b3e4d494a5b1"
},
"outputs": [
{
"data": {
"text/plain": [
"0 Color\n",
"1 Color\n",
"2 Color\n",
"3 Color\n",
"4 NaN\n",
"5 Color\n",
"6 Color\n",
"7 Color\n",
"8 Color\n",
"9 Color\n",
"Name: color, dtype: object"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# choose the first K lines: data['columnname'][:K]\n",
"K = 10\n",
"data[\"color\"][:K]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:39:08.441758Z",
"start_time": "2023-09-18T03:39:08.415829Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 423
},
"id": "dG-J0eWokra-",
"outputId": "08fc90f9-44f6-4cb0-d947-9565d2ec31f2"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" director_name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Color | \n",
" James Cameron | \n",
"
\n",
" \n",
" | 1 | \n",
" Color | \n",
" Gore Verbinski | \n",
"
\n",
" \n",
" | 2 | \n",
" Color | \n",
" Sam Mendes | \n",
"
\n",
" \n",
" | 3 | \n",
" Color | \n",
" Christopher Nolan | \n",
"
\n",
" \n",
" | 4 | \n",
" NaN | \n",
" Doug Walker | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 5038 | \n",
" Color | \n",
" Scott Smith | \n",
"
\n",
" \n",
" | 5039 | \n",
" Color | \n",
" NaN | \n",
"
\n",
" \n",
" | 5040 | \n",
" Color | \n",
" Benjamin Roberds | \n",
"
\n",
" \n",
" | 5041 | \n",
" Color | \n",
" Daniel Hsia | \n",
"
\n",
" \n",
" | 5042 | \n",
" Color | \n",
" Jon Gunn | \n",
"
\n",
" \n",
"
\n",
"
5043 rows × 2 columns
\n",
"
"
],
"text/plain": [
" color director_name\n",
"0 Color James Cameron\n",
"1 Color Gore Verbinski\n",
"2 Color Sam Mendes\n",
"3 Color Christopher Nolan\n",
"4 NaN Doug Walker\n",
"... ... ...\n",
"5038 Color Scott Smith\n",
"5039 Color NaN\n",
"5040 Color Benjamin Roberds\n",
"5041 Color Daniel Hsia\n",
"5042 Color Jon Gunn\n",
"\n",
"[5043 rows x 2 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# choose multiple columns: data[[\"column1\", \"column2\"]]\n",
"data[[\"color\", \"director_name\"]]\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:40:36.171107Z",
"start_time": "2023-09-18T03:40:36.105984Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 617
},
"id": "ahas9hYCkra_",
"outputId": "ce8c3fd2-20a7-4b6e-8b37-d3ec15047e6b"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" director_name | \n",
" num_critic_for_reviews | \n",
" duration | \n",
" director_facebook_likes | \n",
" actor_3_facebook_likes | \n",
" actor_2_name | \n",
" actor_1_facebook_likes | \n",
" gross | \n",
" genres | \n",
" ... | \n",
" num_user_for_reviews | \n",
" language | \n",
" country | \n",
" content_rating | \n",
" budget | \n",
" title_year | \n",
" actor_2_facebook_likes | \n",
" imdb_score | \n",
" aspect_ratio | \n",
" movie_facebook_likes | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Color | \n",
" James Cameron | \n",
" 723.0 | \n",
" 178.0 | \n",
" 0.0 | \n",
" 855.0 | \n",
" Joel David Moore | \n",
" 1000.0 | \n",
" 760505847.0 | \n",
" Action|Adventure|Fantasy|Sci-Fi | \n",
" ... | \n",
" 3054.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 237000000.0 | \n",
" 2009.0 | \n",
" 936.0 | \n",
" 7.9 | \n",
" 1.78 | \n",
" 33000 | \n",
"
\n",
" \n",
" | 1 | \n",
" Color | \n",
" Gore Verbinski | \n",
" 302.0 | \n",
" 169.0 | \n",
" 563.0 | \n",
" 1000.0 | \n",
" Orlando Bloom | \n",
" 40000.0 | \n",
" 309404152.0 | \n",
" Action|Adventure|Fantasy | \n",
" ... | \n",
" 1238.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 300000000.0 | \n",
" 2007.0 | \n",
" 5000.0 | \n",
" 7.1 | \n",
" 2.35 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" Color | \n",
" Christopher Nolan | \n",
" 813.0 | \n",
" 164.0 | \n",
" 22000.0 | \n",
" 23000.0 | \n",
" Christian Bale | \n",
" 27000.0 | \n",
" 448130642.0 | \n",
" Action|Thriller | \n",
" ... | \n",
" 2701.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 250000000.0 | \n",
" 2012.0 | \n",
" 23000.0 | \n",
" 8.5 | \n",
" 2.35 | \n",
" 164000 | \n",
"
\n",
" \n",
" | 6 | \n",
" Color | \n",
" Sam Raimi | \n",
" 392.0 | \n",
" 156.0 | \n",
" 0.0 | \n",
" 4000.0 | \n",
" James Franco | \n",
" 24000.0 | \n",
" 336530303.0 | \n",
" Action|Adventure|Romance | \n",
" ... | \n",
" 1902.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 258000000.0 | \n",
" 2007.0 | \n",
" 11000.0 | \n",
" 6.2 | \n",
" 2.35 | \n",
" 0 | \n",
"
\n",
" \n",
" | 9 | \n",
" Color | \n",
" David Yates | \n",
" 375.0 | \n",
" 153.0 | \n",
" 282.0 | \n",
" 10000.0 | \n",
" Daniel Radcliffe | \n",
" 25000.0 | \n",
" 301956980.0 | \n",
" Adventure|Family|Fantasy|Mystery | \n",
" ... | \n",
" 973.0 | \n",
" English | \n",
" UK | \n",
" PG | \n",
" 250000000.0 | \n",
" 2009.0 | \n",
" 11000.0 | \n",
" 7.5 | \n",
" 2.35 | \n",
" 10000 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 4688 | \n",
" Color | \n",
" Steve James | \n",
" 53.0 | \n",
" 170.0 | \n",
" 23.0 | \n",
" 2.0 | \n",
" Arthur Agee | \n",
" 7.0 | \n",
" 7830611.0 | \n",
" Documentary|Drama|Sport | \n",
" ... | \n",
" 74.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 700000.0 | \n",
" 1994.0 | \n",
" 6.0 | \n",
" 8.3 | \n",
" 1.33 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4694 | \n",
" Color | \n",
" Peter Jackson | \n",
" 446.0 | \n",
" 201.0 | \n",
" 0.0 | \n",
" 84.0 | \n",
" Thomas Kretschmann | \n",
" 6000.0 | \n",
" 218051260.0 | \n",
" Action|Adventure|Drama|Romance | \n",
" ... | \n",
" 2618.0 | \n",
" English | \n",
" New Zealand | \n",
" PG-13 | \n",
" 207000000.0 | \n",
" 2005.0 | \n",
" 918.0 | \n",
" 7.2 | \n",
" 2.35 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4708 | \n",
" Color | \n",
" Michael Wadleigh | \n",
" 53.0 | \n",
" 215.0 | \n",
" 14.0 | \n",
" 136.0 | \n",
" Jimi Hendrix | \n",
" 262.0 | \n",
" 13300000.0 | \n",
" Documentary|History|Music | \n",
" ... | \n",
" 63.0 | \n",
" English | \n",
" USA | \n",
" R | \n",
" 600000.0 | \n",
" 1970.0 | \n",
" 227.0 | \n",
" 8.1 | \n",
" 2.20 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4747 | \n",
" Black and White | \n",
" Akira Kurosawa | \n",
" 153.0 | \n",
" 202.0 | \n",
" 0.0 | \n",
" 4.0 | \n",
" Minoru Chiaki | \n",
" 304.0 | \n",
" 269061.0 | \n",
" Action|Adventure|Drama | \n",
" ... | \n",
" 596.0 | \n",
" Japanese | \n",
" Japan | \n",
" Unrated | \n",
" 2000000.0 | \n",
" 1954.0 | \n",
" 8.0 | \n",
" 8.7 | \n",
" 1.37 | \n",
" 11000 | \n",
"
\n",
" \n",
" | 4885 | \n",
" Black and White | \n",
" King Vidor | \n",
" 48.0 | \n",
" 151.0 | \n",
" 54.0 | \n",
" 6.0 | \n",
" Renée Adorée | \n",
" 81.0 | \n",
" NaN | \n",
" Drama|Romance|War | \n",
" ... | \n",
" 45.0 | \n",
" NaN | \n",
" USA | \n",
" Not Rated | \n",
" 245000.0 | \n",
" 1925.0 | \n",
" 12.0 | \n",
" 8.3 | \n",
" 1.33 | \n",
" 226 | \n",
"
\n",
" \n",
"
\n",
"
205 rows × 28 columns
\n",
"
"
],
"text/plain": [
" color director_name num_critic_for_reviews duration \\\n",
"0 Color James Cameron 723.0 178.0 \n",
"1 Color Gore Verbinski 302.0 169.0 \n",
"3 Color Christopher Nolan 813.0 164.0 \n",
"6 Color Sam Raimi 392.0 156.0 \n",
"9 Color David Yates 375.0 153.0 \n",
"... ... ... ... ... \n",
"4688 Color Steve James 53.0 170.0 \n",
"4694 Color Peter Jackson 446.0 201.0 \n",
"4708 Color Michael Wadleigh 53.0 215.0 \n",
"4747 Black and White Akira Kurosawa 153.0 202.0 \n",
"4885 Black and White King Vidor 48.0 151.0 \n",
"\n",
" director_facebook_likes actor_3_facebook_likes actor_2_name \\\n",
"0 0.0 855.0 Joel David Moore \n",
"1 563.0 1000.0 Orlando Bloom \n",
"3 22000.0 23000.0 Christian Bale \n",
"6 0.0 4000.0 James Franco \n",
"9 282.0 10000.0 Daniel Radcliffe \n",
"... ... ... ... \n",
"4688 23.0 2.0 Arthur Agee \n",
"4694 0.0 84.0 Thomas Kretschmann \n",
"4708 14.0 136.0 Jimi Hendrix \n",
"4747 0.0 4.0 Minoru Chiaki \n",
"4885 54.0 6.0 Renée Adorée \n",
"\n",
" actor_1_facebook_likes gross genres \\\n",
"0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi \n",
"1 40000.0 309404152.0 Action|Adventure|Fantasy \n",
"3 27000.0 448130642.0 Action|Thriller \n",
"6 24000.0 336530303.0 Action|Adventure|Romance \n",
"9 25000.0 301956980.0 Adventure|Family|Fantasy|Mystery \n",
"... ... ... ... \n",
"4688 7.0 7830611.0 Documentary|Drama|Sport \n",
"4694 6000.0 218051260.0 Action|Adventure|Drama|Romance \n",
"4708 262.0 13300000.0 Documentary|History|Music \n",
"4747 304.0 269061.0 Action|Adventure|Drama \n",
"4885 81.0 NaN Drama|Romance|War \n",
"\n",
" ... num_user_for_reviews language country content_rating \\\n",
"0 ... 3054.0 English USA PG-13 \n",
"1 ... 1238.0 English USA PG-13 \n",
"3 ... 2701.0 English USA PG-13 \n",
"6 ... 1902.0 English USA PG-13 \n",
"9 ... 973.0 English UK PG \n",
"... ... ... ... ... ... \n",
"4688 ... 74.0 English USA PG-13 \n",
"4694 ... 2618.0 English New Zealand PG-13 \n",
"4708 ... 63.0 English USA R \n",
"4747 ... 596.0 Japanese Japan Unrated \n",
"4885 ... 45.0 NaN USA Not Rated \n",
"\n",
" budget title_year actor_2_facebook_likes imdb_score aspect_ratio \\\n",
"0 237000000.0 2009.0 936.0 7.9 1.78 \n",
"1 300000000.0 2007.0 5000.0 7.1 2.35 \n",
"3 250000000.0 2012.0 23000.0 8.5 2.35 \n",
"6 258000000.0 2007.0 11000.0 6.2 2.35 \n",
"9 250000000.0 2009.0 11000.0 7.5 2.35 \n",
"... ... ... ... ... ... \n",
"4688 700000.0 1994.0 6.0 8.3 1.33 \n",
"4694 207000000.0 2005.0 918.0 7.2 2.35 \n",
"4708 600000.0 1970.0 227.0 8.1 2.20 \n",
"4747 2000000.0 1954.0 8.0 8.7 1.37 \n",
"4885 245000.0 1925.0 12.0 8.3 1.33 \n",
"\n",
" movie_facebook_likes \n",
"0 33000 \n",
"1 0 \n",
"3 164000 \n",
"6 0 \n",
"9 10000 \n",
"... ... \n",
"4688 0 \n",
"4694 0 \n",
"4708 0 \n",
"4747 11000 \n",
"4885 226 \n",
"\n",
"[205 rows x 28 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# where filtering: data[data['columnname'] > condition]\n",
"# choose films whose duration is larger than 150 mins\n",
"data[data[\"duration\"] > 150]\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ScdvgzeQkra_"
},
"source": [
"- Process NAN Data\n",
"1. fill value\n",
"2. remove corresponding lines\n",
"3. remove columns where many values are nan"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:40:39.165689Z",
"start_time": "2023-09-18T03:40:39.113922Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 443
},
"id": "R1gY1jMKkra_",
"outputId": "4a8bcfcc-3e43-40dc-cc68-9efe0084e226"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" director_name | \n",
" num_critic_for_reviews | \n",
" duration | \n",
" director_facebook_likes | \n",
" actor_3_facebook_likes | \n",
" actor_2_name | \n",
" actor_1_facebook_likes | \n",
" gross | \n",
" genres | \n",
" ... | \n",
" num_user_for_reviews | \n",
" language | \n",
" country | \n",
" content_rating | \n",
" budget | \n",
" title_year | \n",
" actor_2_facebook_likes | \n",
" imdb_score | \n",
" aspect_ratio | \n",
" movie_facebook_likes | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" | 1 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" | 2 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" | 3 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" | 4 | \n",
" True | \n",
" False | \n",
" True | \n",
" True | \n",
" False | \n",
" True | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
" ... | \n",
" True | \n",
" True | \n",
" True | \n",
" True | \n",
" True | \n",
" True | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 5038 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" True | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
"
\n",
" \n",
" | 5039 | \n",
" False | \n",
" True | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" True | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" | 5040 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
"
\n",
" \n",
" | 5041 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" True | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" | 5042 | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" ... | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
5043 rows × 28 columns
\n",
"
"
],
"text/plain": [
" color director_name num_critic_for_reviews duration \\\n",
"0 False False False False \n",
"1 False False False False \n",
"2 False False False False \n",
"3 False False False False \n",
"4 True False True True \n",
"... ... ... ... ... \n",
"5038 False False False False \n",
"5039 False True False False \n",
"5040 False False False False \n",
"5041 False False False False \n",
"5042 False False False False \n",
"\n",
" director_facebook_likes actor_3_facebook_likes actor_2_name \\\n",
"0 False False False \n",
"1 False False False \n",
"2 False False False \n",
"3 False False False \n",
"4 False True False \n",
"... ... ... ... \n",
"5038 False False False \n",
"5039 True False False \n",
"5040 False False False \n",
"5041 False False False \n",
"5042 False False False \n",
"\n",
" actor_1_facebook_likes gross genres ... num_user_for_reviews \\\n",
"0 False False False ... False \n",
"1 False False False ... False \n",
"2 False False False ... False \n",
"3 False False False ... False \n",
"4 False True False ... True \n",
"... ... ... ... ... ... \n",
"5038 False True False ... False \n",
"5039 False True False ... False \n",
"5040 False True False ... False \n",
"5041 False False False ... False \n",
"5042 False False False ... False \n",
"\n",
" language country content_rating budget title_year \\\n",
"0 False False False False False \n",
"1 False False False False False \n",
"2 False False False False False \n",
"3 False False False False False \n",
"4 True True True True True \n",
"... ... ... ... ... ... \n",
"5038 False False True True False \n",
"5039 False False False True True \n",
"5040 False False True False False \n",
"5041 False False False True False \n",
"5042 False False False False False \n",
"\n",
" actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes \n",
"0 False False False False \n",
"1 False False False False \n",
"2 False False False False \n",
"3 False False False False \n",
"4 False False True False \n",
"... ... ... ... ... \n",
"5038 False False True False \n",
"5039 False False False False \n",
"5040 False False True False \n",
"5041 False False False False \n",
"5042 False False False False \n",
"\n",
"[5043 rows x 28 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check all nan data\n",
"data.isna()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:40:43.044017Z",
"start_time": "2023-09-18T03:40:43.023342Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "frBhEjjwkra_",
"outputId": "4384542b-9360-49cc-b9e9-3b879c966b9d"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 USA\n",
"1 USA\n",
"2 UK\n",
"3 USA\n",
"4 cs5481\n",
"Name: country, dtype: object\n",
"0 178.000000\n",
"1 169.000000\n",
"2 148.000000\n",
"3 164.000000\n",
"4 107.201074\n",
" ... \n",
"5038 87.000000\n",
"5039 43.000000\n",
"5040 76.000000\n",
"5041 100.000000\n",
"5042 90.000000\n",
"Name: duration, Length: 5043, dtype: float64\n"
]
}
],
"source": [
"# fill data with suitable values\n",
"# for example, use \"cs5481\" to replace nan values in column \"country\"\n",
"data.country = data.country.fillna(\"cs5481\")\n",
"print(data.head().country)\n",
"# use mean duration to replace nan values in column \"duration\"\n",
"data.duration = data.duration.fillna(data.duration.mean())\n",
"print(data.duration)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:41:00.088219Z",
"start_time": "2023-09-18T03:41:00.019817Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 565
},
"id": "5PhEMJxrkra_",
"outputId": "6f3727ed-4876-46c6-8faf-d5178efd73b6"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" director_name | \n",
" num_critic_for_reviews | \n",
" duration | \n",
" director_facebook_likes | \n",
" actor_3_facebook_likes | \n",
" actor_2_name | \n",
" actor_1_facebook_likes | \n",
" gross | \n",
" genres | \n",
" ... | \n",
" num_user_for_reviews | \n",
" language | \n",
" country | \n",
" content_rating | \n",
" budget | \n",
" title_year | \n",
" actor_2_facebook_likes | \n",
" imdb_score | \n",
" aspect_ratio | \n",
" movie_facebook_likes | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Color | \n",
" James Cameron | \n",
" 723.0 | \n",
" 178.0 | \n",
" 0.0 | \n",
" 855.0 | \n",
" Joel David Moore | \n",
" 1000.0 | \n",
" 760505847.0 | \n",
" Action|Adventure|Fantasy|Sci-Fi | \n",
" ... | \n",
" 3054.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 237000000.0 | \n",
" 2009.0 | \n",
" 936.0 | \n",
" 7.9 | \n",
" 1.78 | \n",
" 33000 | \n",
"
\n",
" \n",
" | 1 | \n",
" Color | \n",
" Gore Verbinski | \n",
" 302.0 | \n",
" 169.0 | \n",
" 563.0 | \n",
" 1000.0 | \n",
" Orlando Bloom | \n",
" 40000.0 | \n",
" 309404152.0 | \n",
" Action|Adventure|Fantasy | \n",
" ... | \n",
" 1238.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 300000000.0 | \n",
" 2007.0 | \n",
" 5000.0 | \n",
" 7.1 | \n",
" 2.35 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" Color | \n",
" Sam Mendes | \n",
" 602.0 | \n",
" 148.0 | \n",
" 0.0 | \n",
" 161.0 | \n",
" Rory Kinnear | \n",
" 11000.0 | \n",
" 200074175.0 | \n",
" Action|Adventure|Thriller | \n",
" ... | \n",
" 994.0 | \n",
" English | \n",
" UK | \n",
" PG-13 | \n",
" 245000000.0 | \n",
" 2015.0 | \n",
" 393.0 | \n",
" 6.8 | \n",
" 2.35 | \n",
" 85000 | \n",
"
\n",
" \n",
" | 3 | \n",
" Color | \n",
" Christopher Nolan | \n",
" 813.0 | \n",
" 164.0 | \n",
" 22000.0 | \n",
" 23000.0 | \n",
" Christian Bale | \n",
" 27000.0 | \n",
" 448130642.0 | \n",
" Action|Thriller | \n",
" ... | \n",
" 2701.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 250000000.0 | \n",
" 2012.0 | \n",
" 23000.0 | \n",
" 8.5 | \n",
" 2.35 | \n",
" 164000 | \n",
"
\n",
" \n",
" | 5 | \n",
" Color | \n",
" Andrew Stanton | \n",
" 462.0 | \n",
" 132.0 | \n",
" 475.0 | \n",
" 530.0 | \n",
" Samantha Morton | \n",
" 640.0 | \n",
" 73058679.0 | \n",
" Action|Adventure|Sci-Fi | \n",
" ... | \n",
" 738.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 263700000.0 | \n",
" 2012.0 | \n",
" 632.0 | \n",
" 6.6 | \n",
" 2.35 | \n",
" 24000 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 5026 | \n",
" Color | \n",
" Olivier Assayas | \n",
" 81.0 | \n",
" 110.0 | \n",
" 107.0 | \n",
" 45.0 | \n",
" Béatrice Dalle | \n",
" 576.0 | \n",
" 136007.0 | \n",
" Drama|Music|Romance | \n",
" ... | \n",
" 39.0 | \n",
" French | \n",
" France | \n",
" R | \n",
" 4500.0 | \n",
" 2004.0 | \n",
" 133.0 | \n",
" 6.9 | \n",
" 2.35 | \n",
" 171 | \n",
"
\n",
" \n",
" | 5027 | \n",
" Color | \n",
" Jafar Panahi | \n",
" 64.0 | \n",
" 90.0 | \n",
" 397.0 | \n",
" 0.0 | \n",
" Nargess Mamizadeh | \n",
" 5.0 | \n",
" 673780.0 | \n",
" Drama | \n",
" ... | \n",
" 26.0 | \n",
" Persian | \n",
" Iran | \n",
" Not Rated | \n",
" 10000.0 | \n",
" 2000.0 | \n",
" 0.0 | \n",
" 7.5 | \n",
" 1.85 | \n",
" 697 | \n",
"
\n",
" \n",
" | 5033 | \n",
" Color | \n",
" Shane Carruth | \n",
" 143.0 | \n",
" 77.0 | \n",
" 291.0 | \n",
" 8.0 | \n",
" David Sullivan | \n",
" 291.0 | \n",
" 424760.0 | \n",
" Drama|Sci-Fi|Thriller | \n",
" ... | \n",
" 371.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 7000.0 | \n",
" 2004.0 | \n",
" 45.0 | \n",
" 7.0 | \n",
" 1.85 | \n",
" 19000 | \n",
"
\n",
" \n",
" | 5035 | \n",
" Color | \n",
" Robert Rodriguez | \n",
" 56.0 | \n",
" 81.0 | \n",
" 0.0 | \n",
" 6.0 | \n",
" Peter Marquardt | \n",
" 121.0 | \n",
" 2040920.0 | \n",
" Action|Crime|Drama|Romance|Thriller | \n",
" ... | \n",
" 130.0 | \n",
" Spanish | \n",
" USA | \n",
" R | \n",
" 7000.0 | \n",
" 1992.0 | \n",
" 20.0 | \n",
" 6.9 | \n",
" 1.37 | \n",
" 0 | \n",
"
\n",
" \n",
" | 5042 | \n",
" Color | \n",
" Jon Gunn | \n",
" 43.0 | \n",
" 90.0 | \n",
" 16.0 | \n",
" 16.0 | \n",
" Brian Herzlinger | \n",
" 86.0 | \n",
" 85222.0 | \n",
" Documentary | \n",
" ... | \n",
" 84.0 | \n",
" English | \n",
" USA | \n",
" PG | \n",
" 1100.0 | \n",
" 2004.0 | \n",
" 23.0 | \n",
" 6.6 | \n",
" 1.85 | \n",
" 456 | \n",
"
\n",
" \n",
"
\n",
"
3756 rows × 28 columns
\n",
"
"
],
"text/plain": [
" color director_name num_critic_for_reviews duration \\\n",
"0 Color James Cameron 723.0 178.0 \n",
"1 Color Gore Verbinski 302.0 169.0 \n",
"2 Color Sam Mendes 602.0 148.0 \n",
"3 Color Christopher Nolan 813.0 164.0 \n",
"5 Color Andrew Stanton 462.0 132.0 \n",
"... ... ... ... ... \n",
"5026 Color Olivier Assayas 81.0 110.0 \n",
"5027 Color Jafar Panahi 64.0 90.0 \n",
"5033 Color Shane Carruth 143.0 77.0 \n",
"5035 Color Robert Rodriguez 56.0 81.0 \n",
"5042 Color Jon Gunn 43.0 90.0 \n",
"\n",
" director_facebook_likes actor_3_facebook_likes actor_2_name \\\n",
"0 0.0 855.0 Joel David Moore \n",
"1 563.0 1000.0 Orlando Bloom \n",
"2 0.0 161.0 Rory Kinnear \n",
"3 22000.0 23000.0 Christian Bale \n",
"5 475.0 530.0 Samantha Morton \n",
"... ... ... ... \n",
"5026 107.0 45.0 Béatrice Dalle \n",
"5027 397.0 0.0 Nargess Mamizadeh \n",
"5033 291.0 8.0 David Sullivan \n",
"5035 0.0 6.0 Peter Marquardt \n",
"5042 16.0 16.0 Brian Herzlinger \n",
"\n",
" actor_1_facebook_likes gross \\\n",
"0 1000.0 760505847.0 \n",
"1 40000.0 309404152.0 \n",
"2 11000.0 200074175.0 \n",
"3 27000.0 448130642.0 \n",
"5 640.0 73058679.0 \n",
"... ... ... \n",
"5026 576.0 136007.0 \n",
"5027 5.0 673780.0 \n",
"5033 291.0 424760.0 \n",
"5035 121.0 2040920.0 \n",
"5042 86.0 85222.0 \n",
"\n",
" genres ... num_user_for_reviews language \\\n",
"0 Action|Adventure|Fantasy|Sci-Fi ... 3054.0 English \n",
"1 Action|Adventure|Fantasy ... 1238.0 English \n",
"2 Action|Adventure|Thriller ... 994.0 English \n",
"3 Action|Thriller ... 2701.0 English \n",
"5 Action|Adventure|Sci-Fi ... 738.0 English \n",
"... ... ... ... ... \n",
"5026 Drama|Music|Romance ... 39.0 French \n",
"5027 Drama ... 26.0 Persian \n",
"5033 Drama|Sci-Fi|Thriller ... 371.0 English \n",
"5035 Action|Crime|Drama|Romance|Thriller ... 130.0 Spanish \n",
"5042 Documentary ... 84.0 English \n",
"\n",
" country content_rating budget title_year actor_2_facebook_likes \\\n",
"0 USA PG-13 237000000.0 2009.0 936.0 \n",
"1 USA PG-13 300000000.0 2007.0 5000.0 \n",
"2 UK PG-13 245000000.0 2015.0 393.0 \n",
"3 USA PG-13 250000000.0 2012.0 23000.0 \n",
"5 USA PG-13 263700000.0 2012.0 632.0 \n",
"... ... ... ... ... ... \n",
"5026 France R 4500.0 2004.0 133.0 \n",
"5027 Iran Not Rated 10000.0 2000.0 0.0 \n",
"5033 USA PG-13 7000.0 2004.0 45.0 \n",
"5035 USA R 7000.0 1992.0 20.0 \n",
"5042 USA PG 1100.0 2004.0 23.0 \n",
"\n",
" imdb_score aspect_ratio movie_facebook_likes \n",
"0 7.9 1.78 33000 \n",
"1 7.1 2.35 0 \n",
"2 6.8 2.35 85000 \n",
"3 8.5 2.35 164000 \n",
"5 6.6 2.35 24000 \n",
"... ... ... ... \n",
"5026 6.9 2.35 171 \n",
"5027 7.5 1.85 697 \n",
"5033 7.0 1.85 19000 \n",
"5035 6.9 1.37 0 \n",
"5042 6.6 1.85 456 \n",
"\n",
"[3756 rows x 28 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# remove lines where some values are nan\n",
"data.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:41:02.580068Z",
"start_time": "2023-09-18T03:41:02.459971Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 565
},
"id": "f0EIh-sskra_",
"outputId": "f5ab0441-5819-4aa9-a10b-34d21a3dc472"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" director_name | \n",
" num_critic_for_reviews | \n",
" duration | \n",
" director_facebook_likes | \n",
" actor_3_facebook_likes | \n",
" actor_2_name | \n",
" actor_1_facebook_likes | \n",
" gross | \n",
" genres | \n",
" ... | \n",
" num_user_for_reviews | \n",
" language | \n",
" country | \n",
" content_rating | \n",
" budget | \n",
" title_year | \n",
" actor_2_facebook_likes | \n",
" imdb_score | \n",
" aspect_ratio | \n",
" movie_facebook_likes | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Color | \n",
" James Cameron | \n",
" 723.0 | \n",
" 178.000000 | \n",
" 0.0 | \n",
" 855.0 | \n",
" Joel David Moore | \n",
" 1000.0 | \n",
" 760505847.0 | \n",
" Action|Adventure|Fantasy|Sci-Fi | \n",
" ... | \n",
" 3054.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 237000000.0 | \n",
" 2009.0 | \n",
" 936.0 | \n",
" 7.9 | \n",
" 1.78 | \n",
" 33000 | \n",
"
\n",
" \n",
" | 1 | \n",
" Color | \n",
" Gore Verbinski | \n",
" 302.0 | \n",
" 169.000000 | \n",
" 563.0 | \n",
" 1000.0 | \n",
" Orlando Bloom | \n",
" 40000.0 | \n",
" 309404152.0 | \n",
" Action|Adventure|Fantasy | \n",
" ... | \n",
" 1238.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 300000000.0 | \n",
" 2007.0 | \n",
" 5000.0 | \n",
" 7.1 | \n",
" 2.35 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" Color | \n",
" Sam Mendes | \n",
" 602.0 | \n",
" 148.000000 | \n",
" 0.0 | \n",
" 161.0 | \n",
" Rory Kinnear | \n",
" 11000.0 | \n",
" 200074175.0 | \n",
" Action|Adventure|Thriller | \n",
" ... | \n",
" 994.0 | \n",
" English | \n",
" UK | \n",
" PG-13 | \n",
" 245000000.0 | \n",
" 2015.0 | \n",
" 393.0 | \n",
" 6.8 | \n",
" 2.35 | \n",
" 85000 | \n",
"
\n",
" \n",
" | 3 | \n",
" Color | \n",
" Christopher Nolan | \n",
" 813.0 | \n",
" 164.000000 | \n",
" 22000.0 | \n",
" 23000.0 | \n",
" Christian Bale | \n",
" 27000.0 | \n",
" 448130642.0 | \n",
" Action|Thriller | \n",
" ... | \n",
" 2701.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 250000000.0 | \n",
" 2012.0 | \n",
" 23000.0 | \n",
" 8.5 | \n",
" 2.35 | \n",
" 164000 | \n",
"
\n",
" \n",
" | 4 | \n",
" NaN | \n",
" Doug Walker | \n",
" NaN | \n",
" 107.201074 | \n",
" 131.0 | \n",
" NaN | \n",
" Rob Walker | \n",
" 131.0 | \n",
" NaN | \n",
" Documentary | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" cs5481 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 12.0 | \n",
" 7.1 | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 5038 | \n",
" Color | \n",
" Scott Smith | \n",
" 1.0 | \n",
" 87.000000 | \n",
" 2.0 | \n",
" 318.0 | \n",
" Daphne Zuniga | \n",
" 637.0 | \n",
" NaN | \n",
" Comedy|Drama | \n",
" ... | \n",
" 6.0 | \n",
" English | \n",
" Canada | \n",
" NaN | \n",
" NaN | \n",
" 2013.0 | \n",
" 470.0 | \n",
" 7.7 | \n",
" NaN | \n",
" 84 | \n",
"
\n",
" \n",
" | 5039 | \n",
" Color | \n",
" NaN | \n",
" 43.0 | \n",
" 43.000000 | \n",
" NaN | \n",
" 319.0 | \n",
" Valorie Curry | \n",
" 841.0 | \n",
" NaN | \n",
" Crime|Drama|Mystery|Thriller | \n",
" ... | \n",
" 359.0 | \n",
" English | \n",
" USA | \n",
" TV-14 | \n",
" NaN | \n",
" NaN | \n",
" 593.0 | \n",
" 7.5 | \n",
" 16.00 | \n",
" 32000 | \n",
"
\n",
" \n",
" | 5040 | \n",
" Color | \n",
" Benjamin Roberds | \n",
" 13.0 | \n",
" 76.000000 | \n",
" 0.0 | \n",
" 0.0 | \n",
" Maxwell Moody | \n",
" 0.0 | \n",
" NaN | \n",
" Drama|Horror|Thriller | \n",
" ... | \n",
" 3.0 | \n",
" English | \n",
" USA | \n",
" NaN | \n",
" 1400.0 | \n",
" 2013.0 | \n",
" 0.0 | \n",
" 6.3 | \n",
" NaN | \n",
" 16 | \n",
"
\n",
" \n",
" | 5041 | \n",
" Color | \n",
" Daniel Hsia | \n",
" 14.0 | \n",
" 100.000000 | \n",
" 0.0 | \n",
" 489.0 | \n",
" Daniel Henney | \n",
" 946.0 | \n",
" 10443.0 | \n",
" Comedy|Drama|Romance | \n",
" ... | \n",
" 9.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" NaN | \n",
" 2012.0 | \n",
" 719.0 | \n",
" 6.3 | \n",
" 2.35 | \n",
" 660 | \n",
"
\n",
" \n",
" | 5042 | \n",
" Color | \n",
" Jon Gunn | \n",
" 43.0 | \n",
" 90.000000 | \n",
" 16.0 | \n",
" 16.0 | \n",
" Brian Herzlinger | \n",
" 86.0 | \n",
" 85222.0 | \n",
" Documentary | \n",
" ... | \n",
" 84.0 | \n",
" English | \n",
" USA | \n",
" PG | \n",
" 1100.0 | \n",
" 2004.0 | \n",
" 23.0 | \n",
" 6.6 | \n",
" 1.85 | \n",
" 456 | \n",
"
\n",
" \n",
"
\n",
"
5043 rows × 28 columns
\n",
"
"
],
"text/plain": [
" color director_name num_critic_for_reviews duration \\\n",
"0 Color James Cameron 723.0 178.000000 \n",
"1 Color Gore Verbinski 302.0 169.000000 \n",
"2 Color Sam Mendes 602.0 148.000000 \n",
"3 Color Christopher Nolan 813.0 164.000000 \n",
"4 NaN Doug Walker NaN 107.201074 \n",
"... ... ... ... ... \n",
"5038 Color Scott Smith 1.0 87.000000 \n",
"5039 Color NaN 43.0 43.000000 \n",
"5040 Color Benjamin Roberds 13.0 76.000000 \n",
"5041 Color Daniel Hsia 14.0 100.000000 \n",
"5042 Color Jon Gunn 43.0 90.000000 \n",
"\n",
" director_facebook_likes actor_3_facebook_likes actor_2_name \\\n",
"0 0.0 855.0 Joel David Moore \n",
"1 563.0 1000.0 Orlando Bloom \n",
"2 0.0 161.0 Rory Kinnear \n",
"3 22000.0 23000.0 Christian Bale \n",
"4 131.0 NaN Rob Walker \n",
"... ... ... ... \n",
"5038 2.0 318.0 Daphne Zuniga \n",
"5039 NaN 319.0 Valorie Curry \n",
"5040 0.0 0.0 Maxwell Moody \n",
"5041 0.0 489.0 Daniel Henney \n",
"5042 16.0 16.0 Brian Herzlinger \n",
"\n",
" actor_1_facebook_likes gross genres \\\n",
"0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi \n",
"1 40000.0 309404152.0 Action|Adventure|Fantasy \n",
"2 11000.0 200074175.0 Action|Adventure|Thriller \n",
"3 27000.0 448130642.0 Action|Thriller \n",
"4 131.0 NaN Documentary \n",
"... ... ... ... \n",
"5038 637.0 NaN Comedy|Drama \n",
"5039 841.0 NaN Crime|Drama|Mystery|Thriller \n",
"5040 0.0 NaN Drama|Horror|Thriller \n",
"5041 946.0 10443.0 Comedy|Drama|Romance \n",
"5042 86.0 85222.0 Documentary \n",
"\n",
" ... num_user_for_reviews language country content_rating budget \\\n",
"0 ... 3054.0 English USA PG-13 237000000.0 \n",
"1 ... 1238.0 English USA PG-13 300000000.0 \n",
"2 ... 994.0 English UK PG-13 245000000.0 \n",
"3 ... 2701.0 English USA PG-13 250000000.0 \n",
"4 ... NaN NaN cs5481 NaN NaN \n",
"... ... ... ... ... ... ... \n",
"5038 ... 6.0 English Canada NaN NaN \n",
"5039 ... 359.0 English USA TV-14 NaN \n",
"5040 ... 3.0 English USA NaN 1400.0 \n",
"5041 ... 9.0 English USA PG-13 NaN \n",
"5042 ... 84.0 English USA PG 1100.0 \n",
"\n",
" title_year actor_2_facebook_likes imdb_score aspect_ratio \\\n",
"0 2009.0 936.0 7.9 1.78 \n",
"1 2007.0 5000.0 7.1 2.35 \n",
"2 2015.0 393.0 6.8 2.35 \n",
"3 2012.0 23000.0 8.5 2.35 \n",
"4 NaN 12.0 7.1 NaN \n",
"... ... ... ... ... \n",
"5038 2013.0 470.0 7.7 NaN \n",
"5039 NaN 593.0 7.5 16.00 \n",
"5040 2013.0 0.0 6.3 NaN \n",
"5041 2012.0 719.0 6.3 2.35 \n",
"5042 2004.0 23.0 6.6 1.85 \n",
"\n",
" movie_facebook_likes \n",
"0 33000 \n",
"1 0 \n",
"2 85000 \n",
"3 164000 \n",
"4 0 \n",
"... ... \n",
"5038 84 \n",
"5039 32000 \n",
"5040 16 \n",
"5041 660 \n",
"5042 456 \n",
"\n",
"[5043 rows x 28 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# droping lines where just some values are nan is aggressive, so we can just remove lines where all values are nan.\n",
"data.dropna(how=\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:41:04.380118Z",
"start_time": "2023-09-18T03:41:04.220510Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 600
},
"id": "xJDx_kmAkra_",
"outputId": "8e22425d-cca8-405c-eb5a-05dd28d558b5"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" director_name | \n",
" num_critic_for_reviews | \n",
" duration | \n",
" director_facebook_likes | \n",
" actor_3_facebook_likes | \n",
" actor_2_name | \n",
" actor_1_facebook_likes | \n",
" gross | \n",
" genres | \n",
" ... | \n",
" num_user_for_reviews | \n",
" language | \n",
" country | \n",
" content_rating | \n",
" budget | \n",
" title_year | \n",
" actor_2_facebook_likes | \n",
" imdb_score | \n",
" aspect_ratio | \n",
" movie_facebook_likes | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Color | \n",
" James Cameron | \n",
" 723.0 | \n",
" 178.0 | \n",
" 0.0 | \n",
" 855.0 | \n",
" Joel David Moore | \n",
" 1000.0 | \n",
" 760505847.0 | \n",
" Action|Adventure|Fantasy|Sci-Fi | \n",
" ... | \n",
" 3054.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 237000000.0 | \n",
" 2009.0 | \n",
" 936.0 | \n",
" 7.9 | \n",
" 1.78 | \n",
" 33000 | \n",
"
\n",
" \n",
" | 1 | \n",
" Color | \n",
" Gore Verbinski | \n",
" 302.0 | \n",
" 169.0 | \n",
" 563.0 | \n",
" 1000.0 | \n",
" Orlando Bloom | \n",
" 40000.0 | \n",
" 309404152.0 | \n",
" Action|Adventure|Fantasy | \n",
" ... | \n",
" 1238.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 300000000.0 | \n",
" 2007.0 | \n",
" 5000.0 | \n",
" 7.1 | \n",
" 2.35 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" Color | \n",
" Sam Mendes | \n",
" 602.0 | \n",
" 148.0 | \n",
" 0.0 | \n",
" 161.0 | \n",
" Rory Kinnear | \n",
" 11000.0 | \n",
" 200074175.0 | \n",
" Action|Adventure|Thriller | \n",
" ... | \n",
" 994.0 | \n",
" English | \n",
" UK | \n",
" PG-13 | \n",
" 245000000.0 | \n",
" 2015.0 | \n",
" 393.0 | \n",
" 6.8 | \n",
" 2.35 | \n",
" 85000 | \n",
"
\n",
" \n",
" | 3 | \n",
" Color | \n",
" Christopher Nolan | \n",
" 813.0 | \n",
" 164.0 | \n",
" 22000.0 | \n",
" 23000.0 | \n",
" Christian Bale | \n",
" 27000.0 | \n",
" 448130642.0 | \n",
" Action|Thriller | \n",
" ... | \n",
" 2701.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 250000000.0 | \n",
" 2012.0 | \n",
" 23000.0 | \n",
" 8.5 | \n",
" 2.35 | \n",
" 164000 | \n",
"
\n",
" \n",
" | 5 | \n",
" Color | \n",
" Andrew Stanton | \n",
" 462.0 | \n",
" 132.0 | \n",
" 475.0 | \n",
" 530.0 | \n",
" Samantha Morton | \n",
" 640.0 | \n",
" 73058679.0 | \n",
" Action|Adventure|Sci-Fi | \n",
" ... | \n",
" 738.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 263700000.0 | \n",
" 2012.0 | \n",
" 632.0 | \n",
" 6.6 | \n",
" 2.35 | \n",
" 24000 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 5035 | \n",
" Color | \n",
" Robert Rodriguez | \n",
" 56.0 | \n",
" 81.0 | \n",
" 0.0 | \n",
" 6.0 | \n",
" Peter Marquardt | \n",
" 121.0 | \n",
" 2040920.0 | \n",
" Action|Crime|Drama|Romance|Thriller | \n",
" ... | \n",
" 130.0 | \n",
" Spanish | \n",
" USA | \n",
" R | \n",
" 7000.0 | \n",
" 1992.0 | \n",
" 20.0 | \n",
" 6.9 | \n",
" 1.37 | \n",
" 0 | \n",
"
\n",
" \n",
" | 5036 | \n",
" Color | \n",
" Anthony Vallone | \n",
" NaN | \n",
" 84.0 | \n",
" 2.0 | \n",
" 2.0 | \n",
" John Considine | \n",
" 45.0 | \n",
" NaN | \n",
" Crime|Drama | \n",
" ... | \n",
" 1.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 3250.0 | \n",
" 2005.0 | \n",
" 44.0 | \n",
" 7.8 | \n",
" NaN | \n",
" 4 | \n",
"
\n",
" \n",
" | 5037 | \n",
" Color | \n",
" Edward Burns | \n",
" 14.0 | \n",
" 95.0 | \n",
" 0.0 | \n",
" 133.0 | \n",
" Caitlin FitzGerald | \n",
" 296.0 | \n",
" 4584.0 | \n",
" Comedy|Drama | \n",
" ... | \n",
" 14.0 | \n",
" English | \n",
" USA | \n",
" Not Rated | \n",
" 9000.0 | \n",
" 2011.0 | \n",
" 205.0 | \n",
" 6.4 | \n",
" NaN | \n",
" 413 | \n",
"
\n",
" \n",
" | 5041 | \n",
" Color | \n",
" Daniel Hsia | \n",
" 14.0 | \n",
" 100.0 | \n",
" 0.0 | \n",
" 489.0 | \n",
" Daniel Henney | \n",
" 946.0 | \n",
" 10443.0 | \n",
" Comedy|Drama|Romance | \n",
" ... | \n",
" 9.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" NaN | \n",
" 2012.0 | \n",
" 719.0 | \n",
" 6.3 | \n",
" 2.35 | \n",
" 660 | \n",
"
\n",
" \n",
" | 5042 | \n",
" Color | \n",
" Jon Gunn | \n",
" 43.0 | \n",
" 90.0 | \n",
" 16.0 | \n",
" 16.0 | \n",
" Brian Herzlinger | \n",
" 86.0 | \n",
" 85222.0 | \n",
" Documentary | \n",
" ... | \n",
" 84.0 | \n",
" English | \n",
" USA | \n",
" PG | \n",
" 1100.0 | \n",
" 2004.0 | \n",
" 23.0 | \n",
" 6.6 | \n",
" 1.85 | \n",
" 456 | \n",
"
\n",
" \n",
"
\n",
"
4848 rows × 28 columns
\n",
"
"
],
"text/plain": [
" color director_name num_critic_for_reviews duration \\\n",
"0 Color James Cameron 723.0 178.0 \n",
"1 Color Gore Verbinski 302.0 169.0 \n",
"2 Color Sam Mendes 602.0 148.0 \n",
"3 Color Christopher Nolan 813.0 164.0 \n",
"5 Color Andrew Stanton 462.0 132.0 \n",
"... ... ... ... ... \n",
"5035 Color Robert Rodriguez 56.0 81.0 \n",
"5036 Color Anthony Vallone NaN 84.0 \n",
"5037 Color Edward Burns 14.0 95.0 \n",
"5041 Color Daniel Hsia 14.0 100.0 \n",
"5042 Color Jon Gunn 43.0 90.0 \n",
"\n",
" director_facebook_likes actor_3_facebook_likes actor_2_name \\\n",
"0 0.0 855.0 Joel David Moore \n",
"1 563.0 1000.0 Orlando Bloom \n",
"2 0.0 161.0 Rory Kinnear \n",
"3 22000.0 23000.0 Christian Bale \n",
"5 475.0 530.0 Samantha Morton \n",
"... ... ... ... \n",
"5035 0.0 6.0 Peter Marquardt \n",
"5036 2.0 2.0 John Considine \n",
"5037 0.0 133.0 Caitlin FitzGerald \n",
"5041 0.0 489.0 Daniel Henney \n",
"5042 16.0 16.0 Brian Herzlinger \n",
"\n",
" actor_1_facebook_likes gross \\\n",
"0 1000.0 760505847.0 \n",
"1 40000.0 309404152.0 \n",
"2 11000.0 200074175.0 \n",
"3 27000.0 448130642.0 \n",
"5 640.0 73058679.0 \n",
"... ... ... \n",
"5035 121.0 2040920.0 \n",
"5036 45.0 NaN \n",
"5037 296.0 4584.0 \n",
"5041 946.0 10443.0 \n",
"5042 86.0 85222.0 \n",
"\n",
" genres ... num_user_for_reviews language \\\n",
"0 Action|Adventure|Fantasy|Sci-Fi ... 3054.0 English \n",
"1 Action|Adventure|Fantasy ... 1238.0 English \n",
"2 Action|Adventure|Thriller ... 994.0 English \n",
"3 Action|Thriller ... 2701.0 English \n",
"5 Action|Adventure|Sci-Fi ... 738.0 English \n",
"... ... ... ... ... \n",
"5035 Action|Crime|Drama|Romance|Thriller ... 130.0 Spanish \n",
"5036 Crime|Drama ... 1.0 English \n",
"5037 Comedy|Drama ... 14.0 English \n",
"5041 Comedy|Drama|Romance ... 9.0 English \n",
"5042 Documentary ... 84.0 English \n",
"\n",
" country content_rating budget title_year actor_2_facebook_likes \\\n",
"0 USA PG-13 237000000.0 2009.0 936.0 \n",
"1 USA PG-13 300000000.0 2007.0 5000.0 \n",
"2 UK PG-13 245000000.0 2015.0 393.0 \n",
"3 USA PG-13 250000000.0 2012.0 23000.0 \n",
"5 USA PG-13 263700000.0 2012.0 632.0 \n",
"... ... ... ... ... ... \n",
"5035 USA R 7000.0 1992.0 20.0 \n",
"5036 USA PG-13 3250.0 2005.0 44.0 \n",
"5037 USA Not Rated 9000.0 2011.0 205.0 \n",
"5041 USA PG-13 NaN 2012.0 719.0 \n",
"5042 USA PG 1100.0 2004.0 23.0 \n",
"\n",
" imdb_score aspect_ratio movie_facebook_likes \n",
"0 7.9 1.78 33000 \n",
"1 7.1 2.35 0 \n",
"2 6.8 2.35 85000 \n",
"3 8.5 2.35 164000 \n",
"5 6.6 2.35 24000 \n",
"... ... ... ... \n",
"5035 6.9 1.37 0 \n",
"5036 7.8 NaN 4 \n",
"5037 6.4 NaN 413 \n",
"5041 6.3 2.35 660 \n",
"5042 6.6 1.85 456 \n",
"\n",
"[4848 rows x 28 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# we can also add some limitations, save lines where more than 25 values are not nan\n",
"data.dropna(thresh=25)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:41:05.383611Z",
"start_time": "2023-09-18T03:41:05.229722Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 565
},
"id": "KlzHN2cgkra_",
"outputId": "b3fe5937-d68a-47c0-8896-055b0935c3aa"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" director_name | \n",
" num_critic_for_reviews | \n",
" duration | \n",
" director_facebook_likes | \n",
" actor_3_facebook_likes | \n",
" actor_2_name | \n",
" actor_1_facebook_likes | \n",
" gross | \n",
" genres | \n",
" ... | \n",
" num_user_for_reviews | \n",
" language | \n",
" country | \n",
" content_rating | \n",
" budget | \n",
" title_year | \n",
" actor_2_facebook_likes | \n",
" imdb_score | \n",
" aspect_ratio | \n",
" movie_facebook_likes | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Color | \n",
" James Cameron | \n",
" 723.0 | \n",
" 178.000000 | \n",
" 0.0 | \n",
" 855.0 | \n",
" Joel David Moore | \n",
" 1000.0 | \n",
" 760505847.0 | \n",
" Action|Adventure|Fantasy|Sci-Fi | \n",
" ... | \n",
" 3054.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 237000000.0 | \n",
" 2009.0 | \n",
" 936.0 | \n",
" 7.9 | \n",
" 1.78 | \n",
" 33000 | \n",
"
\n",
" \n",
" | 1 | \n",
" Color | \n",
" Gore Verbinski | \n",
" 302.0 | \n",
" 169.000000 | \n",
" 563.0 | \n",
" 1000.0 | \n",
" Orlando Bloom | \n",
" 40000.0 | \n",
" 309404152.0 | \n",
" Action|Adventure|Fantasy | \n",
" ... | \n",
" 1238.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 300000000.0 | \n",
" 2007.0 | \n",
" 5000.0 | \n",
" 7.1 | \n",
" 2.35 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" Color | \n",
" Sam Mendes | \n",
" 602.0 | \n",
" 148.000000 | \n",
" 0.0 | \n",
" 161.0 | \n",
" Rory Kinnear | \n",
" 11000.0 | \n",
" 200074175.0 | \n",
" Action|Adventure|Thriller | \n",
" ... | \n",
" 994.0 | \n",
" English | \n",
" UK | \n",
" PG-13 | \n",
" 245000000.0 | \n",
" 2015.0 | \n",
" 393.0 | \n",
" 6.8 | \n",
" 2.35 | \n",
" 85000 | \n",
"
\n",
" \n",
" | 3 | \n",
" Color | \n",
" Christopher Nolan | \n",
" 813.0 | \n",
" 164.000000 | \n",
" 22000.0 | \n",
" 23000.0 | \n",
" Christian Bale | \n",
" 27000.0 | \n",
" 448130642.0 | \n",
" Action|Thriller | \n",
" ... | \n",
" 2701.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 250000000.0 | \n",
" 2012.0 | \n",
" 23000.0 | \n",
" 8.5 | \n",
" 2.35 | \n",
" 164000 | \n",
"
\n",
" \n",
" | 4 | \n",
" NaN | \n",
" Doug Walker | \n",
" NaN | \n",
" 107.201074 | \n",
" 131.0 | \n",
" NaN | \n",
" Rob Walker | \n",
" 131.0 | \n",
" NaN | \n",
" Documentary | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" cs5481 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 12.0 | \n",
" 7.1 | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 5038 | \n",
" Color | \n",
" Scott Smith | \n",
" 1.0 | \n",
" 87.000000 | \n",
" 2.0 | \n",
" 318.0 | \n",
" Daphne Zuniga | \n",
" 637.0 | \n",
" NaN | \n",
" Comedy|Drama | \n",
" ... | \n",
" 6.0 | \n",
" English | \n",
" Canada | \n",
" NaN | \n",
" NaN | \n",
" 2013.0 | \n",
" 470.0 | \n",
" 7.7 | \n",
" NaN | \n",
" 84 | \n",
"
\n",
" \n",
" | 5039 | \n",
" Color | \n",
" NaN | \n",
" 43.0 | \n",
" 43.000000 | \n",
" NaN | \n",
" 319.0 | \n",
" Valorie Curry | \n",
" 841.0 | \n",
" NaN | \n",
" Crime|Drama|Mystery|Thriller | \n",
" ... | \n",
" 359.0 | \n",
" English | \n",
" USA | \n",
" TV-14 | \n",
" NaN | \n",
" NaN | \n",
" 593.0 | \n",
" 7.5 | \n",
" 16.00 | \n",
" 32000 | \n",
"
\n",
" \n",
" | 5040 | \n",
" Color | \n",
" Benjamin Roberds | \n",
" 13.0 | \n",
" 76.000000 | \n",
" 0.0 | \n",
" 0.0 | \n",
" Maxwell Moody | \n",
" 0.0 | \n",
" NaN | \n",
" Drama|Horror|Thriller | \n",
" ... | \n",
" 3.0 | \n",
" English | \n",
" USA | \n",
" NaN | \n",
" 1400.0 | \n",
" 2013.0 | \n",
" 0.0 | \n",
" 6.3 | \n",
" NaN | \n",
" 16 | \n",
"
\n",
" \n",
" | 5041 | \n",
" Color | \n",
" Daniel Hsia | \n",
" 14.0 | \n",
" 100.000000 | \n",
" 0.0 | \n",
" 489.0 | \n",
" Daniel Henney | \n",
" 946.0 | \n",
" 10443.0 | \n",
" Comedy|Drama|Romance | \n",
" ... | \n",
" 9.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" NaN | \n",
" 2012.0 | \n",
" 719.0 | \n",
" 6.3 | \n",
" 2.35 | \n",
" 660 | \n",
"
\n",
" \n",
" | 5042 | \n",
" Color | \n",
" Jon Gunn | \n",
" 43.0 | \n",
" 90.000000 | \n",
" 16.0 | \n",
" 16.0 | \n",
" Brian Herzlinger | \n",
" 86.0 | \n",
" 85222.0 | \n",
" Documentary | \n",
" ... | \n",
" 84.0 | \n",
" English | \n",
" USA | \n",
" PG | \n",
" 1100.0 | \n",
" 2004.0 | \n",
" 23.0 | \n",
" 6.6 | \n",
" 1.85 | \n",
" 456 | \n",
"
\n",
" \n",
"
\n",
"
5043 rows × 28 columns
\n",
"
"
],
"text/plain": [
" color director_name num_critic_for_reviews duration \\\n",
"0 Color James Cameron 723.0 178.000000 \n",
"1 Color Gore Verbinski 302.0 169.000000 \n",
"2 Color Sam Mendes 602.0 148.000000 \n",
"3 Color Christopher Nolan 813.0 164.000000 \n",
"4 NaN Doug Walker NaN 107.201074 \n",
"... ... ... ... ... \n",
"5038 Color Scott Smith 1.0 87.000000 \n",
"5039 Color NaN 43.0 43.000000 \n",
"5040 Color Benjamin Roberds 13.0 76.000000 \n",
"5041 Color Daniel Hsia 14.0 100.000000 \n",
"5042 Color Jon Gunn 43.0 90.000000 \n",
"\n",
" director_facebook_likes actor_3_facebook_likes actor_2_name \\\n",
"0 0.0 855.0 Joel David Moore \n",
"1 563.0 1000.0 Orlando Bloom \n",
"2 0.0 161.0 Rory Kinnear \n",
"3 22000.0 23000.0 Christian Bale \n",
"4 131.0 NaN Rob Walker \n",
"... ... ... ... \n",
"5038 2.0 318.0 Daphne Zuniga \n",
"5039 NaN 319.0 Valorie Curry \n",
"5040 0.0 0.0 Maxwell Moody \n",
"5041 0.0 489.0 Daniel Henney \n",
"5042 16.0 16.0 Brian Herzlinger \n",
"\n",
" actor_1_facebook_likes gross genres \\\n",
"0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi \n",
"1 40000.0 309404152.0 Action|Adventure|Fantasy \n",
"2 11000.0 200074175.0 Action|Adventure|Thriller \n",
"3 27000.0 448130642.0 Action|Thriller \n",
"4 131.0 NaN Documentary \n",
"... ... ... ... \n",
"5038 637.0 NaN Comedy|Drama \n",
"5039 841.0 NaN Crime|Drama|Mystery|Thriller \n",
"5040 0.0 NaN Drama|Horror|Thriller \n",
"5041 946.0 10443.0 Comedy|Drama|Romance \n",
"5042 86.0 85222.0 Documentary \n",
"\n",
" ... num_user_for_reviews language country content_rating budget \\\n",
"0 ... 3054.0 English USA PG-13 237000000.0 \n",
"1 ... 1238.0 English USA PG-13 300000000.0 \n",
"2 ... 994.0 English UK PG-13 245000000.0 \n",
"3 ... 2701.0 English USA PG-13 250000000.0 \n",
"4 ... NaN NaN cs5481 NaN NaN \n",
"... ... ... ... ... ... ... \n",
"5038 ... 6.0 English Canada NaN NaN \n",
"5039 ... 359.0 English USA TV-14 NaN \n",
"5040 ... 3.0 English USA NaN 1400.0 \n",
"5041 ... 9.0 English USA PG-13 NaN \n",
"5042 ... 84.0 English USA PG 1100.0 \n",
"\n",
" title_year actor_2_facebook_likes imdb_score aspect_ratio \\\n",
"0 2009.0 936.0 7.9 1.78 \n",
"1 2007.0 5000.0 7.1 2.35 \n",
"2 2015.0 393.0 6.8 2.35 \n",
"3 2012.0 23000.0 8.5 2.35 \n",
"4 NaN 12.0 7.1 NaN \n",
"... ... ... ... ... \n",
"5038 2013.0 470.0 7.7 NaN \n",
"5039 NaN 593.0 7.5 16.00 \n",
"5040 2013.0 0.0 6.3 NaN \n",
"5041 2012.0 719.0 6.3 2.35 \n",
"5042 2004.0 23.0 6.6 1.85 \n",
"\n",
" movie_facebook_likes \n",
"0 33000 \n",
"1 0 \n",
"2 85000 \n",
"3 164000 \n",
"4 0 \n",
"... ... \n",
"5038 84 \n",
"5039 32000 \n",
"5040 16 \n",
"5041 660 \n",
"5042 456 \n",
"\n",
"[5043 rows x 28 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# we can remove columns where all values are nan\n",
"data.dropna(axis=1, how=\"all\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:41:06.252799Z",
"start_time": "2023-09-18T03:41:06.167587Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 597
},
"id": "8vZW2dxfkra_",
"outputId": "5e7fba9c-f722-4b35-c43e-bbfd1f45658c"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" duration | \n",
" genres | \n",
" movie_title | \n",
" num_voted_users | \n",
" cast_total_facebook_likes | \n",
" movie_imdb_link | \n",
" country | \n",
" imdb_score | \n",
" movie_facebook_likes | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 178.000000 | \n",
" Action|Adventure|Fantasy|Sci-Fi | \n",
" Avatar | \n",
" 886204 | \n",
" 4834 | \n",
" http://www.imdb.com/title/tt0499549/?ref_=fn_t... | \n",
" USA | \n",
" 7.9 | \n",
" 33000 | \n",
"
\n",
" \n",
" | 1 | \n",
" 169.000000 | \n",
" Action|Adventure|Fantasy | \n",
" Pirates of the Caribbean: At World's End | \n",
" 471220 | \n",
" 48350 | \n",
" http://www.imdb.com/title/tt0449088/?ref_=fn_t... | \n",
" USA | \n",
" 7.1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 148.000000 | \n",
" Action|Adventure|Thriller | \n",
" Spectre | \n",
" 275868 | \n",
" 11700 | \n",
" http://www.imdb.com/title/tt2379713/?ref_=fn_t... | \n",
" UK | \n",
" 6.8 | \n",
" 85000 | \n",
"
\n",
" \n",
" | 3 | \n",
" 164.000000 | \n",
" Action|Thriller | \n",
" The Dark Knight Rises | \n",
" 1144337 | \n",
" 106759 | \n",
" http://www.imdb.com/title/tt1345836/?ref_=fn_t... | \n",
" USA | \n",
" 8.5 | \n",
" 164000 | \n",
"
\n",
" \n",
" | 4 | \n",
" 107.201074 | \n",
" Documentary | \n",
" Star Wars: Episode VII - The Force Awakens ... | \n",
" 8 | \n",
" 143 | \n",
" http://www.imdb.com/title/tt5289954/?ref_=fn_t... | \n",
" cs5481 | \n",
" 7.1 | \n",
" 0 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 5038 | \n",
" 87.000000 | \n",
" Comedy|Drama | \n",
" Signed Sealed Delivered | \n",
" 629 | \n",
" 2283 | \n",
" http://www.imdb.com/title/tt3000844/?ref_=fn_t... | \n",
" Canada | \n",
" 7.7 | \n",
" 84 | \n",
"
\n",
" \n",
" | 5039 | \n",
" 43.000000 | \n",
" Crime|Drama|Mystery|Thriller | \n",
" The Following | \n",
" 73839 | \n",
" 1753 | \n",
" http://www.imdb.com/title/tt2071645/?ref_=fn_t... | \n",
" USA | \n",
" 7.5 | \n",
" 32000 | \n",
"
\n",
" \n",
" | 5040 | \n",
" 76.000000 | \n",
" Drama|Horror|Thriller | \n",
" A Plague So Pleasant | \n",
" 38 | \n",
" 0 | \n",
" http://www.imdb.com/title/tt2107644/?ref_=fn_t... | \n",
" USA | \n",
" 6.3 | \n",
" 16 | \n",
"
\n",
" \n",
" | 5041 | \n",
" 100.000000 | \n",
" Comedy|Drama|Romance | \n",
" Shanghai Calling | \n",
" 1255 | \n",
" 2386 | \n",
" http://www.imdb.com/title/tt2070597/?ref_=fn_t... | \n",
" USA | \n",
" 6.3 | \n",
" 660 | \n",
"
\n",
" \n",
" | 5042 | \n",
" 90.000000 | \n",
" Documentary | \n",
" My Date with Drew | \n",
" 4285 | \n",
" 163 | \n",
" http://www.imdb.com/title/tt0378407/?ref_=fn_t... | \n",
" USA | \n",
" 6.6 | \n",
" 456 | \n",
"
\n",
" \n",
"
\n",
"
5043 rows × 9 columns
\n",
"
"
],
"text/plain": [
" duration genres \\\n",
"0 178.000000 Action|Adventure|Fantasy|Sci-Fi \n",
"1 169.000000 Action|Adventure|Fantasy \n",
"2 148.000000 Action|Adventure|Thriller \n",
"3 164.000000 Action|Thriller \n",
"4 107.201074 Documentary \n",
"... ... ... \n",
"5038 87.000000 Comedy|Drama \n",
"5039 43.000000 Crime|Drama|Mystery|Thriller \n",
"5040 76.000000 Drama|Horror|Thriller \n",
"5041 100.000000 Comedy|Drama|Romance \n",
"5042 90.000000 Documentary \n",
"\n",
" movie_title num_voted_users \\\n",
"0 Avatar 886204 \n",
"1 Pirates of the Caribbean: At World's End 471220 \n",
"2 Spectre 275868 \n",
"3 The Dark Knight Rises 1144337 \n",
"4 Star Wars: Episode VII - The Force Awakens ... 8 \n",
"... ... ... \n",
"5038 Signed Sealed Delivered 629 \n",
"5039 The Following 73839 \n",
"5040 A Plague So Pleasant 38 \n",
"5041 Shanghai Calling 1255 \n",
"5042 My Date with Drew 4285 \n",
"\n",
" cast_total_facebook_likes \\\n",
"0 4834 \n",
"1 48350 \n",
"2 11700 \n",
"3 106759 \n",
"4 143 \n",
"... ... \n",
"5038 2283 \n",
"5039 1753 \n",
"5040 0 \n",
"5041 2386 \n",
"5042 163 \n",
"\n",
" movie_imdb_link country imdb_score \\\n",
"0 http://www.imdb.com/title/tt0499549/?ref_=fn_t... USA 7.9 \n",
"1 http://www.imdb.com/title/tt0449088/?ref_=fn_t... USA 7.1 \n",
"2 http://www.imdb.com/title/tt2379713/?ref_=fn_t... UK 6.8 \n",
"3 http://www.imdb.com/title/tt1345836/?ref_=fn_t... USA 8.5 \n",
"4 http://www.imdb.com/title/tt5289954/?ref_=fn_t... cs5481 7.1 \n",
"... ... ... ... \n",
"5038 http://www.imdb.com/title/tt3000844/?ref_=fn_t... Canada 7.7 \n",
"5039 http://www.imdb.com/title/tt2071645/?ref_=fn_t... USA 7.5 \n",
"5040 http://www.imdb.com/title/tt2107644/?ref_=fn_t... USA 6.3 \n",
"5041 http://www.imdb.com/title/tt2070597/?ref_=fn_t... USA 6.3 \n",
"5042 http://www.imdb.com/title/tt0378407/?ref_=fn_t... USA 6.6 \n",
"\n",
" movie_facebook_likes \n",
"0 33000 \n",
"1 0 \n",
"2 85000 \n",
"3 164000 \n",
"4 0 \n",
"... ... \n",
"5038 84 \n",
"5039 32000 \n",
"5040 16 \n",
"5041 660 \n",
"5042 456 \n",
"\n",
"[5043 rows x 9 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# or remove columns where some values are nan\n",
"data.dropna(axis=1, how=\"any\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "M1RT1evZkra_"
},
"source": [
"- Check Unreasonable Data\n",
"1. Time\n",
"2. Values with a range"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:41:36.472688Z",
"start_time": "2023-09-18T03:41:36.319594Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 565
},
"id": "iPgfWcvTkrbA",
"outputId": "d1aa676a-13c9-4e4b-af4e-f8202b9b7acd"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" director_name | \n",
" num_critic_for_reviews | \n",
" duration | \n",
" director_facebook_likes | \n",
" actor_3_facebook_likes | \n",
" actor_2_name | \n",
" actor_1_facebook_likes | \n",
" gross | \n",
" genres | \n",
" ... | \n",
" num_user_for_reviews | \n",
" language | \n",
" country | \n",
" content_rating | \n",
" budget | \n",
" title_year | \n",
" actor_2_facebook_likes | \n",
" imdb_score | \n",
" aspect_ratio | \n",
" movie_facebook_likes | \n",
"
\n",
" \n",
" \n",
" \n",
" | 10 | \n",
" Color | \n",
" Zack Snyder | \n",
" 673.0 | \n",
" 183.0 | \n",
" 0.0 | \n",
" 2000.0 | \n",
" Lauren Cohan | \n",
" 15000.0 | \n",
" 330249062.0 | \n",
" Action|Adventure|Sci-Fi | \n",
" ... | \n",
" 3018.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 250000000.0 | \n",
" 2016.0 | \n",
" 4000.0 | \n",
" 6.9 | \n",
" 2.35 | \n",
" 197000 | \n",
"
\n",
" \n",
" | 27 | \n",
" Color | \n",
" Anthony Russo | \n",
" 516.0 | \n",
" 147.0 | \n",
" 94.0 | \n",
" 11000.0 | \n",
" Scarlett Johansson | \n",
" 21000.0 | \n",
" 407197282.0 | \n",
" Action|Adventure|Sci-Fi | \n",
" ... | \n",
" 1022.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 250000000.0 | \n",
" 2016.0 | \n",
" 19000.0 | \n",
" 8.2 | \n",
" 2.35 | \n",
" 72000 | \n",
"
\n",
" \n",
" | 57 | \n",
" Color | \n",
" Justin Lin | \n",
" 322.0 | \n",
" 122.0 | \n",
" 681.0 | \n",
" 105.0 | \n",
" Melissa Roxburgh | \n",
" 998.0 | \n",
" 130468626.0 | \n",
" Action|Adventure|Sci-Fi|Thriller | \n",
" ... | \n",
" 432.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 185000000.0 | \n",
" 2016.0 | \n",
" 119.0 | \n",
" 7.5 | \n",
" 2.35 | \n",
" 30000 | \n",
"
\n",
" \n",
" | 63 | \n",
" Color | \n",
" David Yates | \n",
" 248.0 | \n",
" 110.0 | \n",
" 282.0 | \n",
" 103.0 | \n",
" Alexander Skarsgård | \n",
" 11000.0 | \n",
" 124051759.0 | \n",
" Action|Adventure|Drama|Romance | \n",
" ... | \n",
" 239.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 180000000.0 | \n",
" 2016.0 | \n",
" 10000.0 | \n",
" 6.6 | \n",
" 2.35 | \n",
" 29000 | \n",
"
\n",
" \n",
" | 65 | \n",
" Color | \n",
" Bryan Singer | \n",
" 396.0 | \n",
" 144.0 | \n",
" 0.0 | \n",
" 1000.0 | \n",
" Michael Fassbender | \n",
" 34000.0 | \n",
" 154985087.0 | \n",
" Action|Adventure|Sci-Fi | \n",
" ... | \n",
" 622.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 178000000.0 | \n",
" 2016.0 | \n",
" 13000.0 | \n",
" 7.3 | \n",
" 2.35 | \n",
" 54000 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 4772 | \n",
" Color | \n",
" Warren Sheppard | \n",
" 3.0 | \n",
" 94.0 | \n",
" 0.0 | \n",
" 212.0 | \n",
" Randy Jay Burrell | \n",
" 918.0 | \n",
" NaN | \n",
" Action|Romance|Sport | \n",
" ... | \n",
" 2.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 150000.0 | \n",
" 2016.0 | \n",
" 402.0 | \n",
" 4.0 | \n",
" NaN | \n",
" 381 | \n",
"
\n",
" \n",
" | 4773 | \n",
" Color | \n",
" Darren Lynn Bousman | \n",
" 10.0 | \n",
" 97.0 | \n",
" 163.0 | \n",
" 303.0 | \n",
" Barry Bostwick | \n",
" 636.0 | \n",
" NaN | \n",
" Horror|Musical | \n",
" ... | \n",
" 20.0 | \n",
" English | \n",
" USA | \n",
" NaN | \n",
" 500000.0 | \n",
" 2016.0 | \n",
" 456.0 | \n",
" 7.4 | \n",
" 1.78 | \n",
" 707 | \n",
"
\n",
" \n",
" | 4775 | \n",
" Color | \n",
" Joel Paul Reisig | \n",
" 1.0 | \n",
" 108.0 | \n",
" 431.0 | \n",
" 317.0 | \n",
" Joel Paul Reisig | \n",
" 466.0 | \n",
" NaN | \n",
" Family | \n",
" ... | \n",
" 4.0 | \n",
" English | \n",
" USA | \n",
" PG | \n",
" 500000.0 | \n",
" 2016.0 | \n",
" 431.0 | \n",
" 5.7 | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" | 4777 | \n",
" Color | \n",
" Luke Dye | \n",
" 1.0 | \n",
" 84.0 | \n",
" 0.0 | \n",
" 53.0 | \n",
" Jeff Delaney | \n",
" 385.0 | \n",
" NaN | \n",
" Family | \n",
" ... | \n",
" 1.0 | \n",
" English | \n",
" USA | \n",
" NaN | \n",
" 500000.0 | \n",
" 2016.0 | \n",
" 169.0 | \n",
" 5.2 | \n",
" 16.00 | \n",
" 9 | \n",
"
\n",
" \n",
" | 4953 | \n",
" Color | \n",
" Nate Parker | \n",
" 21.0 | \n",
" 120.0 | \n",
" 664.0 | \n",
" 400.0 | \n",
" Nate Parker | \n",
" 990.0 | \n",
" NaN | \n",
" Biography|Drama | \n",
" ... | \n",
" 8.0 | \n",
" English | \n",
" USA | \n",
" R | \n",
" 10000000.0 | \n",
" 2016.0 | \n",
" 664.0 | \n",
" 5.4 | \n",
" 2.35 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
106 rows × 28 columns
\n",
"
"
],
"text/plain": [
" color director_name num_critic_for_reviews duration \\\n",
"10 Color Zack Snyder 673.0 183.0 \n",
"27 Color Anthony Russo 516.0 147.0 \n",
"57 Color Justin Lin 322.0 122.0 \n",
"63 Color David Yates 248.0 110.0 \n",
"65 Color Bryan Singer 396.0 144.0 \n",
"... ... ... ... ... \n",
"4772 Color Warren Sheppard 3.0 94.0 \n",
"4773 Color Darren Lynn Bousman 10.0 97.0 \n",
"4775 Color Joel Paul Reisig 1.0 108.0 \n",
"4777 Color Luke Dye 1.0 84.0 \n",
"4953 Color Nate Parker 21.0 120.0 \n",
"\n",
" director_facebook_likes actor_3_facebook_likes actor_2_name \\\n",
"10 0.0 2000.0 Lauren Cohan \n",
"27 94.0 11000.0 Scarlett Johansson \n",
"57 681.0 105.0 Melissa Roxburgh \n",
"63 282.0 103.0 Alexander Skarsgård \n",
"65 0.0 1000.0 Michael Fassbender \n",
"... ... ... ... \n",
"4772 0.0 212.0 Randy Jay Burrell \n",
"4773 163.0 303.0 Barry Bostwick \n",
"4775 431.0 317.0 Joel Paul Reisig \n",
"4777 0.0 53.0 Jeff Delaney \n",
"4953 664.0 400.0 Nate Parker \n",
"\n",
" actor_1_facebook_likes gross genres \\\n",
"10 15000.0 330249062.0 Action|Adventure|Sci-Fi \n",
"27 21000.0 407197282.0 Action|Adventure|Sci-Fi \n",
"57 998.0 130468626.0 Action|Adventure|Sci-Fi|Thriller \n",
"63 11000.0 124051759.0 Action|Adventure|Drama|Romance \n",
"65 34000.0 154985087.0 Action|Adventure|Sci-Fi \n",
"... ... ... ... \n",
"4772 918.0 NaN Action|Romance|Sport \n",
"4773 636.0 NaN Horror|Musical \n",
"4775 466.0 NaN Family \n",
"4777 385.0 NaN Family \n",
"4953 990.0 NaN Biography|Drama \n",
"\n",
" ... num_user_for_reviews language country content_rating budget \\\n",
"10 ... 3018.0 English USA PG-13 250000000.0 \n",
"27 ... 1022.0 English USA PG-13 250000000.0 \n",
"57 ... 432.0 English USA PG-13 185000000.0 \n",
"63 ... 239.0 English USA PG-13 180000000.0 \n",
"65 ... 622.0 English USA PG-13 178000000.0 \n",
"... ... ... ... ... ... ... \n",
"4772 ... 2.0 English USA PG-13 150000.0 \n",
"4773 ... 20.0 English USA NaN 500000.0 \n",
"4775 ... 4.0 English USA PG 500000.0 \n",
"4777 ... 1.0 English USA NaN 500000.0 \n",
"4953 ... 8.0 English USA R 10000000.0 \n",
"\n",
" title_year actor_2_facebook_likes imdb_score aspect_ratio \\\n",
"10 2016.0 4000.0 6.9 2.35 \n",
"27 2016.0 19000.0 8.2 2.35 \n",
"57 2016.0 119.0 7.5 2.35 \n",
"63 2016.0 10000.0 6.6 2.35 \n",
"65 2016.0 13000.0 7.3 2.35 \n",
"... ... ... ... ... \n",
"4772 2016.0 402.0 4.0 NaN \n",
"4773 2016.0 456.0 7.4 1.78 \n",
"4775 2016.0 431.0 5.7 NaN \n",
"4777 2016.0 169.0 5.2 16.00 \n",
"4953 2016.0 664.0 5.4 2.35 \n",
"\n",
" movie_facebook_likes \n",
"10 197000 \n",
"27 72000 \n",
"57 30000 \n",
"63 29000 \n",
"65 54000 \n",
"... ... \n",
"4772 381 \n",
"4773 707 \n",
"4775 0 \n",
"4777 9 \n",
"4953 0 \n",
"\n",
"[106 rows x 28 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check title_year\n",
"data[data[\"title_year\"] > 2015]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:41:45.541677Z",
"start_time": "2023-09-18T03:41:45.510770Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 99
},
"id": "CNjfgDiikrbA",
"outputId": "ef658e8f-ac60-40ca-c92a-f3454834e995"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" director_name | \n",
" num_critic_for_reviews | \n",
" duration | \n",
" director_facebook_likes | \n",
" actor_3_facebook_likes | \n",
" actor_2_name | \n",
" actor_1_facebook_likes | \n",
" gross | \n",
" genres | \n",
" ... | \n",
" num_user_for_reviews | \n",
" language | \n",
" country | \n",
" content_rating | \n",
" budget | \n",
" title_year | \n",
" actor_2_facebook_likes | \n",
" imdb_score | \n",
" aspect_ratio | \n",
" movie_facebook_likes | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 28 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [color, director_name, num_critic_for_reviews, duration, director_facebook_likes, actor_3_facebook_likes, actor_2_name, actor_1_facebook_likes, gross, genres, actor_1_name, movie_title, num_voted_users, cast_total_facebook_likes, actor_3_name, facenumber_in_poster, plot_keywords, movie_imdb_link, num_user_for_reviews, language, country, content_rating, budget, title_year, actor_2_facebook_likes, imdb_score, aspect_ratio, movie_facebook_likes]\n",
"Index: []\n",
"\n",
"[0 rows x 28 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check imdb_score\n",
"data[data[\"imdb_score\"] > 10]"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Mh2xVJ5YkrbA"
},
"source": [
"- Check Replicated Data"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:41:53.447219Z",
"start_time": "2023-09-18T03:41:53.391774Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 458
},
"id": "oPSvXaVJkrbA",
"outputId": "afd6ed09-7884-4619-8708-2e7c812fbfa3"
},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
" ... \n",
"5038 False\n",
"5039 False\n",
"5040 False\n",
"5041 False\n",
"5042 False\n",
"Length: 5043, dtype: bool"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check duplicated data\n",
"data.duplicated()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:42:01.062354Z",
"start_time": "2023-09-18T03:42:01.036206Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "iDki8e0IkrbA",
"outputId": "f2c942cd-19c2-4b71-d44a-0dba8ac125e4"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" brand | \n",
" style | \n",
" rating | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Yum Yum | \n",
" cup | \n",
" 4.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" Yum Yum | \n",
" cup | \n",
" 4.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" Indomie | \n",
" cup | \n",
" 3.5 | \n",
"
\n",
" \n",
" | 3 | \n",
" Indomie | \n",
" pack | \n",
" 15.0 | \n",
"
\n",
" \n",
" | 4 | \n",
" Indomie | \n",
" pack | \n",
" 5.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" brand style rating\n",
"0 Yum Yum cup 4.0\n",
"1 Yum Yum cup 4.0\n",
"2 Indomie cup 3.5\n",
"3 Indomie pack 15.0\n",
"4 Indomie pack 5.0"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# There is no duplicated lines in this file, thus we use a demo data to show how process it.\n",
"df = pd.DataFrame({\n",
" 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],\n",
" 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],\n",
" 'rating': [4, 4, 3.5, 15, 5]\n",
"})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:42:02.016556Z",
"start_time": "2023-09-18T03:42:02.000715Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 241
},
"id": "plbGsuAakrbA",
"outputId": "76b7ec6c-9940-45db-bf39-f336f5aa4ea7"
},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 True\n",
"2 False\n",
"3 False\n",
"4 False\n",
"dtype: bool"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# By default, for each set of duplicated values, the first occurrence is set on False and all others on True.\n",
"df.duplicated()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:42:05.815667Z",
"start_time": "2023-09-18T03:42:05.796669Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 241
},
"id": "PG37RNbykrbB",
"outputId": "5bda2afc-05a9-40d3-bfd5-5f968ad89b23"
},
"outputs": [
{
"data": {
"text/plain": [
"0 True\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
"dtype: bool"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# By using ‘last’, the last occurrence of each set of duplicated values is set on False and all others on True.\n",
"df.duplicated(keep='last')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:42:09.517275Z",
"start_time": "2023-09-18T03:42:09.485835Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 241
},
"id": "aw27GjYAkrbB",
"outputId": "ebbfc7da-2161-457e-e606-f61d7e9c62cc"
},
"outputs": [
{
"data": {
"text/plain": [
"0 True\n",
"1 True\n",
"2 False\n",
"3 False\n",
"4 False\n",
"dtype: bool"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# By setting keep on False, all duplicates are True.\n",
"df.duplicated(keep=False)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:42:13.522460Z",
"start_time": "2023-09-18T03:42:13.507489Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 241
},
"id": "D1bcvb4nkrbB",
"outputId": "947839cc-7d89-4d72-b5d3-d464b2247c4b"
},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 True\n",
"2 False\n",
"3 True\n",
"4 True\n",
"dtype: bool"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To find duplicates on specific column(s), use subset.\n",
"df.duplicated(subset=['brand'])"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "5K3PzCr8krbB"
},
"source": [
"- Constrain Data Type"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:44:34.776087Z",
"start_time": "2023-09-18T03:44:34.629436Z"
},
"id": "01adGUHbkrbB"
},
"outputs": [],
"source": [
"# we can assume we know some columns' types and we can predefine it when reading data\n",
"data = pd.read_csv(r'movie_metadata.csv', dtype={'num_voted_users': int, \"title_year\": str})"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:44:41.134748Z",
"start_time": "2023-09-18T03:44:41.089632Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 307
},
"id": "IURKu35XkrbB",
"outputId": "e688c65c-e07b-4d1c-c79b-008e554187c1"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" color | \n",
" director_name | \n",
" num_critic_for_reviews | \n",
" duration | \n",
" director_facebook_likes | \n",
" actor_3_facebook_likes | \n",
" actor_2_name | \n",
" actor_1_facebook_likes | \n",
" gross | \n",
" genres | \n",
" ... | \n",
" num_user_for_reviews | \n",
" language | \n",
" country | \n",
" content_rating | \n",
" budget | \n",
" release_date | \n",
" actor_2_facebook_likes | \n",
" imdb_score | \n",
" aspect_ratio | \n",
" facebook_likes | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Color | \n",
" James Cameron | \n",
" 723.0 | \n",
" 178.0 | \n",
" 0.0 | \n",
" 855.0 | \n",
" Joel David Moore | \n",
" 1000.0 | \n",
" 760505847.0 | \n",
" Action|Adventure|Fantasy|Sci-Fi | \n",
" ... | \n",
" 3054.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 237000000.0 | \n",
" 2009 | \n",
" 936.0 | \n",
" 7.9 | \n",
" 1.78 | \n",
" 33000 | \n",
"
\n",
" \n",
" | 1 | \n",
" Color | \n",
" Gore Verbinski | \n",
" 302.0 | \n",
" 169.0 | \n",
" 563.0 | \n",
" 1000.0 | \n",
" Orlando Bloom | \n",
" 40000.0 | \n",
" 309404152.0 | \n",
" Action|Adventure|Fantasy | \n",
" ... | \n",
" 1238.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 300000000.0 | \n",
" 2007 | \n",
" 5000.0 | \n",
" 7.1 | \n",
" 2.35 | \n",
" 0 | \n",
"
\n",
" \n",
" | 2 | \n",
" Color | \n",
" Sam Mendes | \n",
" 602.0 | \n",
" 148.0 | \n",
" 0.0 | \n",
" 161.0 | \n",
" Rory Kinnear | \n",
" 11000.0 | \n",
" 200074175.0 | \n",
" Action|Adventure|Thriller | \n",
" ... | \n",
" 994.0 | \n",
" English | \n",
" UK | \n",
" PG-13 | \n",
" 245000000.0 | \n",
" 2015 | \n",
" 393.0 | \n",
" 6.8 | \n",
" 2.35 | \n",
" 85000 | \n",
"
\n",
" \n",
" | 3 | \n",
" Color | \n",
" Christopher Nolan | \n",
" 813.0 | \n",
" 164.0 | \n",
" 22000.0 | \n",
" 23000.0 | \n",
" Christian Bale | \n",
" 27000.0 | \n",
" 448130642.0 | \n",
" Action|Thriller | \n",
" ... | \n",
" 2701.0 | \n",
" English | \n",
" USA | \n",
" PG-13 | \n",
" 250000000.0 | \n",
" 2012 | \n",
" 23000.0 | \n",
" 8.5 | \n",
" 2.35 | \n",
" 164000 | \n",
"
\n",
" \n",
" | 4 | \n",
" NaN | \n",
" Doug Walker | \n",
" NaN | \n",
" NaN | \n",
" 131.0 | \n",
" NaN | \n",
" Rob Walker | \n",
" 131.0 | \n",
" NaN | \n",
" Documentary | \n",
" ... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 12.0 | \n",
" 7.1 | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 28 columns
\n",
"
"
],
"text/plain": [
" color director_name num_critic_for_reviews duration \\\n",
"0 Color James Cameron 723.0 178.0 \n",
"1 Color Gore Verbinski 302.0 169.0 \n",
"2 Color Sam Mendes 602.0 148.0 \n",
"3 Color Christopher Nolan 813.0 164.0 \n",
"4 NaN Doug Walker NaN NaN \n",
"\n",
" director_facebook_likes actor_3_facebook_likes actor_2_name \\\n",
"0 0.0 855.0 Joel David Moore \n",
"1 563.0 1000.0 Orlando Bloom \n",
"2 0.0 161.0 Rory Kinnear \n",
"3 22000.0 23000.0 Christian Bale \n",
"4 131.0 NaN Rob Walker \n",
"\n",
" actor_1_facebook_likes gross genres ... \\\n",
"0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi ... \n",
"1 40000.0 309404152.0 Action|Adventure|Fantasy ... \n",
"2 11000.0 200074175.0 Action|Adventure|Thriller ... \n",
"3 27000.0 448130642.0 Action|Thriller ... \n",
"4 131.0 NaN Documentary ... \n",
"\n",
" num_user_for_reviews language country content_rating budget \\\n",
"0 3054.0 English USA PG-13 237000000.0 \n",
"1 1238.0 English USA PG-13 300000000.0 \n",
"2 994.0 English UK PG-13 245000000.0 \n",
"3 2701.0 English USA PG-13 250000000.0 \n",
"4 NaN NaN NaN NaN NaN \n",
"\n",
" release_date actor_2_facebook_likes imdb_score aspect_ratio facebook_likes \n",
"0 2009 936.0 7.9 1.78 33000 \n",
"1 2007 5000.0 7.1 2.35 0 \n",
"2 2015 393.0 6.8 2.35 85000 \n",
"3 2012 23000.0 8.5 2.35 164000 \n",
"4 NaN 12.0 7.1 NaN 0 \n",
"\n",
"[5 rows x 28 columns]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# we can also rename columns for human understanding\n",
"data = data.rename(columns = {'title_year':'release_date', 'movie_facebook_likes':'facebook_likes'})\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:45:23.357452Z",
"start_time": "2023-09-18T03:45:23.214830Z"
},
"id": "kWqh8VbNkrbB"
},
"outputs": [],
"source": [
"# after clean the data, we usually need to save the cleaned data to a new file\n",
"data.to_csv('cleanfile.csv', encoding='utf-8')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "RIajsttNkrbB"
},
"source": [
"### 2.2 Data Integration"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:46:39.515799Z",
"start_time": "2023-09-18T03:46:39.448042Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 269
},
"id": "61KOsZx0krbB",
"outputId": "97d1998d-b047-45d5-e5bf-ba7ab1c9b8e1"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" key | \n",
" data1 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" b | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" b | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" a | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" c | \n",
" 3 | \n",
"
\n",
" \n",
" | 4 | \n",
" a | \n",
" 4 | \n",
"
\n",
" \n",
" | 5 | \n",
" a | \n",
" 5 | \n",
"
\n",
" \n",
" | 6 | \n",
" b | \n",
" 6 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" key data1\n",
"0 b 0\n",
"1 b 1\n",
"2 a 2\n",
"3 c 3\n",
"4 a 4\n",
"5 a 5\n",
"6 b 6"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1=pd.DataFrame({'key':['b','b','a','c','a','a','b'],'data1':range(7)})\n",
"df1\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:46:48.916997Z",
"start_time": "2023-09-18T03:46:48.851700Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 143
},
"id": "0uajPTV2krbC",
"outputId": "c0f5cbe0-fc21-4bba-96b5-684fcd5ba7f5"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" key | \n",
" data2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" a | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" b | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" d | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" key data2\n",
"0 a 0\n",
"1 b 1\n",
"2 d 2"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2=pd.DataFrame({'key':['a','b','d'],'data2':range(3)})\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:46:59.724261Z",
"start_time": "2023-09-18T03:46:59.663746Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 237
},
"id": "DFIbTEoBkrbC",
"outputId": "05094f6e-5f9d-4a4e-dd43-af69ec73e359"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" key | \n",
" data1 | \n",
" data2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" b | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" b | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" b | \n",
" 6 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" a | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" a | \n",
" 4 | \n",
" 0 | \n",
"
\n",
" \n",
" | 5 | \n",
" a | \n",
" 5 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" key data1 data2\n",
"0 b 0 1\n",
"1 b 1 1\n",
"2 b 6 1\n",
"3 a 2 0\n",
"4 a 4 0\n",
"5 a 5 0"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# we can merge two datasets with pd.merge(), the default merged column is the common column\n",
"pd.merge(df1, df2)\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:47:13.071316Z",
"start_time": "2023-09-18T03:47:13.047754Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 158
},
"id": "3Ed-uKU0krbC",
"outputId": "6dc5b39f-9d61-452a-dd85-46c3b6863ecf"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" key | \n",
" data1 | \n",
" data2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" b | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" b | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" b | \n",
" 6 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" a | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" a | \n",
" 4 | \n",
" 0 | \n",
"
\n",
" \n",
" | 5 | \n",
" a | \n",
" 5 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" key data1 data2\n",
"0 b 0 1\n",
"1 b 1 1\n",
"2 b 6 1\n",
"3 a 2 0\n",
"4 a 4 0\n",
"5 a 5 0"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# of course, we could give the merged column\n",
"pd.merge(df1,df2,on='key')\n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:47:14.013049Z",
"start_time": "2023-09-18T03:47:13.992690Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 269
},
"id": "LHd3iz2PkrbC",
"outputId": "0f8040fd-96bc-4f08-8ce5-ca6710b21746"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 1key | \n",
" data1 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" b | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" b | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" a | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" c | \n",
" 3 | \n",
"
\n",
" \n",
" | 4 | \n",
" a | \n",
" 4 | \n",
"
\n",
" \n",
" | 5 | \n",
" a | \n",
" 5 | \n",
"
\n",
" \n",
" | 6 | \n",
" b | \n",
" 6 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 1key data1\n",
"0 b 0\n",
"1 b 1\n",
"2 a 2\n",
"3 c 3\n",
"4 a 4\n",
"5 a 5\n",
"6 b 6"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# we could merge two datasets with different columns\n",
"df3=pd.DataFrame({'1key':['b','b','a','c','a','a','b'],'data1':range(7)})\n",
"df3"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:47:15.914926Z",
"start_time": "2023-09-18T03:47:15.857720Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 143
},
"id": "jIS98LXykrbC",
"outputId": "98d78769-59aa-450d-fa76-e39be42a63de"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 2key | \n",
" data2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" a | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" b | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" d | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 2key data2\n",
"0 a 0\n",
"1 b 1\n",
"2 d 2"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df4=pd.DataFrame({'2key':['a','b','d'],'data2':range(3)})\n",
"df4"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:47:17.076458Z",
"start_time": "2023-09-18T03:47:17.052756Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 237
},
"id": "pqolG5Y4krbC",
"outputId": "6a99b91b-98ea-4a6b-ee5c-b47ed9f09e8d"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 1key | \n",
" data1 | \n",
" 2key | \n",
" data2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" b | \n",
" 0 | \n",
" b | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" b | \n",
" 1 | \n",
" b | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" b | \n",
" 6 | \n",
" b | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" a | \n",
" 2 | \n",
" a | \n",
" 0 | \n",
"
\n",
" \n",
" | 4 | \n",
" a | \n",
" 4 | \n",
" a | \n",
" 0 | \n",
"
\n",
" \n",
" | 5 | \n",
" a | \n",
" 5 | \n",
" a | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 1key data1 2key data2\n",
"0 b 0 b 1\n",
"1 b 1 b 1\n",
"2 b 6 b 1\n",
"3 a 2 a 0\n",
"4 a 4 a 0\n",
"5 a 5 a 0"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# default mode keeps the cross set of key values, which is called inner connection\n",
"pd.merge(df3,df4,left_on='1key',right_on='2key')"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:47:18.436687Z",
"start_time": "2023-09-18T03:47:18.348378Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 508
},
"id": "oftoZPnYkrbD",
"outputId": "f8031665-5137-4d16-9bda-0d6bc4531260"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" key data1\n",
"0 b 0\n",
"1 b 1\n",
"2 a 2\n",
"3 c 3\n",
"4 a 4\n",
"5 a 5\n",
"6 b 6\n",
" key data2\n",
"0 a 0\n",
"1 b 1\n",
"2 d 2\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" key | \n",
" data1 | \n",
" data2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" b | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" b | \n",
" 1.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" b | \n",
" 6.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" a | \n",
" 2.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 4 | \n",
" a | \n",
" 4.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 5 | \n",
" a | \n",
" 5.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 6 | \n",
" c | \n",
" 3.0 | \n",
" NaN | \n",
"
\n",
" \n",
" | 7 | \n",
" d | \n",
" NaN | \n",
" 2.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" key data1 data2\n",
"0 b 0.0 1.0\n",
"1 b 1.0 1.0\n",
"2 b 6.0 1.0\n",
"3 a 2.0 0.0\n",
"4 a 4.0 0.0\n",
"5 a 5.0 0.0\n",
"6 c 3.0 NaN\n",
"7 d NaN 2.0"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# when merge two datasets with outer connection\n",
"print(df1)\n",
"print(df2)\n",
"pd.merge(df1,df2,on='key',how='outer')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UMoL7ygJkrbD"
},
"source": [
"### 2.3 Data Transformation"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "2BK5E9LZkrbD"
},
"source": [
"- String Transformation"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:47:26.817624Z",
"start_time": "2023-09-18T03:47:26.748509Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 458
},
"id": "yAB_qHQskrbD",
"outputId": "b694d31d-d9ab-4f42-e185-0c96408329ce"
},
"outputs": [
{
"data": {
"text/plain": [
"0 JAMES CAMERON\n",
"1 GORE VERBINSKI\n",
"2 SAM MENDES\n",
"3 CHRISTOPHER NOLAN\n",
"4 DOUG WALKER\n",
" ... \n",
"5038 SCOTT SMITH\n",
"5039 NaN\n",
"5040 BENJAMIN ROBERDS\n",
"5041 DANIEL HSIA\n",
"5042 JON GUNN\n",
"Name: director_name, Length: 5043, dtype: object"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# lower and upper case\n",
"data[\"director_name\"].str.upper()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:47:28.230447Z",
"start_time": "2023-09-18T03:47:28.210928Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 458
},
"id": "qPmcxRjjkrbD",
"outputId": "7c44dc97-c992-4018-bd54-819749ce1a95"
},
"outputs": [
{
"data": {
"text/plain": [
"0 james cameron\n",
"1 gore verbinski\n",
"2 sam mendes\n",
"3 christopher nolan\n",
"4 doug walker\n",
" ... \n",
"5038 scott smith\n",
"5039 NaN\n",
"5040 benjamin roberds\n",
"5041 daniel hsia\n",
"5042 jon gunn\n",
"Name: director_name, Length: 5043, dtype: object"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# lower and upper case\n",
"data[\"director_name\"].str.lower()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:47:28.991430Z",
"start_time": "2023-09-18T03:47:28.969436Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 458
},
"id": "cjNXtGKVkrbD",
"outputId": "a656f647-8173-4781-f511-8549299b6fa5"
},
"outputs": [
{
"data": {
"text/plain": [
"0 Avatar\n",
"1 Pirates of the Caribbean: At World's End\n",
"2 Spectre\n",
"3 The Dark Knight Rises\n",
"4 Star Wars: Episode VII - The Force Awakens\n",
" ... \n",
"5038 Signed Sealed Delivered\n",
"5039 The Following\n",
"5040 A Plague So Pleasant\n",
"5041 Shanghai Calling\n",
"5042 My Date with Drew\n",
"Name: movie_title, Length: 5043, dtype: object"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# remove special strings for whitespace, \\n\n",
"data['movie_title'].str.strip()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "yIQEQAiSkrbD"
},
"source": [
"- Number Transformation"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:47:30.814883Z",
"start_time": "2023-09-18T03:47:30.769872Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 458
},
"id": "WR4q_pyukrbD",
"outputId": "ddedff31-dd3e-4a0b-d3ab-48a3a6678c7c"
},
"outputs": [
{
"data": {
"text/plain": [
"0 178.0\n",
"1 169.0\n",
"2 148.0\n",
"3 164.0\n",
"4 NaN\n",
" ... \n",
"5038 87.0\n",
"5039 43.0\n",
"5040 76.0\n",
"5041 100.0\n",
"5042 90.0\n",
"Name: duration, Length: 5043, dtype: float64"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# unit transformation\n",
"\n",
"data[\"duration\"]"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:47:31.515954Z",
"start_time": "2023-09-18T03:47:31.433241Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 458
},
"id": "wEIA6NVTkrbD",
"outputId": "b8cc4e61-844c-43eb-b9ef-74edb3024348"
},
"outputs": [
{
"data": {
"text/plain": [
"0 2.966667\n",
"1 2.816667\n",
"2 2.466667\n",
"3 2.733333\n",
"4 NaN\n",
" ... \n",
"5038 1.450000\n",
"5039 0.716667\n",
"5040 1.266667\n",
"5041 1.666667\n",
"5042 1.500000\n",
"Name: duration, Length: 5043, dtype: float64"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[\"duration\"] / 60"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:47:32.022681Z",
"start_time": "2023-09-18T03:47:31.926876Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 458
},
"id": "mGVjK7nakrbD",
"outputId": "f041f48b-16bb-4872-ce7b-1467359a83f1"
},
"outputs": [
{
"data": {
"text/plain": [
"0 0.339286\n",
"1 0.321429\n",
"2 0.279762\n",
"3 0.311508\n",
"4 NaN\n",
" ... \n",
"5038 0.158730\n",
"5039 0.071429\n",
"5040 0.136905\n",
"5041 0.184524\n",
"5042 0.164683\n",
"Name: duration, Length: 5043, dtype: float64"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# normalization\n",
"norm_duration = (data.duration - data.duration.min()) / (data.duration.max() - data.duration.min())\n",
"norm_duration"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:47:32.721861Z",
"start_time": "2023-09-18T03:47:32.652216Z"
},
"colab": {
"base_uri": "https://localhost:8080/",
"height": 458
},
"id": "dP7ElOT2krbD",
"outputId": "34d1ed8d-e771-4758-b4bb-820b81c65782"
},
"outputs": [
{
"data": {
"text/plain": [
"0 2.809767\n",
"1 2.452587\n",
"2 1.619169\n",
"3 2.254155\n",
"4 NaN\n",
" ... \n",
"5038 -0.801711\n",
"5039 -2.547920\n",
"5040 -1.238264\n",
"5041 -0.285786\n",
"5042 -0.682652\n",
"Name: duration, Length: 5043, dtype: float64"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# standardization\n",
"std_duration = (data.duration - data.duration.mean()) / data.duration.std()\n",
"std_duration"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 272
},
"id": "ZIWLjSwXkrbE",
"outputId": "758e2d2e-0c44-4b3c-aa34-4e58ffba78d3"
},
"outputs": [
{
"data": {
"text/plain": [
"(6.999, 91.0] 1054\n",
"(108.0, 122.0] 1028\n",
"(99.0, 108.0] 1011\n",
"(91.0, 99.0] 984\n",
"(122.0, 511.0] 951\n",
"Name: duration, dtype: int64"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# discretization\n",
"# qcut: divie data points into M groups and each group has the basically same number of data points\n",
"m_cut = pd.qcut(data.duration, 5)\n",
"m_cut.value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "CLrcHDbbkrbE"
},
"source": [
"## 3. Regular Expression\n",
"1.Metacharacters\n",
"- [] A set of characters\n",
"- \\\t Signals a special sequence (can also be used to escape special characters)\n",
"- .\t Any character (except newline character)\t\"he..o\"\n",
"- ^\t Starts with \"^hello\"\n",
"- Ends with \"planet\\$\"\n",
"- \\* Zero or more occurrences\t\"he.*o\"\n",
"- \\+ One or more occurrences\t\"he.+o\"\n",
"- ?\t Zero or one occurrences\t\"he.?o\"\n",
"- {} Exactly the specified number of occurrences\t\"he.{2}o\"\n",
"- |\t Either or\t\"falls|stays\"\n",
"- () Capture and group\n",
"\n",
"2.Special Sequences\n",
"- \\A\t Returns a match if the specified characters are at the beginning of the string\t\"\\AThe\"\n",
"- \\b\t Returns a match where the specified characters are at the beginning or at the end of a word\n",
"(the \"r\" in the beginning is making sure that the string is being treated as a \"raw string\")\tr\"\\bain\"\n",
"r\"ain\\b\"\n",
"- \\B\t Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word\n",
"(the \"r\" in the beginning is making sure that the string is being treated as a \"raw string\")\tr\"\\Bain\"\n",
"r\"ain\\B\"\n",
"- \\d\t Returns a match where the string contains digits (numbers from 0-9)\t\"\\d\"\n",
"- \\D\t Returns a match where the string DOES NOT contain digits\t\"\\D\"\n",
"- \\s\t Returns a match where the string contains a white space character\t\"\\s\"\n",
"- \\S\t Returns a match where the string DOES NOT contain a white space character\t\"\\S\"\n",
"- \\w\t Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)\t\"\\w\"\n",
"- \\W\t Returns a match where the string DOES NOT contain any word characters\t\"\\W\"\n",
"- \\Z\t Returns a match if the specified characters are at the end of the string\n",
"\n",
"3.Sets\n",
"- [arn]\t Returns a match where one of the specified characters (a, r, or n) is present\n",
"- [a-n]\t Returns a match for any lower case character, alphabetically between a and n\n",
"- [^arn] Returns a match for any character EXCEPT a, r, and n\n",
"- [0123] Returns a match where any of the specified digits (0, 1, 2, or 3) are present\n",
"- [0-9]\t Returns a match for any digit between 0 and 9\n",
"- [0-5][0-9] Returns a match for any two-digit numbers from 00 and 59\n",
"- [a-zA-Z] Returns a match for any character alphabetically between a and z, lower case OR upper case\n",
"- [+] In sets, +, *, ., |, (), $,{} has no special meaning, so [+] means: return a match for any + character in the string\n",
"\n",
"4.Funtions\n",
"- The findall() function returns a list containing all matches.\n",
"- The search() function searches the string for a match, and returns a Match object if there is a match. If there is more than one match, only the first occurrence of the match will be returned. If no matches are found, the value None is returned.\n",
"- The split() function returns a list where the string has been split at each match.\n",
"- The sub() function replaces the matches with the text of your choice.\n",
"\n",
"5.Ojects\n",
"- A Match Object is an object containing information about the search and the result.\n",
"- The Match object has properties and methods used to retrieve information about the search, and the result:\n",
"\n",
"- .span() returns a tuple containing the start-, and end positions of the match.\n",
"\n",
"- .string returns the string passed into the function\n",
"\n",
"- .group() returns the part of the string where there was a match\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:12.049857Z",
"start_time": "2023-09-18T03:48:12.043616Z"
},
"id": "HV6Ogn21krbE"
},
"outputs": [],
"source": [
"import re"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:12.614389Z",
"start_time": "2023-09-18T03:48:12.583682Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7H5CtFJokrbE",
"outputId": "3a0d39f8-1734-4c42-b41d-c184a4ca2a42"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['ai', 'ai']\n"
]
}
],
"source": [
"# findall function\n",
"txt = \"The rain in Spain\"\n",
"x = re.findall(\"ai\", txt)\n",
"print(x)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:13.213704Z",
"start_time": "2023-09-18T03:48:13.165424Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_R3vTz-kkrbE",
"outputId": "5ffdf73f-b7ae-4a37-8bf9-b4282157af5e"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[]\n"
]
}
],
"source": [
"# Return an empty list if no match was found:\n",
"txt = \"The rain in Spain\"\n",
"x = re.findall(\"Portugal\", txt)\n",
"print(x)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:13.815165Z",
"start_time": "2023-09-18T03:48:13.761650Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "IaXU0hASkrbE",
"outputId": "0305665b-2589-4f98-c9c5-f65efd615833"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The first white-space character is located in position: 3\n"
]
}
],
"source": [
"# search function\n",
"txt = \"The rain in Spain\"\n",
"x = re.search(\"\\s\", txt)\n",
"\n",
"print(\"The first white-space character is located in position:\", x.start())"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:14.294609Z",
"start_time": "2023-09-18T03:48:14.286784Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3n8B_GoMkrbE",
"outputId": "100a806d-8e93-4ecc-8e7f-ba18e33522ad"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"None\n"
]
}
],
"source": [
"# If no matches are found, the value None is returned:\n",
"txt = \"The rain in Spain\"\n",
"x = re.search(\"Portugal\", txt)\n",
"print(x)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:15.020439Z",
"start_time": "2023-09-18T03:48:15.012276Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "b5a5JHi9krbE",
"outputId": "95684ab7-c94c-4401-fdea-a4334d60ef84"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['The', 'rain', 'in', 'Spain']\n"
]
}
],
"source": [
"# split function\n",
"txt = \"The rain in Spain\"\n",
"x = re.split(\"\\s\", txt)\n",
"print(x)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:15.514395Z",
"start_time": "2023-09-18T03:48:15.416765Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YEA5w9gdkrbE",
"outputId": "f869d498-864b-4216-e75f-d528e165a561"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['The', 'rain in Spain']\n"
]
}
],
"source": [
"# Split the string only at the first occurrence:\n",
"txt = \"The rain in Spain\"\n",
"x = re.split(\"\\s\", txt, 1)\n",
"print(x)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:16.714523Z",
"start_time": "2023-09-18T03:48:16.637813Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "CLliIj2GkrbE",
"outputId": "9e497535-bc17-4a14-ad60-8888d0d24f1a"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The9rain9in9Spain\n"
]
}
],
"source": [
"# Replace every white-space character with the number 9:\n",
"txt = \"The rain in Spain\"\n",
"x = re.sub(\"\\s\", \"9\", txt)\n",
"print(x)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:17.714569Z",
"start_time": "2023-09-18T03:48:17.646947Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "L0yXdAeokrbF",
"outputId": "289c797b-9467-47b5-c33b-d1b518b0ccaa"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The9rain9in Spain\n"
]
}
],
"source": [
"# Replace the first 2 occurrences:\n",
"txt = \"The rain in Spain\"\n",
"x = re.sub(\"\\s\", \"9\", txt, 2)\n",
"print(x)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:18.316287Z",
"start_time": "2023-09-18T03:48:18.307583Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0Iis9jAfkrbF",
"outputId": "00d31da8-eb5e-4731-a95e-ec76270aed6b"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# Do a search that will return a Match Object:\n",
"txt = \"The rain in Spain\"\n",
"x = re.search(\"ai\", txt)\n",
"print(x) #this will print an object"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:19.002495Z",
"start_time": "2023-09-18T03:48:18.993177Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Y7q-Q085krbF",
"outputId": "41c42c6d-7a4c-41a9-e8cc-fddd1d235215"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(12, 17)\n"
]
}
],
"source": [
"# Print the position (start- and end-position) of the first match occurrence.\n",
"# The regular expression looks for any words that starts with an upper case \"S\":\n",
"txt = \"The rain in Spain\"\n",
"x = re.search(r\"\\bS\\w+\", txt)\n",
"print(x.span())"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:19.813881Z",
"start_time": "2023-09-18T03:48:19.779228Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "zrwiUxrRkrbF",
"outputId": "e14d8a93-f22d-4c06-f1bb-90bdd1a4a632"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The rain in Spain\n"
]
}
],
"source": [
"# Print the string passed into the function:\n",
"txt = \"The rain in Spain\"\n",
"x = re.search(r\"\\bS\\w+\", txt)\n",
"print(x.string)\n"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:20.914398Z",
"start_time": "2023-09-18T03:48:20.874691Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "0kLFYrlMkrbF",
"outputId": "1049b244-42e4-4cfb-bd4d-08cc8353ce77"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Spain\n"
]
}
],
"source": [
"# Print the part of the string where there was a match.\n",
"# The regular expression looks for any words that starts with an upper case \"S\":\n",
"txt = \"The rain in Spain\"\n",
"x = re.search(r\"\\bS\\w+\", txt)\n",
"print(x.group())"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:48:22.314638Z",
"start_time": "2023-09-18T03:48:22.223651Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "vFgGxVlNkrbF",
"outputId": "23dcce9a-e115-47d1-848b-5d6d82e3455d",
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['test@outlook.com', '123456@qq.com']\n"
]
}
],
"source": [
"# Construction a regular expression which could extract e-mail\n",
"pattern = re.compile(r\"[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(?:\\.[a-zA-Z0-9_-]+)\")\n",
"\n",
"strs = 'My personal e-mail is test@outlook.com, company e-mail is 123456@qq.com'\n",
"result = pattern.findall(strs)\n",
"\n",
"print(result)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "F1N4BmOXkrbF"
},
"source": [
"## 4. Practice"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9xyENVXekrbF"
},
"source": [
"**data preprocessing**"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 143
},
"id": "VXn30yzAkrbF",
"outputId": "cb53816a-a2b8-4330-e37a-c3681ad90037"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" NaN | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4\n",
"0 1 2 3 4 NaN\n",
"1 2 3 4 5 6.0\n",
"2 3 4 5 6 7.0"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a = [[1, 2, 3, 4, ],\n",
" [2, 3, 4, 5, 6],\n",
" [3, 4, 5, 6, 7]]\n",
"data = pd.DataFrame(a)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 143
},
"id": "hSA5JVM8krbF",
"outputId": "b34104e3-96b1-4df2-f112-54be3824a71c"
},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (3319737963.py, line 2)",
"output_type": "error",
"traceback": [
"\u001b[1;36m Cell \u001b[1;32mIn[59], line 2\u001b[1;36m\u001b[0m\n\u001b[1;33m norm_data =\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"# obtain the normalized data\n",
"norm_data =\n",
"norm_data"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 143
},
"id": "kaN38zLnkrbF",
"outputId": "4a6ec00b-a0ce-46e8-e558-be50a868c30e"
},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (2759590456.py, line 2)",
"output_type": "error",
"traceback": [
"\u001b[1;36m Cell \u001b[1;32mIn[60], line 2\u001b[1;36m\u001b[0m\n\u001b[1;33m std_data =\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"# obtain the standarized data\n",
"std_data =\n",
"std_data"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:22:05.915456Z",
"start_time": "2023-09-18T03:22:05.884455Z"
},
"id": "fOVc5VOKkrbG"
},
"source": [
"**regularization**"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"ExecuteTime": {
"end_time": "2023-09-18T03:20:34.514476Z",
"start_time": "2023-09-18T03:20:34.444321Z"
},
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "g11XHzTckrbG",
"outputId": "3b0c5f33-c94e-4943-a0d5-ed3c6fba867e"
},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (2933051834.py, line 2)",
"output_type": "error",
"traceback": [
"\u001b[1;36m Cell \u001b[1;32mIn[61], line 2\u001b[1;36m\u001b[0m\n\u001b[1;33m pattern = #insert your answer here\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"# Construct a regular expression which could extract date\n",
"pattern = #insert your answer here\n",
"strs = 'Today is 2022/09/13, today in the last year is 2021.09.13, today in the next year is 2023-09-13'\n",
"result = #insert your answer here\n",
"print(result)\n",
"# The answer is"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "KMFLhR3fkrbG"
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "good",
"language": "python",
"name": "good"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"vscode": {
"interpreter": {
"hash": "88279d2366fe020547cde40dd65aa0e3aa662a6ec1f3ca12d88834876c85e1a6"
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}