{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "Y9Tga5BDkra8" }, "source": [ "# CS5481 - Tutorial 3\n", "## Data Preprocessing and Regularization Expression\n", "\n", "\n", "## Preparation\n", "- Python\n", "- Python Libraries\n", "- - Pandas\n", "- - re" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "start_time": "2023-09-18T03:28:17.230Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "GFTw5qL4kra9", "outputId": "27f4f5fb-cba9-4b5a-bb01-c576ab6edafa" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.1.4)\n", "Requirement already satisfied: numpy<2,>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from pandas) (1.26.4)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)\n", "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", "\u001b[31mERROR: Could not find a version that satisfies the requirement re (from versions: none)\u001b[0m\u001b[31m\n", "\u001b[0m\u001b[31mERROR: No matching distribution found for re\u001b[0m\u001b[31m\n", "\u001b[0m" ] } ], "source": [ "!pip install pandas" ] }, { "cell_type": "markdown", "metadata": { "id": "sVwmV7dokra9" }, "source": [ "## 1. Import Libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:28:53.632817Z", "start_time": "2023-09-18T03:28:53.034836Z" }, "id": "ubkMahTRkra9" }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": { "id": "h5DXLw8ckra9" }, "source": [ "## 2. Data Preprocessing" ] }, { "cell_type": "markdown", "metadata": { "id": "IuFSmRA3kra-" }, "source": [ "### 2.1 Data Cleaning" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:33:58.665755Z", "start_time": "2023-09-18T03:33:58.598859Z" }, "id": "e3sCxYlfkra-" }, "outputs": [], "source": [ "data = pd.read_csv(r'movie_metadata.csv', encoding=\"utf-8\")" ] }, { "cell_type": "markdown", "metadata": { "id": "lS_CYbgXkra-" }, "source": [ "- Basic Operations" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:34:02.565803Z", "start_time": "2023-09-18T03:34:02.510710Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 307 }, "id": "wUUU2_--kra-", "outputId": "13ce574c-ba60-4bae-88d1-4fb9588beb76" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
2ColorSam Mendes602.0148.00.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|Thriller...994.0EnglishUKPG-13245000000.02015.0393.06.82.3585000
3ColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
4NaNDoug WalkerNaNNaN131.0NaNRob Walker131.0NaNDocumentary...NaNNaNNaNNaNNaNNaN12.07.1NaN0
\n", "

5 rows × 28 columns

\n", "
" ], "text/plain": [ " color director_name num_critic_for_reviews duration \\\n", "0 Color James Cameron 723.0 178.0 \n", "1 Color Gore Verbinski 302.0 169.0 \n", "2 Color Sam Mendes 602.0 148.0 \n", "3 Color Christopher Nolan 813.0 164.0 \n", "4 NaN Doug Walker NaN NaN \n", "\n", " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", "0 0.0 855.0 Joel David Moore \n", "1 563.0 1000.0 Orlando Bloom \n", "2 0.0 161.0 Rory Kinnear \n", "3 22000.0 23000.0 Christian Bale \n", "4 131.0 NaN Rob Walker \n", "\n", " actor_1_facebook_likes gross genres ... \\\n", "0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi ... \n", "1 40000.0 309404152.0 Action|Adventure|Fantasy ... \n", "2 11000.0 200074175.0 Action|Adventure|Thriller ... \n", "3 27000.0 448130642.0 Action|Thriller ... \n", "4 131.0 NaN Documentary ... \n", "\n", " num_user_for_reviews language country content_rating budget \\\n", "0 3054.0 English USA PG-13 237000000.0 \n", "1 1238.0 English USA PG-13 300000000.0 \n", "2 994.0 English UK PG-13 245000000.0 \n", "3 2701.0 English USA PG-13 250000000.0 \n", "4 NaN NaN NaN NaN NaN \n", "\n", " title_year actor_2_facebook_likes imdb_score aspect_ratio \\\n", "0 2009.0 936.0 7.9 1.78 \n", "1 2007.0 5000.0 7.1 2.35 \n", "2 2015.0 393.0 6.8 2.35 \n", "3 2012.0 23000.0 8.5 2.35 \n", "4 NaN 12.0 7.1 NaN \n", "\n", " movie_facebook_likes \n", "0 33000 \n", "1 0 \n", "2 85000 \n", "3 164000 \n", "4 0 \n", "\n", "[5 rows x 28 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# show the first 5 lines of the file\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:34:04.255414Z", "start_time": "2023-09-18T03:34:04.206760Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 325 }, "id": "YU__Sh7pkra-", "outputId": "ff46f456-82ee-459b-8c37-a0d600fd1f75" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
5038ColorScott Smith1.087.02.0318.0Daphne Zuniga637.0NaNComedy|Drama...6.0EnglishCanadaNaNNaN2013.0470.07.7NaN84
5039ColorNaN43.043.0NaN319.0Valorie Curry841.0NaNCrime|Drama|Mystery|Thriller...359.0EnglishUSATV-14NaNNaN593.07.516.0032000
5040ColorBenjamin Roberds13.076.00.00.0Maxwell Moody0.0NaNDrama|Horror|Thriller...3.0EnglishUSANaN1400.02013.00.06.3NaN16
5041ColorDaniel Hsia14.0100.00.0489.0Daniel Henney946.010443.0Comedy|Drama|Romance...9.0EnglishUSAPG-13NaN2012.0719.06.32.35660
5042ColorJon Gunn43.090.016.016.0Brian Herzlinger86.085222.0Documentary...84.0EnglishUSAPG1100.02004.023.06.61.85456
\n", "

5 rows × 28 columns

\n", "
" ], "text/plain": [ " color director_name num_critic_for_reviews duration \\\n", "5038 Color Scott Smith 1.0 87.0 \n", "5039 Color NaN 43.0 43.0 \n", "5040 Color Benjamin Roberds 13.0 76.0 \n", "5041 Color Daniel Hsia 14.0 100.0 \n", "5042 Color Jon Gunn 43.0 90.0 \n", "\n", " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", "5038 2.0 318.0 Daphne Zuniga \n", "5039 NaN 319.0 Valorie Curry \n", "5040 0.0 0.0 Maxwell Moody \n", "5041 0.0 489.0 Daniel Henney \n", "5042 16.0 16.0 Brian Herzlinger \n", "\n", " actor_1_facebook_likes gross genres ... \\\n", "5038 637.0 NaN Comedy|Drama ... \n", "5039 841.0 NaN Crime|Drama|Mystery|Thriller ... \n", "5040 0.0 NaN Drama|Horror|Thriller ... \n", "5041 946.0 10443.0 Comedy|Drama|Romance ... \n", "5042 86.0 85222.0 Documentary ... \n", "\n", " num_user_for_reviews language country content_rating budget \\\n", "5038 6.0 English Canada NaN NaN \n", "5039 359.0 English USA TV-14 NaN \n", "5040 3.0 English USA NaN 1400.0 \n", "5041 9.0 English USA PG-13 NaN \n", "5042 84.0 English USA PG 1100.0 \n", "\n", " title_year actor_2_facebook_likes imdb_score aspect_ratio \\\n", "5038 2013.0 470.0 7.7 NaN \n", "5039 NaN 593.0 7.5 16.00 \n", "5040 2013.0 0.0 6.3 NaN \n", "5041 2012.0 719.0 6.3 2.35 \n", "5042 2004.0 23.0 6.6 1.85 \n", "\n", " movie_facebook_likes \n", "5038 84 \n", "5039 32000 \n", "5040 16 \n", "5041 660 \n", "5042 456 \n", "\n", "[5 rows x 28 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# show the last 5 lines of the file\n", "data.tail()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:34:07.023407Z", "start_time": "2023-09-18T03:34:06.998476Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 335 }, "id": "nUV3nBfPkra-", "outputId": "dda7fe83-3080-4e28-d2a4-17c58cb9273e" }, "outputs": [ { "data": { "text/plain": [ "count 5028.000000\n", "mean 107.201074\n", "std 25.197441\n", "min 7.000000\n", "25% 93.000000\n", "50% 103.000000\n", "75% 118.000000\n", "max 511.000000\n", "Name: duration, dtype: float64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check stat info of columns: data.columnname.describe()\n", "data.duration.describe()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:38:51.325376Z", "start_time": "2023-09-18T03:38:51.311134Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "id": "w4jXV9cAkra-", "outputId": "566a8516-fa0a-4ed1-dfa1-9f77fd7c903d" }, "outputs": [ { "data": { "text/plain": [ "0 Color\n", "1 Color\n", "2 Color\n", "3 Color\n", "4 NaN\n", " ... \n", "5038 Color\n", "5039 Color\n", "5040 Color\n", "5041 Color\n", "5042 Color\n", "Name: color, Length: 5043, dtype: object" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# choose a column: data[columnname]\n", "data['color']" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:38:54.516852Z", "start_time": "2023-09-18T03:38:54.492521Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 397 }, "id": "w1CDAQkckra-", "outputId": "ed4116a5-de00-40dc-f5f7-b3e4d494a5b1" }, "outputs": [ { "data": { "text/plain": [ "0 Color\n", "1 Color\n", "2 Color\n", "3 Color\n", "4 NaN\n", "5 Color\n", "6 Color\n", "7 Color\n", "8 Color\n", "9 Color\n", "Name: color, dtype: object" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# choose the first K lines: data['columnname'][:K]\n", "K = 10\n", "data[\"color\"][:K]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:39:08.441758Z", "start_time": "2023-09-18T03:39:08.415829Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "dG-J0eWokra-", "outputId": "08fc90f9-44f6-4cb0-d947-9565d2ec31f2" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colordirector_name
0ColorJames Cameron
1ColorGore Verbinski
2ColorSam Mendes
3ColorChristopher Nolan
4NaNDoug Walker
.........
5038ColorScott Smith
5039ColorNaN
5040ColorBenjamin Roberds
5041ColorDaniel Hsia
5042ColorJon Gunn
\n", "

5043 rows × 2 columns

\n", "
" ], "text/plain": [ " color director_name\n", "0 Color James Cameron\n", "1 Color Gore Verbinski\n", "2 Color Sam Mendes\n", "3 Color Christopher Nolan\n", "4 NaN Doug Walker\n", "... ... ...\n", "5038 Color Scott Smith\n", "5039 Color NaN\n", "5040 Color Benjamin Roberds\n", "5041 Color Daniel Hsia\n", "5042 Color Jon Gunn\n", "\n", "[5043 rows x 2 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# choose multiple columns: data[[\"column1\", \"column2\"]]\n", "data[[\"color\", \"director_name\"]]\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:40:36.171107Z", "start_time": "2023-09-18T03:40:36.105984Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 617 }, "id": "ahas9hYCkra_", "outputId": "ce8c3fd2-20a7-4b6e-8b37-d3ec15047e6b" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
3ColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
6ColorSam Raimi392.0156.00.04000.0James Franco24000.0336530303.0Action|Adventure|Romance...1902.0EnglishUSAPG-13258000000.02007.011000.06.22.350
9ColorDavid Yates375.0153.0282.010000.0Daniel Radcliffe25000.0301956980.0Adventure|Family|Fantasy|Mystery...973.0EnglishUKPG250000000.02009.011000.07.52.3510000
..................................................................
4688ColorSteve James53.0170.023.02.0Arthur Agee7.07830611.0Documentary|Drama|Sport...74.0EnglishUSAPG-13700000.01994.06.08.31.330
4694ColorPeter Jackson446.0201.00.084.0Thomas Kretschmann6000.0218051260.0Action|Adventure|Drama|Romance...2618.0EnglishNew ZealandPG-13207000000.02005.0918.07.22.350
4708ColorMichael Wadleigh53.0215.014.0136.0Jimi Hendrix262.013300000.0Documentary|History|Music...63.0EnglishUSAR600000.01970.0227.08.12.200
4747Black and WhiteAkira Kurosawa153.0202.00.04.0Minoru Chiaki304.0269061.0Action|Adventure|Drama...596.0JapaneseJapanUnrated2000000.01954.08.08.71.3711000
4885Black and WhiteKing Vidor48.0151.054.06.0Renée Adorée81.0NaNDrama|Romance|War...45.0NaNUSANot Rated245000.01925.012.08.31.33226
\n", "

205 rows × 28 columns

\n", "
" ], "text/plain": [ " color director_name num_critic_for_reviews duration \\\n", "0 Color James Cameron 723.0 178.0 \n", "1 Color Gore Verbinski 302.0 169.0 \n", "3 Color Christopher Nolan 813.0 164.0 \n", "6 Color Sam Raimi 392.0 156.0 \n", "9 Color David Yates 375.0 153.0 \n", "... ... ... ... ... \n", "4688 Color Steve James 53.0 170.0 \n", "4694 Color Peter Jackson 446.0 201.0 \n", "4708 Color Michael Wadleigh 53.0 215.0 \n", "4747 Black and White Akira Kurosawa 153.0 202.0 \n", "4885 Black and White King Vidor 48.0 151.0 \n", "\n", " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", "0 0.0 855.0 Joel David Moore \n", "1 563.0 1000.0 Orlando Bloom \n", "3 22000.0 23000.0 Christian Bale \n", "6 0.0 4000.0 James Franco \n", "9 282.0 10000.0 Daniel Radcliffe \n", "... ... ... ... \n", "4688 23.0 2.0 Arthur Agee \n", "4694 0.0 84.0 Thomas Kretschmann \n", "4708 14.0 136.0 Jimi Hendrix \n", "4747 0.0 4.0 Minoru Chiaki \n", "4885 54.0 6.0 Renée Adorée \n", "\n", " actor_1_facebook_likes gross genres \\\n", "0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi \n", "1 40000.0 309404152.0 Action|Adventure|Fantasy \n", "3 27000.0 448130642.0 Action|Thriller \n", "6 24000.0 336530303.0 Action|Adventure|Romance \n", "9 25000.0 301956980.0 Adventure|Family|Fantasy|Mystery \n", "... ... ... ... \n", "4688 7.0 7830611.0 Documentary|Drama|Sport \n", "4694 6000.0 218051260.0 Action|Adventure|Drama|Romance \n", "4708 262.0 13300000.0 Documentary|History|Music \n", "4747 304.0 269061.0 Action|Adventure|Drama \n", "4885 81.0 NaN Drama|Romance|War \n", "\n", " ... num_user_for_reviews language country content_rating \\\n", "0 ... 3054.0 English USA PG-13 \n", "1 ... 1238.0 English USA PG-13 \n", "3 ... 2701.0 English USA PG-13 \n", "6 ... 1902.0 English USA PG-13 \n", "9 ... 973.0 English UK PG \n", "... ... ... ... ... ... \n", "4688 ... 74.0 English USA PG-13 \n", "4694 ... 2618.0 English New Zealand PG-13 \n", "4708 ... 63.0 English USA R \n", "4747 ... 596.0 Japanese Japan Unrated \n", "4885 ... 45.0 NaN USA Not Rated \n", "\n", " budget title_year actor_2_facebook_likes imdb_score aspect_ratio \\\n", "0 237000000.0 2009.0 936.0 7.9 1.78 \n", "1 300000000.0 2007.0 5000.0 7.1 2.35 \n", "3 250000000.0 2012.0 23000.0 8.5 2.35 \n", "6 258000000.0 2007.0 11000.0 6.2 2.35 \n", "9 250000000.0 2009.0 11000.0 7.5 2.35 \n", "... ... ... ... ... ... \n", "4688 700000.0 1994.0 6.0 8.3 1.33 \n", "4694 207000000.0 2005.0 918.0 7.2 2.35 \n", "4708 600000.0 1970.0 227.0 8.1 2.20 \n", "4747 2000000.0 1954.0 8.0 8.7 1.37 \n", "4885 245000.0 1925.0 12.0 8.3 1.33 \n", "\n", " movie_facebook_likes \n", "0 33000 \n", "1 0 \n", "3 164000 \n", "6 0 \n", "9 10000 \n", "... ... \n", "4688 0 \n", "4694 0 \n", "4708 0 \n", "4747 11000 \n", "4885 226 \n", "\n", "[205 rows x 28 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# where filtering: data[data['columnname'] > condition]\n", "# choose films whose duration is larger than 150 mins\n", "data[data[\"duration\"] > 150]\n" ] }, { "cell_type": "markdown", "metadata": { "id": "ScdvgzeQkra_" }, "source": [ "- Process NAN Data\n", "1. fill value\n", "2. remove corresponding lines\n", "3. remove columns where many values are nan" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:40:39.165689Z", "start_time": "2023-09-18T03:40:39.113922Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 443 }, "id": "R1gY1jMKkra_", "outputId": "4a8bcfcc-3e43-40dc-cc68-9efe0084e226" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
1FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
2FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
3FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
4TrueFalseTrueTrueFalseTrueFalseFalseTrueFalse...TrueTrueTrueTrueTrueTrueFalseFalseTrueFalse
..................................................................
5038FalseFalseFalseFalseFalseFalseFalseFalseTrueFalse...FalseFalseFalseTrueTrueFalseFalseFalseTrueFalse
5039FalseTrueFalseFalseTrueFalseFalseFalseTrueFalse...FalseFalseFalseFalseTrueTrueFalseFalseFalseFalse
5040FalseFalseFalseFalseFalseFalseFalseFalseTrueFalse...FalseFalseFalseTrueFalseFalseFalseFalseTrueFalse
5041FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
5042FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", "

5043 rows × 28 columns

\n", "
" ], "text/plain": [ " color director_name num_critic_for_reviews duration \\\n", "0 False False False False \n", "1 False False False False \n", "2 False False False False \n", "3 False False False False \n", "4 True False True True \n", "... ... ... ... ... \n", "5038 False False False False \n", "5039 False True False False \n", "5040 False False False False \n", "5041 False False False False \n", "5042 False False False False \n", "\n", " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", "0 False False False \n", "1 False False False \n", "2 False False False \n", "3 False False False \n", "4 False True False \n", "... ... ... ... \n", "5038 False False False \n", "5039 True False False \n", "5040 False False False \n", "5041 False False False \n", "5042 False False False \n", "\n", " actor_1_facebook_likes gross genres ... num_user_for_reviews \\\n", "0 False False False ... False \n", "1 False False False ... False \n", "2 False False False ... False \n", "3 False False False ... False \n", "4 False True False ... True \n", "... ... ... ... ... ... \n", "5038 False True False ... False \n", "5039 False True False ... False \n", "5040 False True False ... False \n", "5041 False False False ... False \n", "5042 False False False ... False \n", "\n", " language country content_rating budget title_year \\\n", "0 False False False False False \n", "1 False False False False False \n", "2 False False False False False \n", "3 False False False False False \n", "4 True True True True True \n", "... ... ... ... ... ... \n", "5038 False False True True False \n", "5039 False False False True True \n", "5040 False False True False False \n", "5041 False False False True False \n", "5042 False False False False False \n", "\n", " actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes \n", "0 False False False False \n", "1 False False False False \n", "2 False False False False \n", "3 False False False False \n", "4 False False True False \n", "... ... ... ... ... \n", "5038 False False True False \n", "5039 False False False False \n", "5040 False False True False \n", "5041 False False False False \n", "5042 False False False False \n", "\n", "[5043 rows x 28 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check all nan data\n", "data.isna()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:40:43.044017Z", "start_time": "2023-09-18T03:40:43.023342Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "frBhEjjwkra_", "outputId": "4384542b-9360-49cc-b9e9-3b879c966b9d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 USA\n", "1 USA\n", "2 UK\n", "3 USA\n", "4 cs5481\n", "Name: country, dtype: object\n", "0 178.000000\n", "1 169.000000\n", "2 148.000000\n", "3 164.000000\n", "4 107.201074\n", " ... \n", "5038 87.000000\n", "5039 43.000000\n", "5040 76.000000\n", "5041 100.000000\n", "5042 90.000000\n", "Name: duration, Length: 5043, dtype: float64\n" ] } ], "source": [ "# fill data with suitable values\n", "# for example, use \"cs5481\" to replace nan values in column \"country\"\n", "data.country = data.country.fillna(\"cs5481\")\n", "print(data.head().country)\n", "# use mean duration to replace nan values in column \"duration\"\n", "data.duration = data.duration.fillna(data.duration.mean())\n", "print(data.duration)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:41:00.088219Z", "start_time": "2023-09-18T03:41:00.019817Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 565 }, "id": "5PhEMJxrkra_", "outputId": "6f3727ed-4876-46c6-8faf-d5178efd73b6" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
2ColorSam Mendes602.0148.00.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|Thriller...994.0EnglishUKPG-13245000000.02015.0393.06.82.3585000
3ColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
5ColorAndrew Stanton462.0132.0475.0530.0Samantha Morton640.073058679.0Action|Adventure|Sci-Fi...738.0EnglishUSAPG-13263700000.02012.0632.06.62.3524000
..................................................................
5026ColorOlivier Assayas81.0110.0107.045.0Béatrice Dalle576.0136007.0Drama|Music|Romance...39.0FrenchFranceR4500.02004.0133.06.92.35171
5027ColorJafar Panahi64.090.0397.00.0Nargess Mamizadeh5.0673780.0Drama...26.0PersianIranNot Rated10000.02000.00.07.51.85697
5033ColorShane Carruth143.077.0291.08.0David Sullivan291.0424760.0Drama|Sci-Fi|Thriller...371.0EnglishUSAPG-137000.02004.045.07.01.8519000
5035ColorRobert Rodriguez56.081.00.06.0Peter Marquardt121.02040920.0Action|Crime|Drama|Romance|Thriller...130.0SpanishUSAR7000.01992.020.06.91.370
5042ColorJon Gunn43.090.016.016.0Brian Herzlinger86.085222.0Documentary...84.0EnglishUSAPG1100.02004.023.06.61.85456
\n", "

3756 rows × 28 columns

\n", "
" ], "text/plain": [ " color director_name num_critic_for_reviews duration \\\n", "0 Color James Cameron 723.0 178.0 \n", "1 Color Gore Verbinski 302.0 169.0 \n", "2 Color Sam Mendes 602.0 148.0 \n", "3 Color Christopher Nolan 813.0 164.0 \n", "5 Color Andrew Stanton 462.0 132.0 \n", "... ... ... ... ... \n", "5026 Color Olivier Assayas 81.0 110.0 \n", "5027 Color Jafar Panahi 64.0 90.0 \n", "5033 Color Shane Carruth 143.0 77.0 \n", "5035 Color Robert Rodriguez 56.0 81.0 \n", "5042 Color Jon Gunn 43.0 90.0 \n", "\n", " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", "0 0.0 855.0 Joel David Moore \n", "1 563.0 1000.0 Orlando Bloom \n", "2 0.0 161.0 Rory Kinnear \n", "3 22000.0 23000.0 Christian Bale \n", "5 475.0 530.0 Samantha Morton \n", "... ... ... ... \n", "5026 107.0 45.0 Béatrice Dalle \n", "5027 397.0 0.0 Nargess Mamizadeh \n", "5033 291.0 8.0 David Sullivan \n", "5035 0.0 6.0 Peter Marquardt \n", "5042 16.0 16.0 Brian Herzlinger \n", "\n", " actor_1_facebook_likes gross \\\n", "0 1000.0 760505847.0 \n", "1 40000.0 309404152.0 \n", "2 11000.0 200074175.0 \n", "3 27000.0 448130642.0 \n", "5 640.0 73058679.0 \n", "... ... ... \n", "5026 576.0 136007.0 \n", "5027 5.0 673780.0 \n", "5033 291.0 424760.0 \n", "5035 121.0 2040920.0 \n", "5042 86.0 85222.0 \n", "\n", " genres ... num_user_for_reviews language \\\n", "0 Action|Adventure|Fantasy|Sci-Fi ... 3054.0 English \n", "1 Action|Adventure|Fantasy ... 1238.0 English \n", "2 Action|Adventure|Thriller ... 994.0 English \n", "3 Action|Thriller ... 2701.0 English \n", "5 Action|Adventure|Sci-Fi ... 738.0 English \n", "... ... ... ... ... \n", "5026 Drama|Music|Romance ... 39.0 French \n", "5027 Drama ... 26.0 Persian \n", "5033 Drama|Sci-Fi|Thriller ... 371.0 English \n", "5035 Action|Crime|Drama|Romance|Thriller ... 130.0 Spanish \n", "5042 Documentary ... 84.0 English \n", "\n", " country content_rating budget title_year actor_2_facebook_likes \\\n", "0 USA PG-13 237000000.0 2009.0 936.0 \n", "1 USA PG-13 300000000.0 2007.0 5000.0 \n", "2 UK PG-13 245000000.0 2015.0 393.0 \n", "3 USA PG-13 250000000.0 2012.0 23000.0 \n", "5 USA PG-13 263700000.0 2012.0 632.0 \n", "... ... ... ... ... ... \n", "5026 France R 4500.0 2004.0 133.0 \n", "5027 Iran Not Rated 10000.0 2000.0 0.0 \n", "5033 USA PG-13 7000.0 2004.0 45.0 \n", "5035 USA R 7000.0 1992.0 20.0 \n", "5042 USA PG 1100.0 2004.0 23.0 \n", "\n", " imdb_score aspect_ratio movie_facebook_likes \n", "0 7.9 1.78 33000 \n", "1 7.1 2.35 0 \n", "2 6.8 2.35 85000 \n", "3 8.5 2.35 164000 \n", "5 6.6 2.35 24000 \n", "... ... ... ... \n", "5026 6.9 2.35 171 \n", "5027 7.5 1.85 697 \n", "5033 7.0 1.85 19000 \n", "5035 6.9 1.37 0 \n", "5042 6.6 1.85 456 \n", "\n", "[3756 rows x 28 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# remove lines where some values are nan\n", "data.dropna()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:41:02.580068Z", "start_time": "2023-09-18T03:41:02.459971Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 565 }, "id": "f0EIh-sskra_", "outputId": "f5ab0441-5819-4aa9-a10b-34d21a3dc472" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.0000000.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.000000563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
2ColorSam Mendes602.0148.0000000.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|Thriller...994.0EnglishUKPG-13245000000.02015.0393.06.82.3585000
3ColorChristopher Nolan813.0164.00000022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
4NaNDoug WalkerNaN107.201074131.0NaNRob Walker131.0NaNDocumentary...NaNNaNcs5481NaNNaNNaN12.07.1NaN0
..................................................................
5038ColorScott Smith1.087.0000002.0318.0Daphne Zuniga637.0NaNComedy|Drama...6.0EnglishCanadaNaNNaN2013.0470.07.7NaN84
5039ColorNaN43.043.000000NaN319.0Valorie Curry841.0NaNCrime|Drama|Mystery|Thriller...359.0EnglishUSATV-14NaNNaN593.07.516.0032000
5040ColorBenjamin Roberds13.076.0000000.00.0Maxwell Moody0.0NaNDrama|Horror|Thriller...3.0EnglishUSANaN1400.02013.00.06.3NaN16
5041ColorDaniel Hsia14.0100.0000000.0489.0Daniel Henney946.010443.0Comedy|Drama|Romance...9.0EnglishUSAPG-13NaN2012.0719.06.32.35660
5042ColorJon Gunn43.090.00000016.016.0Brian Herzlinger86.085222.0Documentary...84.0EnglishUSAPG1100.02004.023.06.61.85456
\n", "

5043 rows × 28 columns

\n", "
" ], "text/plain": [ " color director_name num_critic_for_reviews duration \\\n", "0 Color James Cameron 723.0 178.000000 \n", "1 Color Gore Verbinski 302.0 169.000000 \n", "2 Color Sam Mendes 602.0 148.000000 \n", "3 Color Christopher Nolan 813.0 164.000000 \n", "4 NaN Doug Walker NaN 107.201074 \n", "... ... ... ... ... \n", "5038 Color Scott Smith 1.0 87.000000 \n", "5039 Color NaN 43.0 43.000000 \n", "5040 Color Benjamin Roberds 13.0 76.000000 \n", "5041 Color Daniel Hsia 14.0 100.000000 \n", "5042 Color Jon Gunn 43.0 90.000000 \n", "\n", " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", "0 0.0 855.0 Joel David Moore \n", "1 563.0 1000.0 Orlando Bloom \n", "2 0.0 161.0 Rory Kinnear \n", "3 22000.0 23000.0 Christian Bale \n", "4 131.0 NaN Rob Walker \n", "... ... ... ... \n", "5038 2.0 318.0 Daphne Zuniga \n", "5039 NaN 319.0 Valorie Curry \n", "5040 0.0 0.0 Maxwell Moody \n", "5041 0.0 489.0 Daniel Henney \n", "5042 16.0 16.0 Brian Herzlinger \n", "\n", " actor_1_facebook_likes gross genres \\\n", "0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi \n", "1 40000.0 309404152.0 Action|Adventure|Fantasy \n", "2 11000.0 200074175.0 Action|Adventure|Thriller \n", "3 27000.0 448130642.0 Action|Thriller \n", "4 131.0 NaN Documentary \n", "... ... ... ... \n", "5038 637.0 NaN Comedy|Drama \n", "5039 841.0 NaN Crime|Drama|Mystery|Thriller \n", "5040 0.0 NaN Drama|Horror|Thriller \n", "5041 946.0 10443.0 Comedy|Drama|Romance \n", "5042 86.0 85222.0 Documentary \n", "\n", " ... num_user_for_reviews language country content_rating budget \\\n", "0 ... 3054.0 English USA PG-13 237000000.0 \n", "1 ... 1238.0 English USA PG-13 300000000.0 \n", "2 ... 994.0 English UK PG-13 245000000.0 \n", "3 ... 2701.0 English USA PG-13 250000000.0 \n", "4 ... NaN NaN cs5481 NaN NaN \n", "... ... ... ... ... ... ... \n", "5038 ... 6.0 English Canada NaN NaN \n", "5039 ... 359.0 English USA TV-14 NaN \n", "5040 ... 3.0 English USA NaN 1400.0 \n", "5041 ... 9.0 English USA PG-13 NaN \n", "5042 ... 84.0 English USA PG 1100.0 \n", "\n", " title_year actor_2_facebook_likes imdb_score aspect_ratio \\\n", "0 2009.0 936.0 7.9 1.78 \n", "1 2007.0 5000.0 7.1 2.35 \n", "2 2015.0 393.0 6.8 2.35 \n", "3 2012.0 23000.0 8.5 2.35 \n", "4 NaN 12.0 7.1 NaN \n", "... ... ... ... ... \n", "5038 2013.0 470.0 7.7 NaN \n", "5039 NaN 593.0 7.5 16.00 \n", "5040 2013.0 0.0 6.3 NaN \n", "5041 2012.0 719.0 6.3 2.35 \n", "5042 2004.0 23.0 6.6 1.85 \n", "\n", " movie_facebook_likes \n", "0 33000 \n", "1 0 \n", "2 85000 \n", "3 164000 \n", "4 0 \n", "... ... \n", "5038 84 \n", "5039 32000 \n", "5040 16 \n", "5041 660 \n", "5042 456 \n", "\n", "[5043 rows x 28 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# droping lines where just some values are nan is aggressive, so we can just remove lines where all values are nan.\n", "data.dropna(how=\"all\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:41:04.380118Z", "start_time": "2023-09-18T03:41:04.220510Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 600 }, "id": "xJDx_kmAkra_", "outputId": "8e22425d-cca8-405c-eb5a-05dd28d558b5" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
2ColorSam Mendes602.0148.00.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|Thriller...994.0EnglishUKPG-13245000000.02015.0393.06.82.3585000
3ColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
5ColorAndrew Stanton462.0132.0475.0530.0Samantha Morton640.073058679.0Action|Adventure|Sci-Fi...738.0EnglishUSAPG-13263700000.02012.0632.06.62.3524000
..................................................................
5035ColorRobert Rodriguez56.081.00.06.0Peter Marquardt121.02040920.0Action|Crime|Drama|Romance|Thriller...130.0SpanishUSAR7000.01992.020.06.91.370
5036ColorAnthony ValloneNaN84.02.02.0John Considine45.0NaNCrime|Drama...1.0EnglishUSAPG-133250.02005.044.07.8NaN4
5037ColorEdward Burns14.095.00.0133.0Caitlin FitzGerald296.04584.0Comedy|Drama...14.0EnglishUSANot Rated9000.02011.0205.06.4NaN413
5041ColorDaniel Hsia14.0100.00.0489.0Daniel Henney946.010443.0Comedy|Drama|Romance...9.0EnglishUSAPG-13NaN2012.0719.06.32.35660
5042ColorJon Gunn43.090.016.016.0Brian Herzlinger86.085222.0Documentary...84.0EnglishUSAPG1100.02004.023.06.61.85456
\n", "

4848 rows × 28 columns

\n", "
" ], "text/plain": [ " color director_name num_critic_for_reviews duration \\\n", "0 Color James Cameron 723.0 178.0 \n", "1 Color Gore Verbinski 302.0 169.0 \n", "2 Color Sam Mendes 602.0 148.0 \n", "3 Color Christopher Nolan 813.0 164.0 \n", "5 Color Andrew Stanton 462.0 132.0 \n", "... ... ... ... ... \n", "5035 Color Robert Rodriguez 56.0 81.0 \n", "5036 Color Anthony Vallone NaN 84.0 \n", "5037 Color Edward Burns 14.0 95.0 \n", "5041 Color Daniel Hsia 14.0 100.0 \n", "5042 Color Jon Gunn 43.0 90.0 \n", "\n", " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", "0 0.0 855.0 Joel David Moore \n", "1 563.0 1000.0 Orlando Bloom \n", "2 0.0 161.0 Rory Kinnear \n", "3 22000.0 23000.0 Christian Bale \n", "5 475.0 530.0 Samantha Morton \n", "... ... ... ... \n", "5035 0.0 6.0 Peter Marquardt \n", "5036 2.0 2.0 John Considine \n", "5037 0.0 133.0 Caitlin FitzGerald \n", "5041 0.0 489.0 Daniel Henney \n", "5042 16.0 16.0 Brian Herzlinger \n", "\n", " actor_1_facebook_likes gross \\\n", "0 1000.0 760505847.0 \n", "1 40000.0 309404152.0 \n", "2 11000.0 200074175.0 \n", "3 27000.0 448130642.0 \n", "5 640.0 73058679.0 \n", "... ... ... \n", "5035 121.0 2040920.0 \n", "5036 45.0 NaN \n", "5037 296.0 4584.0 \n", "5041 946.0 10443.0 \n", "5042 86.0 85222.0 \n", "\n", " genres ... num_user_for_reviews language \\\n", "0 Action|Adventure|Fantasy|Sci-Fi ... 3054.0 English \n", "1 Action|Adventure|Fantasy ... 1238.0 English \n", "2 Action|Adventure|Thriller ... 994.0 English \n", "3 Action|Thriller ... 2701.0 English \n", "5 Action|Adventure|Sci-Fi ... 738.0 English \n", "... ... ... ... ... \n", "5035 Action|Crime|Drama|Romance|Thriller ... 130.0 Spanish \n", "5036 Crime|Drama ... 1.0 English \n", "5037 Comedy|Drama ... 14.0 English \n", "5041 Comedy|Drama|Romance ... 9.0 English \n", "5042 Documentary ... 84.0 English \n", "\n", " country content_rating budget title_year actor_2_facebook_likes \\\n", "0 USA PG-13 237000000.0 2009.0 936.0 \n", "1 USA PG-13 300000000.0 2007.0 5000.0 \n", "2 UK PG-13 245000000.0 2015.0 393.0 \n", "3 USA PG-13 250000000.0 2012.0 23000.0 \n", "5 USA PG-13 263700000.0 2012.0 632.0 \n", "... ... ... ... ... ... \n", "5035 USA R 7000.0 1992.0 20.0 \n", "5036 USA PG-13 3250.0 2005.0 44.0 \n", "5037 USA Not Rated 9000.0 2011.0 205.0 \n", "5041 USA PG-13 NaN 2012.0 719.0 \n", "5042 USA PG 1100.0 2004.0 23.0 \n", "\n", " imdb_score aspect_ratio movie_facebook_likes \n", "0 7.9 1.78 33000 \n", "1 7.1 2.35 0 \n", "2 6.8 2.35 85000 \n", "3 8.5 2.35 164000 \n", "5 6.6 2.35 24000 \n", "... ... ... ... \n", "5035 6.9 1.37 0 \n", "5036 7.8 NaN 4 \n", "5037 6.4 NaN 413 \n", "5041 6.3 2.35 660 \n", "5042 6.6 1.85 456 \n", "\n", "[4848 rows x 28 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we can also add some limitations, save lines where more than 25 values are not nan\n", "data.dropna(thresh=25)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:41:05.383611Z", "start_time": "2023-09-18T03:41:05.229722Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 565 }, "id": "KlzHN2cgkra_", "outputId": "b3fe5937-d68a-47c0-8896-055b0935c3aa" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.0000000.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.000000563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
2ColorSam Mendes602.0148.0000000.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|Thriller...994.0EnglishUKPG-13245000000.02015.0393.06.82.3585000
3ColorChristopher Nolan813.0164.00000022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
4NaNDoug WalkerNaN107.201074131.0NaNRob Walker131.0NaNDocumentary...NaNNaNcs5481NaNNaNNaN12.07.1NaN0
..................................................................
5038ColorScott Smith1.087.0000002.0318.0Daphne Zuniga637.0NaNComedy|Drama...6.0EnglishCanadaNaNNaN2013.0470.07.7NaN84
5039ColorNaN43.043.000000NaN319.0Valorie Curry841.0NaNCrime|Drama|Mystery|Thriller...359.0EnglishUSATV-14NaNNaN593.07.516.0032000
5040ColorBenjamin Roberds13.076.0000000.00.0Maxwell Moody0.0NaNDrama|Horror|Thriller...3.0EnglishUSANaN1400.02013.00.06.3NaN16
5041ColorDaniel Hsia14.0100.0000000.0489.0Daniel Henney946.010443.0Comedy|Drama|Romance...9.0EnglishUSAPG-13NaN2012.0719.06.32.35660
5042ColorJon Gunn43.090.00000016.016.0Brian Herzlinger86.085222.0Documentary...84.0EnglishUSAPG1100.02004.023.06.61.85456
\n", "

5043 rows × 28 columns

\n", "
" ], "text/plain": [ " color director_name num_critic_for_reviews duration \\\n", "0 Color James Cameron 723.0 178.000000 \n", "1 Color Gore Verbinski 302.0 169.000000 \n", "2 Color Sam Mendes 602.0 148.000000 \n", "3 Color Christopher Nolan 813.0 164.000000 \n", "4 NaN Doug Walker NaN 107.201074 \n", "... ... ... ... ... \n", "5038 Color Scott Smith 1.0 87.000000 \n", "5039 Color NaN 43.0 43.000000 \n", "5040 Color Benjamin Roberds 13.0 76.000000 \n", "5041 Color Daniel Hsia 14.0 100.000000 \n", "5042 Color Jon Gunn 43.0 90.000000 \n", "\n", " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", "0 0.0 855.0 Joel David Moore \n", "1 563.0 1000.0 Orlando Bloom \n", "2 0.0 161.0 Rory Kinnear \n", "3 22000.0 23000.0 Christian Bale \n", "4 131.0 NaN Rob Walker \n", "... ... ... ... \n", "5038 2.0 318.0 Daphne Zuniga \n", "5039 NaN 319.0 Valorie Curry \n", "5040 0.0 0.0 Maxwell Moody \n", "5041 0.0 489.0 Daniel Henney \n", "5042 16.0 16.0 Brian Herzlinger \n", "\n", " actor_1_facebook_likes gross genres \\\n", "0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi \n", "1 40000.0 309404152.0 Action|Adventure|Fantasy \n", "2 11000.0 200074175.0 Action|Adventure|Thriller \n", "3 27000.0 448130642.0 Action|Thriller \n", "4 131.0 NaN Documentary \n", "... ... ... ... \n", "5038 637.0 NaN Comedy|Drama \n", "5039 841.0 NaN Crime|Drama|Mystery|Thriller \n", "5040 0.0 NaN Drama|Horror|Thriller \n", "5041 946.0 10443.0 Comedy|Drama|Romance \n", "5042 86.0 85222.0 Documentary \n", "\n", " ... num_user_for_reviews language country content_rating budget \\\n", "0 ... 3054.0 English USA PG-13 237000000.0 \n", "1 ... 1238.0 English USA PG-13 300000000.0 \n", "2 ... 994.0 English UK PG-13 245000000.0 \n", "3 ... 2701.0 English USA PG-13 250000000.0 \n", "4 ... NaN NaN cs5481 NaN NaN \n", "... ... ... ... ... ... ... \n", "5038 ... 6.0 English Canada NaN NaN \n", "5039 ... 359.0 English USA TV-14 NaN \n", "5040 ... 3.0 English USA NaN 1400.0 \n", "5041 ... 9.0 English USA PG-13 NaN \n", "5042 ... 84.0 English USA PG 1100.0 \n", "\n", " title_year actor_2_facebook_likes imdb_score aspect_ratio \\\n", "0 2009.0 936.0 7.9 1.78 \n", "1 2007.0 5000.0 7.1 2.35 \n", "2 2015.0 393.0 6.8 2.35 \n", "3 2012.0 23000.0 8.5 2.35 \n", "4 NaN 12.0 7.1 NaN \n", "... ... ... ... ... \n", "5038 2013.0 470.0 7.7 NaN \n", "5039 NaN 593.0 7.5 16.00 \n", "5040 2013.0 0.0 6.3 NaN \n", "5041 2012.0 719.0 6.3 2.35 \n", "5042 2004.0 23.0 6.6 1.85 \n", "\n", " movie_facebook_likes \n", "0 33000 \n", "1 0 \n", "2 85000 \n", "3 164000 \n", "4 0 \n", "... ... \n", "5038 84 \n", "5039 32000 \n", "5040 16 \n", "5041 660 \n", "5042 456 \n", "\n", "[5043 rows x 28 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we can remove columns where all values are nan\n", "data.dropna(axis=1, how=\"all\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:41:06.252799Z", "start_time": "2023-09-18T03:41:06.167587Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 597 }, "id": "8vZW2dxfkra_", "outputId": "5e7fba9c-f722-4b35-c43e-bbfd1f45658c" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
durationgenresmovie_titlenum_voted_userscast_total_facebook_likesmovie_imdb_linkcountryimdb_scoremovie_facebook_likes
0178.000000Action|Adventure|Fantasy|Sci-FiAvatar8862044834http://www.imdb.com/title/tt0499549/?ref_=fn_t...USA7.933000
1169.000000Action|Adventure|FantasyPirates of the Caribbean: At World's End47122048350http://www.imdb.com/title/tt0449088/?ref_=fn_t...USA7.10
2148.000000Action|Adventure|ThrillerSpectre27586811700http://www.imdb.com/title/tt2379713/?ref_=fn_t...UK6.885000
3164.000000Action|ThrillerThe Dark Knight Rises1144337106759http://www.imdb.com/title/tt1345836/?ref_=fn_t...USA8.5164000
4107.201074DocumentaryStar Wars: Episode VII - The Force Awakens  ...8143http://www.imdb.com/title/tt5289954/?ref_=fn_t...cs54817.10
..............................
503887.000000Comedy|DramaSigned Sealed Delivered6292283http://www.imdb.com/title/tt3000844/?ref_=fn_t...Canada7.784
503943.000000Crime|Drama|Mystery|ThrillerThe Following738391753http://www.imdb.com/title/tt2071645/?ref_=fn_t...USA7.532000
504076.000000Drama|Horror|ThrillerA Plague So Pleasant380http://www.imdb.com/title/tt2107644/?ref_=fn_t...USA6.316
5041100.000000Comedy|Drama|RomanceShanghai Calling12552386http://www.imdb.com/title/tt2070597/?ref_=fn_t...USA6.3660
504290.000000DocumentaryMy Date with Drew4285163http://www.imdb.com/title/tt0378407/?ref_=fn_t...USA6.6456
\n", "

5043 rows × 9 columns

\n", "
" ], "text/plain": [ " duration genres \\\n", "0 178.000000 Action|Adventure|Fantasy|Sci-Fi \n", "1 169.000000 Action|Adventure|Fantasy \n", "2 148.000000 Action|Adventure|Thriller \n", "3 164.000000 Action|Thriller \n", "4 107.201074 Documentary \n", "... ... ... \n", "5038 87.000000 Comedy|Drama \n", "5039 43.000000 Crime|Drama|Mystery|Thriller \n", "5040 76.000000 Drama|Horror|Thriller \n", "5041 100.000000 Comedy|Drama|Romance \n", "5042 90.000000 Documentary \n", "\n", " movie_title num_voted_users \\\n", "0 Avatar  886204 \n", "1 Pirates of the Caribbean: At World's End  471220 \n", "2 Spectre  275868 \n", "3 The Dark Knight Rises  1144337 \n", "4 Star Wars: Episode VII - The Force Awakens  ... 8 \n", "... ... ... \n", "5038 Signed Sealed Delivered  629 \n", "5039 The Following  73839 \n", "5040 A Plague So Pleasant  38 \n", "5041 Shanghai Calling  1255 \n", "5042 My Date with Drew  4285 \n", "\n", " cast_total_facebook_likes \\\n", "0 4834 \n", "1 48350 \n", "2 11700 \n", "3 106759 \n", "4 143 \n", "... ... \n", "5038 2283 \n", "5039 1753 \n", "5040 0 \n", "5041 2386 \n", "5042 163 \n", "\n", " movie_imdb_link country imdb_score \\\n", "0 http://www.imdb.com/title/tt0499549/?ref_=fn_t... USA 7.9 \n", "1 http://www.imdb.com/title/tt0449088/?ref_=fn_t... USA 7.1 \n", "2 http://www.imdb.com/title/tt2379713/?ref_=fn_t... UK 6.8 \n", "3 http://www.imdb.com/title/tt1345836/?ref_=fn_t... USA 8.5 \n", "4 http://www.imdb.com/title/tt5289954/?ref_=fn_t... cs5481 7.1 \n", "... ... ... ... \n", "5038 http://www.imdb.com/title/tt3000844/?ref_=fn_t... Canada 7.7 \n", "5039 http://www.imdb.com/title/tt2071645/?ref_=fn_t... USA 7.5 \n", "5040 http://www.imdb.com/title/tt2107644/?ref_=fn_t... USA 6.3 \n", "5041 http://www.imdb.com/title/tt2070597/?ref_=fn_t... USA 6.3 \n", "5042 http://www.imdb.com/title/tt0378407/?ref_=fn_t... USA 6.6 \n", "\n", " movie_facebook_likes \n", "0 33000 \n", "1 0 \n", "2 85000 \n", "3 164000 \n", "4 0 \n", "... ... \n", "5038 84 \n", "5039 32000 \n", "5040 16 \n", "5041 660 \n", "5042 456 \n", "\n", "[5043 rows x 9 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# or remove columns where some values are nan\n", "data.dropna(axis=1, how=\"any\")" ] }, { "cell_type": "markdown", "metadata": { "id": "M1RT1evZkra_" }, "source": [ "- Check Unreasonable Data\n", "1. Time\n", "2. Values with a range" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:41:36.472688Z", "start_time": "2023-09-18T03:41:36.319594Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 565 }, "id": "iPgfWcvTkrbA", "outputId": "d1aa676a-13c9-4e4b-af4e-f8202b9b7acd" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
10ColorZack Snyder673.0183.00.02000.0Lauren Cohan15000.0330249062.0Action|Adventure|Sci-Fi...3018.0EnglishUSAPG-13250000000.02016.04000.06.92.35197000
27ColorAnthony Russo516.0147.094.011000.0Scarlett Johansson21000.0407197282.0Action|Adventure|Sci-Fi...1022.0EnglishUSAPG-13250000000.02016.019000.08.22.3572000
57ColorJustin Lin322.0122.0681.0105.0Melissa Roxburgh998.0130468626.0Action|Adventure|Sci-Fi|Thriller...432.0EnglishUSAPG-13185000000.02016.0119.07.52.3530000
63ColorDavid Yates248.0110.0282.0103.0Alexander Skarsgård11000.0124051759.0Action|Adventure|Drama|Romance...239.0EnglishUSAPG-13180000000.02016.010000.06.62.3529000
65ColorBryan Singer396.0144.00.01000.0Michael Fassbender34000.0154985087.0Action|Adventure|Sci-Fi...622.0EnglishUSAPG-13178000000.02016.013000.07.32.3554000
..................................................................
4772ColorWarren Sheppard3.094.00.0212.0Randy Jay Burrell918.0NaNAction|Romance|Sport...2.0EnglishUSAPG-13150000.02016.0402.04.0NaN381
4773ColorDarren Lynn Bousman10.097.0163.0303.0Barry Bostwick636.0NaNHorror|Musical...20.0EnglishUSANaN500000.02016.0456.07.41.78707
4775ColorJoel Paul Reisig1.0108.0431.0317.0Joel Paul Reisig466.0NaNFamily...4.0EnglishUSAPG500000.02016.0431.05.7NaN0
4777ColorLuke Dye1.084.00.053.0Jeff Delaney385.0NaNFamily...1.0EnglishUSANaN500000.02016.0169.05.216.009
4953ColorNate Parker21.0120.0664.0400.0Nate Parker990.0NaNBiography|Drama...8.0EnglishUSAR10000000.02016.0664.05.42.350
\n", "

106 rows × 28 columns

\n", "
" ], "text/plain": [ " color director_name num_critic_for_reviews duration \\\n", "10 Color Zack Snyder 673.0 183.0 \n", "27 Color Anthony Russo 516.0 147.0 \n", "57 Color Justin Lin 322.0 122.0 \n", "63 Color David Yates 248.0 110.0 \n", "65 Color Bryan Singer 396.0 144.0 \n", "... ... ... ... ... \n", "4772 Color Warren Sheppard 3.0 94.0 \n", "4773 Color Darren Lynn Bousman 10.0 97.0 \n", "4775 Color Joel Paul Reisig 1.0 108.0 \n", "4777 Color Luke Dye 1.0 84.0 \n", "4953 Color Nate Parker 21.0 120.0 \n", "\n", " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", "10 0.0 2000.0 Lauren Cohan \n", "27 94.0 11000.0 Scarlett Johansson \n", "57 681.0 105.0 Melissa Roxburgh \n", "63 282.0 103.0 Alexander Skarsgård \n", "65 0.0 1000.0 Michael Fassbender \n", "... ... ... ... \n", "4772 0.0 212.0 Randy Jay Burrell \n", "4773 163.0 303.0 Barry Bostwick \n", "4775 431.0 317.0 Joel Paul Reisig \n", "4777 0.0 53.0 Jeff Delaney \n", "4953 664.0 400.0 Nate Parker \n", "\n", " actor_1_facebook_likes gross genres \\\n", "10 15000.0 330249062.0 Action|Adventure|Sci-Fi \n", "27 21000.0 407197282.0 Action|Adventure|Sci-Fi \n", "57 998.0 130468626.0 Action|Adventure|Sci-Fi|Thriller \n", "63 11000.0 124051759.0 Action|Adventure|Drama|Romance \n", "65 34000.0 154985087.0 Action|Adventure|Sci-Fi \n", "... ... ... ... \n", "4772 918.0 NaN Action|Romance|Sport \n", "4773 636.0 NaN Horror|Musical \n", "4775 466.0 NaN Family \n", "4777 385.0 NaN Family \n", "4953 990.0 NaN Biography|Drama \n", "\n", " ... num_user_for_reviews language country content_rating budget \\\n", "10 ... 3018.0 English USA PG-13 250000000.0 \n", "27 ... 1022.0 English USA PG-13 250000000.0 \n", "57 ... 432.0 English USA PG-13 185000000.0 \n", "63 ... 239.0 English USA PG-13 180000000.0 \n", "65 ... 622.0 English USA PG-13 178000000.0 \n", "... ... ... ... ... ... ... \n", "4772 ... 2.0 English USA PG-13 150000.0 \n", "4773 ... 20.0 English USA NaN 500000.0 \n", "4775 ... 4.0 English USA PG 500000.0 \n", "4777 ... 1.0 English USA NaN 500000.0 \n", "4953 ... 8.0 English USA R 10000000.0 \n", "\n", " title_year actor_2_facebook_likes imdb_score aspect_ratio \\\n", "10 2016.0 4000.0 6.9 2.35 \n", "27 2016.0 19000.0 8.2 2.35 \n", "57 2016.0 119.0 7.5 2.35 \n", "63 2016.0 10000.0 6.6 2.35 \n", "65 2016.0 13000.0 7.3 2.35 \n", "... ... ... ... ... \n", "4772 2016.0 402.0 4.0 NaN \n", "4773 2016.0 456.0 7.4 1.78 \n", "4775 2016.0 431.0 5.7 NaN \n", "4777 2016.0 169.0 5.2 16.00 \n", "4953 2016.0 664.0 5.4 2.35 \n", "\n", " movie_facebook_likes \n", "10 197000 \n", "27 72000 \n", "57 30000 \n", "63 29000 \n", "65 54000 \n", "... ... \n", "4772 381 \n", "4773 707 \n", "4775 0 \n", "4777 9 \n", "4953 0 \n", "\n", "[106 rows x 28 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check title_year\n", "data[data[\"title_year\"] > 2015]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:41:45.541677Z", "start_time": "2023-09-18T03:41:45.510770Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 99 }, "id": "CNjfgDiikrbA", "outputId": "ef658e8f-ac60-40ca-c92a-f3454834e995" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
\n", "

0 rows × 28 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [color, director_name, num_critic_for_reviews, duration, director_facebook_likes, actor_3_facebook_likes, actor_2_name, actor_1_facebook_likes, gross, genres, actor_1_name, movie_title, num_voted_users, cast_total_facebook_likes, actor_3_name, facenumber_in_poster, plot_keywords, movie_imdb_link, num_user_for_reviews, language, country, content_rating, budget, title_year, actor_2_facebook_likes, imdb_score, aspect_ratio, movie_facebook_likes]\n", "Index: []\n", "\n", "[0 rows x 28 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check imdb_score\n", "data[data[\"imdb_score\"] > 10]" ] }, { "cell_type": "markdown", "metadata": { "id": "Mh2xVJ5YkrbA" }, "source": [ "- Check Replicated Data" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:41:53.447219Z", "start_time": "2023-09-18T03:41:53.391774Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "id": "oPSvXaVJkrbA", "outputId": "afd6ed09-7884-4619-8708-2e7c812fbfa3" }, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", " ... \n", "5038 False\n", "5039 False\n", "5040 False\n", "5041 False\n", "5042 False\n", "Length: 5043, dtype: bool" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check duplicated data\n", "data.duplicated()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:42:01.062354Z", "start_time": "2023-09-18T03:42:01.036206Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "iDki8e0IkrbA", "outputId": "f2c942cd-19c2-4b71-d44a-0dba8ac125e4" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
brandstylerating
0Yum Yumcup4.0
1Yum Yumcup4.0
2Indomiecup3.5
3Indomiepack15.0
4Indomiepack5.0
\n", "
" ], "text/plain": [ " brand style rating\n", "0 Yum Yum cup 4.0\n", "1 Yum Yum cup 4.0\n", "2 Indomie cup 3.5\n", "3 Indomie pack 15.0\n", "4 Indomie pack 5.0" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# There is no duplicated lines in this file, thus we use a demo data to show how process it.\n", "df = pd.DataFrame({\n", " 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],\n", " 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],\n", " 'rating': [4, 4, 3.5, 15, 5]\n", "})\n", "df" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:42:02.016556Z", "start_time": "2023-09-18T03:42:02.000715Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 241 }, "id": "plbGsuAakrbA", "outputId": "76b7ec6c-9940-45db-bf39-f336f5aa4ea7" }, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 True\n", "2 False\n", "3 False\n", "4 False\n", "dtype: bool" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# By default, for each set of duplicated values, the first occurrence is set on False and all others on True.\n", "df.duplicated()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:42:05.815667Z", "start_time": "2023-09-18T03:42:05.796669Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 241 }, "id": "PG37RNbykrbB", "outputId": "5bda2afc-05a9-40d3-bfd5-5f968ad89b23" }, "outputs": [ { "data": { "text/plain": [ "0 True\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", "dtype: bool" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# By using ‘last’, the last occurrence of each set of duplicated values is set on False and all others on True.\n", "df.duplicated(keep='last')" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:42:09.517275Z", "start_time": "2023-09-18T03:42:09.485835Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 241 }, "id": "aw27GjYAkrbB", "outputId": "ebbfc7da-2161-457e-e606-f61d7e9c62cc" }, "outputs": [ { "data": { "text/plain": [ "0 True\n", "1 True\n", "2 False\n", "3 False\n", "4 False\n", "dtype: bool" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# By setting keep on False, all duplicates are True.\n", "df.duplicated(keep=False)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:42:13.522460Z", "start_time": "2023-09-18T03:42:13.507489Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 241 }, "id": "D1bcvb4nkrbB", "outputId": "947839cc-7d89-4d72-b5d3-d464b2247c4b" }, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 True\n", "2 False\n", "3 True\n", "4 True\n", "dtype: bool" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# To find duplicates on specific column(s), use subset.\n", "df.duplicated(subset=['brand'])" ] }, { "cell_type": "markdown", "metadata": { "id": "5K3PzCr8krbB" }, "source": [ "- Constrain Data Type" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:44:34.776087Z", "start_time": "2023-09-18T03:44:34.629436Z" }, "id": "01adGUHbkrbB" }, "outputs": [], "source": [ "# we can assume we know some columns' types and we can predefine it when reading data\n", "data = pd.read_csv(r'movie_metadata.csv', dtype={'num_voted_users': int, \"title_year\": str})" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:44:41.134748Z", "start_time": "2023-09-18T03:44:41.089632Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 307 }, "id": "IURKu35XkrbB", "outputId": "e688c65c-e07b-4d1c-c79b-008e554187c1" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgetrelease_dateactor_2_facebook_likesimdb_scoreaspect_ratiofacebook_likes
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009936.07.91.7833000
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.020075000.07.12.350
2ColorSam Mendes602.0148.00.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|Thriller...994.0EnglishUKPG-13245000000.02015393.06.82.3585000
3ColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...2701.0EnglishUSAPG-13250000000.0201223000.08.52.35164000
4NaNDoug WalkerNaNNaN131.0NaNRob Walker131.0NaNDocumentary...NaNNaNNaNNaNNaNNaN12.07.1NaN0
\n", "

5 rows × 28 columns

\n", "
" ], "text/plain": [ " color director_name num_critic_for_reviews duration \\\n", "0 Color James Cameron 723.0 178.0 \n", "1 Color Gore Verbinski 302.0 169.0 \n", "2 Color Sam Mendes 602.0 148.0 \n", "3 Color Christopher Nolan 813.0 164.0 \n", "4 NaN Doug Walker NaN NaN \n", "\n", " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", "0 0.0 855.0 Joel David Moore \n", "1 563.0 1000.0 Orlando Bloom \n", "2 0.0 161.0 Rory Kinnear \n", "3 22000.0 23000.0 Christian Bale \n", "4 131.0 NaN Rob Walker \n", "\n", " actor_1_facebook_likes gross genres ... \\\n", "0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi ... \n", "1 40000.0 309404152.0 Action|Adventure|Fantasy ... \n", "2 11000.0 200074175.0 Action|Adventure|Thriller ... \n", "3 27000.0 448130642.0 Action|Thriller ... \n", "4 131.0 NaN Documentary ... \n", "\n", " num_user_for_reviews language country content_rating budget \\\n", "0 3054.0 English USA PG-13 237000000.0 \n", "1 1238.0 English USA PG-13 300000000.0 \n", "2 994.0 English UK PG-13 245000000.0 \n", "3 2701.0 English USA PG-13 250000000.0 \n", "4 NaN NaN NaN NaN NaN \n", "\n", " release_date actor_2_facebook_likes imdb_score aspect_ratio facebook_likes \n", "0 2009 936.0 7.9 1.78 33000 \n", "1 2007 5000.0 7.1 2.35 0 \n", "2 2015 393.0 6.8 2.35 85000 \n", "3 2012 23000.0 8.5 2.35 164000 \n", "4 NaN 12.0 7.1 NaN 0 \n", "\n", "[5 rows x 28 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we can also rename columns for human understanding\n", "data = data.rename(columns = {'title_year':'release_date', 'movie_facebook_likes':'facebook_likes'})\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:45:23.357452Z", "start_time": "2023-09-18T03:45:23.214830Z" }, "id": "kWqh8VbNkrbB" }, "outputs": [], "source": [ "# after clean the data, we usually need to save the cleaned data to a new file\n", "data.to_csv('cleanfile.csv', encoding='utf-8')" ] }, { "cell_type": "markdown", "metadata": { "id": "RIajsttNkrbB" }, "source": [ "### 2.2 Data Integration" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:46:39.515799Z", "start_time": "2023-09-18T03:46:39.448042Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 269 }, "id": "61KOsZx0krbB", "outputId": "97d1998d-b047-45d5-e5bf-ba7ab1c9b8e1" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keydata1
0b0
1b1
2a2
3c3
4a4
5a5
6b6
\n", "
" ], "text/plain": [ " key data1\n", "0 b 0\n", "1 b 1\n", "2 a 2\n", "3 c 3\n", "4 a 4\n", "5 a 5\n", "6 b 6" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1=pd.DataFrame({'key':['b','b','a','c','a','a','b'],'data1':range(7)})\n", "df1\n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:46:48.916997Z", "start_time": "2023-09-18T03:46:48.851700Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 143 }, "id": "0uajPTV2krbC", "outputId": "c0f5cbe0-fc21-4bba-96b5-684fcd5ba7f5" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keydata2
0a0
1b1
2d2
\n", "
" ], "text/plain": [ " key data2\n", "0 a 0\n", "1 b 1\n", "2 d 2" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2=pd.DataFrame({'key':['a','b','d'],'data2':range(3)})\n", "df2" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:46:59.724261Z", "start_time": "2023-09-18T03:46:59.663746Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 237 }, "id": "DFIbTEoBkrbC", "outputId": "05094f6e-5f9d-4a4e-dd43-af69ec73e359" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keydata1data2
0b01
1b11
2b61
3a20
4a40
5a50
\n", "
" ], "text/plain": [ " key data1 data2\n", "0 b 0 1\n", "1 b 1 1\n", "2 b 6 1\n", "3 a 2 0\n", "4 a 4 0\n", "5 a 5 0" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we can merge two datasets with pd.merge(), the default merged column is the common column\n", "pd.merge(df1, df2)\n" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:47:13.071316Z", "start_time": "2023-09-18T03:47:13.047754Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 158 }, "id": "3Ed-uKU0krbC", "outputId": "6dc5b39f-9d61-452a-dd85-46c3b6863ecf" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keydata1data2
0b01
1b11
2b61
3a20
4a40
5a50
\n", "
" ], "text/plain": [ " key data1 data2\n", "0 b 0 1\n", "1 b 1 1\n", "2 b 6 1\n", "3 a 2 0\n", "4 a 4 0\n", "5 a 5 0" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# of course, we could give the merged column\n", "pd.merge(df1,df2,on='key')\n" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:47:14.013049Z", "start_time": "2023-09-18T03:47:13.992690Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 269 }, "id": "LHd3iz2PkrbC", "outputId": "0f8040fd-96bc-4f08-8ce5-ca6710b21746" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1keydata1
0b0
1b1
2a2
3c3
4a4
5a5
6b6
\n", "
" ], "text/plain": [ " 1key data1\n", "0 b 0\n", "1 b 1\n", "2 a 2\n", "3 c 3\n", "4 a 4\n", "5 a 5\n", "6 b 6" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we could merge two datasets with different columns\n", "df3=pd.DataFrame({'1key':['b','b','a','c','a','a','b'],'data1':range(7)})\n", "df3" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:47:15.914926Z", "start_time": "2023-09-18T03:47:15.857720Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 143 }, "id": "jIS98LXykrbC", "outputId": "98d78769-59aa-450d-fa76-e39be42a63de" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
2keydata2
0a0
1b1
2d2
\n", "
" ], "text/plain": [ " 2key data2\n", "0 a 0\n", "1 b 1\n", "2 d 2" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4=pd.DataFrame({'2key':['a','b','d'],'data2':range(3)})\n", "df4" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:47:17.076458Z", "start_time": "2023-09-18T03:47:17.052756Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 237 }, "id": "pqolG5Y4krbC", "outputId": "6a99b91b-98ea-4a6b-ee5c-b47ed9f09e8d" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1keydata12keydata2
0b0b1
1b1b1
2b6b1
3a2a0
4a4a0
5a5a0
\n", "
" ], "text/plain": [ " 1key data1 2key data2\n", "0 b 0 b 1\n", "1 b 1 b 1\n", "2 b 6 b 1\n", "3 a 2 a 0\n", "4 a 4 a 0\n", "5 a 5 a 0" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# default mode keeps the cross set of key values, which is called inner connection\n", "pd.merge(df3,df4,left_on='1key',right_on='2key')" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:47:18.436687Z", "start_time": "2023-09-18T03:47:18.348378Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 508 }, "id": "oftoZPnYkrbD", "outputId": "f8031665-5137-4d16-9bda-0d6bc4531260" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " key data1\n", "0 b 0\n", "1 b 1\n", "2 a 2\n", "3 c 3\n", "4 a 4\n", "5 a 5\n", "6 b 6\n", " key data2\n", "0 a 0\n", "1 b 1\n", "2 d 2\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keydata1data2
0b0.01.0
1b1.01.0
2b6.01.0
3a2.00.0
4a4.00.0
5a5.00.0
6c3.0NaN
7dNaN2.0
\n", "
" ], "text/plain": [ " key data1 data2\n", "0 b 0.0 1.0\n", "1 b 1.0 1.0\n", "2 b 6.0 1.0\n", "3 a 2.0 0.0\n", "4 a 4.0 0.0\n", "5 a 5.0 0.0\n", "6 c 3.0 NaN\n", "7 d NaN 2.0" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# when merge two datasets with outer connection\n", "print(df1)\n", "print(df2)\n", "pd.merge(df1,df2,on='key',how='outer')" ] }, { "cell_type": "markdown", "metadata": { "id": "UMoL7ygJkrbD" }, "source": [ "### 2.3 Data Transformation" ] }, { "cell_type": "markdown", "metadata": { "id": "2BK5E9LZkrbD" }, "source": [ "- String Transformation" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:47:26.817624Z", "start_time": "2023-09-18T03:47:26.748509Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "id": "yAB_qHQskrbD", "outputId": "b694d31d-d9ab-4f42-e185-0c96408329ce" }, "outputs": [ { "data": { "text/plain": [ "0 JAMES CAMERON\n", "1 GORE VERBINSKI\n", "2 SAM MENDES\n", "3 CHRISTOPHER NOLAN\n", "4 DOUG WALKER\n", " ... \n", "5038 SCOTT SMITH\n", "5039 NaN\n", "5040 BENJAMIN ROBERDS\n", "5041 DANIEL HSIA\n", "5042 JON GUNN\n", "Name: director_name, Length: 5043, dtype: object" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# lower and upper case\n", "data[\"director_name\"].str.upper()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:47:28.230447Z", "start_time": "2023-09-18T03:47:28.210928Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "id": "qPmcxRjjkrbD", "outputId": "7c44dc97-c992-4018-bd54-819749ce1a95" }, "outputs": [ { "data": { "text/plain": [ "0 james cameron\n", "1 gore verbinski\n", "2 sam mendes\n", "3 christopher nolan\n", "4 doug walker\n", " ... \n", "5038 scott smith\n", "5039 NaN\n", "5040 benjamin roberds\n", "5041 daniel hsia\n", "5042 jon gunn\n", "Name: director_name, Length: 5043, dtype: object" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# lower and upper case\n", "data[\"director_name\"].str.lower()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:47:28.991430Z", "start_time": "2023-09-18T03:47:28.969436Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "id": "cjNXtGKVkrbD", "outputId": "a656f647-8173-4781-f511-8549299b6fa5" }, "outputs": [ { "data": { "text/plain": [ "0 Avatar\n", "1 Pirates of the Caribbean: At World's End\n", "2 Spectre\n", "3 The Dark Knight Rises\n", "4 Star Wars: Episode VII - The Force Awakens\n", " ... \n", "5038 Signed Sealed Delivered\n", "5039 The Following\n", "5040 A Plague So Pleasant\n", "5041 Shanghai Calling\n", "5042 My Date with Drew\n", "Name: movie_title, Length: 5043, dtype: object" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# remove special strings for whitespace, \\n\n", "data['movie_title'].str.strip()" ] }, { "cell_type": "markdown", "metadata": { "id": "yIQEQAiSkrbD" }, "source": [ "- Number Transformation" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:47:30.814883Z", "start_time": "2023-09-18T03:47:30.769872Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "id": "WR4q_pyukrbD", "outputId": "ddedff31-dd3e-4a0b-d3ab-48a3a6678c7c" }, "outputs": [ { "data": { "text/plain": [ "0 178.0\n", "1 169.0\n", "2 148.0\n", "3 164.0\n", "4 NaN\n", " ... \n", "5038 87.0\n", "5039 43.0\n", "5040 76.0\n", "5041 100.0\n", "5042 90.0\n", "Name: duration, Length: 5043, dtype: float64" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# unit transformation\n", "\n", "data[\"duration\"]" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:47:31.515954Z", "start_time": "2023-09-18T03:47:31.433241Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "id": "wEIA6NVTkrbD", "outputId": "b8cc4e61-844c-43eb-b9ef-74edb3024348" }, "outputs": [ { "data": { "text/plain": [ "0 2.966667\n", "1 2.816667\n", "2 2.466667\n", "3 2.733333\n", "4 NaN\n", " ... \n", "5038 1.450000\n", "5039 0.716667\n", "5040 1.266667\n", "5041 1.666667\n", "5042 1.500000\n", "Name: duration, Length: 5043, dtype: float64" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[\"duration\"] / 60" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:47:32.022681Z", "start_time": "2023-09-18T03:47:31.926876Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "id": "mGVjK7nakrbD", "outputId": "f041f48b-16bb-4872-ce7b-1467359a83f1" }, "outputs": [ { "data": { "text/plain": [ "0 0.339286\n", "1 0.321429\n", "2 0.279762\n", "3 0.311508\n", "4 NaN\n", " ... \n", "5038 0.158730\n", "5039 0.071429\n", "5040 0.136905\n", "5041 0.184524\n", "5042 0.164683\n", "Name: duration, Length: 5043, dtype: float64" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# normalization\n", "norm_duration = (data.duration - data.duration.min()) / (data.duration.max() - data.duration.min())\n", "norm_duration" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:47:32.721861Z", "start_time": "2023-09-18T03:47:32.652216Z" }, "colab": { "base_uri": "https://localhost:8080/", "height": 458 }, "id": "dP7ElOT2krbD", "outputId": "34d1ed8d-e771-4758-b4bb-820b81c65782" }, "outputs": [ { "data": { "text/plain": [ "0 2.809767\n", "1 2.452587\n", "2 1.619169\n", "3 2.254155\n", "4 NaN\n", " ... \n", "5038 -0.801711\n", "5039 -2.547920\n", "5040 -1.238264\n", "5041 -0.285786\n", "5042 -0.682652\n", "Name: duration, Length: 5043, dtype: float64" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# standardization\n", "std_duration = (data.duration - data.duration.mean()) / data.duration.std()\n", "std_duration" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 272 }, "id": "ZIWLjSwXkrbE", "outputId": "758e2d2e-0c44-4b3c-aa34-4e58ffba78d3" }, "outputs": [ { "data": { "text/plain": [ "(6.999, 91.0] 1054\n", "(108.0, 122.0] 1028\n", "(99.0, 108.0] 1011\n", "(91.0, 99.0] 984\n", "(122.0, 511.0] 951\n", "Name: duration, dtype: int64" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# discretization\n", "# qcut: divie data points into M groups and each group has the basically same number of data points\n", "m_cut = pd.qcut(data.duration, 5)\n", "m_cut.value_counts()" ] }, { "cell_type": "markdown", "metadata": { "id": "CLrcHDbbkrbE" }, "source": [ "## 3. Regular Expression\n", "1.Metacharacters\n", "- []    A set of characters\n", "- \\\t   Signals a special sequence (can also be used to escape special characters)\n", "- .\t   Any character (except newline character)\t\"he..o\"\n", "- ^\t   Starts with \"^hello\"\n", "- Ends with \"planet\\$\"\n", "- \\*    Zero or more occurrences\t\"he.*o\"\n", "- \\+    One or more occurrences\t\"he.+o\"\n", "- ?\t   Zero or one occurrences\t\"he.?o\"\n", "- {}    Exactly the specified number of occurrences\t\"he.{2}o\"\n", "- |\t   Either or\t\"falls|stays\"\n", "- ()    Capture and group\n", "\n", "2.Special Sequences\n", "- \\A\t   Returns a match if the specified characters are at the beginning of the string\t\"\\AThe\"\n", "- \\b\t   Returns a match where the specified characters are at the beginning or at the end of a word\n", "(the \"r\" in the beginning is making sure that the string is being treated as a \"raw string\")\tr\"\\bain\"\n", "r\"ain\\b\"\n", "- \\B\t   Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word\n", "(the \"r\" in the beginning is making sure that the string is being treated as a \"raw string\")\tr\"\\Bain\"\n", "r\"ain\\B\"\n", "- \\d\t   Returns a match where the string contains digits (numbers from 0-9)\t\"\\d\"\n", "- \\D\t   Returns a match where the string DOES NOT contain digits\t\"\\D\"\n", "- \\s\t   Returns a match where the string contains a white space character\t\"\\s\"\n", "- \\S\t   Returns a match where the string DOES NOT contain a white space character\t\"\\S\"\n", "- \\w\t   Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)\t\"\\w\"\n", "- \\W\t   Returns a match where the string DOES NOT contain any word characters\t\"\\W\"\n", "- \\Z\t   Returns a match if the specified characters are at the end of the string\n", "\n", "3.Sets\n", "- [arn]\t   Returns a match where one of the specified characters (a, r, or n) is present\n", "- [a-n]\t   Returns a match for any lower case character, alphabetically between a and n\n", "- [^arn]    Returns a match for any character EXCEPT a, r, and n\n", "- [0123]    Returns a match where any of the specified digits (0, 1, 2, or 3) are present\n", "- [0-9]\t   Returns a match for any digit between 0 and 9\n", "- [0-5][0-9]    Returns a match for any two-digit numbers from 00 and 59\n", "- [a-zA-Z]    Returns a match for any character alphabetically between a and z, lower case OR upper case\n", "- [+]    In sets, +, *, ., |, (), $,{} has no special meaning, so [+] means: return a match for any + character in the string\n", "\n", "4.Funtions\n", "- The findall() function returns a list containing all matches.\n", "- The search() function searches the string for a match, and returns a Match object if there is a match. If there is more than one match, only the first occurrence of the match will be returned. If no matches are found, the value None is returned.\n", "- The split() function returns a list where the string has been split at each match.\n", "- The sub() function replaces the matches with the text of your choice.\n", "\n", "5.Ojects\n", "- A Match Object is an object containing information about the search and the result.\n", "- The Match object has properties and methods used to retrieve information about the search, and the result:\n", "\n", "- .span() returns a tuple containing the start-, and end positions of the match.\n", "\n", "- .string returns the string passed into the function\n", "\n", "- .group() returns the part of the string where there was a match\n" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:12.049857Z", "start_time": "2023-09-18T03:48:12.043616Z" }, "id": "HV6Ogn21krbE" }, "outputs": [], "source": [ "import re" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:12.614389Z", "start_time": "2023-09-18T03:48:12.583682Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "7H5CtFJokrbE", "outputId": "3a0d39f8-1734-4c42-b41d-c184a4ca2a42" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['ai', 'ai']\n" ] } ], "source": [ "# findall function\n", "txt = \"The rain in Spain\"\n", "x = re.findall(\"ai\", txt)\n", "print(x)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:13.213704Z", "start_time": "2023-09-18T03:48:13.165424Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "_R3vTz-kkrbE", "outputId": "5ffdf73f-b7ae-4a37-8bf9-b4282157af5e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[]\n" ] } ], "source": [ "# Return an empty list if no match was found:\n", "txt = \"The rain in Spain\"\n", "x = re.findall(\"Portugal\", txt)\n", "print(x)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:13.815165Z", "start_time": "2023-09-18T03:48:13.761650Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "IaXU0hASkrbE", "outputId": "0305665b-2589-4f98-c9c5-f65efd615833" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The first white-space character is located in position: 3\n" ] } ], "source": [ "# search function\n", "txt = \"The rain in Spain\"\n", "x = re.search(\"\\s\", txt)\n", "\n", "print(\"The first white-space character is located in position:\", x.start())" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:14.294609Z", "start_time": "2023-09-18T03:48:14.286784Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "3n8B_GoMkrbE", "outputId": "100a806d-8e93-4ecc-8e7f-ba18e33522ad" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "None\n" ] } ], "source": [ "# If no matches are found, the value None is returned:\n", "txt = \"The rain in Spain\"\n", "x = re.search(\"Portugal\", txt)\n", "print(x)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:15.020439Z", "start_time": "2023-09-18T03:48:15.012276Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "b5a5JHi9krbE", "outputId": "95684ab7-c94c-4401-fdea-a4334d60ef84" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['The', 'rain', 'in', 'Spain']\n" ] } ], "source": [ "# split function\n", "txt = \"The rain in Spain\"\n", "x = re.split(\"\\s\", txt)\n", "print(x)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:15.514395Z", "start_time": "2023-09-18T03:48:15.416765Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "YEA5w9gdkrbE", "outputId": "f869d498-864b-4216-e75f-d528e165a561" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['The', 'rain in Spain']\n" ] } ], "source": [ "# Split the string only at the first occurrence:\n", "txt = \"The rain in Spain\"\n", "x = re.split(\"\\s\", txt, 1)\n", "print(x)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:16.714523Z", "start_time": "2023-09-18T03:48:16.637813Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "CLliIj2GkrbE", "outputId": "9e497535-bc17-4a14-ad60-8888d0d24f1a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The9rain9in9Spain\n" ] } ], "source": [ "# Replace every white-space character with the number 9:\n", "txt = \"The rain in Spain\"\n", "x = re.sub(\"\\s\", \"9\", txt)\n", "print(x)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:17.714569Z", "start_time": "2023-09-18T03:48:17.646947Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "L0yXdAeokrbF", "outputId": "289c797b-9467-47b5-c33b-d1b518b0ccaa" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The9rain9in Spain\n" ] } ], "source": [ "# Replace the first 2 occurrences:\n", "txt = \"The rain in Spain\"\n", "x = re.sub(\"\\s\", \"9\", txt, 2)\n", "print(x)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:18.316287Z", "start_time": "2023-09-18T03:48:18.307583Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "0Iis9jAfkrbF", "outputId": "00d31da8-eb5e-4731-a95e-ec76270aed6b" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Do a search that will return a Match Object:\n", "txt = \"The rain in Spain\"\n", "x = re.search(\"ai\", txt)\n", "print(x) #this will print an object" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:19.002495Z", "start_time": "2023-09-18T03:48:18.993177Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "Y7q-Q085krbF", "outputId": "41c42c6d-7a4c-41a9-e8cc-fddd1d235215" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(12, 17)\n" ] } ], "source": [ "# Print the position (start- and end-position) of the first match occurrence.\n", "# The regular expression looks for any words that starts with an upper case \"S\":\n", "txt = \"The rain in Spain\"\n", "x = re.search(r\"\\bS\\w+\", txt)\n", "print(x.span())" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:19.813881Z", "start_time": "2023-09-18T03:48:19.779228Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "zrwiUxrRkrbF", "outputId": "e14d8a93-f22d-4c06-f1bb-90bdd1a4a632" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The rain in Spain\n" ] } ], "source": [ "# Print the string passed into the function:\n", "txt = \"The rain in Spain\"\n", "x = re.search(r\"\\bS\\w+\", txt)\n", "print(x.string)\n" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:20.914398Z", "start_time": "2023-09-18T03:48:20.874691Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "0kLFYrlMkrbF", "outputId": "1049b244-42e4-4cfb-bd4d-08cc8353ce77" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spain\n" ] } ], "source": [ "# Print the part of the string where there was a match.\n", "# The regular expression looks for any words that starts with an upper case \"S\":\n", "txt = \"The rain in Spain\"\n", "x = re.search(r\"\\bS\\w+\", txt)\n", "print(x.group())" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:48:22.314638Z", "start_time": "2023-09-18T03:48:22.223651Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "vFgGxVlNkrbF", "outputId": "23dcce9a-e115-47d1-848b-5d6d82e3455d", "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['test@outlook.com', '123456@qq.com']\n" ] } ], "source": [ "# Construction a regular expression which could extract e-mail\n", "pattern = re.compile(r\"[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(?:\\.[a-zA-Z0-9_-]+)\")\n", "\n", "strs = 'My personal e-mail is test@outlook.com, company e-mail is 123456@qq.com'\n", "result = pattern.findall(strs)\n", "\n", "print(result)" ] }, { "cell_type": "markdown", "metadata": { "id": "F1N4BmOXkrbF" }, "source": [ "## 4. Practice" ] }, { "cell_type": "markdown", "metadata": { "id": "9xyENVXekrbF" }, "source": [ "**data preprocessing**" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 143 }, "id": "VXn30yzAkrbF", "outputId": "cb53816a-a2b8-4330-e37a-c3681ad90037" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
01234NaN
123456.0
234567.0
\n", "
" ], "text/plain": [ " 0 1 2 3 4\n", "0 1 2 3 4 NaN\n", "1 2 3 4 5 6.0\n", "2 3 4 5 6 7.0" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = [[1, 2, 3, 4, ],\n", " [2, 3, 4, 5, 6],\n", " [3, 4, 5, 6, 7]]\n", "data = pd.DataFrame(a)\n", "data" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 143 }, "id": "hSA5JVM8krbF", "outputId": "b34104e3-96b1-4df2-f112-54be3824a71c" }, "outputs": [ { "ename": "SyntaxError", "evalue": "invalid syntax (3319737963.py, line 2)", "output_type": "error", "traceback": [ "\u001b[1;36m Cell \u001b[1;32mIn[59], line 2\u001b[1;36m\u001b[0m\n\u001b[1;33m norm_data =\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n" ] } ], "source": [ "# obtain the normalized data\n", "norm_data =\n", "norm_data" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 143 }, "id": "kaN38zLnkrbF", "outputId": "4a6ec00b-a0ce-46e8-e558-be50a868c30e" }, "outputs": [ { "ename": "SyntaxError", "evalue": "invalid syntax (2759590456.py, line 2)", "output_type": "error", "traceback": [ "\u001b[1;36m Cell \u001b[1;32mIn[60], line 2\u001b[1;36m\u001b[0m\n\u001b[1;33m std_data =\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n" ] } ], "source": [ "# obtain the standarized data\n", "std_data =\n", "std_data" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:22:05.915456Z", "start_time": "2023-09-18T03:22:05.884455Z" }, "id": "fOVc5VOKkrbG" }, "source": [ "**regularization**" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "ExecuteTime": { "end_time": "2023-09-18T03:20:34.514476Z", "start_time": "2023-09-18T03:20:34.444321Z" }, "colab": { "base_uri": "https://localhost:8080/" }, "id": "g11XHzTckrbG", "outputId": "3b0c5f33-c94e-4943-a0d5-ed3c6fba867e" }, "outputs": [ { "ename": "SyntaxError", "evalue": "invalid syntax (2933051834.py, line 2)", "output_type": "error", "traceback": [ "\u001b[1;36m Cell \u001b[1;32mIn[61], line 2\u001b[1;36m\u001b[0m\n\u001b[1;33m pattern = #insert your answer here\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n" ] } ], "source": [ "# Construct a regular expression which could extract date\n", "pattern = #insert your answer here\n", "strs = 'Today is 2022/09/13, today in the last year is 2021.09.13, today in the next year is 2023-09-13'\n", "result = #insert your answer here\n", "print(result)\n", "# The answer is" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "KMFLhR3fkrbG" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "good", "language": "python", "name": "good" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "vscode": { "interpreter": { "hash": "88279d2366fe020547cde40dd65aa0e3aa662a6ec1f3ca12d88834876c85e1a6" } } }, "nbformat": 4, "nbformat_minor": 4 }