{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "sample_datasets.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyMHJcfhFEEQm/LsB7gh7HZ0",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
" "
]
},
{
"cell_type": "code",
"source": [
"pip install pydataset"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "WHg_b6MTGuZu",
"outputId": "ecbfd75a-bfbd-4967-e566-a5594e7456ad"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting pydataset\n",
" Downloading pydataset-0.2.0.tar.gz (15.9 MB)\n",
"\u001b[K |████████████████████████████████| 15.9 MB 3.8 MB/s \n",
"\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from pydataset) (1.3.5)\n",
"Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pydataset) (1.21.6)\n",
"Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pydataset) (2022.1)\n",
"Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pydataset) (2.8.2)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->pydataset) (1.15.0)\n",
"Building wheels for collected packages: pydataset\n",
" Building wheel for pydataset (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for pydataset: filename=pydataset-0.2.0-py3-none-any.whl size=15939432 sha256=a806d6fc5bf803cbd7f0adf0dadb93635522366f99f7334a9dcb209087a69360\n",
" Stored in directory: /root/.cache/pip/wheels/32/26/30/d71562a19eed948eaada9a61b4d722fa358657a3bfb5d151e2\n",
"Successfully built pydataset\n",
"Installing collected packages: pydataset\n",
"Successfully installed pydataset-0.2.0\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Import package\n",
"from pydataset import data\n",
"# Check out datasets\n",
"print(data())"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "As1heSJCGuWg",
"outputId": "b0792618-1712-4115-c189-5404ebfffc8f"
},
"execution_count": 25,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" dataset_id title\n",
"0 AirPassengers Monthly Airline Passenger Numbers 1949-1960\n",
"1 BJsales Sales Data with Leading Indicator\n",
"2 BOD Biochemical Oxygen Demand\n",
"3 Formaldehyde Determination of Formaldehyde\n",
"4 HairEyeColor Hair and Eye Color of Statistics Students\n",
".. ... ...\n",
"752 VerbAgg Verbal Aggression item responses\n",
"753 cake Breakage Angle of Chocolate Cakes\n",
"754 cbpp Contagious bovine pleuropneumonia\n",
"755 grouseticks Data on red grouse ticks from Elston et al. 2001\n",
"756 sleepstudy Reaction times in a sleep deprivation study\n",
"\n",
"[757 rows x 2 columns]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Create a function to glimpse the data\n",
"def glimpse(df):\n",
" print(f\"{df.shape[0]} rows and {df.shape[1]} columns\")\n",
" display(df.head())\n",
" display(df.tail())"
],
"metadata": {
"id": "S8ag0sluGuTZ"
},
"execution_count": 9,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Load as a dataframe\n",
"df = data('cake')\n",
"glimpse(df)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 413
},
"id": "Mab4X3gbGuRE",
"outputId": "93a85f27-34e0-4927-a919-4cc9107a7009"
},
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"270 rows and 5 columns\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" replicate recipe temperature angle temp\n",
"1 1 A 175 42 175\n",
"2 1 A 185 46 185\n",
"3 1 A 195 47 195\n",
"4 1 A 205 39 205\n",
"5 1 A 215 53 215"
],
"text/html": [
"\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" replicate \n",
" recipe \n",
" temperature \n",
" angle \n",
" temp \n",
" \n",
" \n",
" \n",
" \n",
" 1 \n",
" 1 \n",
" A \n",
" 175 \n",
" 42 \n",
" 175 \n",
" \n",
" \n",
" 2 \n",
" 1 \n",
" A \n",
" 185 \n",
" 46 \n",
" 185 \n",
" \n",
" \n",
" 3 \n",
" 1 \n",
" A \n",
" 195 \n",
" 47 \n",
" 195 \n",
" \n",
" \n",
" 4 \n",
" 1 \n",
" A \n",
" 205 \n",
" 39 \n",
" 205 \n",
" \n",
" \n",
" 5 \n",
" 1 \n",
" A \n",
" 215 \n",
" 53 \n",
" 215 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" replicate recipe temperature angle temp\n",
"266 15 C 185 28 185\n",
"267 15 C 195 25 195\n",
"268 15 C 205 25 205\n",
"269 15 C 215 31 215\n",
"270 15 C 225 25 225"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" replicate \n",
" recipe \n",
" temperature \n",
" angle \n",
" temp \n",
" \n",
" \n",
" \n",
" \n",
" 266 \n",
" 15 \n",
" C \n",
" 185 \n",
" 28 \n",
" 185 \n",
" \n",
" \n",
" 267 \n",
" 15 \n",
" C \n",
" 195 \n",
" 25 \n",
" 195 \n",
" \n",
" \n",
" 268 \n",
" 15 \n",
" C \n",
" 205 \n",
" 25 \n",
" 205 \n",
" \n",
" \n",
" 269 \n",
" 15 \n",
" C \n",
" 215 \n",
" 31 \n",
" 215 \n",
" \n",
" \n",
" 270 \n",
" 15 \n",
" C \n",
" 225 \n",
" 25 \n",
" 225 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"# Import seaborn\n",
"import seaborn as sns\n",
"# Check out available datasets\n",
"print(sns.get_dataset_names())"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "liCs1voBGuOx",
"outputId": "b5222ee7-176f-4dc2-cad6-d5574cfd83ba"
},
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'exercise', 'flights', 'fmri', 'gammas', 'geyser', 'iris', 'mpg', 'penguins', 'planets', 'taxis', 'tips', 'titanic']\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"df = sns.load_dataset('flights')\n",
"glimpse(df)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 413
},
"id": "dmH_dMegKY-M",
"outputId": "5ec16c95-3cc3-439e-94d5-f9ebc186ccd1"
},
"execution_count": 17,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"144 rows and 3 columns\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" year month passengers\n",
"0 1949 Jan 112\n",
"1 1949 Feb 118\n",
"2 1949 Mar 132\n",
"3 1949 Apr 129\n",
"4 1949 May 121"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" year \n",
" month \n",
" passengers \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1949 \n",
" Jan \n",
" 112 \n",
" \n",
" \n",
" 1 \n",
" 1949 \n",
" Feb \n",
" 118 \n",
" \n",
" \n",
" 2 \n",
" 1949 \n",
" Mar \n",
" 132 \n",
" \n",
" \n",
" 3 \n",
" 1949 \n",
" Apr \n",
" 129 \n",
" \n",
" \n",
" 4 \n",
" 1949 \n",
" May \n",
" 121 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" year month passengers\n",
"139 1960 Aug 606\n",
"140 1960 Sep 508\n",
"141 1960 Oct 461\n",
"142 1960 Nov 390\n",
"143 1960 Dec 432"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" year \n",
" month \n",
" passengers \n",
" \n",
" \n",
" \n",
" \n",
" 139 \n",
" 1960 \n",
" Aug \n",
" 606 \n",
" \n",
" \n",
" 140 \n",
" 1960 \n",
" Sep \n",
" 508 \n",
" \n",
" \n",
" 141 \n",
" 1960 \n",
" Oct \n",
" 461 \n",
" \n",
" \n",
" 142 \n",
" 1960 \n",
" Nov \n",
" 390 \n",
" \n",
" \n",
" 143 \n",
" 1960 \n",
" Dec \n",
" 432 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"# Import package\n",
"from sklearn.datasets import fetch_california_housing\n",
"# Load data (will download the data if it's the first time loading)\n",
"housing = fetch_california_housing(as_frame=True)\n",
"# Create a dataframe\n",
"df = housing['data'].join(housing['target'])\n",
"glimpse(df)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 413
},
"id": "HMEilRlhKY7i",
"outputId": "392243f4-a55e-40ae-aa44-6947e672ebf8"
},
"execution_count": 18,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"20640 rows and 9 columns\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n",
"0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n",
"1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n",
"2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n",
"3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n",
"4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n",
"\n",
" Longitude MedHouseVal \n",
"0 -122.23 4.526 \n",
"1 -122.22 3.585 \n",
"2 -122.24 3.521 \n",
"3 -122.25 3.413 \n",
"4 -122.25 3.422 "
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" MedInc \n",
" HouseAge \n",
" AveRooms \n",
" AveBedrms \n",
" Population \n",
" AveOccup \n",
" Latitude \n",
" Longitude \n",
" MedHouseVal \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 8.3252 \n",
" 41.0 \n",
" 6.984127 \n",
" 1.023810 \n",
" 322.0 \n",
" 2.555556 \n",
" 37.88 \n",
" -122.23 \n",
" 4.526 \n",
" \n",
" \n",
" 1 \n",
" 8.3014 \n",
" 21.0 \n",
" 6.238137 \n",
" 0.971880 \n",
" 2401.0 \n",
" 2.109842 \n",
" 37.86 \n",
" -122.22 \n",
" 3.585 \n",
" \n",
" \n",
" 2 \n",
" 7.2574 \n",
" 52.0 \n",
" 8.288136 \n",
" 1.073446 \n",
" 496.0 \n",
" 2.802260 \n",
" 37.85 \n",
" -122.24 \n",
" 3.521 \n",
" \n",
" \n",
" 3 \n",
" 5.6431 \n",
" 52.0 \n",
" 5.817352 \n",
" 1.073059 \n",
" 558.0 \n",
" 2.547945 \n",
" 37.85 \n",
" -122.25 \n",
" 3.413 \n",
" \n",
" \n",
" 4 \n",
" 3.8462 \n",
" 52.0 \n",
" 6.281853 \n",
" 1.081081 \n",
" 565.0 \n",
" 2.181467 \n",
" 37.85 \n",
" -122.25 \n",
" 3.422 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n",
"20635 1.5603 25.0 5.045455 1.133333 845.0 2.560606 39.48 \n",
"20636 2.5568 18.0 6.114035 1.315789 356.0 3.122807 39.49 \n",
"20637 1.7000 17.0 5.205543 1.120092 1007.0 2.325635 39.43 \n",
"20638 1.8672 18.0 5.329513 1.171920 741.0 2.123209 39.43 \n",
"20639 2.3886 16.0 5.254717 1.162264 1387.0 2.616981 39.37 \n",
"\n",
" Longitude MedHouseVal \n",
"20635 -121.09 0.781 \n",
"20636 -121.21 0.771 \n",
"20637 -121.22 0.923 \n",
"20638 -121.32 0.847 \n",
"20639 -121.24 0.894 "
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" MedInc \n",
" HouseAge \n",
" AveRooms \n",
" AveBedrms \n",
" Population \n",
" AveOccup \n",
" Latitude \n",
" Longitude \n",
" MedHouseVal \n",
" \n",
" \n",
" \n",
" \n",
" 20635 \n",
" 1.5603 \n",
" 25.0 \n",
" 5.045455 \n",
" 1.133333 \n",
" 845.0 \n",
" 2.560606 \n",
" 39.48 \n",
" -121.09 \n",
" 0.781 \n",
" \n",
" \n",
" 20636 \n",
" 2.5568 \n",
" 18.0 \n",
" 6.114035 \n",
" 1.315789 \n",
" 356.0 \n",
" 3.122807 \n",
" 39.49 \n",
" -121.21 \n",
" 0.771 \n",
" \n",
" \n",
" 20637 \n",
" 1.7000 \n",
" 17.0 \n",
" 5.205543 \n",
" 1.120092 \n",
" 1007.0 \n",
" 2.325635 \n",
" 39.43 \n",
" -121.22 \n",
" 0.923 \n",
" \n",
" \n",
" 20638 \n",
" 1.8672 \n",
" 18.0 \n",
" 5.329513 \n",
" 1.171920 \n",
" 741.0 \n",
" 2.123209 \n",
" 39.43 \n",
" -121.32 \n",
" 0.847 \n",
" \n",
" \n",
" 20639 \n",
" 2.3886 \n",
" 16.0 \n",
" 5.254717 \n",
" 1.162264 \n",
" 1387.0 \n",
" 2.616981 \n",
" 39.37 \n",
" -121.24 \n",
" 0.894 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"# Import package\n",
"import statsmodels.api as sm\n",
"# Load data as a dataframe\n",
"df = sm.datasets.macrodata.load_pandas()['data']\n",
"glimpse(df)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 467
},
"id": "SRbMPdLKKY5J",
"outputId": "36a9e4f2-d5a9-4a86-c6a1-9684afe58e71"
},
"execution_count": 19,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n",
" import pandas.util.testing as tm\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"203 rows and 14 columns\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" year quarter realgdp realcons realinv realgovt realdpi cpi \\\n",
"0 1959.0 1.0 2710.349 1707.4 286.898 470.045 1886.9 28.98 \n",
"1 1959.0 2.0 2778.801 1733.7 310.859 481.301 1919.7 29.15 \n",
"2 1959.0 3.0 2775.488 1751.8 289.226 491.260 1916.4 29.35 \n",
"3 1959.0 4.0 2785.204 1753.7 299.356 484.052 1931.3 29.37 \n",
"4 1960.0 1.0 2847.699 1770.5 331.722 462.199 1955.5 29.54 \n",
"\n",
" m1 tbilrate unemp pop infl realint \n",
"0 139.7 2.82 5.8 177.146 0.00 0.00 \n",
"1 141.7 3.08 5.1 177.830 2.34 0.74 \n",
"2 140.5 3.82 5.3 178.657 2.74 1.09 \n",
"3 140.0 4.33 5.6 179.386 0.27 4.06 \n",
"4 139.6 3.50 5.2 180.007 2.31 1.19 "
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" year \n",
" quarter \n",
" realgdp \n",
" realcons \n",
" realinv \n",
" realgovt \n",
" realdpi \n",
" cpi \n",
" m1 \n",
" tbilrate \n",
" unemp \n",
" pop \n",
" infl \n",
" realint \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1959.0 \n",
" 1.0 \n",
" 2710.349 \n",
" 1707.4 \n",
" 286.898 \n",
" 470.045 \n",
" 1886.9 \n",
" 28.98 \n",
" 139.7 \n",
" 2.82 \n",
" 5.8 \n",
" 177.146 \n",
" 0.00 \n",
" 0.00 \n",
" \n",
" \n",
" 1 \n",
" 1959.0 \n",
" 2.0 \n",
" 2778.801 \n",
" 1733.7 \n",
" 310.859 \n",
" 481.301 \n",
" 1919.7 \n",
" 29.15 \n",
" 141.7 \n",
" 3.08 \n",
" 5.1 \n",
" 177.830 \n",
" 2.34 \n",
" 0.74 \n",
" \n",
" \n",
" 2 \n",
" 1959.0 \n",
" 3.0 \n",
" 2775.488 \n",
" 1751.8 \n",
" 289.226 \n",
" 491.260 \n",
" 1916.4 \n",
" 29.35 \n",
" 140.5 \n",
" 3.82 \n",
" 5.3 \n",
" 178.657 \n",
" 2.74 \n",
" 1.09 \n",
" \n",
" \n",
" 3 \n",
" 1959.0 \n",
" 4.0 \n",
" 2785.204 \n",
" 1753.7 \n",
" 299.356 \n",
" 484.052 \n",
" 1931.3 \n",
" 29.37 \n",
" 140.0 \n",
" 4.33 \n",
" 5.6 \n",
" 179.386 \n",
" 0.27 \n",
" 4.06 \n",
" \n",
" \n",
" 4 \n",
" 1960.0 \n",
" 1.0 \n",
" 2847.699 \n",
" 1770.5 \n",
" 331.722 \n",
" 462.199 \n",
" 1955.5 \n",
" 29.54 \n",
" 139.6 \n",
" 3.50 \n",
" 5.2 \n",
" 180.007 \n",
" 2.31 \n",
" 1.19 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" year quarter realgdp realcons realinv realgovt realdpi \\\n",
"198 2008.0 3.0 13324.600 9267.7 1990.693 991.551 9838.3 \n",
"199 2008.0 4.0 13141.920 9195.3 1857.661 1007.273 9920.4 \n",
"200 2009.0 1.0 12925.410 9209.2 1558.494 996.287 9926.4 \n",
"201 2009.0 2.0 12901.504 9189.0 1456.678 1023.528 10077.5 \n",
"202 2009.0 3.0 12990.341 9256.0 1486.398 1044.088 10040.6 \n",
"\n",
" cpi m1 tbilrate unemp pop infl realint \n",
"198 216.889 1474.7 1.17 6.0 305.270 -3.16 4.33 \n",
"199 212.174 1576.5 0.12 6.9 305.952 -8.79 8.91 \n",
"200 212.671 1592.8 0.22 8.1 306.547 0.94 -0.71 \n",
"201 214.469 1653.6 0.18 9.2 307.226 3.37 -3.19 \n",
"202 216.385 1673.9 0.12 9.6 308.013 3.56 -3.44 "
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" year \n",
" quarter \n",
" realgdp \n",
" realcons \n",
" realinv \n",
" realgovt \n",
" realdpi \n",
" cpi \n",
" m1 \n",
" tbilrate \n",
" unemp \n",
" pop \n",
" infl \n",
" realint \n",
" \n",
" \n",
" \n",
" \n",
" 198 \n",
" 2008.0 \n",
" 3.0 \n",
" 13324.600 \n",
" 9267.7 \n",
" 1990.693 \n",
" 991.551 \n",
" 9838.3 \n",
" 216.889 \n",
" 1474.7 \n",
" 1.17 \n",
" 6.0 \n",
" 305.270 \n",
" -3.16 \n",
" 4.33 \n",
" \n",
" \n",
" 199 \n",
" 2008.0 \n",
" 4.0 \n",
" 13141.920 \n",
" 9195.3 \n",
" 1857.661 \n",
" 1007.273 \n",
" 9920.4 \n",
" 212.174 \n",
" 1576.5 \n",
" 0.12 \n",
" 6.9 \n",
" 305.952 \n",
" -8.79 \n",
" 8.91 \n",
" \n",
" \n",
" 200 \n",
" 2009.0 \n",
" 1.0 \n",
" 12925.410 \n",
" 9209.2 \n",
" 1558.494 \n",
" 996.287 \n",
" 9926.4 \n",
" 212.671 \n",
" 1592.8 \n",
" 0.22 \n",
" 8.1 \n",
" 306.547 \n",
" 0.94 \n",
" -0.71 \n",
" \n",
" \n",
" 201 \n",
" 2009.0 \n",
" 2.0 \n",
" 12901.504 \n",
" 9189.0 \n",
" 1456.678 \n",
" 1023.528 \n",
" 10077.5 \n",
" 214.469 \n",
" 1653.6 \n",
" 0.18 \n",
" 9.2 \n",
" 307.226 \n",
" 3.37 \n",
" -3.19 \n",
" \n",
" \n",
" 202 \n",
" 2009.0 \n",
" 3.0 \n",
" 12990.341 \n",
" 9256.0 \n",
" 1486.398 \n",
" 1044.088 \n",
" 10040.6 \n",
" 216.385 \n",
" 1673.9 \n",
" 0.12 \n",
" 9.6 \n",
" 308.013 \n",
" 3.56 \n",
" -3.44 \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"pip install nltk"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "V4i-MFxeKY2x",
"outputId": "03ea599b-5ec9-433e-ecc1-7dea252b72e7"
},
"execution_count": 32,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: nltk in /usr/local/lib/python3.7/dist-packages (3.2.5)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from nltk) (1.15.0)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Import package\n",
"import nltk\n",
"# Download the corpus (only need to do once)\n",
"nltk.download('movie_reviews')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gnQAwiSgKY0M",
"outputId": "0d02eb88-0117-4eaa-b71d-1fb3acf3647c"
},
"execution_count": 33,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[nltk_data] Downloading package movie_reviews to /root/nltk_data...\n",
"[nltk_data] Unzipping corpora/movie_reviews.zip.\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"execution_count": 33
}
]
},
{
"cell_type": "code",
"source": [
"# Import packages\n",
"import pandas as pd\n",
"from nltk.corpus import movie_reviews\n",
"# Convert to dataframe\n",
"documents = []\n",
"for fileid in movie_reviews.fileids():\n",
" tag, filename = fileid.split('/')\n",
" documents.append((tag, movie_reviews.raw(fileid)))\n",
"df = pd.DataFrame(documents, columns=['target', 'document'])\n",
"glimpse(df)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 413
},
"id": "7d37DToOKYx7",
"outputId": "db6112c3-9013-42ae-af8c-2bb0db2e93bd"
},
"execution_count": 34,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"2000 rows and 2 columns\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" target \\\n",
"0 neg \n",
"1 neg \n",
"2 neg \n",
"3 neg \n",
"4 neg \n",
"\n",
" document \n",
"0 plot : two teen couples go to a church party , drink and then drive . \\nthey get into ... \n",
"1 the happy bastard's quick movie review \\ndamn that y2k bug . \\nit's got a head start i... \n",
"2 it is movies like these that make a jaded movie viewer thankful for the invention of t... \n",
"3 \" quest for camelot \" is warner bros . ' first feature-length , fully-animated attemp... \n",
"4 synopsis : a mentally unstable man undergoing psychotherapy saves a boy from a potenti... "
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" target \n",
" document \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" neg \n",
" plot : two teen couples go to a church party , drink and then drive . \\nthey get into ... \n",
" \n",
" \n",
" 1 \n",
" neg \n",
" the happy bastard's quick movie review \\ndamn that y2k bug . \\nit's got a head start i... \n",
" \n",
" \n",
" 2 \n",
" neg \n",
" it is movies like these that make a jaded movie viewer thankful for the invention of t... \n",
" \n",
" \n",
" 3 \n",
" neg \n",
" \" quest for camelot \" is warner bros . ' first feature-length , fully-animated attemp... \n",
" \n",
" \n",
" 4 \n",
" neg \n",
" synopsis : a mentally unstable man undergoing psychotherapy saves a boy from a potenti... \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" target \\\n",
"1995 pos \n",
"1996 pos \n",
"1997 pos \n",
"1998 pos \n",
"1999 pos \n",
"\n",
" document \n",
"1995 wow ! what a movie . \\nit's everything a movie can be : funny , dramatic , interesting... \n",
"1996 richard gere can be a commanding actor , but he's not always in great films . \\neveryt... \n",
"1997 glory--starring matthew broderick , denzel washington , and morgan freeman--is the tru... \n",
"1998 steven spielberg's second epic film on world war ii is an unquestioned masterpiece of ... \n",
"1999 truman ( \" true-man \" ) burbank is the perfect name for jim carrey's character in this... "
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" target \n",
" document \n",
" \n",
" \n",
" \n",
" \n",
" 1995 \n",
" pos \n",
" wow ! what a movie . \\nit's everything a movie can be : funny , dramatic , interesting... \n",
" \n",
" \n",
" 1996 \n",
" pos \n",
" richard gere can be a commanding actor , but he's not always in great films . \\neveryt... \n",
" \n",
" \n",
" 1997 \n",
" pos \n",
" glory--starring matthew broderick , denzel washington , and morgan freeman--is the tru... \n",
" \n",
" \n",
" 1998 \n",
" pos \n",
" steven spielberg's second epic film on world war ii is an unquestioned masterpiece of ... \n",
" \n",
" \n",
" 1999 \n",
" pos \n",
" truman ( \" true-man \" ) burbank is the perfect name for jim carrey's character in this... \n",
" \n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {}
}
]
}
]
}