{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import math" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY NaN NaN\n", "1 NaN NaN NaN NaN NaN\n", "2 Charlie NaN CA NaN NaN\n", "3 Dave 68.0 TX 70.0 NaN\n", "4 Ellen NaN CA 88.0 NaN\n", "5 Frank 30.0 NaN NaN NaN\n" ] } ], "source": [ "df = pd.read_csv('data/src/sample_pandas_normal_nan.csv')\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name object\n", "age float64\n", "state object\n", "point float64\n", "other float64\n", "dtype: object\n" ] } ], "source": [ "print(df.dtypes)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "nan\n", "\n" ] } ], "source": [ "print(df.at[1, 'name'])\n", "print(type(df.at[1, 'name']))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "nan\n", "\n" ] } ], "source": [ "print(df.at[0, 'point'])\n", "print(type(df.at[0, 'point']))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "True\n", "True\n", "True\n" ] } ], "source": [ "print(pd.isnull(df.at[0, 'point']))\n", "print(np.isnan(df.at[0, 'point']))\n", "print(math.isnan(df.at[0, 'point']))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "False\n" ] } ], "source": [ "print(df.at[0, 'point'] == np.nan)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY NaN NaN\n", "2 Charlie NaN CA NaN NaN\n", "3 Dave 68.0 TX 70.0 NaN\n", "4 Ellen NaN CA 88.0 NaN\n", "5 Frank 30.0 NaN NaN NaN\n" ] } ], "source": [ "print(df.dropna(how='all'))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point\n", "0 Alice 24.0 NY NaN\n", "1 NaN NaN NaN NaN\n", "2 Charlie NaN CA NaN\n", "3 Dave 68.0 TX 70.0\n", "4 Ellen NaN CA 88.0\n", "5 Frank 30.0 NaN NaN\n" ] } ], "source": [ "print(df.dropna(how='all', axis=1))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point\n", "0 Alice 24.0 NY NaN\n", "2 Charlie NaN CA NaN\n", "3 Dave 68.0 TX 70.0\n", "4 Ellen NaN CA 88.0\n", "5 Frank 30.0 NaN NaN\n" ] } ], "source": [ "print(df.dropna(how='all').dropna(how='all', axis=1))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point\n", "0 Alice 24.0 NY NaN\n", "2 Charlie NaN CA NaN\n", "3 Dave 68.0 TX 70.0\n", "4 Ellen NaN CA 88.0\n", "5 Frank 30.0 NaN NaN\n" ] } ], "source": [ "df2 = df.dropna(how='all').dropna(how='all', axis=1)\n", "print(df2)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point\n", "3 Dave 68.0 TX 70.0\n" ] } ], "source": [ "print(df2.dropna(how='any'))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point\n", "3 Dave 68.0 TX 70.0\n" ] } ], "source": [ "print(df2.dropna())" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name\n", "0 Alice\n", "2 Charlie\n", "3 Dave\n", "4 Ellen\n", "5 Frank\n" ] } ], "source": [ "print(df2.dropna(how='any', axis=1))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY NaN NaN\n", "3 Dave 68.0 TX 70.0 NaN\n", "4 Ellen NaN CA 88.0 NaN\n" ] } ], "source": [ "print(df.dropna(thresh=3))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state\n", "0 Alice 24.0 NY\n", "1 NaN NaN NaN\n", "2 Charlie NaN CA\n", "3 Dave 68.0 TX\n", "4 Ellen NaN CA\n", "5 Frank 30.0 NaN\n" ] } ], "source": [ "print(df.dropna(thresh=3, axis=1))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY NaN NaN\n", "3 Dave 68.0 TX 70.0 NaN\n", "5 Frank 30.0 NaN NaN NaN\n" ] } ], "source": [ "print(df.dropna(subset=['age']))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY NaN NaN\n", "3 Dave 68.0 TX 70.0 NaN\n" ] } ], "source": [ "print(df.dropna(subset=['age', 'state']))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY NaN NaN\n", "2 Charlie NaN CA NaN NaN\n", "3 Dave 68.0 TX 70.0 NaN\n", "4 Ellen NaN CA 88.0 NaN\n", "5 Frank 30.0 NaN NaN NaN\n" ] } ], "source": [ "print(df.dropna(subset=['age', 'state'], how='all'))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name state\n", "0 Alice NY\n", "1 NaN NaN\n", "2 Charlie CA\n", "3 Dave TX\n", "4 Ellen CA\n", "5 Frank NaN\n" ] } ], "source": [ "print(df.dropna(subset=[0, 4], axis=1))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point\n", "0 Alice 24.0 NY NaN\n", "1 NaN NaN NaN NaN\n", "2 Charlie NaN CA NaN\n", "3 Dave 68.0 TX 70.0\n", "4 Ellen NaN CA 88.0\n", "5 Frank 30.0 NaN NaN\n" ] } ], "source": [ "print(df.dropna(subset=[0, 4], axis=1, how='all'))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 24.0\n", "1 NaN\n", "2 NaN\n", "3 68.0\n", "4 NaN\n", "5 30.0\n", "Name: age, dtype: float64\n" ] } ], "source": [ "s = df['age']\n", "print(s)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 24.0\n", "3 68.0\n", "5 30.0\n", "Name: age, dtype: float64\n" ] } ], "source": [ "print(s.dropna())" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY 0.0 0.0\n", "1 0 0.0 0 0.0 0.0\n", "2 Charlie 0.0 CA 0.0 0.0\n", "3 Dave 68.0 TX 70.0 0.0\n", "4 Ellen 0.0 CA 88.0 0.0\n", "5 Frank 30.0 0 0.0 0.0\n" ] } ], "source": [ "print(df.fillna(0))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY 0.0 NaN\n", "1 XXX 20.0 NaN 0.0 NaN\n", "2 Charlie 20.0 CA 0.0 NaN\n", "3 Dave 68.0 TX 70.0 NaN\n", "4 Ellen 20.0 CA 88.0 NaN\n", "5 Frank 30.0 NaN 0.0 NaN\n" ] } ], "source": [ "print(df.fillna({'name': 'XXX', 'age': 20, 'point': 0}))" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name ZZZ\n", "age 100\n", "dtype: object\n" ] } ], "source": [ "s_for_fill = pd.Series(['ZZZ', 100], index=['name', 'age'])\n", "print(s_for_fill)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY NaN NaN\n", "1 ZZZ 100.0 NaN NaN NaN\n", "2 Charlie 100.0 CA NaN NaN\n", "3 Dave 68.0 TX 70.0 NaN\n", "4 Ellen 100.0 CA 88.0 NaN\n", "5 Frank 30.0 NaN NaN NaN\n" ] } ], "source": [ "print(df.fillna(s_for_fill))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "age 40.666667\n", "point 79.000000\n", "other NaN\n", "dtype: float64\n" ] } ], "source": [ "print(df.mean())" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.000000 NY 79.0 NaN\n", "1 NaN 40.666667 NaN 79.0 NaN\n", "2 Charlie 40.666667 CA 79.0 NaN\n", "3 Dave 68.000000 TX 70.0 NaN\n", "4 Ellen 40.666667 CA 88.0 NaN\n", "5 Frank 30.000000 NaN 79.0 NaN\n" ] } ], "source": [ "print(df.fillna(df.mean()))" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY 79.0 NaN\n", "1 NaN 30.0 NaN 79.0 NaN\n", "2 Charlie 30.0 CA 79.0 NaN\n", "3 Dave 68.0 TX 70.0 NaN\n", "4 Ellen 30.0 CA 88.0 NaN\n", "5 Frank 30.0 NaN 79.0 NaN\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.6/site-packages/numpy/lib/nanfunctions.py:1018: RuntimeWarning: Mean of empty slice\n", " return np.nanmean(a, axis, out=out, keepdims=keepdims)\n", "/usr/local/lib/python3.6/site-packages/numpy/lib/function_base.py:4033: RuntimeWarning: All-NaN slice encountered\n", " r = func(a, **kwargs)\n" ] } ], "source": [ "print(df.fillna(df.median()))" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY 70.0 NaN\n", "1 Alice 24.0 CA 70.0 NaN\n", "2 Charlie 24.0 CA 70.0 NaN\n", "3 Dave 68.0 TX 70.0 NaN\n", "4 Ellen 24.0 CA 88.0 NaN\n", "5 Frank 30.0 CA 70.0 NaN\n" ] } ], "source": [ "print(df.fillna(df.mode().iloc[0]))" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY NaN NaN\n", "1 Alice 24.0 NY NaN NaN\n", "2 Charlie 24.0 CA NaN NaN\n", "3 Dave 68.0 TX 70.0 NaN\n", "4 Ellen 68.0 CA 88.0 NaN\n", "5 Frank 30.0 CA 88.0 NaN\n" ] } ], "source": [ "print(df.fillna(method='ffill'))" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY 70.0 NaN\n", "1 Charlie 68.0 CA 70.0 NaN\n", "2 Charlie 68.0 CA 70.0 NaN\n", "3 Dave 68.0 TX 70.0 NaN\n", "4 Ellen 30.0 CA 88.0 NaN\n", "5 Frank 30.0 NaN NaN NaN\n" ] } ], "source": [ "print(df.fillna(method='bfill'))" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY NaN NaN\n", "1 Charlie NaN CA NaN NaN\n", "2 Charlie 68.0 CA 70.0 NaN\n", "3 Dave 68.0 TX 70.0 NaN\n", "4 Ellen 30.0 CA 88.0 NaN\n", "5 Frank 30.0 NaN NaN NaN\n" ] } ], "source": [ "print(df.fillna(method='bfill', limit=1))" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 24.0\n", "1 NaN\n", "2 NaN\n", "3 68.0\n", "4 NaN\n", "5 30.0\n", "Name: age, dtype: float64\n" ] } ], "source": [ "s = df['age']\n", "print(s)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 24.0\n", "1 100.0\n", "2 100.0\n", "3 68.0\n", "4 100.0\n", "5 30.0\n", "Name: age, dtype: float64\n" ] } ], "source": [ "print(s.fillna(100))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 24.0\n", "1 100.0\n", "2 NaN\n", "3 68.0\n", "4 0.0\n", "5 30.0\n", "Name: age, dtype: float64\n" ] } ], "source": [ "print(s.fillna({1: 100, 4: 0}))" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 24.0\n", "1 NaN\n", "2 68.0\n", "3 68.0\n", "4 30.0\n", "5 30.0\n", "Name: age, dtype: float64\n" ] } ], "source": [ "print(s.fillna(method='bfill', limit=1))" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY NaN NaN\n", "1 NaN NaN NaN NaN NaN\n", "2 Charlie NaN CA NaN NaN\n", "3 Dave 68.0 TX 70.0 NaN\n", "4 Ellen NaN CA 88.0 NaN\n", "5 Frank 30.0 NaN NaN NaN\n" ] } ], "source": [ "print(df)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 True\n", "1 True\n", "2 True\n", "3 False\n", "4 False\n", "5 True\n", "Name: point, dtype: bool\n" ] } ], "source": [ "print(df['point'].isnull())" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point other\n", "0 Alice 24.0 NY NaN NaN\n", "1 NaN NaN NaN NaN NaN\n", "2 Charlie NaN CA NaN NaN\n", "5 Frank 30.0 NaN NaN NaN\n" ] } ], "source": [ "print(df[df['point'].isnull()])" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name False\n", "age True\n", "state False\n", "point True\n", "other True\n", "Name: 2, dtype: bool\n" ] } ], "source": [ "print(df.iloc[2].isnull())" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " age point other\n", "0 24.0 NaN NaN\n", "1 NaN NaN NaN\n", "2 NaN NaN NaN\n", "3 68.0 70.0 NaN\n", "4 NaN 88.0 NaN\n", "5 30.0 NaN NaN\n" ] } ], "source": [ "print(df.loc[:, df.iloc[2].isnull()])" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point\n", "0 Alice 24.0 NY NaN\n", "2 Charlie NaN CA NaN\n", "3 Dave 68.0 TX 70.0\n", "4 Ellen NaN CA 88.0\n", "5 Frank 30.0 NaN NaN\n" ] } ], "source": [ "df2 = df.dropna(how='all').dropna(how='all', axis=1)\n", "print(df2)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point\n", "0 False False False True\n", "2 False True False True\n", "3 False False False False\n", "4 False True False False\n", "5 False False True True\n" ] } ], "source": [ "print(df2.isnull())" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 True\n", "2 True\n", "3 False\n", "4 True\n", "5 True\n", "dtype: bool\n" ] } ], "source": [ "print(df2.isnull().any(axis=1))" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point\n", "0 Alice 24.0 NY NaN\n", "2 Charlie NaN CA NaN\n", "4 Ellen NaN CA 88.0\n", "5 Frank 30.0 NaN NaN\n" ] } ], "source": [ "print(df2[df2.isnull().any(axis=1)])" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name False\n", "age True\n", "state True\n", "point True\n", "dtype: bool\n" ] } ], "source": [ "print(df2.isnull().any())" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " age state point\n", "0 24.0 NY NaN\n", "2 NaN CA NaN\n", "3 68.0 TX 70.0\n", "4 NaN CA 88.0\n", "5 30.0 NaN NaN\n" ] } ], "source": [ "print(df2.loc[:, df2.isnull().any()])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }