{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point\n", "0 Alice 24.0 NY 64.0\n", "1 NaN NaN NaN NaN\n", "2 Charlie 18.0 CA 70.0\n", "3 Dave 68.0 TX 70.0\n", "4 Ellen 24.0 CA 88.0\n", "5 Frank 30.0 NY 57.0\n" ] } ], "source": [ "df = pd.read_csv('data/src/sample_pandas_normal.csv')\n", "df.iloc[1] = np.nan\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['NY' nan 'CA' 'TX']\n", "\n" ] } ], "source": [ "u = df['state'].unique()\n", "print(u)\n", "print(type(u))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NY 2\n", "CA 2\n", "TX 1\n", "Name: state, dtype: int64\n", "\n" ] } ], "source": [ "vc = df['state'].value_counts()\n", "print(vc)\n", "print(type(vc))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TX 1\n", "CA 2\n", "NY 2\n", "Name: state, dtype: int64\n" ] } ], "source": [ "print(df['state'].value_counts(ascending=True))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CA 2\n", "NY 2\n", "TX 1\n", "Name: state, dtype: int64\n" ] } ], "source": [ "print(df['state'].value_counts(sort=False))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NY 2\n", "CA 2\n", "TX 1\n", "NaN 1\n", "Name: state, dtype: int64\n" ] } ], "source": [ "print(df['state'].value_counts(dropna=False))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NY 0.333333\n", "CA 0.333333\n", "TX 0.166667\n", "NaN 0.166667\n", "Name: state, dtype: float64\n" ] } ], "source": [ "print(df['state'].value_counts(dropna=False, normalize=True))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3\n", "\n" ] } ], "source": [ "nu = df['state'].nunique()\n", "print(nu)\n", "print(type(nu))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4\n" ] } ], "source": [ "print(df['state'].nunique(dropna=False))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name 5\n", "age 4\n", "state 3\n", "point 4\n", "dtype: int64\n", "\n" ] } ], "source": [ "nu_col = df.nunique()\n", "print(nu_col)\n", "print(type(nu_col))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name 6\n", "age 5\n", "state 4\n", "point 5\n", "dtype: int64\n" ] } ], "source": [ "print(df.nunique(dropna=False))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 4\n", "1 1\n", "2 4\n", "3 4\n", "4 4\n", "5 4\n", "dtype: int64\n" ] } ], "source": [ "print(df.nunique(dropna=False, axis='columns'))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3\n" ] } ], "source": [ "print(df['state'].nunique())" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name 5\n", "age 4\n", "state 3\n", "point 4\n", "dtype: int64\n" ] } ], "source": [ "print(df.nunique())" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['NY', nan, 'CA', 'TX']\n", "\n" ] } ], "source": [ "print(df['state'].unique().tolist())\n", "print(type(df['state'].unique().tolist()))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['NY', 'CA', 'TX']\n", "\n" ] } ], "source": [ "print(df['state'].value_counts().index.tolist())\n", "print(type(df['state'].value_counts().index.tolist()))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['NY' 'CA' 'TX' nan]\n", "\n" ] } ], "source": [ "print(df['state'].value_counts(dropna=False).index.values)\n", "print(type(df['state'].value_counts().index.values))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2\n" ] } ], "source": [ "print(df['state'].value_counts()['NY'])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2\n" ] } ], "source": [ "print(df['state'].value_counts().NY)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NY : 2\n", "CA : 2\n", "TX : 1\n" ] } ], "source": [ "for index, value in df['state'].value_counts().iteritems():\n", " print(index, ': ', value)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'NY': 2, 'CA': 2, 'TX': 1}\n", "\n" ] } ], "source": [ "d = df['state'].value_counts().to_dict()\n", "print(d)\n", "print(type(d))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2\n" ] } ], "source": [ "print(d['NY'])" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NY : 2\n", "CA : 2\n", "TX : 1\n" ] } ], "source": [ "for key, value in d.items():\n", " print(key, ': ', value)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NY 2\n", "CA 2\n", "TX 1\n", "Name: state, dtype: int64\n" ] } ], "source": [ "print(df['state'].value_counts())" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "NY\n" ] } ], "source": [ "print(df['state'].value_counts().index[0])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2\n" ] } ], "source": [ "print(df['state'].value_counts().iat[0])" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name Frank\n", "age 24\n", "state NY\n", "point 70\n", "dtype: object\n" ] } ], "source": [ "print(df.apply(lambda x: x.value_counts().index[0]))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name 1\n", "age 2\n", "state 2\n", "point 2\n", "dtype: int64\n" ] } ], "source": [ "print(df.apply(lambda x: x.value_counts().iat[0]))" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 CA\n", "1 NY\n", "dtype: object\n" ] } ], "source": [ "print(df['state'].mode())" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['CA', 'NY']\n" ] } ], "source": [ "print(df['state'].mode().tolist())" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[24.0]\n" ] } ], "source": [ "print(df['age'].mode().tolist())" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name [Alice, Charlie, Dave, Ellen, Frank]\n", "age [24.0]\n", "state [CA, NY]\n", "point [70.0]\n", "dtype: object\n" ] } ], "source": [ "s_mode = df.apply(lambda x: x.mode().tolist())\n", "print(s_mode)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(type(s_mode))" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Alice', 'Charlie', 'Dave', 'Ellen', 'Frank']\n" ] } ], "source": [ "print(s_mode['name'])" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(type(s_mode['name']))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point\n", "0 Alice 24.0 CA 70.0\n", "1 Charlie NaN NY NaN\n", "2 Dave NaN NaN NaN\n", "3 Ellen NaN NaN NaN\n", "4 Frank NaN NaN NaN\n" ] } ], "source": [ "print(df.mode())" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name 5\n", "age 1\n", "state 2\n", "point 1\n", "dtype: int64\n" ] } ], "source": [ "print(df.mode().count())" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name age state point\n", "count 6 6 6 6\n", "unique 6 5 4 5\n", "top Frank 24.0 CA 70.0\n", "freq 1 2 2 2\n" ] } ], "source": [ "print(df.astype('str').describe())" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name Frank\n", "age 24.0\n", "state CA\n", "point 70.0\n", "Name: top, dtype: object\n" ] } ], "source": [ "print(df.astype('str').describe().loc['top'])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }