{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 快速查看整体信息\n",
"上一章讲到了控制DataFrame显示的一些参数,本章则具体讲解一下如何获得对DataFrame的整体认知。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"E:\\ML\\实战\\pandas实用教程 - 副本\n"
]
}
],
"source": [
"!cd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. `.info()`\n",
"这是DataFrame才可用的API,快捷查看多种信息:总行数和列数、每列元素类型和non-NaN的个数,总内存。\n",
"#### `DataFrame.info(verbose=None, memory_usage=True, null_counts=True)`\n",
"- verbose:True or False,字面意思是冗长的,也就说如何DataFrame有很多列,是否显示所有列的信息,如果为否,那么会省略一部分;\n",
"- memory_usage:True or False,默认为True,是否查看DataFrame的内存使用情况;\n",
"- null_counts:True or False,默认为True,是否统计NaN值的个数。"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" ... | \n",
" 90 | \n",
" 91 | \n",
" 92 | \n",
" 93 | \n",
" 94 | \n",
" 95 | \n",
" 96 | \n",
" 97 | \n",
" 98 | \n",
" 99 | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
0 rows × 100 columns
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]\n",
"Index: []\n",
"\n",
"[0 rows x 100 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame( columns = range(0,100))\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Index: 0 entries\n",
"Data columns (total 100 columns):\n",
"0 0 non-null object\n",
"1 0 non-null object\n",
"2 0 non-null object\n",
"3 0 non-null object\n",
"4 0 non-null object\n",
"5 0 non-null object\n",
"6 0 non-null object\n",
"7 0 non-null object\n",
"8 0 non-null object\n",
"9 0 non-null object\n",
"10 0 non-null object\n",
"11 0 non-null object\n",
"12 0 non-null object\n",
"13 0 non-null object\n",
"14 0 non-null object\n",
"15 0 non-null object\n",
"16 0 non-null object\n",
"17 0 non-null object\n",
"18 0 non-null object\n",
"19 0 non-null object\n",
"20 0 non-null object\n",
"21 0 non-null object\n",
"22 0 non-null object\n",
"23 0 non-null object\n",
"24 0 non-null object\n",
"25 0 non-null object\n",
"26 0 non-null object\n",
"27 0 non-null object\n",
"28 0 non-null object\n",
"29 0 non-null object\n",
"30 0 non-null object\n",
"31 0 non-null object\n",
"32 0 non-null object\n",
"33 0 non-null object\n",
"34 0 non-null object\n",
"35 0 non-null object\n",
"36 0 non-null object\n",
"37 0 non-null object\n",
"38 0 non-null object\n",
"39 0 non-null object\n",
"40 0 non-null object\n",
"41 0 non-null object\n",
"42 0 non-null object\n",
"43 0 non-null object\n",
"44 0 non-null object\n",
"45 0 non-null object\n",
"46 0 non-null object\n",
"47 0 non-null object\n",
"48 0 non-null object\n",
"49 0 non-null object\n",
"50 0 non-null object\n",
"51 0 non-null object\n",
"52 0 non-null object\n",
"53 0 non-null object\n",
"54 0 non-null object\n",
"55 0 non-null object\n",
"56 0 non-null object\n",
"57 0 non-null object\n",
"58 0 non-null object\n",
"59 0 non-null object\n",
"60 0 non-null object\n",
"61 0 non-null object\n",
"62 0 non-null object\n",
"63 0 non-null object\n",
"64 0 non-null object\n",
"65 0 non-null object\n",
"66 0 non-null object\n",
"67 0 non-null object\n",
"68 0 non-null object\n",
"69 0 non-null object\n",
"70 0 non-null object\n",
"71 0 non-null object\n",
"72 0 non-null object\n",
"73 0 non-null object\n",
"74 0 non-null object\n",
"75 0 non-null object\n",
"76 0 non-null object\n",
"77 0 non-null object\n",
"78 0 non-null object\n",
"79 0 non-null object\n",
"80 0 non-null object\n",
"81 0 non-null object\n",
"82 0 non-null object\n",
"83 0 non-null object\n",
"84 0 non-null object\n",
"85 0 non-null object\n",
"86 0 non-null object\n",
"87 0 non-null object\n",
"88 0 non-null object\n",
"89 0 non-null object\n",
"90 0 non-null object\n",
"91 0 non-null object\n",
"92 0 non-null object\n",
"93 0 non-null object\n",
"94 0 non-null object\n",
"95 0 non-null object\n",
"96 0 non-null object\n",
"97 0 non-null object\n",
"98 0 non-null object\n",
"99 0 non-null object\n",
"dtypes: object(100)\n",
"memory usage: 0.0+ bytes\n"
]
}
],
"source": [
"df.info() # 直接默认设置即可"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"# 2. `.ndim, .shape, .size`\n",
"查看维数,形状,元素个数。"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" NaN | \n",
" 2.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 3.0 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B\n",
"0 NaN 2.0\n",
"1 3.0 NaN"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame( [[np.nan, 2],[3,np.nan]], columns = ['A','B'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.ndim # 返回维度数,Series一维,DataFrame两维,平时很少用到,不过有时会在循环中用到"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2, 2)"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape # (行数,列数)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.size # 元素个数,rows×cols"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"# 3. `.head(), .tail()`\n",
"默认分别查看头5行和后5行。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### `Series/DataFrame.head(n=5)`\n",
"#### `Series/DataFrame.tail(n=5)`"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"0 0\n",
"1 1\n",
"2 2\n",
"3 3\n",
"4 4\n",
"dtype: int32"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s = pd.Series( range(0,5))\n",
"s"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0\n",
"1 1\n",
"2 2\n",
"dtype: int32"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2 2\n",
"3 3\n",
"4 4\n",
"dtype: int32"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s.tail(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"# 4. `.memory_usage()`\n",
"比info中内存显示更可控一些,单位是**字节**。\n",
"#### `Series/DataFrame.memory_usage(index=True, deep=False)`\n",
"- index:是否显示索引占用的内存,毫无疑问索引也占用内存;\n",
"- deep:是否显示object类型的列消耗的系统资源,由于pandas中object元素只是一个引用,我估计这个deep是指显示真实的内存占用。"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index 80\n",
"numeric 24\n",
"object 24\n",
"dtype: int64"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.memory_usage(deep = False) # Index即索引占用内存"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index 80\n",
"numeric 24\n",
"object 186\n",
"dtype: int64"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.memory_usage(deep = True) # object 型占用的内存变大"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"# 5. `.describe()`\n",
"快速查看每一列的统计信息,默认排除所有NaN元素。\n",
"#### `DataFrame.describe( include= [np.number])`\n",
"- include:'all'或者[np.number 或 np.object]。numberic只对元素属性为数值的列做数值统计,object只对元素属性为object的列做类字符串统计。"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" numeric | \n",
" object | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" a | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" b | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" b | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" numeric object\n",
"0 1 a\n",
"1 2 b\n",
"2 1 b"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame( [[1,'a'],[2,'b'],[1,'b']], columns = ['numeric','object'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"numeric int64\n",
"object object\n",
"dtype: object"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" numeric | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 3.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 1.333333 | \n",
"
\n",
" \n",
" std | \n",
" 0.577350 | \n",
"
\n",
" \n",
" min | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 1.500000 | \n",
"
\n",
" \n",
" max | \n",
" 2.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" numeric\n",
"count 3.000000\n",
"mean 1.333333\n",
"std 0.577350\n",
"min 1.000000\n",
"25% 1.000000\n",
"50% 1.000000\n",
"75% 1.500000\n",
"max 2.000000"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe() # 默认只对数值列进行统计 "
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" object | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 3 | \n",
"
\n",
" \n",
" unique | \n",
" 2 | \n",
"
\n",
" \n",
" top | \n",
" b | \n",
"
\n",
" \n",
" freq | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" object\n",
"count 3\n",
"unique 2\n",
"top b\n",
"freq 2"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe( include=[np.object]) # 只对object型列进行统计,类别统计方式,只统计这四种"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" numeric | \n",
" object | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 3.000000 | \n",
" 3 | \n",
"
\n",
" \n",
" unique | \n",
" NaN | \n",
" 2 | \n",
"
\n",
" \n",
" top | \n",
" NaN | \n",
" b | \n",
"
\n",
" \n",
" freq | \n",
" NaN | \n",
" 2 | \n",
"
\n",
" \n",
" mean | \n",
" 1.333333 | \n",
" NaN | \n",
"
\n",
" \n",
" std | \n",
" 0.577350 | \n",
" NaN | \n",
"
\n",
" \n",
" min | \n",
" 1.000000 | \n",
" NaN | \n",
"
\n",
" \n",
" 25% | \n",
" 1.000000 | \n",
" NaN | \n",
"
\n",
" \n",
" 50% | \n",
" 1.000000 | \n",
" NaN | \n",
"
\n",
" \n",
" 75% | \n",
" 1.500000 | \n",
" NaN | \n",
"
\n",
" \n",
" max | \n",
" 2.000000 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" numeric object\n",
"count 3.000000 3\n",
"unique NaN 2\n",
"top NaN b\n",
"freq NaN 2\n",
"mean 1.333333 NaN\n",
"std 0.577350 NaN\n",
"min 1.000000 NaN\n",
"25% 1.000000 NaN\n",
"50% 1.000000 NaN\n",
"75% 1.500000 NaN\n",
"max 2.000000 NaN"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe( include = 'all') # 数值序列和object序列共同统计的信息只有count: non-NaN元素个数"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2rc2"
},
"toc": {
"colors": {
"hover_highlight": "#DAA520",
"navigate_num": "#000000",
"navigate_text": "#333333",
"running_highlight": "#FF0000",
"selected_highlight": "#FFD700",
"sidebar_border": "#EEEEEE",
"wrapper_background": "#FFFFFF"
},
"moveMenuLeft": true,
"nav_menu": {
"height": "141px",
"width": "253px"
},
"navigate_menu": true,
"number_sections": false,
"sideBar": true,
"threshold": "3",
"toc_cell": false,
"toc_section_display": "block",
"toc_window_display": true,
"widenNotebook": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}