{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "import pandas as pd\n", "from pandas import Series,DataFrame" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 数据分析数据挖掘\n", "# 有数据情况下:\n", "# 数据预处理\n", "# 数据清洗(空数据,异常值)\n", "# 数据集成(多个数据合并到一起,级联)数据可能存放在多个表中\n", "# 数据转化\n", "# 数据规约(属性减少(不重要的属性删除),数据减少去重操作)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([[ 5, 12, 67, 29, 46, 103, 53, 53, 139, 87],\n", " [126, 33, 55, 104, 45, 70, 96, 133, 116, 43],\n", " [ 84, 45, 17, 42, 19, 11, 125, 43, 54, 39],\n", " [ 97, 68, 99, 90, 28, 60, 135, 84, 111, 63],\n", " [114, 56, 30, 81, 48, 73, 119, 65, 20, 22]])" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "array([[115, 128, 122, 127, 4, 135, 26, 25, 131, 139],\n", " [ 66, 119, 37, 136, 101, 40, 102, 127, 148, 127],\n", " [ 89, 80, 140, 133, 51, 142, 47, 27, 54, 23],\n", " [ 64, 127, 33, 128, 60, 106, 67, 94, 110, 76],\n", " [ 6, 21, 23, 96, 10, 62, 26, 79, 149, 43],\n", " [116, 143, 132, 118, 68, 21, 57, 133, 124, 124]])" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 首先看numpy数组的集成\n", "nd1 = np.random.randint(0,150,size = (5,10))\n", "\n", "nd2 = np.random.randint(0,150,size = (6,10))\n", "display(nd1,nd2)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([[ 5, 12, 67, 29, 46, 103, 53, 53, 139, 87],\n", " [126, 33, 55, 104, 45, 70, 96, 133, 116, 43],\n", " [ 84, 45, 17, 42, 19, 11, 125, 43, 54, 39],\n", " [ 97, 68, 99, 90, 28, 60, 135, 84, 111, 63],\n", " [114, 56, 30, 81, 48, 73, 119, 65, 20, 22],\n", " [115, 128, 122, 127, 4, 135, 26, 25, 131, 139],\n", " [ 66, 119, 37, 136, 101, 40, 102, 127, 148, 127],\n", " [ 89, 80, 140, 133, 51, 142, 47, 27, 54, 23],\n", " [ 64, 127, 33, 128, 60, 106, 67, 94, 110, 76],\n", " [ 6, 21, 23, 96, 10, 62, 26, 79, 149, 43],\n", " [116, 143, 132, 118, 68, 21, 57, 133, 124, 124]])" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 原来数据一个5行,一个是6行,级联之后变成了11行\n", "nd3 = np.concatenate([nd1,nd2],axis = 0)\n", "nd3" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([[110, 38, 144, 92, 38, 2, 67, 2, 103, 81],\n", " [ 56, 61, 61, 22, 108, 145, 95, 44, 40, 100],\n", " [ 65, 74, 85, 123, 47, 117, 35, 55, 120, 20],\n", " [ 15, 9, 4, 84, 71, 133, 140, 13, 71, 91],\n", " [ 94, 31, 41, 5, 7, 32, 50, 24, 18, 120]])" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "array([[ 65, 149, 86, 138, 98],\n", " [136, 49, 102, 45, 140],\n", " [ 13, 124, 94, 81, 73],\n", " [ 82, 38, 0, 75, 94],\n", " [146, 28, 143, 61, 49]])" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "nd1 = np.random.randint(0,150,size = (5,10))\n", "\n", "nd2 = np.random.randint(0,150,size = (5,5))\n", "display(nd1,nd2)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[110, 38, 144, 92, 38, 2, 67, 2, 103, 81, 65, 149, 86,\n", " 138, 98],\n", " [ 56, 61, 61, 22, 108, 145, 95, 44, 40, 100, 136, 49, 102,\n", " 45, 140],\n", " [ 65, 74, 85, 123, 47, 117, 35, 55, 120, 20, 13, 124, 94,\n", " 81, 73],\n", " [ 15, 9, 4, 84, 71, 133, 140, 13, 71, 91, 82, 38, 0,\n", " 75, 94],\n", " [ 94, 31, 41, 5, 7, 32, 50, 24, 18, 120, 146, 28, 143,\n", " 61, 49]])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# axis = 0行级联(第一维度的级联),axis = 1(第二个维度的级联,列的级联)\n", "np.concatenate((nd1,nd2),axis = 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# pandas级联操作,pandas基于numpy\n", "# pandas的级联类似" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PythonMathEn
A1135380
B1354052
C1441864
\n", "
" ], "text/plain": [ " Python Math En\n", "A 113 53 80\n", "B 135 40 52\n", "C 144 18 64" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PythonMathEn
D126118146
E1478127
F87631
G359533
H13011791
I12498122
\n", "
" ], "text/plain": [ " Python Math En\n", "D 126 118 146\n", "E 147 81 27\n", "F 87 63 1\n", "G 35 95 33\n", "H 130 117 91\n", "I 124 98 122" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df1 = DataFrame(np.random.randint(0,150,size = (3,3)),index = list('ABC'),columns=['Python','Math','En'])\n", "\n", "df2 = DataFrame(np.random.randint(0,150,size = (6,3)),index = list('DEFGHI'),columns=['Python','Math','En'])\n", "\n", "display(df1,df2)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PythonMathEn
A1135380
B1354052
C1441864
D126118146
E1478127
F87631
G359533
H13011791
I12498122
\n", "
" ], "text/plain": [ " Python Math En\n", "A 113 53 80\n", "B 135 40 52\n", "C 144 18 64\n", "D 126 118 146\n", "E 147 81 27\n", "F 87 63 1\n", "G 35 95 33\n", "H 130 117 91\n", "I 124 98 122" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pandas汇总数据,数据集成\n", "df1.append(df2)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PythonMathEn
A1135380
B1354052
C1441864
D126118146
E1478127
F87631
G359533
H13011791
I12498122
\n", "
" ], "text/plain": [ " Python Math En\n", "A 113 53 80\n", "B 135 40 52\n", "C 144 18 64\n", "D 126 118 146\n", "E 147 81 27\n", "F 87 63 1\n", "G 35 95 33\n", "H 130 117 91\n", "I 124 98 122" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([df1,df2])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "d:\\python36\\lib\\site-packages\\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", "of pandas will change to not sort by default.\n", "\n", "To accept the future behavior, pass 'sort=False'.\n", "\n", "To retain the current behavior and silence the warning, pass 'sort=True'.\n", "\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PythonMathEnPythonMathEn
A113.053.080.0NaNNaNNaN
B135.040.052.0NaNNaNNaN
C144.018.064.0NaNNaNNaN
DNaNNaNNaN126.0118.0146.0
ENaNNaNNaN147.081.027.0
FNaNNaNNaN87.063.01.0
GNaNNaNNaN35.095.033.0
HNaNNaNNaN130.0117.091.0
INaNNaNNaN124.098.0122.0
\n", "
" ], "text/plain": [ " Python Math En Python Math En\n", "A 113.0 53.0 80.0 NaN NaN NaN\n", "B 135.0 40.0 52.0 NaN NaN NaN\n", "C 144.0 18.0 64.0 NaN NaN NaN\n", "D NaN NaN NaN 126.0 118.0 146.0\n", "E NaN NaN NaN 147.0 81.0 27.0\n", "F NaN NaN NaN 87.0 63.0 1.0\n", "G NaN NaN NaN 35.0 95.0 33.0\n", "H NaN NaN NaN 130.0 117.0 91.0\n", "I NaN NaN NaN 124.0 98.0 122.0" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([df1,df2],axis = 1,ignore_index = False)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PythonMathEn
A225813
B995735
C512824
E560111
F13723121
G4978115
\n", "
" ], "text/plain": [ " Python Math En\n", "A 22 58 13\n", "B 99 57 35\n", "C 51 28 24\n", "E 5 60 111\n", "F 137 23 121\n", "G 49 78 115" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PythonMathEn
A11811381
B5122126
C0115128
E10013094
F4993140
G705994
\n", "
" ], "text/plain": [ " Python Math En\n", "A 118 113 81\n", "B 51 22 126\n", "C 0 115 128\n", "E 100 130 94\n", "F 49 93 140\n", "G 70 59 94" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 期中\n", "df1 = DataFrame(np.random.randint(0,150,size = (6,3)),index = list('ABCEFG'),columns=['Python','Math','En'])\n", "\n", "# 期末\n", "df2 = DataFrame(np.random.randint(0,150,size = (6,3)),index = list('ABCEFG'),columns=['Python','Math','En'])\n", "\n", "display(df1,df2)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PythonMathEn
期中A225813
B995735
C512824
E560111
F13723121
G4978115
期末A11811381
B5122126
C0115128
E10013094
F4993140
G705994
\n", "
" ], "text/plain": [ " Python Math En\n", "期中 A 22 58 13\n", " B 99 57 35\n", " C 51 28 24\n", " E 5 60 111\n", " F 137 23 121\n", " G 49 78 115\n", "期末 A 118 113 81\n", " B 51 22 126\n", " C 0 115 128\n", " E 100 130 94\n", " F 49 93 140\n", " G 70 59 94" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3 = pd.concat([df1,df2],axis = 0,keys = ['期中','期末'])\n", "df3" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PythonMathEn
A期中225813
期末11811381
B期中995735
期末5122126
C期中512824
期末0115128
E期中560111
期末10013094
F期中13723121
期末4993140
G期中4978115
期末705994
\n", "
" ], "text/plain": [ " Python Math En\n", "A 期中 22 58 13\n", " 期末 118 113 81\n", "B 期中 99 57 35\n", " 期末 51 22 126\n", "C 期中 51 28 24\n", " 期末 0 115 128\n", "E 期中 5 60 111\n", " 期末 100 130 94\n", "F 期中 137 23 121\n", " 期末 49 93 140\n", "G 期中 49 78 115\n", " 期末 70 59 94" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3.unstack(level = 0).stack()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }