{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pandas" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import random\n", "import string\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'KXDIWQKMP'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "randStr = lambda str_type,length:\"\".join(np.random.choice(np.array(list(eval(str_type))),length))\n", "randStr(\"string.ascii_uppercase\",9)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iovrrxnfinsumvdfjcidjgesfubmrglvuhfv
00.9876540.2061040.8029200.0111570.8606180.575871
10.7063970.8600830.9392300.4361940.5570810.706964
20.0431390.7294350.5974880.7009980.9741930.917758
30.3160800.4615470.8445400.5101430.9084750.877330
40.8288390.1776700.6108330.3282380.3276970.689756
\n", "
" ], "text/plain": [ " iovrrx nfinsu mvdfjc idjges fubmrg lvuhfv\n", "0 0.987654 0.206104 0.802920 0.011157 0.860618 0.575871\n", "1 0.706397 0.860083 0.939230 0.436194 0.557081 0.706964\n", "2 0.043139 0.729435 0.597488 0.700998 0.974193 0.917758\n", "3 0.316080 0.461547 0.844540 0.510143 0.908475 0.877330\n", "4 0.828839 0.177670 0.610833 0.328238 0.327697 0.689756" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def mkRandDf(length=None, width=None, str_kind=None, str_len=None):\n", " \"\"\"\n", " Parameters\n", " ----------\n", " length : int\n", " width : int\n", " str_kind : str\n", " Any of the types of strings in strings module e.g.\n", " \"acscii_lower\", \"ascii_upper\", \"digits\", ect...\n", " str_len : int\n", " \"\"\"\n", " str_kind = str_kind or \"ascii_uppercase\"\n", " str_kind = \"string.\"+ str_kind\n", " str_len = width or 10\n", " \n", " randStr = lambda str_type, length :\"\".join(np.random.choice(np.array(list(eval(str_type))),length))\n", " \n", " column_names = [randStr(str_kind, width) for i in range(width)]\n", " \n", " rand_df = pd.DataFrame(np.random.rand(length, width))\n", " rand_df.columns = column_names\n", " \n", " return rand_df\n", "\n", "my_df = mkRandDf(5, 6, str_kind=\"ascii_lowercase\")\n", "my_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "--- \n", "# `ix` to slice up like a `numpy` array." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 0.046928\n", "1 0.736357\n", "2 0.020734\n", "3 0.973336\n", "4 0.827972\n", "Name: aekbyu, dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_df.ix[:,0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Use `to_frame` to make a sliced series into a data frame of its own." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
aekbyu
00.046928
10.736357
20.020734
30.973336
40.827972
\n", "
" ], "text/plain": [ " aekbyu\n", "0 0.046928\n", "1 0.736357\n", "2 0.020734\n", "3 0.973336\n", "4 0.827972" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_df.ix[:,0].to_frame()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "\n", "### MultiIndexing" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
leftmiddleright
00.9129920.5448380.489038
10.9775530.2655880.342345
20.2662700.1213330.871679
\n", "
" ], "text/plain": [ " left middle right\n", "0 0.912992 0.544838 0.489038\n", "1 0.977553 0.265588 0.342345\n", "2 0.266270 0.121333 0.871679" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_array = np.random.rand(3,3)\n", "data = pd.DataFrame(data_array,columns = [\"left\",\"middle\",\"right\"])\n", "data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def superGroup(dataframe=None,new_level=None):\n", " if type(dataframe.columns) == pd.indexes.base.Index:\n", " out_df = pd.DataFrame(dataframe.values,index=dataframe.index,columns=pd.MultiIndex.from_product([[new_level],dataframe.columns]))\n", " return out_df\n", " if type(dataframe.columns) == pd.indexes.multi.MultiIndex:\n", " levels = [list(i.values) for i in dataframe.columns.levels]\n", " levels = [[new_level]]+levels\n", " out_df = pd.DataFrame(dataframe.values, index = dataframe.index, columns = pd.MultiIndex.from_product(levels))\n", " return out_df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Most New
Newest
New
leftmiddleright
00.3943290.2988360.683910
10.8471830.6304190.171109
20.7473840.0587200.129397
\n", "
" ], "text/plain": [ " Most New \n", " Newest \n", " New \n", " left middle right\n", "0 0.394329 0.298836 0.683910\n", "1 0.847183 0.630419 0.171109\n", "2 0.747384 0.058720 0.129397" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hey = superGroup(data,\"New\")\n", "works = superGroup(hey,\"Newest\")\n", "superGroup(works,\"Most New\")\n", "# hey.columns.labels" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "code review question" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10000 loops, best of 3: 25.3 µs per loop\n" ] } ], "source": [ "import re\n", "import pandas as pd\n", "import numpy as np\n", "color = [\"1xYellow ; 2xRed \",\n", " \"2xYellow ; 1xOrange \",\n", " \"3xYellow ; 2xGreen \",\n", " \"1xYellow ; 1xRed \",\n", " \"2xYellow ; 1xRed \"]\n", "numbers = np.random.rand(len(color))\n", "ex_df = pd.DataFrame(np.array([numbers,color]).T,\n", " columns = [\"numbers\",\"colors\"])\n", "rx = re.compile(\"x(\\w+)\\s\")\n", "just_colors = ex_df.colors.apply(rx.findall)\n", "%timeit present_colors = set(sum(just_colors,[]))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# present_colors" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'Green ', 'Orange ', 'Red ', 'Yellow '}" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "present_colors = set()\n", "for value in ex_df['colors'].values:\n", " for color in [x.strip() for x in value.split(';')]:\n", " present_colors.add(color.split('x')[-1])\n", "\n", "present_colors = set(sum(([color.split('x')[-1] for color in value.split(';')]\n", " for value in ex_df['colors'].values), []))\n", "present_colors" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "---\n", "## [CODEREVIEW - Is there a more efficient way to subtract multiple columns in a PANDAS DataFrame by a series (or single column)?](http://codereview.stackexchange.com/questions/156447/is-there-a-more-efficient-way-to-subtract-multiple-columns-in-a-pandas-dataframe)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " iovrrx nfinsu mvdfjc idjges fubmrg lvuhfv\n", "0 0.987654 0.206104 0.802920 0.011157 0.860618 0.575871\n", "1 0.706397 0.860083 0.939230 0.436194 0.557081 0.706964\n", "2 0.043139 0.729435 0.597488 0.700998 0.974193 0.917758\n", "3 0.316080 0.461547 0.844540 0.510143 0.908475 0.877330\n", "4 0.828839 0.177670 0.610833 0.328238 0.327697 0.689756\n" ] } ], "source": [ "print(my_df)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "section_A = my_df.ix[:,:3]\n", "section_B = my_df.ix[:,3:]\n", "# section_B" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " iovrrx nfinsu mvdfjc idjges fubmrg lvuhfv\n", "0 1.667378 -0.593258 1.368628 -4.800610 1.468744 0.889117\n", "1 0.056992 0.340988 0.467991 -0.638518 -0.285601 0.058149\n", "2 -3.467018 0.612699 0.324830 0.555330 1.030127 0.944032\n", "3 -0.941776 -0.395590 0.476099 -0.251165 0.581380 0.531053\n", "4 0.933714 -1.288174 0.493400 -0.402633 -0.405015 0.668708\n" ] } ], "source": [ "log_div_ave = my_df.apply(np.log2).values.T - my_df.apply(np.log2).mean(axis=1).values\n", "\n", "log_div_ave = pd.DataFrame(log_div_ave.T,columns=my_df.columns)\n", "\n", "print(log_div_ave)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 1 }