{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Retain in Pandas\n", "## Similar question: \n", "https://stackoverflow.com/questions/33569265/retain-function-in-python\n", "\n", "### Any better way to do this than using a for loop or apply?" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DateQuantityPriceTotal SharesCost
02017-01-101000117.22491000117224.90
12017-02-10-500130.5928500-65296.40
22017-03-101500137.53162000206297.40
32017-04-10-2000141.51500-283030.00
42017-05-10500151.488450075744.20
52017-06-09500147.8657100073932.85
62017-07-10500143.9750150071987.50
72017-08-10-1000154.7636500-154763.60
82017-09-11-200160.9215300-32184.30
92017-10-101000155.34161300155341.60
\n", "
" ], "text/plain": [ " Date Quantity Price Total Shares Cost\n", "0 2017-01-10 1000 117.2249 1000 117224.90\n", "1 2017-02-10 -500 130.5928 500 -65296.40\n", "2 2017-03-10 1500 137.5316 2000 206297.40\n", "3 2017-04-10 -2000 141.5150 0 -283030.00\n", "4 2017-05-10 500 151.4884 500 75744.20\n", "5 2017-06-09 500 147.8657 1000 73932.85\n", "6 2017-07-10 500 143.9750 1500 71987.50\n", "7 2017-08-10 -1000 154.7636 500 -154763.60\n", "8 2017-09-11 -200 160.9215 300 -32184.30\n", "9 2017-10-10 1000 155.3416 1300 155341.60" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "aapl = pd.read_csv('AAPL_exmaple.csv', parse_dates=['Date'])\n", "aapl['Total Shares'] = aapl['Quantity'].cumsum()\n", "aapl['Cost'] = aapl['Quantity']*aapl['Price']\n", "aapl" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DateQuantityPriceTotal SharesCostAve Cost
02017-01-101000117.22491000117224.90117.224900
12017-02-10-500130.5928500-65296.40117.224900
22017-03-101500137.53162000206297.40132.454925
32017-04-10-2000141.51500-283030.000.000000
42017-05-10500151.488450075744.20151.488400
52017-06-09500147.8657100073932.85149.677050
62017-07-10500143.9750150071987.50147.776367
72017-08-10-1000154.7636500-154763.60147.776367
82017-09-11-200160.9215300-32184.30147.776367
92017-10-101000155.34161300155341.60153.595777
\n", "
" ], "text/plain": [ " Date Quantity Price Total Shares Cost Ave Cost\n", "0 2017-01-10 1000 117.2249 1000 117224.90 117.224900\n", "1 2017-02-10 -500 130.5928 500 -65296.40 117.224900\n", "2 2017-03-10 1500 137.5316 2000 206297.40 132.454925\n", "3 2017-04-10 -2000 141.5150 0 -283030.00 0.000000\n", "4 2017-05-10 500 151.4884 500 75744.20 151.488400\n", "5 2017-06-09 500 147.8657 1000 73932.85 149.677050\n", "6 2017-07-10 500 143.9750 1500 71987.50 147.776367\n", "7 2017-08-10 -1000 154.7636 500 -154763.60 147.776367\n", "8 2017-09-11 -200 160.9215 300 -32184.30 147.776367\n", "9 2017-10-10 1000 155.3416 1300 155341.60 153.595777" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_ave_cost(df):\n", " for index, row in df.iterrows():\n", " if index == 0:\n", " df.loc[index,'Ave Cost'] = row['Price']\n", " elif row['Total Shares'] == 0:\n", " df.loc[index,'Ave Cost'] = 0.0\n", " else:\n", " if row['Quantity'] > 0:\n", " df.loc[index,'Ave Cost'] = ((df.loc[index - 1,'Ave Cost'] * df.loc[index - 1,'Total Shares']) \\\n", " + row['Cost'])/row['Total Shares']\n", " else:\n", " df.loc[index,'Ave Cost'] = df.loc[index - 1,'Ave Cost']\n", " return df\n", "\n", "aapl_ave_cost = get_ave_cost(aapl)\n", "aapl_ave_cost" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100 loops, best of 3: 4.81 ms per loop\n" ] } ], "source": [ "%timeit get_ave_cost(aapl)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The root problem is using for loops instead of a vectorized solution." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }