{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# pandas Tip 정리" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# aggregate시 /기분으로 join\n", "sample_df[[\"id\",\"period2\",\"event\"]].groupby([\"id\",\"period2\"]).aggregate(lambda x: '/'.join(str(v) for v in x))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# timestamp -> datetime\n", "df[\"period\"] = pd.to_datetime(df[\"period\"],unit=\"us\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# display max rows 설정\n", "pd.set_option('display.max_rows', 1000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# event가 user_engagement가 아닌 df\n", "df = df[df[\"event\"] != \"user_engagement\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# event에 start/load/abc 이런식으로 있는 경우 테이블로 늘림\n", "group_df[\"event\"].str.split('/').apply(pd.Series, 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# CLK_filter를 하고 TakePhoto를 하지 않은 사람\n", "for i in range(0,len(group_df1)):\n", " if sum(group_df1.ix[i,1:].str.contains('CLK_Filter')[~group_df1.ix[i,1:].str.contains('TakePhoto')]) >= 1:\n", " cluster_list.append(1)\n", " else:\n", " cluster_list.append(0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# df 우측에 단순히 붙이기 (axis=0이면 하단에 붙이기)\n", "pd.concat(['group_df1','temp_df'], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# list를 dataframe으로 변경\n", "temp_df = pd.DataFrame(result_list, columns=[\"cluster\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# ''로 join한 후 우측으로 붙이기!\n", "group_df1 = df[[\"id\",\"event\"]].groupby([\"id\"]).aggregate(lambda x: ''.join(str(v) for v in x))\n", "print(\"1\")\n", "group_df1 = pd.concat([group_df1, group_df1[\"event\"].str.split('Launch').apply(pd.Series, 1)], axis=1)\n", "print(\"2\")\n", "group_df1 = group_df1.fillna(\"\")\n", "print(\"3\")\n", "group_df1 = group_df1.reset_index(drop=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# event가 CLK_Album이 아닌 이벤트만 설정\n", "df = df[df[\"event\"] != \"CLK_Album\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# index 기준으로 랜덤 샘플링\n", "df1 = df.ix[np.random.choice(df.index, 10000)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# df에 있는 event_date(datetime 객체)를 필터링하고 싶은 경우! 맨 아래 쿼리도 동일한 결과값 나옴..! 하지만 더 편하네요 쩐당\n", "df.query('20170501 <= event_date <= 20170507') \n", "# df[df['event_date'] >= '2017-05-01'][df['event_date'] <= '2017-05-07']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# lag 함수 구현\n", "df['Data_lagged'] = df.groupby(['Group'])['Data'].shift(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 누적합\n", "df['asq_cum']=df['asq'].cumsum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# timedelta64 -> float64 변환\n", "data['difference'].astype('timedelta64[D]')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# pivot\n", "day_df[day_df['event_name'] == 'event_name'][['event_date','platform','tu']].pivot(index='event_date',columns='platform',values='tu').plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 이동평균\n", "series.rolling(window=3, center=False).mean().plot(style='--') # window는 간격을 의미!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 기간 mast\n", "mask = (df['date'] > start_date) & (df['date'] <= end_date)\n", "df.loc[mask]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# re-ordering columns in dataframe column name\n", "df.reindex_axis(sorted(df.columns), axis=1)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# key value 반전\n", "df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in test_dict.items() ]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# value에 특정 값을 포함하는 dataframe 추출 ( | = or)\n", "df[df['A'].str.contains(\"Hello|Britain\")==True]\n", "\n", "# 특정 값이 1개라면\n", "df[df['A'].str.contains(\"hello\")]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# numpy 객체로 변환해 용량 줄이는 코드\n", "import numpy as np\n", "for c, dtype in zip(df.columns, df.dtypes):\n", " if dtype == np.float64:\n", " prop[c] = prop[c].astype(np.float32)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false } }, "nbformat": 4, "nbformat_minor": 2 }