{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd, numpy as np\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#model data\n", "df=pd.read_csv('ex/data/days-simulated-v2.tsv')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
day
00,270,5,32,10,73,16,25,5,165,2,35,4,300,1,53,1...
14,150,16,7,4,623,16,5,8,35,16,20,5,30,2,10,8,1...
20,270,1,75,16,30,9,15,16,5,8,40,16,5,10,10,16,...
30,240,1,60,10,50,16,5,5,55,2,32,16,8,10,10,16,...
40,170,2,20,16,5,4,285,2,15,3,205,16,5,6,6,16,1...
\n", "
" ], "text/plain": [ " day\n", "0 0,270,5,32,10,73,16,25,5,165,2,35,4,300,1,53,1...\n", "1 4,150,16,7,4,623,16,5,8,35,16,20,5,30,2,10,8,1...\n", "2 0,270,1,75,16,30,9,15,16,5,8,40,16,5,10,10,16,...\n", "3 0,240,1,60,10,50,16,5,5,55,2,32,16,8,10,10,16,...\n", "4 0,170,2,20,16,5,4,285,2,15,3,205,16,5,6,6,16,1..." ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#target structure: activity, duration, activity, duration, ...\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df1=pd.read_csv('ex/1.csv')\n", "df2=pd.read_csv('ex/2.csv')\n", "df3=pd.read_csv('ex/3.csv')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df=pd.concat([df1,df2[1:],df3[1:]]) #no need for headers twice, df headers completely identical" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create weekday dataframe\n", "hkoz=df[df.columns[9:489]].reset_index()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create weekend dataframe\n", "hetv=df[df.columns[489:969]].reset_index()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create descriptive stats dataframe\n", "desc=df[df.columns[969:]].reset_index()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create survey metadata dataframe\n", "time=df[df.columns[2:4]].reset_index()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#top 16 activity groups\n", "activities=[['Alv\\xc3\\xa1s'],\n", "['Zuhany / Mosd\\xc3\\xb3'],\n", "['\\xc3\\x89tkez\\xc3\\xa9s', '\\xc3\\x89tterem/Vend\\xc3\\xa9gl\\xc5\\x91'],\n", "[u'Munka (irodai)', 'Munka (k\\xc3\\xa9tkezi)'],\n", "[u'Internet', u'Telefon/Chat/Facebook'],\n", "['V\\xc3\\xa1s\\xc3\\xa1rl\\xc3\\xa1s'],\n", "['Vall\\xc3\\xa1sgyakorl\\xc3\\xa1s','\\xc3\\x96nk\\xc3\\xa9ntess\\xc3\\xa9g'],\n", "[u'TV/Film', u'Mozi'],\n", "['Olvas\\xc3\\xa1s', '\\xc3\\x9ajs\\xc3\\xa1g/Keresztrejtv\\xc3\\xa9ny'],\n", "['H\\xc3\\xa1zimunka/Gyerekfel\\xc3\\xbcgyelet'],\n", "['Hivatalos elint\\xc3\\xa9znival\\xc3\\xb3k'],\n", "[u'Sport', 'Edz\\xc5\\x91terem/Sz\\xc3\\xa9ps\\xc3\\xa9gszalon'],\n", "['Egy\\xc3\\xa9b Hobby', 'PC j\\xc3\\xa1t\\xc3\\xa9k', 'Kert\\xc3\\xa9szked\\xc3\\xa9s/Bark\\xc3\\xa1csol\\xc3\\xa1s', 'Rokonl\\xc3\\xa1togat\\xc3\\xa1s', 'M\\xc3\\xa1s'],\n", "['Tanul\\xc3\\xa1s', 'Mag\\xc3\\xa1n\\xc3\\xb3ra'],\n", "['Sz\\xc3\\xb3rakoz\\xc3\\xb3hely/K\\xc3\\xa1v\\xc3\\xa9z\\xc3\\xb3/Pub'],\n", "['S\\xc3\\xa9ta/Kutyas\\xc3\\xa9t\\xc3\\xa1ltat\\xc3\\xa1s', 'Term\\xc3\\xa9szet/Kir\\xc3\\xa1ndul\\xc3\\xa1s'],\n", "['Utaz\\xc3\\xa1s/Vezet\\xc3\\xa9s']]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#bin activities into activity groups\n", "actidict={}\n", "for i in range(len(activities)):\n", " for j in range(len(activities[i])):\n", " actidict[activities[i][j]]=i" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'Olvas\\xc3\\xa1s': 8,\n", " 'Rokonl\\xc3\\xa1togat\\xc3\\xa1s': 12,\n", " u'Mozi': 7,\n", " 'M\\xc3\\xa1s': 12,\n", " u'Internet': 4,\n", " '\\xc3\\x89tterem/Vend\\xc3\\xa9gl\\xc5\\x91': 2,\n", " 'H\\xc3\\xa1zimunka/Gyerekfel\\xc3\\xbcgyelet': 9,\n", " 'S\\xc3\\xa9ta/Kutyas\\xc3\\xa9t\\xc3\\xa1ltat\\xc3\\xa1s': 15,\n", " '\\xc3\\x89tkez\\xc3\\xa9s': 2,\n", " 'Vall\\xc3\\xa1sgyakorl\\xc3\\xa1s': 6,\n", " '\\xc3\\x96nk\\xc3\\xa9ntess\\xc3\\xa9g': 6,\n", " '\\xc3\\x9ajs\\xc3\\xa1g/Keresztrejtv\\xc3\\xa9ny': 8,\n", " 'Edz\\xc5\\x91terem/Sz\\xc3\\xa9ps\\xc3\\xa9gszalon': 11,\n", " 'Egy\\xc3\\xa9b Hobby': 12,\n", " u'TV/Film': 7,\n", " 'Alv\\xc3\\xa1s': 0,\n", " 'Utaz\\xc3\\xa1s/Vezet\\xc3\\xa9s': 16,\n", " 'Sz\\xc3\\xb3rakoz\\xc3\\xb3hely/K\\xc3\\xa1v\\xc3\\xa9z\\xc3\\xb3/Pub': 14,\n", " 'Mag\\xc3\\xa1n\\xc3\\xb3ra': 13,\n", " 'PC j\\xc3\\xa1t\\xc3\\xa9k': 12,\n", " u'Sport': 11,\n", " 'Hivatalos elint\\xc3\\xa9znival\\xc3\\xb3k': 10,\n", " 'Kert\\xc3\\xa9szked\\xc3\\xa9s/Bark\\xc3\\xa1csol\\xc3\\xa1s': 12,\n", " u'Telefon/Chat/Facebook': 4,\n", " 'V\\xc3\\xa1s\\xc3\\xa1rl\\xc3\\xa1s': 5,\n", " u'Munka (irodai)': 3,\n", " 'Munka (k\\xc3\\xa9tkezi)': 3,\n", " 'Term\\xc3\\xa9szet/Kir\\xc3\\xa1ndul\\xc3\\xa1s': 15,\n", " 'Tanul\\xc3\\xa1s': 13,\n", " 'Zuhany / Mosd\\xc3\\xb3': 1}" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#binned activities into activity groups\n", "actidict" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# run only once\n", "hetv.columns=hetv.loc[0].values\n", "hetv=hetv[1:].drop(0,axis=1)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#extract and linearize data from pandas dataframe\n", "hvegdata={}\n", "for i in hetv.index:\n", " index=hetv.loc[i].index\n", " values=hetv.loc[i].values\n", " helper=[]\n", " for j in range(len(values)):\n", " if str(values[j]).lower()!='nan':\n", " helper.append(index[j])\n", " hvegdata[i]=helper " ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create timematrix - timeslice:activity list\n", "j=1\n", "timematrix={}\n", "for i in hvegdata[j]:\n", " activity=i[:i.find('-')-1]\n", " timeslice=i[i.find('-')+2:]\n", " if timeslice not in timematrix:timematrix[timeslice]=[]\n", " timematrix[timeslice].append(actidict[activity])" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create correct timeslice order to start day at 04:00\n", "parseorder=np.roll(np.sort(timematrix.keys()),-2)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create output list, with shared timeslots\n", "output=[]\n", "for k in range(len(parseorder)):\n", " helper=timematrix[parseorder[k]]\n", " np.random.shuffle(helper)\n", " output.append(helper[:3]) #max 3 activities within 90 minutes, but create 3 randomized persons" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0], [0], [0], [2, 1, 0], [13], [13], [12, 4, 7], [2], [2], [0], [0], [0], [0]]\n" ] } ], "source": [ "print output" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0, 90, 0, 90, 0, 90, 2, 30, 1, 30, 0, 30, 13, 90, 13, 90, 12, 30, 4, 30, 7, 30, 2, 90, 2, 90, 0, 90, 0, 90, 0, 90, 0, 90]\n" ] } ], "source": [ "#create output CSV list: activity, duration, activity, duration, ...\n", "output2=[]\n", "fixed=90 # survey 90 min timeslices are fixed\n", "for k in range(len(output)):\n", " for z in range(len(output[k])):\n", " output2.append(output[k][z])\n", " output2.append(fixed/(len(output[k])))\n", "print output2" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "1170" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#minutes in perfect day\n", "sum([output2[i*2+1] for i in range(len(output2)/2)])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0, 264, 2, 31, 1, 21, 0, 25, 13, 191, 12, 20, 4, 22, 7, 21, 2, 166, 0, 350]\n" ] } ], "source": [ "#create output CSV list: activity, duration, activity, duration, ... including pruning\n", "output2=[]\n", "fixed=90 # survey 90 min timeslices are fixed\n", "current=999\n", "for k in range(len(output)):\n", " for z in range(len(output[k])):\n", " if output[k][z]!=current:\n", " current=output[k][z]\n", " output2.append(output[k][z])\n", " output2.append(fixed/(len(output[k]))-15+int(np.random.uniform(30))) #randomize a bit better movement\n", " else:\n", " output2[-1]+=fixed/(len(output[k])) \n", "print output2" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "1111" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum([output2[i*2+1] for i in range(len(output2)/2)])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Parse all data" ] }, { "cell_type": "code", "execution_count": 351, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create timematrix - timeslice:activity list\n", "output4=[]\n", "for j in hvegdata:\n", " timematrix={}\n", " for i in hvegdata[j]:\n", " activity=i[:i.find('-')-1]\n", " timeslice=i[i.find('-')+2:]\n", " if timeslice not in timematrix:timematrix[timeslice]=[]\n", " timematrix[timeslice].append(actidict[activity])\n", " #create correct timeslice order to start day at 04:00\n", " parseorder=np.roll(np.sort(timematrix.keys()),-2)\n", " #create output list, with shared timeslots\n", " for x in range(3): #create 3 randomized person-instances\n", " output=[]\n", " for k in range(len(parseorder)):\n", " helper=timematrix[parseorder[k]]\n", " np.random.shuffle(helper)\n", " output.append(helper[:3]) #max 3 activities within 90 minutes, but create 3 randomized persons\n", " #create output CSV list: activity, duration, activity, duration, ...\n", " output2=[]\n", " fixed=90 # survey 90 min timeslices are fixed\n", " current=999\n", " for k in range(len(output)):\n", " for z in range(len(output[k])):\n", " if output[k][z]!=current:\n", " current=output[k][z]\n", " output2.append(output[k][z])\n", " output2.append(fixed/(len(output[k]))+int(np.random.uniform(30)))\n", " else:\n", " output2[-1]+=fixed/(len(output[k])) \n", " output4.append(str(output2)[1:-1].replace(' ',''))" ] }, { "cell_type": "code", "execution_count": 352, "metadata": { "collapsed": false }, "outputs": [], "source": [ "savedata=pd.DataFrame(output4)\n", "savedata.columns=['day']" ] }, { "cell_type": "code", "execution_count": 353, "metadata": { "collapsed": false }, "outputs": [], "source": [ "savedata.to_csv('hkoz.csv',index=False)" ] }, { "cell_type": "code", "execution_count": 354, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "798" ] }, "execution_count": 354, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(savedata)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }