{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd, import numpy as np\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#model data\n", "df=pd.read_csv('ex/data/days-simulated-v2.tsv')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
day
00,270,5,32,10,73,16,25,5,165,2,35,4,300,1,53,1...
14,150,16,7,4,623,16,5,8,35,16,20,5,30,2,10,8,1...
20,270,1,75,16,30,9,15,16,5,8,40,16,5,10,10,16,...
30,240,1,60,10,50,16,5,5,55,2,32,16,8,10,10,16,...
40,170,2,20,16,5,4,285,2,15,3,205,16,5,6,6,16,1...
\n", "
" ], "text/plain": [ " day\n", "0 0,270,5,32,10,73,16,25,5,165,2,35,4,300,1,53,1...\n", "1 4,150,16,7,4,623,16,5,8,35,16,20,5,30,2,10,8,1...\n", "2 0,270,1,75,16,30,9,15,16,5,8,40,16,5,10,10,16,...\n", "3 0,240,1,60,10,50,16,5,5,55,2,32,16,8,10,10,16,...\n", "4 0,170,2,20,16,5,4,285,2,15,3,205,16,5,6,6,16,1..." ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#target structure: activity, duration, activity, duration, ...\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df1=pd.read_csv('ex/1.csv')\n", "df2=pd.read_csv('ex/2.csv')\n", "df3=pd.read_csv('ex/3.csv')" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df=pd.concat([df1,df2[1:],df3[1:]]) #no need for headers twice, df headers completely identical" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create weekday dataframe\n", "hkoz=df[df.columns[9:489]].reset_index()" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create weekend dataframe\n", "hetv=df[df.columns[489:969]].reset_index()" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create descriptive stats dataframe\n", "desc=df[df.columns[969:]].reset_index()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create survey metadata dataframe\n", "time=df[df.columns[2:4]].reset_index()" ] }, { "cell_type": "code", "execution_count": 112, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#top 16 activity groups\n", "activities=[['Alv\\xc3\\xa1s'],\n", "['Zuhany / Mosd\\xc3\\xb3'],\n", "['\\xc3\\x89tkez\\xc3\\xa9s', '\\xc3\\x89tterem/Vend\\xc3\\xa9gl\\xc5\\x91'],\n", "[u'Munka (irodai)', 'Munka (k\\xc3\\xa9tkezi)'],\n", "[u'Internet', u'Telefon/Chat/Facebook'],\n", "['V\\xc3\\xa1s\\xc3\\xa1rl\\xc3\\xa1s'],\n", "['Vall\\xc3\\xa1sgyakorl\\xc3\\xa1s'],\n", "[u'TV/Film', u'Mozi'],\n", "['Olvas\\xc3\\xa1s', '\\xc3\\x9ajs\\xc3\\xa1g/Keresztrejtv\\xc3\\xa9ny'],\n", "['H\\xc3\\xa1zimunka/Gyerekfel\\xc3\\xbcgyelet'],\n", "['Hivatalos elint\\xc3\\xa9znival\\xc3\\xb3k'],\n", "[u'Sport', 'Edz\\xc5\\x91terem/Sz\\xc3\\xa9ps\\xc3\\xa9gszalon'],\n", "['Utaz\\xc3\\xa1s/Vezet\\xc3\\xa9s'],\n", "['Tanul\\xc3\\xa1s', 'Mag\\xc3\\xa1n\\xc3\\xb3ra'],\n", "['Sz\\xc3\\xb3rakoz\\xc3\\xb3hely/K\\xc3\\xa1v\\xc3\\xa9z\\xc3\\xb3/Pub'],\n", "['S\\xc3\\xa9ta/Kutyas\\xc3\\xa9t\\xc3\\xa1ltat\\xc3\\xa1s', 'Term\\xc3\\xa9szet/Kir\\xc3\\xa1ndul\\xc3\\xa1s'],\n", "['Egy\\xc3\\xa9b Hobby', 'PC j\\xc3\\xa1t\\xc3\\xa9k','\\xc3\\x96nk\\xc3\\xa9ntess\\xc3\\xa9g', 'Kert\\xc3\\xa9szked\\xc3\\xa9s/Bark\\xc3\\xa1csol\\xc3\\xa1s', 'Rokonl\\xc3\\xa1togat\\xc3\\xa1s', 'M\\xc3\\xa1s']]" ] }, { "cell_type": "code", "execution_count": 113, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#bin activities into activity groups\n", "actidict={}\n", "for i in range(len(activities)):\n", " for j in range(len(activities[i])):\n", " actidict[activities[i][j]]=i" ] }, { "cell_type": "code", "execution_count": 114, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'Olvas\\xc3\\xa1s': 8,\n", " 'Rokonl\\xc3\\xa1togat\\xc3\\xa1s': 16,\n", " u'Mozi': 7,\n", " 'M\\xc3\\xa1s': 16,\n", " u'Internet': 4,\n", " '\\xc3\\x89tterem/Vend\\xc3\\xa9gl\\xc5\\x91': 2,\n", " 'H\\xc3\\xa1zimunka/Gyerekfel\\xc3\\xbcgyelet': 9,\n", " 'S\\xc3\\xa9ta/Kutyas\\xc3\\xa9t\\xc3\\xa1ltat\\xc3\\xa1s': 15,\n", " '\\xc3\\x89tkez\\xc3\\xa9s': 2,\n", " 'Vall\\xc3\\xa1sgyakorl\\xc3\\xa1s': 6,\n", " '\\xc3\\x96nk\\xc3\\xa9ntess\\xc3\\xa9g': 16,\n", " '\\xc3\\x9ajs\\xc3\\xa1g/Keresztrejtv\\xc3\\xa9ny': 8,\n", " 'Edz\\xc5\\x91terem/Sz\\xc3\\xa9ps\\xc3\\xa9gszalon': 11,\n", " 'Egy\\xc3\\xa9b Hobby': 16,\n", " u'TV/Film': 7,\n", " 'Alv\\xc3\\xa1s': 0,\n", " 'Utaz\\xc3\\xa1s/Vezet\\xc3\\xa9s': 12,\n", " 'Sz\\xc3\\xb3rakoz\\xc3\\xb3hely/K\\xc3\\xa1v\\xc3\\xa9z\\xc3\\xb3/Pub': 14,\n", " 'Mag\\xc3\\xa1n\\xc3\\xb3ra': 13,\n", " 'PC j\\xc3\\xa1t\\xc3\\xa9k': 16,\n", " u'Sport': 11,\n", " 'Hivatalos elint\\xc3\\xa9znival\\xc3\\xb3k': 10,\n", " 'Kert\\xc3\\xa9szked\\xc3\\xa9s/Bark\\xc3\\xa1csol\\xc3\\xa1s': 16,\n", " u'Telefon/Chat/Facebook': 4,\n", " 'V\\xc3\\xa1s\\xc3\\xa1rl\\xc3\\xa1s': 5,\n", " u'Munka (irodai)': 3,\n", " 'Munka (k\\xc3\\xa9tkezi)': 3,\n", " 'Term\\xc3\\xa9szet/Kir\\xc3\\xa1ndul\\xc3\\xa1s': 15,\n", " 'Tanul\\xc3\\xa1s': 13,\n", " 'Zuhany / Mosd\\xc3\\xb3': 1}" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#binned activities into activity groups\n", "actidict" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# run only once\n", "hkoz.columns=hkoz.loc[0].values\n", "hkoz=hkoz[1:].drop(0,axis=1)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#extract and linearize data from pandas dataframe\n", "hkozdata={}\n", "for i in hkoz.index:\n", " index=hkoz.loc[i].index\n", " values=hkoz.loc[i].values\n", " helper=[]\n", " for j in range(len(values)):\n", " if str(values[j]).lower()!='nan':\n", " helper.append(index[j])\n", " hkozdata[i]=helper " ] }, { "cell_type": "code", "execution_count": 115, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create timematrix - timeslice:activity list\n", "j=1\n", "timematrix={}\n", "for i in hkozdata[j]:\n", " activity=i[:i.find('-')-1]\n", " timeslice=i[i.find('-')+2:]\n", " if timeslice not in timematrix:timematrix[timeslice]=[]\n", " timematrix[timeslice].append(actidict[activity])" ] }, { "cell_type": "code", "execution_count": 130, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create correct timeslice order to start day at 04:00\n", "parseorder=np.roll(np.sort(timematrix.keys()),-2)" ] }, { "cell_type": "code", "execution_count": 164, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create output list, with shared timeslots\n", "output=[]\n", "for k in range(len(parseorder)):\n", " helper=timematrix[parseorder[k]]\n", " np.random.shuffle(helper)\n", " output.append(helper[:3]) #max 3 activities within 90 minutes, but create 3 randomized persons" ] }, { "cell_type": "code", "execution_count": 170, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0], [0, 1], [13], [13], [13, 2], [13], [13], [13], [4, 16, 2], [13], [13], [7, 4, 16], [0], [0], [0], [0]]\n" ] } ], "source": [ "print output" ] }, { "cell_type": "code", "execution_count": 283, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0, 90, 0, 90, 1, 45, 0, 45, 12, 90, 3, 90, 3, 90, 2, 30, 4, 30, 3, 30, 3, 90, 3, 90, 3, 90, 3, 90, 4, 45, 5, 45, 4, 30, 12, 30, 2, 30, 1, 45, 0, 45, 0, 90, 0, 90]\n" ] } ], "source": [ "#create output CSV list: activity, duration, activity, duration, ...\n", "output2=[]\n", "fixed=90 # survey 90 min timeslices are fixed\n", "for k in range(len(output)):\n", " for z in range(len(output[k])):\n", " output2.append(output[k][z])\n", " output2.append(fixed/(len(output[k])))\n", "print output2" ] }, { "cell_type": "code", "execution_count": 289, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[3, 90, 4, 45, 5, 45, 4, 30, 12, 30, 2, 30, 1, 45, 0, 45, 0, 90, 0, 90, 3, 60, 4, 45, 5, 45, 4, 30, 12, 30, 2, 30, 1, 45, 0, 45, 0, 90, 0, 90, 4, 75, 0, 90, 4, 135]\n" ] } ], "source": [ "helper=list(output2)\n", "for k in range(1,len(output2)/2):\n", " if output2[k*2]==output2[(k-1)*2]:\n", " helper=helper[(k-1)*2:]+[helper[k*2],helper[k*2+1]+helper[k*2-1]]+output2[(k+1)*2:]\n", "print helper" ] }, { "cell_type": "code", "execution_count": 285, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0, 90, 0, 90, 1, 45, 0, 45, 12, 90, 3, 90, 3, 90, 2, 30, 4, 30, 3, 30, 3, 90, 3, 90, 3, 90, 3, 90, 4, 45, 5, 45, 4, 30, 12, 30, 2, 30, 1, 45, 0, 45, 0, 90, 0, 90]\n" ] } ], "source": [ "print output2" ] }, { "cell_type": "code", "execution_count": 286, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{0: [0, 1, 3, 20, 21, 22],\n", " 1: [2, 19],\n", " 2: [7, 18],\n", " 3: [5, 6, 9, 10, 11, 12, 13],\n", " 4: [8, 14, 16],\n", " 5: [15],\n", " 12: [4, 17]}" ] }, "execution_count": 286, "metadata": {}, "output_type": "execute_result" } ], "source": [ "activityslice" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Parse all data" ] }, { "cell_type": "code", "execution_count": 282, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#create timematrix - timeslice:activity list\n", "output4=[]\n", "for j in hkozdata:\n", " timematrix={}\n", " for i in hkozdata[j]:\n", " activity=i[:i.find('-')-1]\n", " timeslice=i[i.find('-')+2:]\n", " if timeslice not in timematrix:timematrix[timeslice]=[]\n", " timematrix[timeslice].append(actidict[activity])\n", " #create correct timeslice order to start day at 04:00\n", " parseorder=np.roll(np.sort(timematrix.keys()),-2)\n", " #create output list, with shared timeslots\n", " for x in range(3): #create 3 randomized person-instances\n", " output=[]\n", " for k in range(len(parseorder)):\n", " helper=timematrix[parseorder[k]]\n", " np.random.shuffle(helper)\n", " output.append(helper[:3]) #max 3 activities within 90 minutes, but create 3 randomized persons\n", " #create output CSV list: activity, duration, activity, duration, ...\n", " output2=[]\n", " fixed=90 # survey 90 min timeslices are fixed\n", " for k in range(len(output)):\n", " for z in range(len(output[k])):\n", " output2.append(output[k][z])\n", " output2.append(fixed/(len(output[k])))\n", " output4.append(str(output2)[1:-1].replace(' ',''))" ] }, { "cell_type": "code", "execution_count": 264, "metadata": { "collapsed": false }, "outputs": [], "source": [ "savedata=pd.DataFrame(output4)\n", "savedata.columns=['day']" ] }, { "cell_type": "code", "execution_count": 265, "metadata": { "collapsed": false }, "outputs": [], "source": [ "savedata.to_csv('hkoz.csv',index=False)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }