{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd, numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#model data\n",
    "df=pd.read_csv('ex/data/days-simulated-v2.tsv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>day</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0,270,5,32,10,73,16,25,5,165,2,35,4,300,1,53,1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4,150,16,7,4,623,16,5,8,35,16,20,5,30,2,10,8,1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0,270,1,75,16,30,9,15,16,5,8,40,16,5,10,10,16,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0,240,1,60,10,50,16,5,5,55,2,32,16,8,10,10,16,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0,170,2,20,16,5,4,285,2,15,3,205,16,5,6,6,16,1...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 day\n",
       "0  0,270,5,32,10,73,16,25,5,165,2,35,4,300,1,53,1...\n",
       "1  4,150,16,7,4,623,16,5,8,35,16,20,5,30,2,10,8,1...\n",
       "2  0,270,1,75,16,30,9,15,16,5,8,40,16,5,10,10,16,...\n",
       "3  0,240,1,60,10,50,16,5,5,55,2,32,16,8,10,10,16,...\n",
       "4  0,170,2,20,16,5,4,285,2,15,3,205,16,5,6,6,16,1..."
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#target structure: activity, duration, activity, duration, ...\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df1=pd.read_csv('ex/1.csv')\n",
    "df2=pd.read_csv('ex/2.csv')\n",
    "df3=pd.read_csv('ex/3.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df=pd.concat([df1,df2[1:],df3[1:]]) #no need for headers twice, df headers completely identical"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#create weekday dataframe\n",
    "hkoz=df[df.columns[9:489]].reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#create weekend dataframe\n",
    "hetv=df[df.columns[489:969]].reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#create descriptive stats dataframe\n",
    "desc=df[df.columns[969:]].reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#create survey metadata dataframe\n",
    "time=df[df.columns[2:4]].reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#top 16 activity groups\n",
    "activities=[['Alv\\xc3\\xa1s'],\n",
    "['Zuhany / Mosd\\xc3\\xb3'],\n",
    "['\\xc3\\x89tkez\\xc3\\xa9s', '\\xc3\\x89tterem/Vend\\xc3\\xa9gl\\xc5\\x91'],\n",
    "[u'Munka (irodai)', 'Munka (k\\xc3\\xa9tkezi)'],\n",
    "[u'Internet', u'Telefon/Chat/Facebook'],\n",
    "['V\\xc3\\xa1s\\xc3\\xa1rl\\xc3\\xa1s'],\n",
    "['Vall\\xc3\\xa1sgyakorl\\xc3\\xa1s','\\xc3\\x96nk\\xc3\\xa9ntess\\xc3\\xa9g'],\n",
    "[u'TV/Film', u'Mozi'],\n",
    "['Olvas\\xc3\\xa1s', '\\xc3\\x9ajs\\xc3\\xa1g/Keresztrejtv\\xc3\\xa9ny'],\n",
    "['H\\xc3\\xa1zimunka/Gyerekfel\\xc3\\xbcgyelet'],\n",
    "['Hivatalos elint\\xc3\\xa9znival\\xc3\\xb3k'],\n",
    "[u'Sport', 'Edz\\xc5\\x91terem/Sz\\xc3\\xa9ps\\xc3\\xa9gszalon'],\n",
    "['Egy\\xc3\\xa9b Hobby', 'PC j\\xc3\\xa1t\\xc3\\xa9k', 'Kert\\xc3\\xa9szked\\xc3\\xa9s/Bark\\xc3\\xa1csol\\xc3\\xa1s', 'Rokonl\\xc3\\xa1togat\\xc3\\xa1s',  'M\\xc3\\xa1s'],\n",
    "['Tanul\\xc3\\xa1s',  'Mag\\xc3\\xa1n\\xc3\\xb3ra'],\n",
    "['Sz\\xc3\\xb3rakoz\\xc3\\xb3hely/K\\xc3\\xa1v\\xc3\\xa9z\\xc3\\xb3/Pub'],\n",
    "['S\\xc3\\xa9ta/Kutyas\\xc3\\xa9t\\xc3\\xa1ltat\\xc3\\xa1s', 'Term\\xc3\\xa9szet/Kir\\xc3\\xa1ndul\\xc3\\xa1s'],\n",
    "['Utaz\\xc3\\xa1s/Vezet\\xc3\\xa9s']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#bin activities into activity groups\n",
    "actidict={}\n",
    "for i in range(len(activities)):\n",
    "    for j in range(len(activities[i])):\n",
    "        actidict[activities[i][j]]=i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Olvas\\xc3\\xa1s': 8,\n",
       " 'Rokonl\\xc3\\xa1togat\\xc3\\xa1s': 12,\n",
       " u'Mozi': 7,\n",
       " 'M\\xc3\\xa1s': 12,\n",
       " u'Internet': 4,\n",
       " '\\xc3\\x89tterem/Vend\\xc3\\xa9gl\\xc5\\x91': 2,\n",
       " 'H\\xc3\\xa1zimunka/Gyerekfel\\xc3\\xbcgyelet': 9,\n",
       " 'S\\xc3\\xa9ta/Kutyas\\xc3\\xa9t\\xc3\\xa1ltat\\xc3\\xa1s': 15,\n",
       " '\\xc3\\x89tkez\\xc3\\xa9s': 2,\n",
       " 'Vall\\xc3\\xa1sgyakorl\\xc3\\xa1s': 6,\n",
       " '\\xc3\\x96nk\\xc3\\xa9ntess\\xc3\\xa9g': 6,\n",
       " '\\xc3\\x9ajs\\xc3\\xa1g/Keresztrejtv\\xc3\\xa9ny': 8,\n",
       " 'Edz\\xc5\\x91terem/Sz\\xc3\\xa9ps\\xc3\\xa9gszalon': 11,\n",
       " 'Egy\\xc3\\xa9b Hobby': 12,\n",
       " u'TV/Film': 7,\n",
       " 'Alv\\xc3\\xa1s': 0,\n",
       " 'Utaz\\xc3\\xa1s/Vezet\\xc3\\xa9s': 16,\n",
       " 'Sz\\xc3\\xb3rakoz\\xc3\\xb3hely/K\\xc3\\xa1v\\xc3\\xa9z\\xc3\\xb3/Pub': 14,\n",
       " 'Mag\\xc3\\xa1n\\xc3\\xb3ra': 13,\n",
       " 'PC j\\xc3\\xa1t\\xc3\\xa9k': 12,\n",
       " u'Sport': 11,\n",
       " 'Hivatalos elint\\xc3\\xa9znival\\xc3\\xb3k': 10,\n",
       " 'Kert\\xc3\\xa9szked\\xc3\\xa9s/Bark\\xc3\\xa1csol\\xc3\\xa1s': 12,\n",
       " u'Telefon/Chat/Facebook': 4,\n",
       " 'V\\xc3\\xa1s\\xc3\\xa1rl\\xc3\\xa1s': 5,\n",
       " u'Munka (irodai)': 3,\n",
       " 'Munka (k\\xc3\\xa9tkezi)': 3,\n",
       " 'Term\\xc3\\xa9szet/Kir\\xc3\\xa1ndul\\xc3\\xa1s': 15,\n",
       " 'Tanul\\xc3\\xa1s': 13,\n",
       " 'Zuhany / Mosd\\xc3\\xb3': 1}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#binned activities into activity groups\n",
    "actidict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# run only once\n",
    "hetv.columns=hetv.loc[0].values\n",
    "hetv=hetv[1:].drop(0,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#extract and linearize data from pandas dataframe\n",
    "hvegdata={}\n",
    "for i in hetv.index:\n",
    "    index=hetv.loc[i].index\n",
    "    values=hetv.loc[i].values\n",
    "    helper=[]\n",
    "    for j in range(len(values)):\n",
    "        if str(values[j]).lower()!='nan':\n",
    "            helper.append(index[j])\n",
    "    hvegdata[i]=helper                "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#create timematrix - timeslice:activity list\n",
    "j=1\n",
    "timematrix={}\n",
    "for i in hvegdata[j]:\n",
    "    activity=i[:i.find('-')-1]\n",
    "    timeslice=i[i.find('-')+2:]\n",
    "    if timeslice not in timematrix:timematrix[timeslice]=[]\n",
    "    timematrix[timeslice].append(actidict[activity])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#create correct timeslice order to start day at 04:00\n",
    "parseorder=np.roll(np.sort(timematrix.keys()),-2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#create output list, with shared timeslots\n",
    "output=[]\n",
    "for k in range(len(parseorder)):\n",
    "    helper=timematrix[parseorder[k]]\n",
    "    np.random.shuffle(helper)\n",
    "    output.append(helper[:3]) #max 3 activities within 90 minutes, but create 3 randomized persons"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0], [0], [0], [2, 1, 0], [13], [13], [12, 4, 7], [2], [2], [0], [0], [0], [0]]\n"
     ]
    }
   ],
   "source": [
    "print output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0, 90, 0, 90, 0, 90, 2, 30, 1, 30, 0, 30, 13, 90, 13, 90, 12, 30, 4, 30, 7, 30, 2, 90, 2, 90, 0, 90, 0, 90, 0, 90, 0, 90]\n"
     ]
    }
   ],
   "source": [
    "#create output CSV list: activity, duration, activity, duration, ...\n",
    "output2=[]\n",
    "fixed=90 # survey 90 min timeslices are fixed\n",
    "for k in range(len(output)):\n",
    "    for z in range(len(output[k])):\n",
    "        output2.append(output[k][z])\n",
    "        output2.append(fixed/(len(output[k])))\n",
    "print output2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1170"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#minutes in perfect day\n",
    "sum([output2[i*2+1] for i in range(len(output2)/2)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0, 264, 2, 31, 1, 21, 0, 25, 13, 191, 12, 20, 4, 22, 7, 21, 2, 166, 0, 350]\n"
     ]
    }
   ],
   "source": [
    "#create output CSV list: activity, duration, activity, duration, ... including pruning\n",
    "output2=[]\n",
    "fixed=90 # survey 90 min timeslices are fixed\n",
    "current=999\n",
    "for k in range(len(output)):\n",
    "    for z in range(len(output[k])):\n",
    "        if output[k][z]!=current:\n",
    "            current=output[k][z]\n",
    "            output2.append(output[k][z])\n",
    "            output2.append(fixed/(len(output[k]))-15+int(np.random.uniform(30))) #randomize a bit better movement\n",
    "        else:\n",
    "            output2[-1]+=fixed/(len(output[k]))        \n",
    "print output2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1111"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum([output2[i*2+1] for i in range(len(output2)/2)])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Parse all data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 351,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#create timematrix - timeslice:activity list\n",
    "output4=[]\n",
    "for j in hvegdata:\n",
    "    timematrix={}\n",
    "    for i in hvegdata[j]:\n",
    "        activity=i[:i.find('-')-1]\n",
    "        timeslice=i[i.find('-')+2:]\n",
    "        if timeslice not in timematrix:timematrix[timeslice]=[]\n",
    "        timematrix[timeslice].append(actidict[activity])\n",
    "    #create correct timeslice order to start day at 04:00\n",
    "    parseorder=np.roll(np.sort(timematrix.keys()),-2)\n",
    "    #create output list, with shared timeslots\n",
    "    for x in range(3): #create 3 randomized person-instances\n",
    "        output=[]\n",
    "        for k in range(len(parseorder)):\n",
    "            helper=timematrix[parseorder[k]]\n",
    "            np.random.shuffle(helper)\n",
    "            output.append(helper[:3]) #max 3 activities within 90 minutes, but create 3 randomized persons\n",
    "        #create output CSV list: activity, duration, activity, duration, ...\n",
    "        output2=[]\n",
    "        fixed=90 # survey 90 min timeslices are fixed\n",
    "        current=999\n",
    "        for k in range(len(output)):\n",
    "            for z in range(len(output[k])):\n",
    "                if output[k][z]!=current:\n",
    "                    current=output[k][z]\n",
    "                    output2.append(output[k][z])\n",
    "                    output2.append(fixed/(len(output[k]))+int(np.random.uniform(30)))\n",
    "                else:\n",
    "                    output2[-1]+=fixed/(len(output[k])) \n",
    "        output4.append(str(output2)[1:-1].replace(' ',''))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 352,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "savedata=pd.DataFrame(output4)\n",
    "savedata.columns=['day']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 353,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "savedata.to_csv('hkoz.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 354,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "798"
      ]
     },
     "execution_count": 354,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(savedata)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}