{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exploring and initial data cleaning" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is my working note for the process of initial data cleaning. For a short version, go to cleaning_short.ipynb which contains functions for cleaning this dataset" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "%reload_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from pathlib import Path\n", "import json\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[PosixPath('data/houston.csv'), PosixPath('data/location_history.json')]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "PATH = Path('data')\n", "list(PATH.iterdir())" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "j_file = json.load((PATH/'location_history.json').open())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(j_file)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": false }, "outputs": [], "source": [ "df = pd.DataFrame.from_dict(j_file['locations'])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accuracyactivityaltitudeheadinglatitudeE7longitudeE7timestampMsvelocityverticalAccuracy
64788730NaNNaNNaN296678426-9527666181386536707631NaNNaN
64788830[{'timestampMs': '1386536651324', 'activity': ...NaNNaN296678240-9527666061386536647690NaNNaN
64788930NaNNaNNaN296678308-9527665921386536587653NaNNaN
64789030NaNNaNNaN296678200-9527664441386536527693NaNNaN
64789111NaNNaNNaN296678076-9527666691386536468150NaNNaN
64789212NaNNaNNaN296678120-9527664691386536423014NaNNaN
6478935[{'timestampMs': '1386536388113', 'activity': ...31.0320.0296677802-95276584313865363778980.0NaN
6478946NaN31.0320.0296677783-95276582413865363728590.0NaN
6478953NaN30.0307.0296677688-95276572913865363528000.0NaN
6478968[{'timestampMs': '1386536307870', 'activity': ...30.0161.0296676529-95276454913865363071630.0NaN
\n", "
" ], "text/plain": [ " accuracy activity altitude \\\n", "647887 30 NaN NaN \n", "647888 30 [{'timestampMs': '1386536651324', 'activity': ... NaN \n", "647889 30 NaN NaN \n", "647890 30 NaN NaN \n", "647891 11 NaN NaN \n", "647892 12 NaN NaN \n", "647893 5 [{'timestampMs': '1386536388113', 'activity': ... 31.0 \n", "647894 6 NaN 31.0 \n", "647895 3 NaN 30.0 \n", "647896 8 [{'timestampMs': '1386536307870', 'activity': ... 30.0 \n", "\n", " heading latitudeE7 longitudeE7 timestampMs velocity \\\n", "647887 NaN 296678426 -952766618 1386536707631 NaN \n", "647888 NaN 296678240 -952766606 1386536647690 NaN \n", "647889 NaN 296678308 -952766592 1386536587653 NaN \n", "647890 NaN 296678200 -952766444 1386536527693 NaN \n", "647891 NaN 296678076 -952766669 1386536468150 NaN \n", "647892 NaN 296678120 -952766469 1386536423014 NaN \n", "647893 320.0 296677802 -952765843 1386536377898 0.0 \n", "647894 320.0 296677783 -952765824 1386536372859 0.0 \n", "647895 307.0 296677688 -952765729 1386536352800 0.0 \n", "647896 161.0 296676529 -952764549 1386536307163 0.0 \n", "\n", " verticalAccuracy \n", "647887 NaN \n", "647888 NaN \n", "647889 NaN \n", "647890 NaN \n", "647891 NaN \n", "647892 NaN \n", "647893 NaN \n", "647894 NaN \n", "647895 NaN \n", "647896 NaN " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail(10) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: data is already sorted by timestamp (newest to oldest)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df.timestampMs = df.timestampMs.astype(np.int64)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# df['date_time'] = pd.to_datetime(df.timestampMs,unit='ms').dt.tz_localize('utc').dt.tz_convert('US/Central')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accuracyactivityaltitudeheadinglatitudeE7longitudeE7timestampMsvelocityverticalAccuracydate_time
016NaN118.0NaN390846488-7715274551531424546373NaN2.02018-07-12 14:42:26.373000-05:00
116NaN118.0NaN390846471-7715274441531423345272NaN2.02018-07-12 14:22:25.272000-05:00
216NaN118.0NaN390846486-7715274751531422368498NaN2.02018-07-12 14:06:08.498000-05:00
316NaN118.0NaN390846477-7715274821531421692554NaN2.02018-07-12 13:54:52.554000-05:00
426NaNNaNNaN390845015-7715270641531421092427NaNNaN2018-07-12 13:44:52.427000-05:00
\n", "
" ], "text/plain": [ " accuracy activity altitude heading latitudeE7 longitudeE7 \\\n", "0 16 NaN 118.0 NaN 390846488 -771527455 \n", "1 16 NaN 118.0 NaN 390846471 -771527444 \n", "2 16 NaN 118.0 NaN 390846486 -771527475 \n", "3 16 NaN 118.0 NaN 390846477 -771527482 \n", "4 26 NaN NaN NaN 390845015 -771527064 \n", "\n", " timestampMs velocity verticalAccuracy date_time \n", "0 1531424546373 NaN 2.0 2018-07-12 14:42:26.373000-05:00 \n", "1 1531423345272 NaN 2.0 2018-07-12 14:22:25.272000-05:00 \n", "2 1531422368498 NaN 2.0 2018-07-12 14:06:08.498000-05:00 \n", "3 1531421692554 NaN 2.0 2018-07-12 13:54:52.554000-05:00 \n", "4 1531421092427 NaN NaN 2018-07-12 13:44:52.427000-05:00 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Extract data in Houston/TX area: from 2013 to 1/30/2017 6:18 am\n", "df_houston = df[df.timestampMs <= 1485778729042].reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(270054, 9)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_houston.shape" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# note that when saving to csv, data structure type (dictionary or list) will be converted into string\n", "# df_houston.to_csv(PATH/'houston.csv',index=False)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# df_houston = pd.read_csv(PATH/'houston.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Activity (list) - exclude long list of activity" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "df_act = df_houston[~df_houston.activity.isnull()].copy()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.3827827027187155" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# % of non-null activity records\n", "len(df_act)/ len(df_houston)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accuracyactivityaltitudeheadinglatitudeE7longitudeE7timestampMsvelocityverticalAccuracy
0545[{'timestampMs': '1485778729815', 'activity': ...23.0NaN299853940-9534810151485778729042NaNNaN
1585[{'timestampMs': '1485778608624', 'activity': ...19.0NaN299853159-9534823131485778672000NaNNaN
28[{'timestampMs': '1485778474389', 'activity': ...4.0224.0299850980-9534891481485778297913NaNNaN
48[{'timestampMs': '1485778148328', 'activity': ...4.0224.0299850980-9534891481485778148040NaNNaN
58[{'timestampMs': '1485778027987', 'activity': ...4.0224.0299850980-9534891481485778028950NaNNaN
6292[{'timestampMs': '1485777980551', 'activity': ...16.0186.0299849212-9534871051485777980208NaNNaN
7292[{'timestampMs': '1485777850168', 'activity': ...16.0186.0299849212-9534871051485777812675NaNNaN
8145[{'timestampMs': '1485777709913', 'activity': ...15.077.0299853950-9534902221485777721949NaNNaN
943[{'timestampMs': '1485777678185', 'activity': ...16.0NaN299849213-9534898811485777656000NaNNaN
10106[{'timestampMs': '1485777555400', 'activity': ...-45.06.0299852839-9534853931485777566000NaNNaN
\n", "
" ], "text/plain": [ " accuracy activity altitude \\\n", "0 545 [{'timestampMs': '1485778729815', 'activity': ... 23.0 \n", "1 585 [{'timestampMs': '1485778608624', 'activity': ... 19.0 \n", "2 8 [{'timestampMs': '1485778474389', 'activity': ... 4.0 \n", "4 8 [{'timestampMs': '1485778148328', 'activity': ... 4.0 \n", "5 8 [{'timestampMs': '1485778027987', 'activity': ... 4.0 \n", "6 292 [{'timestampMs': '1485777980551', 'activity': ... 16.0 \n", "7 292 [{'timestampMs': '1485777850168', 'activity': ... 16.0 \n", "8 145 [{'timestampMs': '1485777709913', 'activity': ... 15.0 \n", "9 43 [{'timestampMs': '1485777678185', 'activity': ... 16.0 \n", "10 106 [{'timestampMs': '1485777555400', 'activity': ... -45.0 \n", "\n", " heading latitudeE7 longitudeE7 timestampMs velocity \\\n", "0 NaN 299853940 -953481015 1485778729042 NaN \n", "1 NaN 299853159 -953482313 1485778672000 NaN \n", "2 224.0 299850980 -953489148 1485778297913 NaN \n", "4 224.0 299850980 -953489148 1485778148040 NaN \n", "5 224.0 299850980 -953489148 1485778028950 NaN \n", "6 186.0 299849212 -953487105 1485777980208 NaN \n", "7 186.0 299849212 -953487105 1485777812675 NaN \n", "8 77.0 299853950 -953490222 1485777721949 NaN \n", "9 NaN 299849213 -953489881 1485777656000 NaN \n", "10 6.0 299852839 -953485393 1485777566000 NaN \n", "\n", " verticalAccuracy \n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "4 NaN \n", "5 NaN \n", "6 NaN \n", "7 NaN \n", "8 NaN \n", "9 NaN \n", "10 NaN " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_act.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: activity feature contains a list of smaller activities, stored as dictionary" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# how many smaller activities are there in 'activity' feature\n", "df_act_expand = df_act.activity.apply(pd.Series)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...89909192939495969798
0{'timestampMs': '1485778729815', 'activity': [...NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1{'timestampMs': '1485778608624', 'activity': [...NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2{'timestampMs': '1485778474389', 'activity': [...{'timestampMs': '1485778410435', 'activity': [...NaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4{'timestampMs': '1485778148328', 'activity': [...NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
5{'timestampMs': '1485778027987', 'activity': [...NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

5 rows × 99 columns

\n", "
" ], "text/plain": [ " 0 \\\n", "0 {'timestampMs': '1485778729815', 'activity': [... \n", "1 {'timestampMs': '1485778608624', 'activity': [... \n", "2 {'timestampMs': '1485778474389', 'activity': [... \n", "4 {'timestampMs': '1485778148328', 'activity': [... \n", "5 {'timestampMs': '1485778027987', 'activity': [... \n", "\n", " 1 2 3 4 5 6 \\\n", "0 NaN NaN NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN NaN NaN \n", "2 {'timestampMs': '1485778410435', 'activity': [... NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN NaN \n", "5 NaN NaN NaN NaN NaN NaN \n", "\n", " 7 8 9 ... 89 90 91 92 93 94 95 96 97 98 \n", "0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", "1 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", "2 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", "5 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", "\n", "[5 rows x 99 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_act_expand.head()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(103372, 99)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_act_expand.shape " ] }, { "cell_type": "code", "execution_count": 89, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "0 0.000000\n", "1 68.148715\n", "2 82.064344\n", "3 88.444479\n", "4 92.835422\n", "5 95.221678\n", "6 97.024928\n", "7 98.041798\n", "8 98.710027\n", "9 99.117744\n", "10 99.385036\n", "11 99.557419\n", "12 99.681381\n", "13 99.751109\n", "14 99.819869\n", "15 99.859575\n", "16 99.879912\n", "17 99.897345\n", "18 99.913808\n", "19 99.929303\n", "20 99.940925\n", "21 99.944798\n", "22 99.954483\n", "23 99.960294\n", "24 99.964167\n", "25 99.965136\n", "26 99.969010\n", "27 99.969978\n", "28 99.972883\n", "29 99.973852\n", " ... \n", "69 99.998063\n", "70 99.998063\n", "71 99.998063\n", "72 99.998063\n", "73 99.998063\n", "74 99.998063\n", "75 99.998063\n", "76 99.998063\n", "77 99.999032\n", "78 99.999032\n", "79 99.999032\n", "80 99.999032\n", "81 99.999032\n", "82 99.999032\n", "83 99.999032\n", "84 99.999032\n", "85 99.999032\n", "86 99.999032\n", "87 99.999032\n", "88 99.999032\n", "89 99.999032\n", "90 99.999032\n", "91 99.999032\n", "92 99.999032\n", "93 99.999032\n", "94 99.999032\n", "95 99.999032\n", "96 99.999032\n", "97 99.999032\n", "98 99.999032\n", "Length: 99, dtype: float64" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# null values percentages for each col\n", "((df_act_expand.isnull().sum()) / len(df_act_expand))*100" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8822559999999982" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "100-99.117744" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: A record can have maximum of 99 small activites. However, very few records (< 1%) have >= 9 small activities. We will treat these records as outliers and drop them" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int64')" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "null_sum = df_act_expand.isnull().sum()\n", "null_sum[(null_sum / len(df_act_expand))*100 <= 99.0].index\n", "# cols 0 to 8 have <= 99.0% null values" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9\n", "99\n" ] } ], "source": [ "col_del_start = null_sum[(null_sum / len(df_act_expand))*100 > 99.0].index[0]\n", "col_del_end = df_act_expand.shape[1]\n", "\n", "print(col_del_start)\n", "print(col_del_end)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "Int64Index([ 52, 751, 754, 756, 757, 856, 1354, 2155,\n", " 2191, 2199,\n", " ...\n", " 258650, 258655, 258669, 258705, 258724, 258960, 258963, 258968,\n", " 258991, 258994],\n", " dtype='int64', length=912)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get records (by indices) that have non-null values starting at col_del_start\n", "row_drops = df_act_expand[(~df_act_expand.loc[:,range(col_del_start,col_del_end)].isnull()).sum(axis=1) >=1].index\n", "row_drops" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "df_houston2 = df_houston.drop(row_drops)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(269142, 9)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_houston2.shape" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sanity check\n", "len(df_houston) - len(df_houston2) == len(row_drops)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "df_houston2.reset_index(drop=True,inplace=True)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# df_houston2.to_csv(PATH/'houston.csv',index=False)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accuracyactivityaltitudeheadinglatitudeE7longitudeE7timestampMsvelocityverticalAccuracydate_time
0545[{'timestampMs': '1485778729815', 'activity': ...23.0NaN299853940-9534810151485778729042NaNNaN2017-01-30 06:18:49.042000-06:00
1585[{'timestampMs': '1485778608624', 'activity': ...19.0NaN299853159-9534823131485778672000NaNNaN2017-01-30 06:17:52-06:00
28[{'timestampMs': '1485778474389', 'activity': ...4.0224.0299850980-9534891481485778297913NaNNaN2017-01-30 06:11:37.913000-06:00
38NaN4.0224.0299850980-9534891481485778233440NaNNaN2017-01-30 06:10:33.440000-06:00
48[{'timestampMs': '1485778148328', 'activity': ...4.0224.0299850980-9534891481485778148040NaNNaN2017-01-30 06:09:08.040000-06:00
\n", "
" ], "text/plain": [ " accuracy activity altitude \\\n", "0 545 [{'timestampMs': '1485778729815', 'activity': ... 23.0 \n", "1 585 [{'timestampMs': '1485778608624', 'activity': ... 19.0 \n", "2 8 [{'timestampMs': '1485778474389', 'activity': ... 4.0 \n", "3 8 NaN 4.0 \n", "4 8 [{'timestampMs': '1485778148328', 'activity': ... 4.0 \n", "\n", " heading latitudeE7 longitudeE7 timestampMs velocity \\\n", "0 NaN 299853940 -953481015 1485778729042 NaN \n", "1 NaN 299853159 -953482313 1485778672000 NaN \n", "2 224.0 299850980 -953489148 1485778297913 NaN \n", "3 224.0 299850980 -953489148 1485778233440 NaN \n", "4 224.0 299850980 -953489148 1485778148040 NaN \n", "\n", " verticalAccuracy date_time \n", "0 NaN 2017-01-30 06:18:49.042000-06:00 \n", "1 NaN 2017-01-30 06:17:52-06:00 \n", "2 NaN 2017-01-30 06:11:37.913000-06:00 \n", "3 NaN 2017-01-30 06:10:33.440000-06:00 \n", "4 NaN 2017-01-30 06:09:08.040000-06:00 " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_houston2.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Activity (dictionary) - Extract small activites from dictionary" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "df_act = df_houston2[~df_houston2.activity.isnull()].copy()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(102460, 10)" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_act.shape" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['timestampMs', 'activity'])" ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_act.iloc[3].activity[0].keys()" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check if # of dict keys are consistent\n", "(df_act.activity.apply(lambda x: len(x[0].keys())) <2).sum()" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "29" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(df_act.activity.apply(lambda x: len(x[0].keys())) >2).sum()" ] }, { "cell_type": "code", "execution_count": 130, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "19233 (timestampMs, activity, extra)\n", "19234 (timestampMs, activity, extra)\n", "19236 (timestampMs, activity, extra)\n", "19237 (timestampMs, activity, extra)\n", "19238 (timestampMs, activity, extra)\n", "19242 (timestampMs, activity, extra)\n", "19243 (timestampMs, activity, extra)\n", "19244 (timestampMs, activity, extra)\n", "19246 (timestampMs, activity, extra)\n", "19250 (timestampMs, activity, extra)\n", "19301 (timestampMs, activity, extra)\n", "19303 (timestampMs, activity, extra)\n", "19304 (timestampMs, activity, extra)\n", "19305 (timestampMs, activity, extra)\n", "19307 (timestampMs, activity, extra)\n", "19309 (timestampMs, activity, extra)\n", "19310 (timestampMs, activity, extra)\n", "19315 (timestampMs, activity, extra)\n", "19318 (timestampMs, activity, extra)\n", "19352 (timestampMs, activity, extra)\n", "19353 (timestampMs, activity, extra)\n", "19354 (timestampMs, activity, extra)\n", "19356 (timestampMs, activity, extra)\n", "19357 (timestampMs, activity, extra)\n", "19358 (timestampMs, activity, extra)\n", "19360 (timestampMs, activity, extra)\n", "19363 (timestampMs, activity, extra)\n", "19365 (timestampMs, activity, extra)\n", "19366 (timestampMs, activity, extra)\n", "Name: activity, dtype: object" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_act[(df_act.activity.apply(lambda x: len(x[0].keys())) >2)].activity.apply(lambda x: x[0].keys())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: Almost all records have 'activity' dictionary containing 2 keys: timestampMs and activity. Only 29 (out of 300k+) of them have an extra key: extra. Let's see what these keys contain" ] }, { "cell_type": "code", "execution_count": 136, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'activity': [{'confidence': 82, 'type': 'IN_VEHICLE'},\n", " {'confidence': 15, 'type': 'STILL'},\n", " {'confidence': 13, 'type': 'UNKNOWN'},\n", " {'confidence': 2, 'type': 'ON_FOOT'},\n", " {'confidence': 2, 'type': 'WALKING'}],\n", " 'extra': [{'intVal': 100,\n", " 'name': 'vehicle_personal_confidence',\n", " 'type': 'VALUE'}],\n", " 'timestampMs': '1483476045607'},\n", " {'activity': [{'confidence': 100, 'type': 'TILTING'}],\n", " 'timestampMs': '1483476045545'}]" ] }, "execution_count": 136, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_act[(df_act.activity.apply(lambda x: len(x[0].keys())) >2)].loc[19233,'activity']\n", "# look at few records that have 3 keys" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'activity': [{'confidence': 74, 'type': 'IN_VEHICLE'},\n", " {'confidence': 26, 'type': 'UNKNOWN'}],\n", " 'extra': [{'intVal': 100,\n", " 'name': 'vehicle_personal_confidence',\n", " 'type': 'VALUE'}],\n", " 'timestampMs': '1483475547194'},\n", " {'activity': [{'confidence': 100, 'type': 'TILTING'}],\n", " 'timestampMs': '1483475547028'},\n", " {'activity': [{'confidence': 100, 'type': 'IN_VEHICLE'},\n", " {'confidence': 23, 'type': 'STILL'}],\n", " 'extra': [{'intVal': 100,\n", " 'name': 'vehicle_personal_confidence',\n", " 'type': 'VALUE'}],\n", " 'timestampMs': '1483475523282'}]" ] }, "execution_count": 140, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_act[(df_act.activity.apply(lambda x: len(x[0].keys())) >2)].loc[19242,'activity']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 'Extra' key" ] }, { "cell_type": "code", "execution_count": 151, "metadata": {}, "outputs": [], "source": [ "df_act_2act = df_act[(df_act.activity.apply(lambda x: len(x[0].keys())) >2)]" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "19233 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19234 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19236 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19237 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19238 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19242 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19243 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19244 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19246 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19250 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19301 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19303 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19304 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19305 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19307 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19309 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19310 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19315 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19318 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19352 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19353 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19354 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19356 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19357 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19358 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19360 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19363 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19365 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "19366 {'type': 'VALUE', 'name': 'vehicle_personal_co...\n", "Name: activity, dtype: object" ] }, "execution_count": 166, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_act_2act[df_act_2act.activity.apply(lambda x: len(x[0]['extra'][0].keys()) == 3)].activity.apply(lambda x: x[0]['extra'][0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: extra list only contains 1 value (a list size 1) and they are all the same: personal vehicle related info" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create new data record for each non-null activity" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert this (an example of Activity feature)\n", "```\n", "[{'activity': [{'confidence': 74, 'type': 'IN_VEHICLE'},\n", " {'confidence': 26, 'type': 'UNKNOWN'}],\n", " 'extra': [{'intVal': 100,\n", " 'name': 'vehicle_personal_confidence',\n", " 'type': 'VALUE'}],\n", " 'timestampMs': '1483475547194'}]\n", "```\n", "to this\n", "\n", "| activity_conf1 | activity_type1 | activity_conf2 | activity_type2 | extra_intVal | extra_name | extra_type | timestampMs |\n", "|----------------|----------------|----------------|----------------|--------------|------------|------------|-------------|\n", "| 74| IN_VEHICLE | 26 | UNKNOWN | 100 | vehicle_personal_confidence | VALUE | 1483475547194\n", "\n", "(Google already sorted 'confidence' from high to low, so we will record only the first 2 confidence. We will replace the record timestamp with this timestamp)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['accuracy', 'activity', 'altitude', 'heading', 'latitudeE7',\n", " 'longitudeE7', 'timestampMs', 'velocity', 'verticalAccuracy'],\n", " dtype='object')" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_houston2.columns" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "df_act = df_houston2[~df_houston2.activity.isnull()].copy() # get all the non-null activity record" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(102460,)" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_act.timestampMs.value_counts().shape" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accuracyactivityaltitudeheadinglatitudeE7longitudeE7timestampMsvelocityverticalAccuracy
0545[{'timestampMs': '1485778729815', 'activity': ...23.0NaN299853940-9534810151485778729042NaNNaN
1585[{'timestampMs': '1485778608624', 'activity': ...19.0NaN299853159-9534823131485778672000NaNNaN
28[{'timestampMs': '1485778474389', 'activity': ...4.0224.0299850980-9534891481485778297913NaNNaN
\n", "
" ], "text/plain": [ " accuracy activity altitude \\\n", "0 545 [{'timestampMs': '1485778729815', 'activity': ... 23.0 \n", "1 585 [{'timestampMs': '1485778608624', 'activity': ... 19.0 \n", "2 8 [{'timestampMs': '1485778474389', 'activity': ... 4.0 \n", "\n", " heading latitudeE7 longitudeE7 timestampMs velocity verticalAccuracy \n", "0 NaN 299853940 -953481015 1485778729042 NaN NaN \n", "1 NaN 299853159 -953482313 1485778672000 NaN NaN \n", "2 224.0 299850980 -953489148 1485778297913 NaN NaN " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_act.head(3)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "temp_act = df_act.iloc[0:10]" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accuracyactivityaltitudeheadinglatitudeE7longitudeE7timestampMsvelocityverticalAccuracy
0545[{'timestampMs': '1485778729815', 'activity': ...23.0NaN299853940-9534810151485778729042NaNNaN
1585[{'timestampMs': '1485778608624', 'activity': ...19.0NaN299853159-9534823131485778672000NaNNaN
28[{'timestampMs': '1485778474389', 'activity': ...4.0224.0299850980-9534891481485778297913NaNNaN
48[{'timestampMs': '1485778148328', 'activity': ...4.0224.0299850980-9534891481485778148040NaNNaN
58[{'timestampMs': '1485778027987', 'activity': ...4.0224.0299850980-9534891481485778028950NaNNaN
\n", "
" ], "text/plain": [ " accuracy activity altitude \\\n", "0 545 [{'timestampMs': '1485778729815', 'activity': ... 23.0 \n", "1 585 [{'timestampMs': '1485778608624', 'activity': ... 19.0 \n", "2 8 [{'timestampMs': '1485778474389', 'activity': ... 4.0 \n", "4 8 [{'timestampMs': '1485778148328', 'activity': ... 4.0 \n", "5 8 [{'timestampMs': '1485778027987', 'activity': ... 4.0 \n", "\n", " heading latitudeE7 longitudeE7 timestampMs velocity verticalAccuracy \n", "0 NaN 299853940 -953481015 1485778729042 NaN NaN \n", "1 NaN 299853159 -953482313 1485778672000 NaN NaN \n", "2 224.0 299850980 -953489148 1485778297913 NaN NaN \n", "4 224.0 299850980 -953489148 1485778148040 NaN NaN \n", "5 224.0 299850980 -953489148 1485778028950 NaN NaN " ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp_act.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Recap: Activity List can contain >=1 dictionary\n", "\n", "Dictionary has keys: timestampms, activity and extra\n", "\n", "Value of 'activity' key in dictionary is a list that can have >=1 dict, each dict has keys: confidence, type. Only pick 2 highest confidences\n", "\n", "Value of 'extra' key in dictionary is a list that can have >=1 dict, each dict has keys: intval, name and type" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 1, 2, 4, 5, 6, 7, 8, 9, 10])" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp_act.index.values" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['accuracy',\n", " 'altitude',\n", " 'heading',\n", " 'latitudeE7',\n", " 'longitudeE7',\n", " 'velocity',\n", " 'verticalAccuracy']" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "col_to_drop = ['timestampMs','activity']\n", "org_cols=[col for col in temp_act.columns.values.tolist() if col not in col_to_drop]\n", "org_cols" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "new_cols = ['timestampMs','act_conf1','act_type1','act_cont2','act_type2',\n", " 'extra_intVal','extra_name','extra_type']" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "\n", "temp_combined_df = pd.DataFrame(columns=org_cols + new_cols)\n", "\n", "for idx,acts in zip(temp_act.index.values,temp_act.activity.values):\n", " temp_new_df = pd.DataFrame(columns=new_cols)\n", " c=0\n", " for act in acts:\n", " row=[]\n", " row.append(act['timestampMs'])\n", " # get activity max confidence and type, which is the first dict in activity list\n", " cof = act['activity'][0]\n", " row+=[cof['confidence'],cof['type']]\n", " # get secondary activity conf and type\n", " if len(act['activity']) > 1:\n", " cof = act['activity'][1]\n", " row+=[cof['confidence'],cof['type']]\n", " else:\n", " row+=[np.NaN,np.NaN]\n", " # extra\n", " if 'extra' in act:\n", " ex = act['extra'][0] # take only 1 item in extra list\n", " row+=[ex['intVal'],ex['name'],ex['type']]\n", " else:\n", " row+=[np.NaN,np.NaN,np.NaN]\n", " temp_new_df.loc[c]=row\n", " c+=1\n", " \n", " \n", " temp_org_df = pd.DataFrame([temp_act.loc[idx]]*c,index=range(0,c)) # copy the rest of features and save it to another df\n", " temp_org_df.drop(col_to_drop,axis=1,inplace=True)\n", " temp_combined_df=temp_combined_df.append(pd.concat([temp_org_df,temp_new_df],axis=1),ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accuracyaltitudeheadinglatitudeE7longitudeE7velocityverticalAccuracytimestampMsact_conf1act_type1act_cont2act_type2extra_intValextra_nameextra_type
054523.0NaN299853940-953481015NaNNaN1485778729815100STILLNaNNaNNaNNaNNaN
158519.0NaN299853159-953482313NaNNaN1485778608624100STILLNaNNaNNaNNaNNaN
284.0224.0299850980-953489148NaNNaN1485778474389100STILLNaNNaNNaNNaNNaN
384.0224.0299850980-953489148NaNNaN1485778410435100STILLNaNNaNNaNNaNNaN
484.0224.0299850980-953489148NaNNaN1485778148328100STILLNaNNaNNaNNaNNaN
584.0224.0299850980-953489148NaNNaN148577802798771STILL19IN_VEHICLENaNNaNNaN
629216.0186.0299849212-953487105NaNNaN148577798055187STILL13IN_VEHICLENaNNaNNaN
729216.0186.0299849212-953487105NaNNaN1485777920344100TILTINGNaNNaNNaNNaNNaN
829216.0186.0299849212-953487105NaNNaN1485777850168100TILTINGNaNNaNNaNNaNNaN
929216.0186.0299849212-953487105NaNNaN148577784019890STILL10IN_VEHICLENaNNaNNaN
1014515.077.0299853950-953490222NaNNaN148577770991344STILL42UNKNOWNNaNNaNNaN
1114515.077.0299853950-953490222NaNNaN1485777694049100TILTINGNaNNaNNaNNaNNaN
124316.0NaN299849213-953489881NaNNaN148577767818575STILL16UNKNOWNNaNNaNNaN
134316.0NaN299849213-953489881NaNNaN1485777676309100TILTINGNaNNaNNaNNaNNaN
144316.0NaN299849213-953489881NaNNaN148577767443369STILL13IN_VEHICLENaNNaNNaN
154316.0NaN299849213-953489881NaNNaN1485777614917100TILTINGNaNNaNNaNNaNNaN
16106-45.06.0299852839-953485393NaNNaN148577755540085STILL15IN_VEHICLENaNNaNNaN
\n", "
" ], "text/plain": [ " accuracy altitude heading latitudeE7 longitudeE7 velocity \\\n", "0 545 23.0 NaN 299853940 -953481015 NaN \n", "1 585 19.0 NaN 299853159 -953482313 NaN \n", "2 8 4.0 224.0 299850980 -953489148 NaN \n", "3 8 4.0 224.0 299850980 -953489148 NaN \n", "4 8 4.0 224.0 299850980 -953489148 NaN \n", "5 8 4.0 224.0 299850980 -953489148 NaN \n", "6 292 16.0 186.0 299849212 -953487105 NaN \n", "7 292 16.0 186.0 299849212 -953487105 NaN \n", "8 292 16.0 186.0 299849212 -953487105 NaN \n", "9 292 16.0 186.0 299849212 -953487105 NaN \n", "10 145 15.0 77.0 299853950 -953490222 NaN \n", "11 145 15.0 77.0 299853950 -953490222 NaN \n", "12 43 16.0 NaN 299849213 -953489881 NaN \n", "13 43 16.0 NaN 299849213 -953489881 NaN \n", "14 43 16.0 NaN 299849213 -953489881 NaN \n", "15 43 16.0 NaN 299849213 -953489881 NaN \n", "16 106 -45.0 6.0 299852839 -953485393 NaN \n", "\n", " verticalAccuracy timestampMs act_conf1 act_type1 act_cont2 act_type2 \\\n", "0 NaN 1485778729815 100 STILL NaN NaN \n", "1 NaN 1485778608624 100 STILL NaN NaN \n", "2 NaN 1485778474389 100 STILL NaN NaN \n", "3 NaN 1485778410435 100 STILL NaN NaN \n", "4 NaN 1485778148328 100 STILL NaN NaN \n", "5 NaN 1485778027987 71 STILL 19 IN_VEHICLE \n", "6 NaN 1485777980551 87 STILL 13 IN_VEHICLE \n", "7 NaN 1485777920344 100 TILTING NaN NaN \n", "8 NaN 1485777850168 100 TILTING NaN NaN \n", "9 NaN 1485777840198 90 STILL 10 IN_VEHICLE \n", "10 NaN 1485777709913 44 STILL 42 UNKNOWN \n", "11 NaN 1485777694049 100 TILTING NaN NaN \n", "12 NaN 1485777678185 75 STILL 16 UNKNOWN \n", "13 NaN 1485777676309 100 TILTING NaN NaN \n", "14 NaN 1485777674433 69 STILL 13 IN_VEHICLE \n", "15 NaN 1485777614917 100 TILTING NaN NaN \n", "16 NaN 1485777555400 85 STILL 15 IN_VEHICLE \n", "\n", " extra_intVal extra_name extra_type \n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "5 NaN NaN NaN \n", "6 NaN NaN NaN \n", "7 NaN NaN NaN \n", "8 NaN NaN NaN \n", "9 NaN NaN NaN \n", "10 NaN NaN NaN \n", "11 NaN NaN NaN \n", "12 NaN NaN NaN \n", "13 NaN NaN NaN \n", "14 NaN NaN NaN \n", "15 NaN NaN NaN \n", "16 NaN NaN NaN " ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp_combined_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Combined non-null-activity dataframe to null-activity dataframe" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "df_non_act = df_houston2[df_houston2.activity.isnull()].copy() # get all the null activity record" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "temp_non_act=df_non_act.iloc[:10].copy()" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "temp_non_act.drop('activity',axis=1,inplace=True)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accuracyaltitudeheadinglatitudeE7longitudeE7timestampMsvelocityverticalAccuracy
384.0224.0299850980-9534891481485778233440NaNNaN
13395-18.0NaN299857978-9534937531485777399000NaNNaN
1453-3.0335.0299855823-9534904041485777360000NaNNaN
1513854.030.0299850044-9534900811485777320000NaNNaN
161700NaNNaN299819691-9535421281485777244889NaNNaN
191700NaNNaN299819691-9535421281485776979169NaNNaN
212360.0111.0299851691-9534880971485776857000NaNNaN
23125356.0248.0299846068-9534871621485776770000NaNNaN
2923NaNNaN299852733-9534934931485776105741NaNNaN
3122NaNNaN299856137-9534983971485775997117NaNNaN
\n", "
" ], "text/plain": [ " accuracy altitude heading latitudeE7 longitudeE7 timestampMs \\\n", "3 8 4.0 224.0 299850980 -953489148 1485778233440 \n", "13 395 -18.0 NaN 299857978 -953493753 1485777399000 \n", "14 53 -3.0 335.0 299855823 -953490404 1485777360000 \n", "15 138 54.0 30.0 299850044 -953490081 1485777320000 \n", "16 1700 NaN NaN 299819691 -953542128 1485777244889 \n", "19 1700 NaN NaN 299819691 -953542128 1485776979169 \n", "21 23 60.0 111.0 299851691 -953488097 1485776857000 \n", "23 125 356.0 248.0 299846068 -953487162 1485776770000 \n", "29 23 NaN NaN 299852733 -953493493 1485776105741 \n", "31 22 NaN NaN 299856137 -953498397 1485775997117 \n", "\n", " velocity verticalAccuracy \n", "3 NaN NaN \n", "13 NaN NaN \n", "14 NaN NaN \n", "15 NaN NaN \n", "16 NaN NaN \n", "19 NaN NaN \n", "21 NaN NaN \n", "23 NaN NaN \n", "29 NaN NaN \n", "31 NaN NaN " ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp_non_act" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accuracyaltitudeheadinglatitudeE7longitudeE7velocityverticalAccuracytimestampMsact_conf1act_type1act_cont2act_type2extra_intValextra_nameextra_type
054523.0NaN299853940-953481015NaNNaN1485778729815100STILLNaNNaNNaNNaNNaN
158519.0NaN299853159-953482313NaNNaN1485778608624100STILLNaNNaNNaNNaNNaN
284.0224.0299850980-953489148NaNNaN1485778474389100STILLNaNNaNNaNNaNNaN
384.0224.0299850980-953489148NaNNaN1485778410435100STILLNaNNaNNaNNaNNaN
484.0224.0299850980-953489148NaNNaN1485778148328100STILLNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " accuracy altitude heading latitudeE7 longitudeE7 velocity \\\n", "0 545 23.0 NaN 299853940 -953481015 NaN \n", "1 585 19.0 NaN 299853159 -953482313 NaN \n", "2 8 4.0 224.0 299850980 -953489148 NaN \n", "3 8 4.0 224.0 299850980 -953489148 NaN \n", "4 8 4.0 224.0 299850980 -953489148 NaN \n", "\n", " verticalAccuracy timestampMs act_conf1 act_type1 act_cont2 act_type2 \\\n", "0 NaN 1485778729815 100 STILL NaN NaN \n", "1 NaN 1485778608624 100 STILL NaN NaN \n", "2 NaN 1485778474389 100 STILL NaN NaN \n", "3 NaN 1485778410435 100 STILL NaN NaN \n", "4 NaN 1485778148328 100 STILL NaN NaN \n", "\n", " extra_intVal extra_name extra_type \n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN " ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp_combined_df.head()" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accuracyaltitudeheadinglatitudeE7longitudeE7timestampMsvelocityverticalAccuracyact_conf1act_type1act_cont2act_type2extra_intValextra_nameextra_type
084.0224.0299850980-9534891481485778233440NaNNaNNaNNaNNaNNaNNaNNaNNaN
1395-18.0NaN299857978-9534937531485777399000NaNNaNNaNNaNNaNNaNNaNNaNNaN
253-3.0335.0299855823-9534904041485777360000NaNNaNNaNNaNNaNNaNNaNNaNNaN
313854.030.0299850044-9534900811485777320000NaNNaNNaNNaNNaNNaNNaNNaNNaN
41700NaNNaN299819691-9535421281485777244889NaNNaNNaNNaNNaNNaNNaNNaNNaN
51700NaNNaN299819691-9535421281485776979169NaNNaNNaNNaNNaNNaNNaNNaNNaN
62360.0111.0299851691-9534880971485776857000NaNNaNNaNNaNNaNNaNNaNNaNNaN
7125356.0248.0299846068-9534871621485776770000NaNNaNNaNNaNNaNNaNNaNNaNNaN
823NaNNaN299852733-9534934931485776105741NaNNaNNaNNaNNaNNaNNaNNaNNaN
922NaNNaN299856137-9534983971485775997117NaNNaNNaNNaNNaNNaNNaNNaNNaN
1054523.0NaN299853940-9534810151485778729815NaNNaN100STILLNaNNaNNaNNaNNaN
1158519.0NaN299853159-9534823131485778608624NaNNaN100STILLNaNNaNNaNNaNNaN
1284.0224.0299850980-9534891481485778474389NaNNaN100STILLNaNNaNNaNNaNNaN
1384.0224.0299850980-9534891481485778410435NaNNaN100STILLNaNNaNNaNNaNNaN
1484.0224.0299850980-9534891481485778148328NaNNaN100STILLNaNNaNNaNNaNNaN
1584.0224.0299850980-9534891481485778027987NaNNaN71STILL19IN_VEHICLENaNNaNNaN
1629216.0186.0299849212-9534871051485777980551NaNNaN87STILL13IN_VEHICLENaNNaNNaN
1729216.0186.0299849212-9534871051485777920344NaNNaN100TILTINGNaNNaNNaNNaNNaN
1829216.0186.0299849212-9534871051485777850168NaNNaN100TILTINGNaNNaNNaNNaNNaN
1929216.0186.0299849212-9534871051485777840198NaNNaN90STILL10IN_VEHICLENaNNaNNaN
2014515.077.0299853950-9534902221485777709913NaNNaN44STILL42UNKNOWNNaNNaNNaN
2114515.077.0299853950-9534902221485777694049NaNNaN100TILTINGNaNNaNNaNNaNNaN
224316.0NaN299849213-9534898811485777678185NaNNaN75STILL16UNKNOWNNaNNaNNaN
234316.0NaN299849213-9534898811485777676309NaNNaN100TILTINGNaNNaNNaNNaNNaN
244316.0NaN299849213-9534898811485777674433NaNNaN69STILL13IN_VEHICLENaNNaNNaN
254316.0NaN299849213-9534898811485777614917NaNNaN100TILTINGNaNNaNNaNNaNNaN
26106-45.06.0299852839-9534853931485777555400NaNNaN85STILL15IN_VEHICLENaNNaNNaN
\n", "
" ], "text/plain": [ " accuracy altitude heading latitudeE7 longitudeE7 timestampMs \\\n", "0 8 4.0 224.0 299850980 -953489148 1485778233440 \n", "1 395 -18.0 NaN 299857978 -953493753 1485777399000 \n", "2 53 -3.0 335.0 299855823 -953490404 1485777360000 \n", "3 138 54.0 30.0 299850044 -953490081 1485777320000 \n", "4 1700 NaN NaN 299819691 -953542128 1485777244889 \n", "5 1700 NaN NaN 299819691 -953542128 1485776979169 \n", "6 23 60.0 111.0 299851691 -953488097 1485776857000 \n", "7 125 356.0 248.0 299846068 -953487162 1485776770000 \n", "8 23 NaN NaN 299852733 -953493493 1485776105741 \n", "9 22 NaN NaN 299856137 -953498397 1485775997117 \n", "10 545 23.0 NaN 299853940 -953481015 1485778729815 \n", "11 585 19.0 NaN 299853159 -953482313 1485778608624 \n", "12 8 4.0 224.0 299850980 -953489148 1485778474389 \n", "13 8 4.0 224.0 299850980 -953489148 1485778410435 \n", "14 8 4.0 224.0 299850980 -953489148 1485778148328 \n", "15 8 4.0 224.0 299850980 -953489148 1485778027987 \n", "16 292 16.0 186.0 299849212 -953487105 1485777980551 \n", "17 292 16.0 186.0 299849212 -953487105 1485777920344 \n", "18 292 16.0 186.0 299849212 -953487105 1485777850168 \n", "19 292 16.0 186.0 299849212 -953487105 1485777840198 \n", "20 145 15.0 77.0 299853950 -953490222 1485777709913 \n", "21 145 15.0 77.0 299853950 -953490222 1485777694049 \n", "22 43 16.0 NaN 299849213 -953489881 1485777678185 \n", "23 43 16.0 NaN 299849213 -953489881 1485777676309 \n", "24 43 16.0 NaN 299849213 -953489881 1485777674433 \n", "25 43 16.0 NaN 299849213 -953489881 1485777614917 \n", "26 106 -45.0 6.0 299852839 -953485393 1485777555400 \n", "\n", " velocity verticalAccuracy act_conf1 act_type1 act_cont2 act_type2 \\\n", "0 NaN NaN NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN NaN \n", "5 NaN NaN NaN NaN NaN NaN \n", "6 NaN NaN NaN NaN NaN NaN \n", "7 NaN NaN NaN NaN NaN NaN \n", "8 NaN NaN NaN NaN NaN NaN \n", "9 NaN NaN NaN NaN NaN NaN \n", "10 NaN NaN 100 STILL NaN NaN \n", "11 NaN NaN 100 STILL NaN NaN \n", "12 NaN NaN 100 STILL NaN NaN \n", "13 NaN NaN 100 STILL NaN NaN \n", "14 NaN NaN 100 STILL NaN NaN \n", "15 NaN NaN 71 STILL 19 IN_VEHICLE \n", "16 NaN NaN 87 STILL 13 IN_VEHICLE \n", "17 NaN NaN 100 TILTING NaN NaN \n", "18 NaN NaN 100 TILTING NaN NaN \n", "19 NaN NaN 90 STILL 10 IN_VEHICLE \n", "20 NaN NaN 44 STILL 42 UNKNOWN \n", "21 NaN NaN 100 TILTING NaN NaN \n", "22 NaN NaN 75 STILL 16 UNKNOWN \n", "23 NaN NaN 100 TILTING NaN NaN \n", "24 NaN NaN 69 STILL 13 IN_VEHICLE \n", "25 NaN NaN 100 TILTING NaN NaN \n", "26 NaN NaN 85 STILL 15 IN_VEHICLE \n", "\n", " extra_intVal extra_name extra_type \n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "5 NaN NaN NaN \n", "6 NaN NaN NaN \n", "7 NaN NaN NaN \n", "8 NaN NaN NaN \n", "9 NaN NaN NaN \n", "10 NaN NaN NaN \n", "11 NaN NaN NaN \n", "12 NaN NaN NaN \n", "13 NaN NaN NaN \n", "14 NaN NaN NaN \n", "15 NaN NaN NaN \n", "16 NaN NaN NaN \n", "17 NaN NaN NaN \n", "18 NaN NaN NaN \n", "19 NaN NaN NaN \n", "20 NaN NaN NaN \n", "21 NaN NaN NaN \n", "22 NaN NaN NaN \n", "23 NaN NaN NaN \n", "24 NaN NaN NaN \n", "25 NaN NaN NaN \n", "26 NaN NaN NaN " ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "temp_non_act.append(temp_combined_df,ignore_index=True,sort=False) # combine all the null and non-null activity records" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }