{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Curate_StudyAppDownloadReason_Data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:54:55.009598Z", "start_time": "2018-12-28T19:54:37.669242Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Welcome, Abhishek Pratap!\n", "\n" ] } ], "source": [ "%matplotlib inline\n", "\n", "import datetime as dt\n", "import itertools as it\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import matplotlib.pyplot as plt\n", "from IPython.core.interactiveshell import InteractiveShell\n", "import synapseclient\n", "from synapseclient import Activity, File, Schema, Table, as_table_columns\n", "from tqdm import tqdm\n", "\n", "InteractiveShell.ast_node_interactivity = 'all'\n", "syn = synapseclient.Synapse()\n", "syn.login()\n", "\n", "tqdm.pandas()\n", "\n", "def isnum(x):\n", " if x is None:\n", " return False\n", " try:\n", " float(x)\n", " return True\n", " except ValueError:\n", " return False" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:54:57.272327Z", "start_time": "2018-12-28T19:54:55.020146Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
brightenidstartweekuser_idsent_time_localsent_time_utcresponse_localresponse_utcresponse_idWhy did you download this app?For \"other\", please type in box
0BLUE-000482014-08-01010431NaTNaT2014-08-01 07:00:092014-08-01 11:00:09166331for fun|for mental health reasons [e.g. depres...NaN
1BLUE-000492014-08-01010470NaTNaT2014-08-01 12:31:582014-08-01 16:31:58166824for brain health [e.g. better memory]|to impro...NaN
2BLUE-000502014-08-09010519NaTNaT2014-08-08 19:31:062014-08-09 02:31:06173157for management of daily problems|for brain hea...NaN
3BLUE-000502014-08-094105192014-09-06 09:00:012014-09-06 16:00:012014-09-08 18:21:292014-09-09 01:21:29198454for management of daily problems|for brain hea...NaN
4BLUE-000502014-08-0912105192014-11-01 08:00:052014-11-01 16:00:052014-11-02 09:16:222014-11-02 17:16:22256858for mood [e.g. sadness]|for brain health [e.g....NaN
\n", "
" ], "text/plain": [ " brightenid start week user_id sent_time_local \\\n", "0 BLUE-00048 2014-08-01 0 10431 NaT \n", "1 BLUE-00049 2014-08-01 0 10470 NaT \n", "2 BLUE-00050 2014-08-09 0 10519 NaT \n", "3 BLUE-00050 2014-08-09 4 10519 2014-09-06 09:00:01 \n", "4 BLUE-00050 2014-08-09 12 10519 2014-11-01 08:00:05 \n", "\n", " sent_time_utc response_local response_utc response_id \\\n", "0 NaT 2014-08-01 07:00:09 2014-08-01 11:00:09 166331 \n", "1 NaT 2014-08-01 12:31:58 2014-08-01 16:31:58 166824 \n", "2 NaT 2014-08-08 19:31:06 2014-08-09 02:31:06 173157 \n", "3 2014-09-06 16:00:01 2014-09-08 18:21:29 2014-09-09 01:21:29 198454 \n", "4 2014-11-01 16:00:05 2014-11-02 09:16:22 2014-11-02 17:16:22 256858 \n", "\n", " Why did you download this app? \\\n", "0 for fun|for mental health reasons [e.g. depres... \n", "1 for brain health [e.g. better memory]|to impro... \n", "2 for management of daily problems|for brain hea... \n", "3 for management of daily problems|for brain hea... \n", "4 for mood [e.g. sadness]|for brain health [e.g.... \n", "\n", " For \"other\", please type in box \n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
usernameWhy did you download this app?createdAt
0EN05039Fun,My mental health,My mood,Managing daily is...2016-09-03 17:13:21
1EN05331My mental health,Brain health,Improve work2017-01-22 23:04:11
2EN00387My mental health,My mood,Brain health,Fun2016-10-28 08:37:06
3EN00322My mental health2016-09-07 21:32:32
4EN00478My mental health,Managing daily issues,My mood...2016-11-11 07:21:42
\n", "
" ], "text/plain": [ " username Why did you download this app? \\\n", "0 EN05039 Fun,My mental health,My mood,Managing daily is... \n", "1 EN05331 My mental health,Brain health,Improve work \n", "2 EN00387 My mental health,My mood,Brain health,Fun \n", "3 EN00322 My mental health \n", "4 EN00478 My mental health,Managing daily issues,My mood... \n", "\n", " createdAt \n", "0 2016-09-03 17:13:21 \n", "1 2017-01-22 23:04:11 \n", "2 2016-10-28 08:37:06 \n", "3 2016-09-07 21:32:32 \n", "4 2016-11-11 07:21:42 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "v1sid, v2sid = 'syn10250489', 'syn17023091'\n", "\n", "v1r = pd.read_excel(syn.get(v1sid).path)\n", "v2r = pd.read_csv(syn.get(v2sid).path, parse_dates=['createdAt'])\n", "\n", "v1r.head()\n", "v2r.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### V1 Data Prep" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:54:57.410111Z", "start_time": "2018-12-28T19:54:57.275614Z" } }, "outputs": [], "source": [ "# drop the uneeded columns\n", "v1 = v1r.drop(columns=[\n", " 'week',\t'user_id',\t'sent_time_local',\t'sent_time_utc', 'response_utc', 'response_id'\n", "]).rename(columns={\n", " 'brightenid': 'participant_id',\n", " 'response_local':'dt_response',\n", " 'Why did you download this app?':'apps',\n", " 'For \"other\", please type in box':'other_description'\n", "})\n", "\n", "# conver to lowercase for lookups\n", "v1.apps = v1.apps.apply(lambda x: x.lower() if not isnum(x) else 'none')\n", "\n", "# add indicators for different app usage\n", "v1['fun'] = v1.apps.apply(lambda x: int(x.find('fun') > -1))\n", "v1['mental_health'] = v1.apps.apply(lambda x: int(x.find('mental health') > -1))\n", "v1['mood'] = v1.apps.apply(lambda x: int(x.find('mood') > -1))\n", "v1['managing_daily_issues'] = v1.apps.apply(lambda x: int(x.find('management of daily ') > -1))\n", "v1['improve_work'] = v1.apps.apply(lambda x: int(x.find('improve work') > -1))\n", "v1['brain_health'] = v1.apps.apply(lambda x: int(x.find('brain health') > -1))\n", "v1['improve_relationships'] = v1.apps.apply(lambda x: int(x.find('improve relationships') > -1))\n", "v1['other'] = v1.apps.apply(lambda x: int(x.find('other') > -1))\n", "\n", "v1.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:54:57.423884Z", "start_time": "2018-12-28T19:54:57.414380Z" } }, "outputs": [], "source": [ "# print the unique application reasons\n", "t = [print(a) for a in pd.unique(list(it.chain(*[t.split('|') for t in v1.apps])))]; del t" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:54:57.724949Z", "start_time": "2018-12-28T19:54:57.429488Z" } }, "outputs": [], "source": [ "plt.hist([len(a) for a in v1.other_description if not isnum(a)])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:54:58.216979Z", "start_time": "2018-12-28T19:54:57.728659Z" } }, "outputs": [], "source": [ "v1['day'] = [(t.dt_response - t.start).days + 1 for t in v1.itertuples()]\n", "v1.day.hist()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Extract two common topics I saw a quick read through the other descriptions" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:54:58.243131Z", "start_time": "2018-12-28T19:54:58.223308Z" }, "scrolled": false }, "outputs": [], "source": [ "incentive_words = [\n", " 'paid', 'pay', 'mone', 'compens', 'gift', 'incentive', '$', 'finan', 'incom', 'reimb', 'craigs', 'pd'\n", "]\n", "\n", "def is_incentive(s):\n", " if not isinstance(s, str):\n", " return 0\n", " else:\n", " return int(any(s.find(a) > -1 for a in incentive_words))\n", " \n", "v1['happ_inc'] = pd.to_numeric(v1.other_description.apply(is_incentive), downcast='integer')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:54:58.299327Z", "start_time": "2018-12-28T19:54:58.248110Z" } }, "outputs": [], "source": [ "for_the_study_words = [\n", " 'part', 'require', 'for study', 'told to', 'asked to', 'request', 'to be', 'to do', 'brighten', 'assignment', 'ucsf', 'for a', 'study'\n", "]\n", "\n", "def is_for_the_study(s):\n", " if not isinstance(s, str):\n", " return int(0)\n", " else:\n", " return int(any(s.find(a) > -1 for a in for_the_study_words))\n", " \n", "v1['happ_fts'] = v1.other_description.apply(is_for_the_study)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:54:58.360346Z", "start_time": "2018-12-28T19:54:58.303805Z" } }, "outputs": [], "source": [ "v1 = v1.drop(columns=['start', 'day'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### V2 Data Prep" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:54:58.439887Z", "start_time": "2018-12-28T19:54:58.364685Z" } }, "outputs": [], "source": [ "# drop the uneeded columns\n", "v2 = v2r.rename(columns={\n", " 'username': 'participant_id',\n", " 'createdAt':'dt_response',\n", " 'Why did you download this app?':'apps'\n", "})\n", "\n", "# conver to lowercase for lookups\n", "v2.apps = v2.apps.apply(lambda x: x.lower() if not isnum(x) else 'none')\n", "\n", "# add indicators for different app usage\n", "v2['fun'] = v2.apps.apply(lambda x: int(x.find('fun') > -1))\n", "v2['mental_health'] = v2.apps.apply(lambda x: int(x.find('my mental health') > -1))\n", "v2['mood'] = v2.apps.apply(lambda x: int(x.find('my mood') > -1))\n", "v2['managing_daily_issues'] = v2.apps.apply(lambda x: int(x.find('managing daily issues') > -1))\n", "v2['improve_work'] = v2.apps.apply(lambda x: int(x.find('improve work') > -1))\n", "v2['brain_health'] = v2.apps.apply(lambda x: int(x.find('brain health') > -1))\n", "v2['improve_relationships'] = v2.apps.apply(lambda x: int(x.find('improve relationships') > -1))\n", "v2['other'] = v2.apps.apply(lambda x: int(x.find('other') > -1))\n", "\n", "# add the study flag\n", "v2.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:54:58.463531Z", "start_time": "2018-12-28T19:54:58.444738Z" } }, "outputs": [], "source": [ "t = [print(a) for a in pd.unique(list(it.chain(*[t.split(',') for t in v2.apps])))]; del t" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Combine the DataFrames" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:54:58.541761Z", "start_time": "2018-12-28T19:54:58.468717Z" } }, "outputs": [], "source": [ "combined = pd.concat([v1, v2], sort=False)\n", "\n", "combined = combined.drop(columns=['apps']).rename(columns={\n", " 'brain_health': 'happ_bh',\n", " 'fun': 'happ_f',\n", " 'improve_relationships': 'happ_ir',\n", " 'improve_work': 'happ_iw',\n", " 'managing_daily_issues': 'happ_mdi',\n", " 'mental_health': 'happ_mh',\n", " 'mood':'happ_m',\n", " 'other': 'happ_o',\n", " 'other_description':'happ_o_description'\n", "}).loc[:, [\n", " 'participant_id', 'dt_response', \n", " 'happ_bh', 'happ_f', 'happ_fts', 'happ_inc', 'happ_ir', 'happ_iw', 'happ_m', 'happ_mdi', 'happ_mh', 'happ_o',\n", " 'happ_o_description'\n", "]]\n", "\n", "combined = combined.fillna(0)\n", "\n", "# make sure they're all the correct type. for some reason pandas was converting fts and inc to floats\n", "# but we don't need that many bits. plus i like consistency\n", "for c in combined.columns:\n", " if c.find('description') > -1:\n", " continue \n", " \n", " if c.find('app') > -1:\n", " combined[c] = combined[c].astype(int)\n", "\n", "combined.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add week into study" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "metasid = 'syn27082597'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:54:59.936285Z", "start_time": "2018-12-28T19:54:58.545530Z" } }, "outputs": [], "source": [ "metadata = syn.tableQuery(f'SELECT participant_id, startdate FROM {metasid}').asDataFrame(convert_to_datetime=True)\n", "metadata.startdate = pd.to_datetime(metadata.startdate)\n", "\n", "# add in the participants start date as a new column\n", "combined = pd.merge(combined, metadata, on='participant_id', how='left')\n", "\n", "# get the time difference in weeks as a float\n", "combined['week'] = [\n", " d.days/7 for d in (\n", " combined.dt_response.apply(\n", " lambda x: dt.datetime(year=x.year, month=x.month, day=x.day))-combined.startdate\n", " )\n", "]\n", "\n", "# convert the week number to an int by taking the floor\n", "combined.week = combined.week.progress_apply(lambda x: np.int16(np.floor(x))+1)# if not pd.isnull(x) else np.nan)\n", "\n", "# remove the start date\n", "combined = combined.drop(columns=['startdate'], errors='ignore')\n", "\n", "# reorder the columns\n", "cols = list(combined.columns)\n", "cols = cols[0:2] + ['week'] + cols[2:-1]\n", "combined = combined.reindex(columns=cols)\n", "\n", "combined.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Localize timestamps" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:55:00.056246Z", "start_time": "2018-12-28T19:54:59.951047Z" } }, "outputs": [], "source": [ "# localize timestamps\n", "combined['dt_response'] = [\n", " str(t.tz_localize('UTC'))\n", " for t in combined.dt_response\n", "]\n", "\n", "combined.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set provenance and upload to Synapse" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2018-12-28T19:55:21.790792Z", "start_time": "2018-12-28T19:55:00.061005Z" } }, "outputs": [], "source": [ "t = syn.delete(\n", " syn.tableQuery('select * from syn17022426')\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "start_time": "2018-12-28T19:54:37.763Z" } }, "outputs": [], "source": [ "final = syn.store(Table(\n", " Schema(\n", " name='Health Applications',\n", " columns=as_table_columns(combined), \n", " parent='syn10848316'),\n", " combined\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "start_time": "2018-12-28T19:54:37.767Z" } }, "outputs": [], "source": [ "final = syn.setProvenance(\n", " 'syn17022426',\n", " activity=Activity(\n", " name='Combine V1 and V2 data',\n", " description='Process and combine the data collected during study 1 and study 2',\n", " used=['syn17023091', v1sid, v2sid, 'syn12181332', metasid],\n", " executed=[\n", " dict(\n", " name='Curate_StudyAppDownloadReason_Data',\n", " url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Curate_StudyAppDownloadReason_Data.ipynb'\n", " )\n", " ]\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }