{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Curate_StudyAppDownloadReason_Data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:54:55.009598Z",
"start_time": "2018-12-28T19:54:37.669242Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Welcome, Abhishek Pratap!\n",
"\n"
]
}
],
"source": [
"%matplotlib inline\n",
"\n",
"import datetime as dt\n",
"import itertools as it\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from IPython.core.interactiveshell import InteractiveShell\n",
"import synapseclient\n",
"from synapseclient import Activity, File, Schema, Table, as_table_columns\n",
"from tqdm import tqdm\n",
"\n",
"InteractiveShell.ast_node_interactivity = 'all'\n",
"syn = synapseclient.Synapse()\n",
"syn.login()\n",
"\n",
"tqdm.pandas()\n",
"\n",
"def isnum(x):\n",
" if x is None:\n",
" return False\n",
" try:\n",
" float(x)\n",
" return True\n",
" except ValueError:\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:54:57.272327Z",
"start_time": "2018-12-28T19:54:55.020146Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" brightenid | \n",
" start | \n",
" week | \n",
" user_id | \n",
" sent_time_local | \n",
" sent_time_utc | \n",
" response_local | \n",
" response_utc | \n",
" response_id | \n",
" Why did you download this app? | \n",
" For \"other\", please type in box | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" BLUE-00048 | \n",
" 2014-08-01 | \n",
" 0 | \n",
" 10431 | \n",
" NaT | \n",
" NaT | \n",
" 2014-08-01 07:00:09 | \n",
" 2014-08-01 11:00:09 | \n",
" 166331 | \n",
" for fun|for mental health reasons [e.g. depres... | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" BLUE-00049 | \n",
" 2014-08-01 | \n",
" 0 | \n",
" 10470 | \n",
" NaT | \n",
" NaT | \n",
" 2014-08-01 12:31:58 | \n",
" 2014-08-01 16:31:58 | \n",
" 166824 | \n",
" for brain health [e.g. better memory]|to impro... | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" BLUE-00050 | \n",
" 2014-08-09 | \n",
" 0 | \n",
" 10519 | \n",
" NaT | \n",
" NaT | \n",
" 2014-08-08 19:31:06 | \n",
" 2014-08-09 02:31:06 | \n",
" 173157 | \n",
" for management of daily problems|for brain hea... | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" BLUE-00050 | \n",
" 2014-08-09 | \n",
" 4 | \n",
" 10519 | \n",
" 2014-09-06 09:00:01 | \n",
" 2014-09-06 16:00:01 | \n",
" 2014-09-08 18:21:29 | \n",
" 2014-09-09 01:21:29 | \n",
" 198454 | \n",
" for management of daily problems|for brain hea... | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" BLUE-00050 | \n",
" 2014-08-09 | \n",
" 12 | \n",
" 10519 | \n",
" 2014-11-01 08:00:05 | \n",
" 2014-11-01 16:00:05 | \n",
" 2014-11-02 09:16:22 | \n",
" 2014-11-02 17:16:22 | \n",
" 256858 | \n",
" for mood [e.g. sadness]|for brain health [e.g.... | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" brightenid start week user_id sent_time_local \\\n",
"0 BLUE-00048 2014-08-01 0 10431 NaT \n",
"1 BLUE-00049 2014-08-01 0 10470 NaT \n",
"2 BLUE-00050 2014-08-09 0 10519 NaT \n",
"3 BLUE-00050 2014-08-09 4 10519 2014-09-06 09:00:01 \n",
"4 BLUE-00050 2014-08-09 12 10519 2014-11-01 08:00:05 \n",
"\n",
" sent_time_utc response_local response_utc response_id \\\n",
"0 NaT 2014-08-01 07:00:09 2014-08-01 11:00:09 166331 \n",
"1 NaT 2014-08-01 12:31:58 2014-08-01 16:31:58 166824 \n",
"2 NaT 2014-08-08 19:31:06 2014-08-09 02:31:06 173157 \n",
"3 2014-09-06 16:00:01 2014-09-08 18:21:29 2014-09-09 01:21:29 198454 \n",
"4 2014-11-01 16:00:05 2014-11-02 09:16:22 2014-11-02 17:16:22 256858 \n",
"\n",
" Why did you download this app? \\\n",
"0 for fun|for mental health reasons [e.g. depres... \n",
"1 for brain health [e.g. better memory]|to impro... \n",
"2 for management of daily problems|for brain hea... \n",
"3 for management of daily problems|for brain hea... \n",
"4 for mood [e.g. sadness]|for brain health [e.g.... \n",
"\n",
" For \"other\", please type in box \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" username | \n",
" Why did you download this app? | \n",
" createdAt | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" EN05039 | \n",
" Fun,My mental health,My mood,Managing daily is... | \n",
" 2016-09-03 17:13:21 | \n",
"
\n",
" \n",
" 1 | \n",
" EN05331 | \n",
" My mental health,Brain health,Improve work | \n",
" 2017-01-22 23:04:11 | \n",
"
\n",
" \n",
" 2 | \n",
" EN00387 | \n",
" My mental health,My mood,Brain health,Fun | \n",
" 2016-10-28 08:37:06 | \n",
"
\n",
" \n",
" 3 | \n",
" EN00322 | \n",
" My mental health | \n",
" 2016-09-07 21:32:32 | \n",
"
\n",
" \n",
" 4 | \n",
" EN00478 | \n",
" My mental health,Managing daily issues,My mood... | \n",
" 2016-11-11 07:21:42 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" username Why did you download this app? \\\n",
"0 EN05039 Fun,My mental health,My mood,Managing daily is... \n",
"1 EN05331 My mental health,Brain health,Improve work \n",
"2 EN00387 My mental health,My mood,Brain health,Fun \n",
"3 EN00322 My mental health \n",
"4 EN00478 My mental health,Managing daily issues,My mood... \n",
"\n",
" createdAt \n",
"0 2016-09-03 17:13:21 \n",
"1 2017-01-22 23:04:11 \n",
"2 2016-10-28 08:37:06 \n",
"3 2016-09-07 21:32:32 \n",
"4 2016-11-11 07:21:42 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"v1sid, v2sid = 'syn10250489', 'syn17023091'\n",
"\n",
"v1r = pd.read_excel(syn.get(v1sid).path)\n",
"v2r = pd.read_csv(syn.get(v2sid).path, parse_dates=['createdAt'])\n",
"\n",
"v1r.head()\n",
"v2r.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### V1 Data Prep"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:54:57.410111Z",
"start_time": "2018-12-28T19:54:57.275614Z"
}
},
"outputs": [],
"source": [
"# drop the uneeded columns\n",
"v1 = v1r.drop(columns=[\n",
" 'week',\t'user_id',\t'sent_time_local',\t'sent_time_utc', 'response_utc', 'response_id'\n",
"]).rename(columns={\n",
" 'brightenid': 'participant_id',\n",
" 'response_local':'dt_response',\n",
" 'Why did you download this app?':'apps',\n",
" 'For \"other\", please type in box':'other_description'\n",
"})\n",
"\n",
"# conver to lowercase for lookups\n",
"v1.apps = v1.apps.apply(lambda x: x.lower() if not isnum(x) else 'none')\n",
"\n",
"# add indicators for different app usage\n",
"v1['fun'] = v1.apps.apply(lambda x: int(x.find('fun') > -1))\n",
"v1['mental_health'] = v1.apps.apply(lambda x: int(x.find('mental health') > -1))\n",
"v1['mood'] = v1.apps.apply(lambda x: int(x.find('mood') > -1))\n",
"v1['managing_daily_issues'] = v1.apps.apply(lambda x: int(x.find('management of daily ') > -1))\n",
"v1['improve_work'] = v1.apps.apply(lambda x: int(x.find('improve work') > -1))\n",
"v1['brain_health'] = v1.apps.apply(lambda x: int(x.find('brain health') > -1))\n",
"v1['improve_relationships'] = v1.apps.apply(lambda x: int(x.find('improve relationships') > -1))\n",
"v1['other'] = v1.apps.apply(lambda x: int(x.find('other') > -1))\n",
"\n",
"v1.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:54:57.423884Z",
"start_time": "2018-12-28T19:54:57.414380Z"
}
},
"outputs": [],
"source": [
"# print the unique application reasons\n",
"t = [print(a) for a in pd.unique(list(it.chain(*[t.split('|') for t in v1.apps])))]; del t"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:54:57.724949Z",
"start_time": "2018-12-28T19:54:57.429488Z"
}
},
"outputs": [],
"source": [
"plt.hist([len(a) for a in v1.other_description if not isnum(a)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:54:58.216979Z",
"start_time": "2018-12-28T19:54:57.728659Z"
}
},
"outputs": [],
"source": [
"v1['day'] = [(t.dt_response - t.start).days + 1 for t in v1.itertuples()]\n",
"v1.day.hist()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Extract two common topics I saw a quick read through the other descriptions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:54:58.243131Z",
"start_time": "2018-12-28T19:54:58.223308Z"
},
"scrolled": false
},
"outputs": [],
"source": [
"incentive_words = [\n",
" 'paid', 'pay', 'mone', 'compens', 'gift', 'incentive', '$', 'finan', 'incom', 'reimb', 'craigs', 'pd'\n",
"]\n",
"\n",
"def is_incentive(s):\n",
" if not isinstance(s, str):\n",
" return 0\n",
" else:\n",
" return int(any(s.find(a) > -1 for a in incentive_words))\n",
" \n",
"v1['happ_inc'] = pd.to_numeric(v1.other_description.apply(is_incentive), downcast='integer')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:54:58.299327Z",
"start_time": "2018-12-28T19:54:58.248110Z"
}
},
"outputs": [],
"source": [
"for_the_study_words = [\n",
" 'part', 'require', 'for study', 'told to', 'asked to', 'request', 'to be', 'to do', 'brighten', 'assignment', 'ucsf', 'for a', 'study'\n",
"]\n",
"\n",
"def is_for_the_study(s):\n",
" if not isinstance(s, str):\n",
" return int(0)\n",
" else:\n",
" return int(any(s.find(a) > -1 for a in for_the_study_words))\n",
" \n",
"v1['happ_fts'] = v1.other_description.apply(is_for_the_study)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:54:58.360346Z",
"start_time": "2018-12-28T19:54:58.303805Z"
}
},
"outputs": [],
"source": [
"v1 = v1.drop(columns=['start', 'day'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### V2 Data Prep"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:54:58.439887Z",
"start_time": "2018-12-28T19:54:58.364685Z"
}
},
"outputs": [],
"source": [
"# drop the uneeded columns\n",
"v2 = v2r.rename(columns={\n",
" 'username': 'participant_id',\n",
" 'createdAt':'dt_response',\n",
" 'Why did you download this app?':'apps'\n",
"})\n",
"\n",
"# conver to lowercase for lookups\n",
"v2.apps = v2.apps.apply(lambda x: x.lower() if not isnum(x) else 'none')\n",
"\n",
"# add indicators for different app usage\n",
"v2['fun'] = v2.apps.apply(lambda x: int(x.find('fun') > -1))\n",
"v2['mental_health'] = v2.apps.apply(lambda x: int(x.find('my mental health') > -1))\n",
"v2['mood'] = v2.apps.apply(lambda x: int(x.find('my mood') > -1))\n",
"v2['managing_daily_issues'] = v2.apps.apply(lambda x: int(x.find('managing daily issues') > -1))\n",
"v2['improve_work'] = v2.apps.apply(lambda x: int(x.find('improve work') > -1))\n",
"v2['brain_health'] = v2.apps.apply(lambda x: int(x.find('brain health') > -1))\n",
"v2['improve_relationships'] = v2.apps.apply(lambda x: int(x.find('improve relationships') > -1))\n",
"v2['other'] = v2.apps.apply(lambda x: int(x.find('other') > -1))\n",
"\n",
"# add the study flag\n",
"v2.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:54:58.463531Z",
"start_time": "2018-12-28T19:54:58.444738Z"
}
},
"outputs": [],
"source": [
"t = [print(a) for a in pd.unique(list(it.chain(*[t.split(',') for t in v2.apps])))]; del t"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Combine the DataFrames"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:54:58.541761Z",
"start_time": "2018-12-28T19:54:58.468717Z"
}
},
"outputs": [],
"source": [
"combined = pd.concat([v1, v2], sort=False)\n",
"\n",
"combined = combined.drop(columns=['apps']).rename(columns={\n",
" 'brain_health': 'happ_bh',\n",
" 'fun': 'happ_f',\n",
" 'improve_relationships': 'happ_ir',\n",
" 'improve_work': 'happ_iw',\n",
" 'managing_daily_issues': 'happ_mdi',\n",
" 'mental_health': 'happ_mh',\n",
" 'mood':'happ_m',\n",
" 'other': 'happ_o',\n",
" 'other_description':'happ_o_description'\n",
"}).loc[:, [\n",
" 'participant_id', 'dt_response', \n",
" 'happ_bh', 'happ_f', 'happ_fts', 'happ_inc', 'happ_ir', 'happ_iw', 'happ_m', 'happ_mdi', 'happ_mh', 'happ_o',\n",
" 'happ_o_description'\n",
"]]\n",
"\n",
"combined = combined.fillna(0)\n",
"\n",
"# make sure they're all the correct type. for some reason pandas was converting fts and inc to floats\n",
"# but we don't need that many bits. plus i like consistency\n",
"for c in combined.columns:\n",
" if c.find('description') > -1:\n",
" continue \n",
" \n",
" if c.find('app') > -1:\n",
" combined[c] = combined[c].astype(int)\n",
"\n",
"combined.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add week into study"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"metasid = 'syn27082597'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:54:59.936285Z",
"start_time": "2018-12-28T19:54:58.545530Z"
}
},
"outputs": [],
"source": [
"metadata = syn.tableQuery(f'SELECT participant_id, startdate FROM {metasid}').asDataFrame(convert_to_datetime=True)\n",
"metadata.startdate = pd.to_datetime(metadata.startdate)\n",
"\n",
"# add in the participants start date as a new column\n",
"combined = pd.merge(combined, metadata, on='participant_id', how='left')\n",
"\n",
"# get the time difference in weeks as a float\n",
"combined['week'] = [\n",
" d.days/7 for d in (\n",
" combined.dt_response.apply(\n",
" lambda x: dt.datetime(year=x.year, month=x.month, day=x.day))-combined.startdate\n",
" )\n",
"]\n",
"\n",
"# convert the week number to an int by taking the floor\n",
"combined.week = combined.week.progress_apply(lambda x: np.int16(np.floor(x))+1)# if not pd.isnull(x) else np.nan)\n",
"\n",
"# remove the start date\n",
"combined = combined.drop(columns=['startdate'], errors='ignore')\n",
"\n",
"# reorder the columns\n",
"cols = list(combined.columns)\n",
"cols = cols[0:2] + ['week'] + cols[2:-1]\n",
"combined = combined.reindex(columns=cols)\n",
"\n",
"combined.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Localize timestamps"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:55:00.056246Z",
"start_time": "2018-12-28T19:54:59.951047Z"
}
},
"outputs": [],
"source": [
"# localize timestamps\n",
"combined['dt_response'] = [\n",
" str(t.tz_localize('UTC'))\n",
" for t in combined.dt_response\n",
"]\n",
"\n",
"combined.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set provenance and upload to Synapse"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"end_time": "2018-12-28T19:55:21.790792Z",
"start_time": "2018-12-28T19:55:00.061005Z"
}
},
"outputs": [],
"source": [
"t = syn.delete(\n",
" syn.tableQuery('select * from syn17022426')\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"start_time": "2018-12-28T19:54:37.763Z"
}
},
"outputs": [],
"source": [
"final = syn.store(Table(\n",
" Schema(\n",
" name='Health Applications',\n",
" columns=as_table_columns(combined), \n",
" parent='syn10848316'),\n",
" combined\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"start_time": "2018-12-28T19:54:37.767Z"
}
},
"outputs": [],
"source": [
"final = syn.setProvenance(\n",
" 'syn17022426',\n",
" activity=Activity(\n",
" name='Combine V1 and V2 data',\n",
" description='Process and combine the data collected during study 1 and study 2',\n",
" used=['syn17023091', v1sid, v2sid, 'syn12181332', metasid],\n",
" executed=[\n",
" dict(\n",
" name='Curate_StudyAppDownloadReason_Data',\n",
" url='https://github.com/apratap/BRIGHTEN-Data-Release/blob/master/Curate_StudyAppDownloadReason_Data.ipynb'\n",
" )\n",
" ]\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}