{ "cells": [ { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The autoreload extension is already loaded. To reload it, use:\n", " %reload_ext autoreload\n" ] } ], "source": [ "# tools for handling files\n", "import sys\n", "import os\n", "\n", "# pandas/numpy for handling data\n", "import pandas as pd\n", "import numpy as np\n", "from pandas import ExcelWriter\n", "from pandas import ExcelFile\n", "\n", "# for reading individual telomere length data from files\n", "from ast import literal_eval\n", "\n", "# for grabbing individual cells\n", "import more_itertools\n", "\n", "# my module containing functions for handling/visualizing/analyzing telomere length/chr rearrangement data\n", "import telomere_methods_rad_patient as trp\n", "\n", "# incase reloading modules is required\n", "import importlib\n", "%load_ext autoreload\n", "%autoreload " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", " \n", "\n", "...\n", "\n", " \n", "\n", "---" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Loading Telomere Length Data from TeloFISH\n", "---" ] }, { "cell_type": "markdown", "metadata": { "toc-hr-collapsed": false }, "source": [ "## Extracting telomere length data output from ImageJ from all radiation therapy patients" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SW9A non irrad.xlsx data extraction in progress..\n", "BJ1 for SW9_.xlsx data extraction in progress..\n", "SW11A non irrad.xlsx data extraction in progress..\n", "BJ1 for SW15_.xlsx data extraction in progress..\n", "SW6A non irrad.xlsx data extraction in progress..\n", "SW6A irrad @ 4 Gy.xlsx data extraction in progress..\n", "SW8B.xlsx data extraction in progress..\n", "SW14A irrad @ 4 Gy.xlsx data extraction in progress..\n", "SW8A irrad @ 4 Gy.xlsx data extraction in progress..\n", "SW5A irrad @ 4 Gy.xlsx data extraction in progress..\n", "SW8C.xlsx data extraction in progress..\n", "SW1A non irrad.xlsx data extraction in progress..\n", "BJ1 for SW11_.xlsx data extraction in progress..\n", "SW16A non irrad.xlsx data extraction in progress..\n", "BJ1 for SW13_.xlsx data extraction in progress..\n", "BJ-hTERT for SW9_.xlsx data extraction in progress..\n", "BJ1 for SW14_.xlsx data extraction in progress..\n", "SW9B.xlsx data extraction in progress..\n", "BJ1 for SW8_.xlsx data extraction in progress..\n", "SW_1_ok_3_C_.xlsx data extraction in progress..\n", "SW3A irrad @ 4 Gy.xlsx data extraction in progress..\n", "SW11A irrad @ 4 Gy.xlsx data extraction in progress..\n", "BJ1 for SW16_.xlsx data extraction in progress..\n", "BJ1 for SW12_.xlsx data extraction in progress..\n", "SW8A non irrad.xlsx data extraction in progress..\n", "BJ-hTERT for SW8_.xlsx data extraction in progress..\n", "SW10A non irrad.xlsx data extraction in progress..\n", "SW12A irrad @ 4 Gy.xlsx data extraction in progress..\n", "SW9C.xlsx data extraction in progress..\n", "BJ1 for SW10_.xlsx data extraction in progress..\n", "SW7A non irrad.xlsx data extraction in progress..\n", "SW1A irrad @ 4 Gy.xlsx data extraction in progress..\n", "SW13A irrad @ 4 Gy.xlsx data extraction in progress..\n", "SW1B.xlsx data extraction in progress..\n", "BJ-hTERT for SW6_.xlsx data extraction in progress..\n", "SW13B.xlsx data extraction in progress..\n", "BJ1 for SW2_.xlsx data extraction in progress..\n", "SW2A non irrad.xlsx data extraction in progress..\n", "SW5C.xlsx data extraction in progress..\n", "SW15C.xlsx data extraction in progress..\n", "SW7C.xlsx data extraction in progress..\n", "SW11B.xlsx data extraction in progress..\n", "SW3B.xlsx data extraction in progress..\n", "BJ-hTERT for SW15_.xlsx data extraction in progress..\n", "SW15A non irrad.xlsx data extraction in progress..\n", "SW12A non irrad.xlsx data extraction in progress..\n", "BJ-hTERT for SW11_.xlsx data extraction in progress..\n", "SW3C.xlsx data extraction in progress..\n", "SW11C.xlsx data extraction in progress..\n", "SW7B.xlsx data extraction in progress..\n", "SW15B.xlsx data extraction in progress..\n", "BJ1 for SW6_.xlsx data extraction in progress..\n", "BJ-hTERT for SW2_.xlsx data extraction in progress..\n", "SW5B.xlsx data extraction in progress..\n", "SW5A non irrad.xlsx data extraction in progress..\n", "SW1C.xlsx data extraction in progress..\n", "BJ-hTERT for SW13_.xlsx data extraction in progress..\n", "SW10A irrad @ 4 Gy.xlsx data extraction in progress..\n", "SW2A irrad @ 4 Gy.xlsx data extraction in progress..\n", "BJ1 for SW1_.xlsx data extraction in progress..\n", "SW10B.xlsx data extraction in progress..\n", "BJ-hTERT for SW5_.xlsx data extraction in progress..\n", "SW2B.xlsx data extraction in progress..\n", "SW13A non irrad.xlsx data extraction in progress..\n", "SW14C.xlsx data extraction in progress..\n", "SW6C.xlsx data extraction in progress..\n", "SW9A irrad @ 4 Gy.xlsx data extraction in progress..\n", "SW16A irrad @ 4 Gy.xlsx data extraction in progress..\n", "BJ-hTERT for SW14_.xlsx data extraction in progress..\n", "BJ-hTERT for SW16_.xlsx data extraction in progress..\n", "SW16C.xlsx data extraction in progress..\n", "BJ1 for SW3_.xlsx data extraction in progress..\n", "SW12B.xlsx data extraction in progress..\n", "BJ-hTERT for SW7_.xlsx data extraction in progress..\n", "SW12C.xlsx data extraction in progress..\n", "SW16B.xlsx data extraction in progress..\n", "BJ-hTERT for SW3_.xlsx data extraction in progress..\n", "BJ1 for SW7_.xlsx data extraction in progress..\n", "BJ-hTERT for SW12_.xlsx data extraction in progress..\n", "SW3A non irrad.xlsx data extraction in progress..\n", "SW15A irrad @ 4 Gy.xlsx data extraction in progress..\n", "SW7A irrad @ 4 Gy.xlsx data extraction in progress..\n", "BJ-hTERT for SW10_.xlsx data extraction in progress..\n", "SW6B.xlsx data extraction in progress..\n", "SW14B.xlsx data extraction in progress..\n", "BJ-hTERT for SW1_.xlsx data extraction in progress..\n", "SW14A non irrad.xlsx data extraction in progress..\n", "BJ1 for SW5_.xlsx data extraction in progress..\n", "SW2C.xlsx data extraction in progress..\n", "SW10C.xlsx data extraction in progress..\n", "completed file collection\n" ] } ], "source": [ "all_patients_dict = trp.generate_dictionary_from_TeloLength_data('../data/raw patient teloFISH data/')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Making dataframe from dict w/ all patients telomere length data, contains telo means & individual telos as list" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(59, 7)\n" ] } ], "source": [ "all_patients_df = trp.generate_dataframe_from_dict(all_patients_dict)\n", "\n", "# don't need telo means per cell @ this time\n", "all_patients_df = all_patients_df.drop(['cell data'], axis=1)\n", "print(all_patients_df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Saving all patients telomere length data for later retrieval" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# changing telo data to list in prep for saving to csv\n", "all_patients_df['telo data'] = all_patients_df['telo data'].apply(lambda row: row.tolist())\n", "all_patients_df.to_csv('../data/compiled patient data csv files/all_patients_df.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Generating all patients telo df containing telo counts per quartile melted into tidy data format " ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>patient id</th>\n", " <th>timepoint</th>\n", " <th>telo data</th>\n", " <th>telo means</th>\n", " <th>relative Q</th>\n", " <th>Q freq counts</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>1 non irrad</td>\n", " <td>[79.18994405924711, 58.07204491719145, 95.0279...</td>\n", " <td>84.796483</td>\n", " <td>Q1</td>\n", " <td>1195.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>2 irrad @ 4 Gy</td>\n", " <td>[149.93296075217452, 138.31843562348496, 106.6...</td>\n", " <td>90.975826</td>\n", " <td>Q1</td>\n", " <td>724.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1</td>\n", " <td>3 B</td>\n", " <td>[176.32960877192357, 111.92066838585988, 123.5...</td>\n", " <td>116.779989</td>\n", " <td>Q1</td>\n", " <td>231.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1</td>\n", " <td>4 C</td>\n", " <td>[144.65363114822472, 84.46927366319692, 78.133...</td>\n", " <td>99.346299</td>\n", " <td>Q1</td>\n", " <td>372.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " patient id timepoint \\\n", "0 1 1 non irrad \n", "1 1 2 irrad @ 4 Gy \n", "2 1 3 B \n", "3 1 4 C \n", "\n", " telo data telo means relative Q \\\n", "0 [79.18994405924711, 58.07204491719145, 95.0279... 84.796483 Q1 \n", "1 [149.93296075217452, 138.31843562348496, 106.6... 90.975826 Q1 \n", "2 [176.32960877192357, 111.92066838585988, 123.5... 116.779989 Q1 \n", "3 [144.65363114822472, 84.46927366319692, 78.133... 99.346299 Q1 \n", "\n", " Q freq counts \n", "0 1195.0 \n", "1 724.0 \n", "2 231.0 \n", "3 372.0 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "melted_all_patients_df = pd.melt(\n", " all_patients_df,\n", " id_vars = [col for col in all_patients_df.columns if col != 'Q1' and col != 'Q2-3' and col != 'Q4'],\n", " var_name='relative Q',\n", " value_name='Q freq counts')\n", "\n", "melted_all_patients_df['Q freq counts'] = melted_all_patients_df['Q freq counts'].astype('float64')\n", "melted_all_patients_df.head(4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Saving melted all patients df to csv" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "melted_all_patients_df.to_csv('../data/compiled patient data csv files/melted_all_patients_df.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pivoted Dataframe w/ timepoints as columns, and telomere length means for each patient timepoint in rows" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "pivot_patients_telo_means_df = all_patients_df.pivot(index='patient id', columns='timepoint', values='telo means')\n", "pivot_patients_telo_means_df = pivot_patients_telo_means_df.drop(13)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Saving pivoted telo means df to file" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "pivot_patients_telo_means_df.to_csv('../data/compiled patient data csv files/pivot_patients_telo_means_df.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Exploding individual telomere length measurements from contained list into dataframe (i.e row per individual telomere) while retaining related column info" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(59, 4600)\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " <th>1</th>\n", " <th>2</th>\n", " <th>3</th>\n", " <th>4</th>\n", " <th>5</th>\n", " <th>6</th>\n", " <th>7</th>\n", " <th>8</th>\n", " <th>9</th>\n", " <th>...</th>\n", " <th>4590</th>\n", " <th>4591</th>\n", " <th>4592</th>\n", " <th>4593</th>\n", " <th>4594</th>\n", " <th>4595</th>\n", " <th>4596</th>\n", " <th>4597</th>\n", " <th>4598</th>\n", " <th>4599</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>79.189944</td>\n", " <td>58.072045</td>\n", " <td>95.027933</td>\n", " <td>106.642458</td>\n", " <td>55.960334</td>\n", " <td>62.296089</td>\n", " <td>81.301676</td>\n", " <td>109.810056</td>\n", " <td>123.536313</td>\n", " <td>181.608938</td>\n", " <td>...</td>\n", " <td>71.798883</td>\n", " <td>122.480447</td>\n", " <td>50.681564</td>\n", " <td>81.301676</td>\n", " <td>40.122905</td>\n", " <td>115.088234</td>\n", " <td>114.032379</td>\n", " <td>59.128492</td>\n", " <td>63.351322</td>\n", " <td>46.457636</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>149.932961</td>\n", " <td>138.318436</td>\n", " <td>106.642458</td>\n", " <td>101.363128</td>\n", " <td>101.363128</td>\n", " <td>80.245810</td>\n", " <td>111.921788</td>\n", " <td>127.758499</td>\n", " <td>102.418994</td>\n", " <td>57.016760</td>\n", " <td>...</td>\n", " <td>47.513966</td>\n", " <td>92.916201</td>\n", " <td>84.469274</td>\n", " <td>89.748603</td>\n", " <td>47.513966</td>\n", " <td>102.418994</td>\n", " <td>117.201117</td>\n", " <td>107.697247</td>\n", " <td>92.916201</td>\n", " <td>71.798883</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>176.329609</td>\n", " <td>111.920668</td>\n", " <td>123.535077</td>\n", " <td>177.385475</td>\n", " <td>117.201117</td>\n", " <td>139.374302</td>\n", " <td>99.251397</td>\n", " <td>51.736913</td>\n", " <td>145.708040</td>\n", " <td>154.156424</td>\n", " <td>...</td>\n", " <td>127.759776</td>\n", " <td>66.519553</td>\n", " <td>92.916201</td>\n", " <td>48.569832</td>\n", " <td>145.709497</td>\n", " <td>114.033519</td>\n", " <td>156.268156</td>\n", " <td>137.262570</td>\n", " <td>141.486033</td>\n", " <td>182.664804</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>144.653631</td>\n", " <td>84.469274</td>\n", " <td>78.133297</td>\n", " <td>139.372908</td>\n", " <td>98.195531</td>\n", " <td>53.849162</td>\n", " <td>68.631285</td>\n", " <td>62.296089</td>\n", " <td>76.022346</td>\n", " <td>125.578131</td>\n", " <td>...</td>\n", " <td>112.977654</td>\n", " <td>109.808958</td>\n", " <td>111.920668</td>\n", " <td>127.759776</td>\n", " <td>105.586592</td>\n", " <td>100.307262</td>\n", " <td>128.814354</td>\n", " <td>143.597765</td>\n", " <td>157.664571</td>\n", " <td>115.089385</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>4 rows × 4600 columns</p>\n", "</div>" ], "text/plain": [ " 0 1 2 3 4 5 \\\n", "0 79.189944 58.072045 95.027933 106.642458 55.960334 62.296089 \n", "1 149.932961 138.318436 106.642458 101.363128 101.363128 80.245810 \n", "2 176.329609 111.920668 123.535077 177.385475 117.201117 139.374302 \n", "3 144.653631 84.469274 78.133297 139.372908 98.195531 53.849162 \n", "\n", " 6 7 8 9 ... 4590 \\\n", "0 81.301676 109.810056 123.536313 181.608938 ... 71.798883 \n", "1 111.921788 127.758499 102.418994 57.016760 ... 47.513966 \n", "2 99.251397 51.736913 145.708040 154.156424 ... 127.759776 \n", "3 68.631285 62.296089 76.022346 125.578131 ... 112.977654 \n", "\n", " 4591 4592 4593 4594 4595 4596 \\\n", "0 122.480447 50.681564 81.301676 40.122905 115.088234 114.032379 \n", "1 92.916201 84.469274 89.748603 47.513966 102.418994 117.201117 \n", "2 66.519553 92.916201 48.569832 145.709497 114.033519 156.268156 \n", "3 109.808958 111.920668 127.759776 105.586592 100.307262 128.814354 \n", "\n", " 4597 4598 4599 \n", "0 59.128492 63.351322 46.457636 \n", "1 107.697247 92.916201 71.798883 \n", "2 137.262570 141.486033 182.664804 \n", "3 143.597765 157.664571 115.089385 \n", "\n", "[4 rows x 4600 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# can imagine the lists containing the individual telos per patient exploding to the right; maintains the index relationship\n", "explode_telos_raw = all_patients_df['telo data'].apply(pd.Series)\n", "\n", "print(explode_telos_raw.shape)\n", "explode_telos_raw.head(4)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>patient id</th>\n", " <th>timepoint</th>\n", " <th>telo means</th>\n", " <th>individual telomeres</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>1 non irrad</td>\n", " <td>84.796483</td>\n", " <td>79.189944</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>2 irrad @ 4 Gy</td>\n", " <td>90.975826</td>\n", " <td>149.932961</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1</td>\n", " <td>3 B</td>\n", " <td>116.779989</td>\n", " <td>176.329609</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1</td>\n", " <td>4 C</td>\n", " <td>99.346299</td>\n", " <td>144.653631</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " patient id timepoint telo means individual telomeres\n", "0 1 1 non irrad 84.796483 79.189944\n", "1 1 2 irrad @ 4 Gy 90.975826 149.932961\n", "2 1 3 B 116.779989 176.329609\n", "3 1 4 C 99.346299 144.653631" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exploded_telos_all_patients_df = (explode_telos_raw\n", " \n", " # we'll merge the exploded telos df w/ our original all patients df on the index!\n", " .merge(all_patients_df, right_index = True, left_index = True)\n", " .drop(['telo data', 'Q1', 'Q2-3', 'Q4'], axis = 1)\n", " .melt(id_vars = ['patient id', 'timepoint', 'telo means'], value_name = \"individual telomeres\") \n", " .drop(\"variable\", axis = 1)\n", " .dropna())\n", "\n", "exploded_telos_all_patients_df.head(4)" ] }, { "cell_type": "markdown", "metadata": { "toc-hr-collapsed": true, "toc-nb-collapsed": true }, "source": [ "## Saving exploded telomere df for later retrieval" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "exploded_telos_all_patients_df.to_csv('../data/compiled patient data csv files/exploded_telos_all_patients_df.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Loading Chromosome Aberration Data from Subtelo-dGH \n", "---" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<DirEntry 'SW14_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW3_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW8_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW13_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW9_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW5_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW12_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW15_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW2_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW10_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW7_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW1_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW16_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW11_timepoints_subtelodGH.xlsx'>\n", "<DirEntry 'SW6_timepoints_subtelodGH.xlsx'>\n" ] } ], "source": [ "all_chr_aberr_df = trp.make_dataframe_chr_aberr_data('../data/dGH scoresheets/')" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "all_chr_aberr_df.to_csv('../data/compiled patient data csv files/all_chr_aberr_df.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": { "toc-hr-collapsed": true, "toc-nb-collapsed": true }, "source": [ "# Loading Complete Blood Count data" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "# loading excel file\n", "cbc_data = pd.read_excel('../data/to colorado.xlsx')\n", "\n", "# minor data cleaning\n", "cbc_data.rename({'patient': 'patient id',\n", " 'mrn': 'timepoint'}, axis=1, inplace=True)\n", "\n", "def extract_patient_ID(row):\n", " if 'SW' in row:\n", " row = row.replace('SW', ' ').strip()\n", " return row\n", "\n", "cbc_data['patient id'] = cbc_data['patient id'].apply(lambda row: extract_patient_ID(row))\n", "cbc_data['patient id'] = cbc_data['patient id'].astype('int64')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# saving to file\n", "cbc_data.to_csv('../data/compiled patient data csv files/cleaned cbc data.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" }, "toc-autonumbering": true }, "nbformat": 4, "nbformat_minor": 4 }