{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib.ticker as ticker\n", "from IPython.core.display import display, HTML\n", "import os\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Notebook Styling \n", "sns.set()\n", "pd.options.display.max_columns = None\n", "display(HTML(\"\"))\n", "pd.set_option('display.float_format',lambda x: '%.5f' % x)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDReason for absenceMonth of absenceDay of the weekSeasonsTransportation expenseDistance from Residence to WorkService timeAgeWork load Average/dayHit targetDisciplinary failureEducationSonSocial drinkerSocial smokerPetWeightHeightBody mass indexAbsenteeism time in hours
01126731289361333239.554009701210190172304
1360731118131850239.554009711110098178310
2323741179511838239.554009701010089170312
37775127951439239.554009701211068168244
41123751289361333239.554009701210190172302
\n", "
" ], "text/plain": [ " ID Reason for absence Month of absence Day of the week Seasons \\\n", "0 11 26 7 3 1 \n", "1 36 0 7 3 1 \n", "2 3 23 7 4 1 \n", "3 7 7 7 5 1 \n", "4 11 23 7 5 1 \n", "\n", " Transportation expense Distance from Residence to Work Service time Age \\\n", "0 289 36 13 33 \n", "1 118 13 18 50 \n", "2 179 51 18 38 \n", "3 279 5 14 39 \n", "4 289 36 13 33 \n", "\n", " Work load Average/day Hit target Disciplinary failure Education Son \\\n", "0 239.55400 97 0 1 2 \n", "1 239.55400 97 1 1 1 \n", "2 239.55400 97 0 1 0 \n", "3 239.55400 97 0 1 2 \n", "4 239.55400 97 0 1 2 \n", "\n", " Social drinker Social smoker Pet Weight Height Body mass index \\\n", "0 1 0 1 90 172 30 \n", "1 1 0 0 98 178 31 \n", "2 1 0 0 89 170 31 \n", "3 1 1 0 68 168 24 \n", "4 1 0 1 90 172 30 \n", "\n", " Absenteeism time in hours \n", "0 4 \n", "1 0 \n", "2 2 \n", "3 4 \n", "4 2 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "CSV_PATH = os.path.join('data', 'hr', 'Absenteeism_at_work.csv')\n", "absenteeism = pd.read_csv(CSV_PATH, encoding='latin1', sep=';') \n", "absenteeism.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(740, 21)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "absenteeism.shape" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 740 entries, 0 to 739\n", "Data columns (total 21 columns):\n", "ID 740 non-null int64\n", "Reason for absence 740 non-null int64\n", "Month of absence 740 non-null int64\n", "Day of the week 740 non-null int64\n", "Seasons 740 non-null int64\n", "Transportation expense 740 non-null int64\n", "Distance from Residence to Work 740 non-null int64\n", "Service time 740 non-null int64\n", "Age 740 non-null int64\n", "Work load Average/day 740 non-null float64\n", "Hit target 740 non-null int64\n", "Disciplinary failure 740 non-null int64\n", "Education 740 non-null int64\n", "Son 740 non-null int64\n", "Social drinker 740 non-null int64\n", "Social smoker 740 non-null int64\n", "Pet 740 non-null int64\n", "Weight 740 non-null int64\n", "Height 740 non-null int64\n", "Body mass index 740 non-null int64\n", "Absenteeism time in hours 740 non-null int64\n", "dtypes: float64(1), int64(20)\n", "memory usage: 121.5 KB\n" ] } ], "source": [ "absenteeism.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Categorical Feature: Reason for absence \n", "\n", "The **Reason for Absence** feature consists of 28 possible answers:\n", "\n", "1. Certain infectious and parasitic diseases \n", "2. Neoplasms \n", "3. Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism \n", "4. Endocrine, nutritional and metabolic diseases \n", "5. Mental and behavioural disorders \n", "6. Diseases of the nervous system \n", "7. Diseases of the eye and adnexa \n", "8. Diseases of the ear and mastoid process \n", "9. Diseases of the circulatory system \n", "10. Diseases of the respiratory system \n", "11. Diseases of the digestive system \n", "12. Diseases of the skin and subcutaneous tissue \n", "13. Diseases of the musculoskeletal system and connective tissue \n", "14. Diseases of the genitourinary system \n", "15. Pregnancy, childbirth and the puerperium \n", "16. Certain conditions originating in the perinatal period \n", "17. Congenital malformations, deformations and chromosomal abnormalities \n", "18. Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified \n", "19. Injury, poisoning and certain other consequences of external causes \n", "20. External causes of morbidity and mortality\n", "21. Factors influencing health status and contact with health services.\n", "22. patient follow-up \n", "23. medical consultation \n", "24. blood donation \n", "25. laboratory examination \n", "26. unjustified absence \n", "27. physiotherapy \n", "28. dental consultation" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "absenteeism['Reason for absence'] = absenteeism['Reason for absence'].astype('category')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:py36]", "language": "python", "name": "conda-env-py36-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }