{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Created by: [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [GitHub](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [CMS.gov](https://data.cms.gov/browse?category=Special+Programs%2FInitiatives+-+Long+Term+Care+Facility+Staffing+Payroll-Based+Journal&q=PBJ+Daily+Nurse+Staffing+CY+2017+Q4&sortBy=relevance)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Cleaning for RI Nursing Staff PBJ" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "using pandas to compile multiple csv files from different reporting periods into one" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import glob as glob\n", "import csv" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# get data set with only wanted columns and state of RI \n", "\n", "files = glob.glob('./data/raw/*.csv')\n", "\n", "nursing_staff = []\n", "non_nursing_staff = []\n", "\n", "for f in files:\n", " \n", " part = f[22:-4]\n", " \n", " df = pd.read_csv(f, low_memory=False)\n", " df = df[df['STATE'] == 'RI']\n", " \n", " df = df[[x for x in df if not x.strip().endswith('_emp')]]\n", " df = df[[x for x in df if not x.strip().endswith('_ctr')]]\n", " \n", " df['job_type'] = part\n", " \n", " if part == 'part1':\n", " nursing_staff.append(df)\n", " \n", " else:\n", " non_nursing_staff.append(df)\n", "\n", "nursing = pd.concat(nursing_staff, sort=False)\n", "non_nursing = pd.concat(non_nursing_staff, sort=False)\n", "\n", "nursing.to_csv('./data/clean/nursing_clean.csv', index=False)\n", "non_nursing.to_csv('./data/clean/non_nursing_clean.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "files = glob.glob('./data/clean/*clean.csv')\n", "\n", "nursing_staff = []\n", "non_nursing_staff = []\n", "\n", "for f in files:\n", " \n", " part = f[13:-10]\n", " \n", " # cleaning data\n", "\n", " df = pd.read_csv(f, low_memory=False)\n", " df = df.reset_index(drop=True)\n", "\n", " df = df.rename(columns=lambda x: x.strip())\n", "\n", " cols = ['COUNTY_FIPS', 'CY_Qtr', 'Location 1']\n", "\n", " df = df.drop(columns=cols)\n", "\n", " df = df.loc[:,~df.columns.duplicated()]\n", "\n", " x = ['PROVNUM', 'PROVNAME', 'CITY', 'STATE', 'COUNTY_NAME', 'WorkDate', 'MDScensus', 'job_type']\n", "\n", " hours_cols = df.columns.drop(x).tolist()\n", "\n", " df['total_hrs'] = df[hours_cols].sum(axis=1)\n", " hours_cols.append('total_hrs')\n", "\n", " df = pd.melt(df, id_vars=x, value_vars=hours_cols)\n", " \n", " if part == 'nursing':\n", " nursing_staff.append(df)\n", " \n", " else:\n", " non_nursing_staff.append(df)\n", "\n", "\n", "nursing = pd.concat(nursing_staff, sort=False)\n", "non_nursing = pd.concat(non_nursing_staff, sort=False)\n", "\n", "nursing.to_csv('./data/clean/nursing_final.csv', index=False)\n", "non_nursing.to_csv('./data/clean/non_nursing_final.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [], "source": [ "# merging files\n", "\n", "nursing = pd.read_csv('./data/clean/nursing_final.csv')\n", "non_nursing = pd.read_csv('./data/clean/non_nursing_final.csv')\n", "\n", "df = pd.concat([nursing, non_nursing])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# grabbing provider info\n", "\n", "providers = pd.read_csv('./data/extra/provider_info.csv')\n", "\n", "# getting lat/lng of facilities\n", "\n", "providers['lat_lon'] = providers['Location'].apply(lambda x: x[x.find(\"(\")+1:x.find(\")\")]).str.split(', ')\n", "\n", "providers['lat'] = providers['lat_lon'].apply(lambda x: x[0])\n", "providers['lng'] = providers['lat_lon'].apply(lambda x: x[1])\n", "\n", "# drop list\n", "\n", "f = './files/provider_info_drop.txt'\n", "drop = [line.rstrip() for line in open(f)]\n", "\n", "providers = providers.drop(columns=drop)\n", "\n", "# getting number of beds and lat/lng\n", "\n", "max_beds = providers[['Federal Provider Number', 'Number of Certified Beds', 'lat', 'lng']]\n", "max_beds = max_beds.set_index('Federal Provider Number')\n", "\n", "df = df.merge(max_beds, how='left', left_on='PROVNUM', right_on='Federal Provider Number')\n", "\n", "# removing homes lacking max num of beds\n", "\n", "df = df.dropna(subset=['Number of Certified Beds'])\n", "\n", "# getting capacity\n", "\n", "df['capacity'] = df['MDScensus']/df['Number of Certified Beds']\n", "\n", "# replacing part1/part2\n", "\n", "df['job_type'] = df['job_type'].replace('part1' ,'nursing_staff')\n", "df['job_type'] = df['job_type'].replace('part2' ,'non_nursing_staff')\n", "\n", "\n", "# clean up variable names\n", "\n", "df['variable'] = df['variable'].str.lower()\n", "\n", "def clean(x):\n", " if x.lower().startswith('hrs'):\n", " return x[4:]\n", " else:\n", " return x\n", " \n", "df['variable'] = df['variable'].apply(clean)\n", "df['variable'] = df['variable'].str.replace('_', '')\n", "\n", "var_names = csv.reader(open('./files/variable_names.csv'))\n", "var_names = dict(var_names)\n", "\n", "df['variable'] = df.variable.map(var_names)\n", "\n", "df.to_csv('./data/final/ri_pbj_staffing_final.csv')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }