{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Created by [SmirkyGraphs](http://smirkygraphs.github.io/). Code: [GitHub](https://github.com/SmirkyGraphs/Python-Notebooks).\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Rhode Island DOH Inspections" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from pandas.io.json import json_normalize\n", "import pandas as pd\n", "import numpy as np\n", "import requests\n", "import json\n", "import csv\n", "import glob\n", "import os\n", "import re" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": true }, "outputs": [], "source": [ "facility_url = 'https://ri.healthinspections.us/ri/API/index.cfm/facilities/'\n", "inspection_url = 'https://ri.healthinspections.us/ri/API/index.cfm/inspectionsData/'\n", "report_url = 'https://ri.healthinspections.us/'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "## Getting Data from Site\n" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "finished\n" ] } ], "source": [ "# Download all locations json to get facility ID\n", "\n", "for i in range(9999):\n", " try:\n", " file = './raw/Facility/json/json' + str(i) + '.json'\n", " facility_url2 = facility_url + str(i) + '/0.json'\n", " response = json.loads(requests.get(facility_url2).text)\n", " if not response:\n", " print('finished')\n", " break\n", " else:\n", " with open(file, 'w') as outfile:\n", " json.dump(response, outfile)\n", " except:\n", " pass" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Transforming into csv and turning json into columns\n", "\n", "for i in range(9999):\n", " try:\n", " file = './raw/Facility/json/json' + str(i) + '.json'\n", " f = open(file)\n", " data = json.load(f)\n", " df = pd.DataFrame.from_dict(data)\n", " df_json = df.to_json(orient='records')\n", " df = json_normalize(json.loads(df_json), meta=['key'])\n", " output = './raw/Facility/csv/csv' + str(i) + '.csv'\n", " df.to_csv(output)\n", " except:\n", " pass" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Combine all csv files into 1\n", "\n", "# Set File Path\n", "path = './raw/Facility/csv/'\n", "allFiles = glob.glob(path + \"/*.csv\")\n", "\n", "# Load Data into Dataframe\n", "frame = pd.DataFrame()\n", "list_ = []\n", "for file_ in allFiles:\n", " df = pd.read_csv(file_,index_col=None, header=0)\n", " list_.append(df)\n", "frame = pd.concat(list_)\n", "\n", "df = frame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "## Cleaning the Data" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Data Cleaning\n", "\n", "def trim_type(column):\n", " x = column[14:]\n", " return x\n", "\n", "def trim_inspection(column):\n", " x = column[21:]\n", " return x\n", "\n", "def trim_license(column):\n", " x = column[11:]\n", " return x\n", "\n", "df['License Type'] = df['columns.2'].apply(trim_type)\n", "df['Last Inspection Date'] = df['columns.1'].apply(trim_inspection)\n", "df['License Number'] = df['columns.3'].apply(trim_license)\n", "\n", "df = df.reset_index()\n", "df = df.drop(['Unnamed: 0','bookmarked', 'index', 'columns.0', 'columns.1', 'columns.2', 'columns.3'], axis=1)\n", "\n", "\n", "df.to_csv('./cleaned/Facilities.csv', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "## Getting Inspection Data from Site" ] }, { "cell_type": "code", "execution_count": 98, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Downloading all inspections\n", "\n", "List = df['id'].tolist()\n", "\n", "\n", "for ids in List:\n", " try:\n", " file = './raw/Inspection/json/' + ids + '.json'\n", " inspection_url2 = inspection_url + ids + '.json'\n", " response = json.loads(requests.get(inspection_url2).text)\n", " with open(file, 'w') as outfile:\n", " json.dump(response, outfile)\n", " except:\n", " pass" ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Transforming into csv and turning json into columns\n", "\n", "for ids in List:\n", " file = './raw/Inspection/json/' + ids + '.json'\n", " f = open(file)\n", " data = json.load(f)\n", " df = pd.DataFrame.from_dict(data)\n", " df_json = df.to_json(orient='records')\n", " df = json_normalize(json.loads(df_json), meta=['key'])\n", " output = './raw/Inspection/csv/' + ids + '.csv'\n", " df.to_csv(output)" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Combine all csv files into 1\n", "\n", "# Set File Path\n", "path = './raw/Inspection/csv/'\n", "allFiles = glob.glob(path + \"/*.csv\")\n", "\n", "# Load Data into Dataframe\n", "frame = pd.DataFrame()\n", "list_ = []\n", "for file_ in allFiles:\n", " df = pd.read_csv(file_,index_col=None, header=0)\n", " list_.append(df)\n", "frame = pd.concat(list_)\n", "\n", "df = frame" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Unpivot the columns\n", "\n", "x = ['Unnamed: 0', 'columns.0', 'columns.1', 'columns.2', 'facilityId', 'inspectionId', 'printablePath']\n", "\n", "df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())" ] }, { "cell_type": "code", "execution_count": 102, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Remove nulls\n", "\n", "df = df[(df['variable'] == 'violations.0') | (df['value'].notnull())]" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Data Cleaning\n", "\n", "def trim_date(column):\n", " x = column[16:]\n", " x = x.replace('-','/')\n", " return x\n", "\n", "def trim_purpose(column):\n", " x = column.replace('Inspection Purpose: ','')\n", " return x\n", "\n", "def trim_num(column):\n", " x = column[11:]\n", " return x\n", "\n", "def trim_violation(column):\n", " x = column[2:-2]\n", " return x\n", "\n", "def violation_id(column):\n", " x = column[:2]\n", " x = re.sub('[^0-9]','', x)\n", " x = x.strip()\n", " return x\n", "\n", "def violation_desc(column):\n", " y = column.find('-') + 2\n", " x = column[y:]\n", " return x\n", "\n", "def clean_url(column):\n", " x = column.replace('../', 'https://ri.healthinspections.us/')\n", " return x\n", " \n", "\n", "df['Inspection Date'] = df['columns.0'].apply(trim_date)\n", "df['Inspection Purpose'] = df['columns.1'].astype(str).apply(trim_purpose)\n", "df['Violation Num'] = df['variable'].apply(trim_num)\n", "df['value'] = df['value'].astype(str).apply(trim_violation)\n", "df['Report URL'] = df['printablePath'].astype(str).apply(clean_url)\n", "\n", "# splitting value\n", "df['Violation Id'] = df['value'].astype(str).apply(violation_id)\n", "df['Violation Desc'] = df['value'].astype(str).apply(violation_desc)\n", "\n", "\n", "df = df.reset_index()\n", "df = df.drop(['index', 'Unnamed: 0', 'columns.0', 'columns.1', 'columns.2', 'printablePath',\n", " 'variable', 'value'], axis=1)\n", "\n", "\n", "df.to_csv('./cleaned/Inspections.csv', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }