{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Created by [SmirkyGraphs](http://smirkygraphs.github.io/). Code: [GitHub](https://github.com/SmirkyGraphs/Python-Notebooks).\n",
    "<hr>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Rhode Island DOH Inspections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pandas.io.json import json_normalize\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import requests\n",
    "import json\n",
    "import csv\n",
    "import glob\n",
    "import os\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "facility_url = 'https://ri.healthinspections.us/ri/API/index.cfm/facilities/'\n",
    "inspection_url = 'https://ri.healthinspections.us/ri/API/index.cfm/inspectionsData/'\n",
    "report_url = 'https://ri.healthinspections.us/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<hr>\n",
    "## Getting Data from Site\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "finished\n"
     ]
    }
   ],
   "source": [
    "# Download all locations json to get facility ID\n",
    "\n",
    "for i in range(9999):\n",
    "    try:\n",
    "        file = './raw/Facility/json/json' + str(i) + '.json'\n",
    "        facility_url2 = facility_url + str(i) + '/0.json'\n",
    "        response = json.loads(requests.get(facility_url2).text)\n",
    "        if not response:\n",
    "            print('finished')\n",
    "            break\n",
    "        else:\n",
    "            with open(file, 'w') as outfile:\n",
    "                json.dump(response, outfile)\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Transforming into csv and turning json into columns\n",
    "\n",
    "for i in range(9999):\n",
    "    try:\n",
    "        file = './raw/Facility/json/json' + str(i) + '.json'\n",
    "        f = open(file)\n",
    "        data = json.load(f)\n",
    "        df = pd.DataFrame.from_dict(data)\n",
    "        df_json = df.to_json(orient='records')\n",
    "        df = json_normalize(json.loads(df_json), meta=['key'])\n",
    "        output = './raw/Facility/csv/csv' + str(i) + '.csv'\n",
    "        df.to_csv(output)\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Combine all csv files into 1\n",
    "\n",
    "# Set File Path\n",
    "path = './raw/Facility/csv/'\n",
    "allFiles = glob.glob(path + \"/*.csv\")\n",
    "\n",
    "# Load Data into Dataframe\n",
    "frame = pd.DataFrame()\n",
    "list_ = []\n",
    "for file_ in allFiles:\n",
    "    df = pd.read_csv(file_,index_col=None, header=0)\n",
    "    list_.append(df)\n",
    "frame = pd.concat(list_)\n",
    "\n",
    "df = frame"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<hr>\n",
    "## Cleaning the Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Data Cleaning\n",
    "\n",
    "def trim_type(column):\n",
    "    x = column[14:]\n",
    "    return x\n",
    "\n",
    "def trim_inspection(column):\n",
    "    x = column[21:]\n",
    "    return x\n",
    "\n",
    "def trim_license(column):\n",
    "    x = column[11:]\n",
    "    return x\n",
    "\n",
    "df['License Type'] = df['columns.2'].apply(trim_type)\n",
    "df['Last Inspection Date'] = df['columns.1'].apply(trim_inspection)\n",
    "df['License Number'] = df['columns.3'].apply(trim_license)\n",
    "\n",
    "df = df.reset_index()\n",
    "df = df.drop(['Unnamed: 0','bookmarked', 'index', 'columns.0', 'columns.1', 'columns.2', 'columns.3'], axis=1)\n",
    "\n",
    "\n",
    "df.to_csv('./cleaned/Facilities.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<hr>\n",
    "## Getting Inspection Data from Site"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Downloading all inspections\n",
    "\n",
    "List = df['id'].tolist()\n",
    "\n",
    "\n",
    "for ids in List:\n",
    "    try:\n",
    "        file = './raw/Inspection/json/' + ids + '.json'\n",
    "        inspection_url2 = inspection_url + ids + '.json'\n",
    "        response = json.loads(requests.get(inspection_url2).text)\n",
    "        with open(file, 'w') as outfile:\n",
    "            json.dump(response, outfile)\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Transforming into csv and turning json into columns\n",
    "\n",
    "for ids in List:\n",
    "    file = './raw/Inspection/json/' + ids + '.json'\n",
    "    f = open(file)\n",
    "    data = json.load(f)\n",
    "    df = pd.DataFrame.from_dict(data)\n",
    "    df_json = df.to_json(orient='records')\n",
    "    df = json_normalize(json.loads(df_json), meta=['key'])\n",
    "    output = './raw/Inspection/csv/' + ids + '.csv'\n",
    "    df.to_csv(output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Combine all csv files into 1\n",
    "\n",
    "# Set File Path\n",
    "path = './raw/Inspection/csv/'\n",
    "allFiles = glob.glob(path + \"/*.csv\")\n",
    "\n",
    "# Load Data into Dataframe\n",
    "frame = pd.DataFrame()\n",
    "list_ = []\n",
    "for file_ in allFiles:\n",
    "    df = pd.read_csv(file_,index_col=None, header=0)\n",
    "    list_.append(df)\n",
    "frame = pd.concat(list_)\n",
    "\n",
    "df = frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Unpivot the columns\n",
    "\n",
    "x = ['Unnamed: 0', 'columns.0', 'columns.1', 'columns.2', 'facilityId', 'inspectionId', 'printablePath']\n",
    "\n",
    "df = pd.melt(df, id_vars=x, value_vars=df.columns.drop(x).tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Remove nulls\n",
    "\n",
    "df = df[(df['variable'] == 'violations.0') | (df['value'].notnull())]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Data Cleaning\n",
    "\n",
    "def trim_date(column):\n",
    "    x = column[16:]\n",
    "    x = x.replace('-','/')\n",
    "    return x\n",
    "\n",
    "def trim_purpose(column):\n",
    "    x = column.replace('Inspection Purpose: ','')\n",
    "    return x\n",
    "\n",
    "def trim_num(column):\n",
    "    x = column[11:]\n",
    "    return x\n",
    "\n",
    "def trim_violation(column):\n",
    "    x = column[2:-2]\n",
    "    return x\n",
    "\n",
    "def violation_id(column):\n",
    "    x = column[:2]\n",
    "    x = re.sub('[^0-9]','', x)\n",
    "    x = x.strip()\n",
    "    return x\n",
    "\n",
    "def violation_desc(column):\n",
    "    y = column.find('-') + 2\n",
    "    x = column[y:]\n",
    "    return x\n",
    "\n",
    "def clean_url(column):\n",
    "    x = column.replace('../', 'https://ri.healthinspections.us/')\n",
    "    return x\n",
    "    \n",
    "\n",
    "df['Inspection Date'] = df['columns.0'].apply(trim_date)\n",
    "df['Inspection Purpose'] = df['columns.1'].astype(str).apply(trim_purpose)\n",
    "df['Violation Num'] = df['variable'].apply(trim_num)\n",
    "df['value'] = df['value'].astype(str).apply(trim_violation)\n",
    "df['Report URL'] = df['printablePath'].astype(str).apply(clean_url)\n",
    "\n",
    "# splitting value\n",
    "df['Violation Id'] = df['value'].astype(str).apply(violation_id)\n",
    "df['Violation Desc'] = df['value'].astype(str).apply(violation_desc)\n",
    "\n",
    "\n",
    "df = df.reset_index()\n",
    "df = df.drop(['index', 'Unnamed: 0', 'columns.0', 'columns.1', 'columns.2', 'printablePath',\n",
    "             'variable', 'value'], axis=1)\n",
    "\n",
    "\n",
    "df.to_csv('./cleaned/Inspections.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}