{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Wrangling in Pandas - Worked Examples\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reviewing our earlier application of Data Wrangling to Craigslist Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# import libraries and read in the csv file\n",
    "import re as re, pandas as pd, numpy as np, requests, json\n",
    "df = pd.read_csv('bay.csv')\n",
    "print(df[:5])\n",
    "\n",
    "# clean price and neighborhood\n",
    "df.price = df.price.str.strip('$').astype('float64')\n",
    "df.neighborhood = df.neighborhood.str.strip().str.strip('(').str.strip(')')\n",
    "\n",
    "# break out the date into month day year columns\n",
    "df['month'] = df['date'].str.split().str[0]\n",
    "df['day'] = df['date'].str.split().str[1].astype('int32')\n",
    "df['year'] = df['date'].str.split().str[2].astype('int32')\n",
    "del df['date']\n",
    "\n",
    "def clean_br(value):\n",
    "    if isinstance(value, str):\n",
    "        end = value.find('br')\n",
    "        if end == -1:\n",
    "            return None\n",
    "        else:\n",
    "            start = value.find('/') + 2\n",
    "            return int(value[start:end])\n",
    "df['bedrooms'] = df['bedrooms'].map(clean_br)\n",
    "\n",
    "def clean_sqft(value):\n",
    "    if isinstance(value, str):\n",
    "        end = value.find('ft')\n",
    "        if end == -1:\n",
    "            return None\n",
    "        else:\n",
    "            if value.find('br') == -1:\n",
    "                start = value.find('/') + 2\n",
    "            else:\n",
    "                start = value.find('-') + 2\n",
    "            return int(value[start:end])\n",
    "df['sqft'] = df['sqft'].map(clean_sqft)\n",
    "\n",
    "\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Let's do some wrangling on this dataset:\n",
    "1. Find outliers in rent, say below 200 or above 10,000\n",
    "1. Analyze the data without missing data\n",
    "1. Create a dataset that removes the outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df['price'].dropna().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df['price'][(df['price'] < 200)].dropna().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df['price'][(df['price'] > 10000)].dropna().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Let's get a quantile value at the 99 percentile to see the value that the top one percent of our records exceed\n",
    "df['price'].dropna().quantile(.99)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "filtered = df[(df['price'] < 10000) & (df['price'] > 200)]\n",
    "filtered.dropna().describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## OK, now on your own: \n",
    "1. Filter out records with more than 4 bedrooms\n",
    "2. Create dummy variables for each bedroom count (e.g. bed_1 would have 1 for rows with 1 bedroom, 0 for others), and merge them with the dataframe\n",
    "3. Filter sqft < 500 and > 3000\n",
    "4. Create a set of 5 bins for price that include all values, and do counts of how many records are in each category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "BedFilter = filtered[(filtered['bedrooms']<5)]\n",
    "BedFilter.dropna().describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dummies = pd.get_dummies(BedFilter['bedrooms'],prefix='bedrooms')\n",
    "BedFilterWithDummies = BedFilter.join(dummies)\n",
    "BedFilterWithDummies.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "bedfilter_dummies2=pd.merge(BedFilter, dummies, left_index=True, right_index=True)\n",
    "bedfilter_dummies2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "bins = [0, 500, 1000, 1500, 2000, 99999]\n",
    "cats = pd.cut(df['price'], bins)\n",
    "pd.value_counts(cats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}