{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Wrangling in Pandas - Worked Examples\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Reviewing our earlier application of Data Wrangling to Craigslist Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# import libraries and read in the csv file\n", "import re as re, pandas as pd, numpy as np, requests, json\n", "df = pd.read_csv('bay.csv')\n", "print(df[:5])\n", "\n", "# clean price and neighborhood\n", "df.price = df.price.str.strip('$').astype('float64')\n", "df.neighborhood = df.neighborhood.str.strip().str.strip('(').str.strip(')')\n", "\n", "# break out the date into month day year columns\n", "df['month'] = df['date'].str.split().str[0]\n", "df['day'] = df['date'].str.split().str[1].astype('int32')\n", "df['year'] = df['date'].str.split().str[2].astype('int32')\n", "del df['date']\n", "\n", "def clean_br(value):\n", " if isinstance(value, str):\n", " end = value.find('br')\n", " if end == -1:\n", " return None\n", " else:\n", " start = value.find('/') + 2\n", " return int(value[start:end])\n", "df['bedrooms'] = df['bedrooms'].map(clean_br)\n", "\n", "def clean_sqft(value):\n", " if isinstance(value, str):\n", " end = value.find('ft')\n", " if end == -1:\n", " return None\n", " else:\n", " if value.find('br') == -1:\n", " start = value.find('/') + 2\n", " else:\n", " start = value.find('-') + 2\n", " return int(value[start:end])\n", "df['sqft'] = df['sqft'].map(clean_sqft)\n", "\n", "\n", "\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Let's do some wrangling on this dataset:\n", "1. Find outliers in rent, say below 200 or above 10,000\n", "1. Analyze the data without missing data\n", "1. Create a dataset that removes the outliers" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df['price'].dropna().describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df['price'][(df['price'] < 200)].dropna().describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df['price'][(df['price'] > 10000)].dropna().describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Let's get a quantile value at the 99 percentile to see the value that the top one percent of our records exceed\n", "df['price'].dropna().quantile(.99)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "filtered = df[(df['price'] < 10000) & (df['price'] > 200)]\n", "filtered.dropna().describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## OK, now on your own: \n", "1. Filter out records with more than 4 bedrooms\n", "2. Create dummy variables for each bedroom count (e.g. bed_1 would have 1 for rows with 1 bedroom, 0 for others), and merge them with the dataframe\n", "3. Filter sqft < 500 and > 3000\n", "4. Create a set of 5 bins for price that include all values, and do counts of how many records are in each category" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "BedFilter = filtered[(filtered['bedrooms']<5)]\n", "BedFilter.dropna().describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dummies = pd.get_dummies(BedFilter['bedrooms'],prefix='bedrooms')\n", "BedFilterWithDummies = BedFilter.join(dummies)\n", "BedFilterWithDummies.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "bedfilter_dummies2=pd.merge(BedFilter, dummies, left_index=True, right_index=True)\n", "bedfilter_dummies2.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "bins = [0, 500, 1000, 1500, 2000, 99999]\n", "cats = pd.cut(df['price'], bins)\n", "pd.value_counts(cats)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }