{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# RESEARCH IN PYTHON: DESCRIPTIVE STATISTICS AND EXPLORATORY DATA ANALYSIS\n", "# by J. NATHAN MATIAS March 10, 2015\n", "\n", "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n", "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n", "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n", "# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n", "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n", "# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n", "# THE SOFTWARE.\n", "\n", "# THINGS TO IMPORT\n", "# This is a good baseline set of libraries to import by default if you're rushed for time.\n", "\n", "import codecs # load UTF-8 Content\n", "import json # load JSON files\n", "import pandas as pd # Pandas handles dataframes\n", "import numpy as np # Numpy handles lots of basic maths operations\n", "import matplotlib.pyplot as plt # Matplotlib for plotting\n", "import seaborn as sns # Seaborn for beautiful plots\n", "from dateutil import * # I prefer dateutil for parsing dates\n", "import math # transformations\n", "import statsmodels.formula.api as smf # for doing statistical regression\n", "import statsmodels.api as sm # access to the wider statsmodels library, including R datasets\n", "from collections import Counter # Counter is useful for grouping and counting" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Acquire a Dataset" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Datasets from the R Dataset are accessible via Statsmodels\n", "# http://vincentarelbundock.github.io/Rdatasets/\n", "\n", "# U. S. State Public-School Expenditures\n", "# code book: http://vincentarelbundock.github.io/Rdatasets/doc/car/Anscombe.html\n", "# The observations are the U. S. states plus Washington, D. C. in 1970.\n", "# education = Per-capita education expenditures, dollars.\n", "# income = Per-capita income, dollars.\n", "# young = Proportion under 18, per 1000.\n", "# urban = Proportion urban, per 1000.\n", "\n", "expenditures = sm.datasets.get_rdataset(\"Anscombe\", \"car\")\n", "# assign a variable to the Pandas dataframe for this dataset\n", "expenditures_df = expenditures.data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Summary Statistics" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
| \n", " | education | \n", "income | \n", "young | \n", "urban | \n", "
|---|---|---|---|---|
| count | \n", "51.000000 | \n", "51.000000 | \n", "51.000000 | \n", "51.000000 | \n", "
| mean | \n", "196.313725 | \n", "3225.294118 | \n", "358.886275 | \n", "664.509804 | \n", "
| std | \n", "46.454490 | \n", "560.025974 | \n", "23.959975 | \n", "151.344821 | \n", "
| min | \n", "112.000000 | \n", "2081.000000 | \n", "326.200000 | \n", "322.000000 | \n", "
| 25% | \n", "165.000000 | \n", "2785.500000 | \n", "342.050000 | \n", "552.500000 | \n", "
| 50% | \n", "192.000000 | \n", "3257.000000 | \n", "354.100000 | \n", "664.000000 | \n", "
| 75% | \n", "228.500000 | \n", "3612.000000 | \n", "369.150000 | \n", "790.500000 | \n", "
| max | \n", "372.000000 | \n", "4425.000000 | \n", "439.700000 | \n", "1000.000000 | \n", "