{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.20.3\n", "2.2.2\n" ] } ], "source": [ "import pandas as pd\n", "import os\n", "import matplotlib\n", "\n", "%matplotlib inline\n", "\n", "print pd.__version__\n", "print matplotlib.__version__" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Missing Values in Pandas (Vid-16)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Abilene</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>KS</td>\n", " <td>6/1/1931 13:00</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>New York Worlds Fair</td>\n", " <td>NaN</td>\n", " <td>LIGHT</td>\n", " <td>NY</td>\n", " <td>4/18/1933 19:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke NaN OVAL CO 2/15/1931 14:00\n", "3 Abilene NaN DISK KS 6/1/1931 13:00\n", "4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# reading table\n", "# making seperator as comma\n", "df = pd.read_table(\n", " 'http://bit.ly/uforeports', \n", " sep=','\n", " )\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 False True False False False\n", "1 False True False False False\n", "2 False True False False False\n", "3 False True False False False\n", "4 False True False False False" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# isnull() method returns a df of T/F considering if NaN occurs in a particular cell\n", "df.isnull().head()\n", "# notnull() is just inverse of isnull()\n", "# df.notnull().head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "City 25\n", "Colors Reported 15359\n", "Shape Reported 2644\n", "State 0\n", "Time 0\n", "dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pandas converts T to 1 and F to 0\n", "# since isnull() returns T to whereever NaN exists\n", "# .sum() over it gives the total number of NaN in the dataframe\n", "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>12</th>\n", " <td>Belton</td>\n", " <td>RED</td>\n", " <td>SPHERE</td>\n", " <td>SC</td>\n", " <td>6/30/1939 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>Bering Sea</td>\n", " <td>RED</td>\n", " <td>OTHER</td>\n", " <td>AK</td>\n", " <td>4/30/1943 23:00</td>\n", " </tr>\n", " <tr>\n", " <th>36</th>\n", " <td>Portsmouth</td>\n", " <td>RED</td>\n", " <td>FORMATION</td>\n", " <td>VA</td>\n", " <td>7/10/1945 1:30</td>\n", " </tr>\n", " <tr>\n", " <th>44</th>\n", " <td>Blairsden</td>\n", " <td>GREEN</td>\n", " <td>SPHERE</td>\n", " <td>CA</td>\n", " <td>6/30/1946 19:00</td>\n", " </tr>\n", " <tr>\n", " <th>82</th>\n", " <td>San Jose</td>\n", " <td>BLUE</td>\n", " <td>CHEVRON</td>\n", " <td>CA</td>\n", " <td>7/15/1947 21:00</td>\n", " </tr>\n", " <tr>\n", " <th>84</th>\n", " <td>Modesto</td>\n", " <td>BLUE</td>\n", " <td>DISK</td>\n", " <td>CA</td>\n", " <td>8/8/1947 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>91</th>\n", " <td>Scipio</td>\n", " <td>RED</td>\n", " <td>SPHERE</td>\n", " <td>IN</td>\n", " <td>5/10/1948 19:00</td>\n", " </tr>\n", " <tr>\n", " <th>111</th>\n", " <td>Tarrant City</td>\n", " <td>ORANGE</td>\n", " <td>CIRCLE</td>\n", " <td>AL</td>\n", " <td>8/15/1949 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>129</th>\n", " <td>Napa</td>\n", " <td>GREEN</td>\n", " <td>DISK</td>\n", " <td>CA</td>\n", " <td>6/10/1950 0:00</td>\n", " </tr>\n", " <tr>\n", " <th>138</th>\n", " <td>Coeur d'Alene</td>\n", " <td>ORANGE</td>\n", " <td>CIGAR</td>\n", " <td>ID</td>\n", " <td>7/2/1950 13:00</td>\n", " </tr>\n", " <tr>\n", " <th>152</th>\n", " <td>Irving</td>\n", " <td>BLUE</td>\n", " <td>DISK</td>\n", " <td>KS</td>\n", " <td>4/15/1951 0:30</td>\n", " </tr>\n", " <tr>\n", " <th>157</th>\n", " <td>Greenville</td>\n", " <td>GREEN</td>\n", " <td>DISK</td>\n", " <td>MS</td>\n", " <td>6/15/1951 20:30</td>\n", " </tr>\n", " <tr>\n", " <th>163</th>\n", " <td>Green River</td>\n", " <td>GREEN</td>\n", " <td>SPHERE</td>\n", " <td>WY</td>\n", " <td>7/3/1951 12:00</td>\n", " </tr>\n", " <tr>\n", " <th>164</th>\n", " <td>Provo</td>\n", " <td>BLUE</td>\n", " <td>DISK</td>\n", " <td>UT</td>\n", " <td>7/10/1951 23:30</td>\n", " </tr>\n", " <tr>\n", " <th>174</th>\n", " <td>Greenville</td>\n", " <td>ORANGE</td>\n", " <td>TRIANGLE</td>\n", " <td>TX</td>\n", " <td>4/15/1952 16:00</td>\n", " </tr>\n", " <tr>\n", " <th>178</th>\n", " <td>Norfolk</td>\n", " <td>RED</td>\n", " <td>FIREBALL</td>\n", " <td>VA</td>\n", " <td>6/1/1952 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>202</th>\n", " <td>Arlington</td>\n", " <td>GREEN</td>\n", " <td>OVAL</td>\n", " <td>VA</td>\n", " <td>7/13/1952 21:00</td>\n", " </tr>\n", " <tr>\n", " <th>226</th>\n", " <td>Cambridge</td>\n", " <td>RED</td>\n", " <td>SPHERE</td>\n", " <td>MA</td>\n", " <td>4/1/1953 18:00</td>\n", " </tr>\n", " <tr>\n", " <th>229</th>\n", " <td>Midwest City</td>\n", " <td>YELLOW</td>\n", " <td>FIREBALL</td>\n", " <td>OK</td>\n", " <td>4/15/1953 16:00</td>\n", " </tr>\n", " <tr>\n", " <th>238</th>\n", " <td>Cleveland</td>\n", " <td>RED</td>\n", " <td>FIREBALL</td>\n", " <td>OH</td>\n", " <td>6/30/1953 0:00</td>\n", " </tr>\n", " <tr>\n", " <th>249</th>\n", " <td>Artesia</td>\n", " <td>ORANGE</td>\n", " <td>OTHER</td>\n", " <td>NM</td>\n", " <td>8/15/1953 19:00</td>\n", " </tr>\n", " <tr>\n", " <th>256</th>\n", " <td>Pendleton</td>\n", " <td>GREEN</td>\n", " <td>DISK</td>\n", " <td>IN</td>\n", " <td>11/21/1953 22:30</td>\n", " </tr>\n", " <tr>\n", " <th>288</th>\n", " <td>St. Louis Airport</td>\n", " <td>RED</td>\n", " <td>OVAL</td>\n", " <td>MO</td>\n", " <td>7/1/1954 21:00</td>\n", " </tr>\n", " <tr>\n", " <th>289</th>\n", " <td>Los Angeles</td>\n", " <td>RED</td>\n", " <td>CIRCLE</td>\n", " <td>CA</td>\n", " <td>7/1/1954 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>304</th>\n", " <td>Beaumont</td>\n", " <td>RED</td>\n", " <td>DISK</td>\n", " <td>TX</td>\n", " <td>9/9/1954 12:30</td>\n", " </tr>\n", " <tr>\n", " <th>311</th>\n", " <td>Red Bank</td>\n", " <td>ORANGE</td>\n", " <td>CIRCLE</td>\n", " <td>NJ</td>\n", " <td>12/15/1954 23:10</td>\n", " </tr>\n", " <tr>\n", " <th>314</th>\n", " <td>Holbrook</td>\n", " <td>YELLOW</td>\n", " <td>EGG</td>\n", " <td>MA</td>\n", " <td>5/1/1955 15:00</td>\n", " </tr>\n", " <tr>\n", " <th>323</th>\n", " <td>Terre Haute</td>\n", " <td>ORANGE</td>\n", " <td>CIRCLE</td>\n", " <td>IN</td>\n", " <td>6/15/1955 0:00</td>\n", " </tr>\n", " <tr>\n", " <th>354</th>\n", " <td>Memphis</td>\n", " <td>ORANGE</td>\n", " <td>CYLINDER</td>\n", " <td>TN</td>\n", " <td>6/1/1956 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>363</th>\n", " <td>Vista</td>\n", " <td>ORANGE</td>\n", " <td>CIGAR</td>\n", " <td>CA</td>\n", " <td>6/15/1956 19:15</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>18110</th>\n", " <td>Gilbert</td>\n", " <td>RED</td>\n", " <td>LIGHT</td>\n", " <td>AZ</td>\n", " <td>12/11/2000 23:45</td>\n", " </tr>\n", " <tr>\n", " <th>18115</th>\n", " <td>Madison</td>\n", " <td>RED GREEN</td>\n", " <td>TEARDROP</td>\n", " <td>OH</td>\n", " <td>12/12/2000 18:45</td>\n", " </tr>\n", " <tr>\n", " <th>18117</th>\n", " <td>Concord</td>\n", " <td>RED GREEN</td>\n", " <td>CIRCLE</td>\n", " <td>NH</td>\n", " <td>12/12/2000 23:10</td>\n", " </tr>\n", " <tr>\n", " <th>18121</th>\n", " <td>Redwood Valley</td>\n", " <td>BLUE</td>\n", " <td>LIGHT</td>\n", " <td>CA</td>\n", " <td>12/15/2000 1:09</td>\n", " </tr>\n", " <tr>\n", " <th>18123</th>\n", " <td>Waldorf</td>\n", " <td>GREEN</td>\n", " <td>LIGHT</td>\n", " <td>MD</td>\n", " <td>12/15/2000 5:30</td>\n", " </tr>\n", " <tr>\n", " <th>18127</th>\n", " <td>North Pole</td>\n", " <td>ORANGE</td>\n", " <td>SPHERE</td>\n", " <td>AK</td>\n", " <td>12/15/2000 16:36</td>\n", " </tr>\n", " <tr>\n", " <th>18129</th>\n", " <td>Beaver</td>\n", " <td>RED</td>\n", " <td>DISK</td>\n", " <td>PA</td>\n", " <td>12/15/2000 18:25</td>\n", " </tr>\n", " <tr>\n", " <th>18134</th>\n", " <td>Graham</td>\n", " <td>GREEN</td>\n", " <td>LIGHT</td>\n", " <td>WA</td>\n", " <td>12/16/2000 0:20</td>\n", " </tr>\n", " <tr>\n", " <th>18141</th>\n", " <td>Sebastopol</td>\n", " <td>YELLOW</td>\n", " <td>CIRCLE</td>\n", " <td>CA</td>\n", " <td>12/17/2000 18:00</td>\n", " </tr>\n", " <tr>\n", " <th>18144</th>\n", " <td>Medina</td>\n", " <td>ORANGE</td>\n", " <td>TRIANGLE</td>\n", " <td>OH</td>\n", " <td>12/17/2000 19:30</td>\n", " </tr>\n", " <tr>\n", " <th>18148</th>\n", " <td>Highland Park</td>\n", " <td>BLUE</td>\n", " <td>VARIOUS</td>\n", " <td>NJ</td>\n", " <td>12/18/2000 2:30</td>\n", " </tr>\n", " <tr>\n", " <th>18158</th>\n", " <td>Woodland</td>\n", " <td>ORANGE</td>\n", " <td>LIGHT</td>\n", " <td>CA</td>\n", " <td>12/19/2000 23:30</td>\n", " </tr>\n", " <tr>\n", " <th>18167</th>\n", " <td>Gilbert</td>\n", " <td>RED</td>\n", " <td>CIRCLE</td>\n", " <td>AZ</td>\n", " <td>12/21/2000 19:05</td>\n", " </tr>\n", " <tr>\n", " <th>18170</th>\n", " <td>MM 110</td>\n", " <td>BLUE</td>\n", " <td>CIRCLE</td>\n", " <td>AZ</td>\n", " <td>12/22/2000 3:00</td>\n", " </tr>\n", " <tr>\n", " <th>18172</th>\n", " <td>Toomsuba</td>\n", " <td>RED</td>\n", " <td>OVAL</td>\n", " <td>MS</td>\n", " <td>12/23/2000 4:00</td>\n", " </tr>\n", " <tr>\n", " <th>18177</th>\n", " <td>Chandler</td>\n", " <td>RED</td>\n", " <td>LIGHT</td>\n", " <td>AZ</td>\n", " <td>12/23/2000 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>18181</th>\n", " <td>Fortuna</td>\n", " <td>ORANGE</td>\n", " <td>CIRCLE</td>\n", " <td>CA</td>\n", " <td>12/24/2000 18:00</td>\n", " </tr>\n", " <tr>\n", " <th>18184</th>\n", " <td>Plymouth</td>\n", " <td>GREEN</td>\n", " <td>FIREBALL</td>\n", " <td>OH</td>\n", " <td>12/24/2000 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>18191</th>\n", " <td>Fallston</td>\n", " <td>RED</td>\n", " <td>VARIOUS</td>\n", " <td>MD</td>\n", " <td>12/25/2000 19:15</td>\n", " </tr>\n", " <tr>\n", " <th>18192</th>\n", " <td>Atlanta</td>\n", " <td>ORANGE</td>\n", " <td>LIGHT</td>\n", " <td>GA</td>\n", " <td>12/25/2000 20:30</td>\n", " </tr>\n", " <tr>\n", " <th>18194</th>\n", " <td>Walpole</td>\n", " <td>GREEN</td>\n", " <td>FIREBALL</td>\n", " <td>NH</td>\n", " <td>12/26/2000 18:20</td>\n", " </tr>\n", " <tr>\n", " <th>18195</th>\n", " <td>Walpole</td>\n", " <td>GREEN</td>\n", " <td>FIREBALL</td>\n", " <td>NH</td>\n", " <td>12/26/2000 18:20</td>\n", " </tr>\n", " <tr>\n", " <th>18196</th>\n", " <td>Brocton</td>\n", " <td>GREEN</td>\n", " <td>OVAL</td>\n", " <td>MA</td>\n", " <td>12/26/2000 18:23</td>\n", " </tr>\n", " <tr>\n", " <th>18197</th>\n", " <td>Greenfiled</td>\n", " <td>GREEN</td>\n", " <td>FIREBALL</td>\n", " <td>MA</td>\n", " <td>12/26/2000 18:30</td>\n", " </tr>\n", " <tr>\n", " <th>18210</th>\n", " <td>Monument Valley</td>\n", " <td>BLUE</td>\n", " <td>OTHER</td>\n", " <td>UT</td>\n", " <td>12/28/2000 17:51</td>\n", " </tr>\n", " <tr>\n", " <th>18213</th>\n", " <td>Pasadena</td>\n", " <td>GREEN</td>\n", " <td>FIREBALL</td>\n", " <td>CA</td>\n", " <td>12/28/2000 19:10</td>\n", " </tr>\n", " <tr>\n", " <th>18216</th>\n", " <td>Garden Grove</td>\n", " <td>ORANGE</td>\n", " <td>LIGHT</td>\n", " <td>CA</td>\n", " <td>12/29/2000 16:10</td>\n", " </tr>\n", " <tr>\n", " <th>18220</th>\n", " <td>Shasta Lake</td>\n", " <td>BLUE</td>\n", " <td>DISK</td>\n", " <td>CA</td>\n", " <td>12/29/2000 20:30</td>\n", " </tr>\n", " <tr>\n", " <th>18233</th>\n", " <td>Anchorage</td>\n", " <td>RED</td>\n", " <td>VARIOUS</td>\n", " <td>AK</td>\n", " <td>12/31/2000 21:00</td>\n", " </tr>\n", " <tr>\n", " <th>18239</th>\n", " <td>Eagle River</td>\n", " <td>RED</td>\n", " <td>LIGHT</td>\n", " <td>WI</td>\n", " <td>12/31/2000 23:45</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>2486 rows × 5 columns</p>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State \\\n", "12 Belton RED SPHERE SC \n", "19 Bering Sea RED OTHER AK \n", "36 Portsmouth RED FORMATION VA \n", "44 Blairsden GREEN SPHERE CA \n", "82 San Jose BLUE CHEVRON CA \n", "84 Modesto BLUE DISK CA \n", "91 Scipio RED SPHERE IN \n", "111 Tarrant City ORANGE CIRCLE AL \n", "129 Napa GREEN DISK CA \n", "138 Coeur d'Alene ORANGE CIGAR ID \n", "152 Irving BLUE DISK KS \n", "157 Greenville GREEN DISK MS \n", "163 Green River GREEN SPHERE WY \n", "164 Provo BLUE DISK UT \n", "174 Greenville ORANGE TRIANGLE TX \n", "178 Norfolk RED FIREBALL VA \n", "202 Arlington GREEN OVAL VA \n", "226 Cambridge RED SPHERE MA \n", "229 Midwest City YELLOW FIREBALL OK \n", "238 Cleveland RED FIREBALL OH \n", "249 Artesia ORANGE OTHER NM \n", "256 Pendleton GREEN DISK IN \n", "288 St. Louis Airport RED OVAL MO \n", "289 Los Angeles RED CIRCLE CA \n", "304 Beaumont RED DISK TX \n", "311 Red Bank ORANGE CIRCLE NJ \n", "314 Holbrook YELLOW EGG MA \n", "323 Terre Haute ORANGE CIRCLE IN \n", "354 Memphis ORANGE CYLINDER TN \n", "363 Vista ORANGE CIGAR CA \n", "... ... ... ... ... \n", "18110 Gilbert RED LIGHT AZ \n", "18115 Madison RED GREEN TEARDROP OH \n", "18117 Concord RED GREEN CIRCLE NH \n", "18121 Redwood Valley BLUE LIGHT CA \n", "18123 Waldorf GREEN LIGHT MD \n", "18127 North Pole ORANGE SPHERE AK \n", "18129 Beaver RED DISK PA \n", "18134 Graham GREEN LIGHT WA \n", "18141 Sebastopol YELLOW CIRCLE CA \n", "18144 Medina ORANGE TRIANGLE OH \n", "18148 Highland Park BLUE VARIOUS NJ \n", "18158 Woodland ORANGE LIGHT CA \n", "18167 Gilbert RED CIRCLE AZ \n", "18170 MM 110 BLUE CIRCLE AZ \n", "18172 Toomsuba RED OVAL MS \n", "18177 Chandler RED LIGHT AZ \n", "18181 Fortuna ORANGE CIRCLE CA \n", "18184 Plymouth GREEN FIREBALL OH \n", "18191 Fallston RED VARIOUS MD \n", "18192 Atlanta ORANGE LIGHT GA \n", "18194 Walpole GREEN FIREBALL NH \n", "18195 Walpole GREEN FIREBALL NH \n", "18196 Brocton GREEN OVAL MA \n", "18197 Greenfiled GREEN FIREBALL MA \n", "18210 Monument Valley BLUE OTHER UT \n", "18213 Pasadena GREEN FIREBALL CA \n", "18216 Garden Grove ORANGE LIGHT CA \n", "18220 Shasta Lake BLUE DISK CA \n", "18233 Anchorage RED VARIOUS AK \n", "18239 Eagle River RED LIGHT WI \n", "\n", " Time \n", "12 6/30/1939 20:00 \n", "19 4/30/1943 23:00 \n", "36 7/10/1945 1:30 \n", "44 6/30/1946 19:00 \n", "82 7/15/1947 21:00 \n", "84 8/8/1947 22:00 \n", "91 5/10/1948 19:00 \n", "111 8/15/1949 22:00 \n", "129 6/10/1950 0:00 \n", "138 7/2/1950 13:00 \n", "152 4/15/1951 0:30 \n", "157 6/15/1951 20:30 \n", "163 7/3/1951 12:00 \n", "164 7/10/1951 23:30 \n", "174 4/15/1952 16:00 \n", "178 6/1/1952 22:00 \n", "202 7/13/1952 21:00 \n", "226 4/1/1953 18:00 \n", "229 4/15/1953 16:00 \n", "238 6/30/1953 0:00 \n", "249 8/15/1953 19:00 \n", "256 11/21/1953 22:30 \n", "288 7/1/1954 21:00 \n", "289 7/1/1954 22:00 \n", "304 9/9/1954 12:30 \n", "311 12/15/1954 23:10 \n", "314 5/1/1955 15:00 \n", "323 6/15/1955 0:00 \n", "354 6/1/1956 20:00 \n", "363 6/15/1956 19:15 \n", "... ... \n", "18110 12/11/2000 23:45 \n", "18115 12/12/2000 18:45 \n", "18117 12/12/2000 23:10 \n", "18121 12/15/2000 1:09 \n", "18123 12/15/2000 5:30 \n", "18127 12/15/2000 16:36 \n", "18129 12/15/2000 18:25 \n", "18134 12/16/2000 0:20 \n", "18141 12/17/2000 18:00 \n", "18144 12/17/2000 19:30 \n", "18148 12/18/2000 2:30 \n", "18158 12/19/2000 23:30 \n", "18167 12/21/2000 19:05 \n", "18170 12/22/2000 3:00 \n", "18172 12/23/2000 4:00 \n", "18177 12/23/2000 22:00 \n", "18181 12/24/2000 18:00 \n", "18184 12/24/2000 22:00 \n", "18191 12/25/2000 19:15 \n", "18192 12/25/2000 20:30 \n", "18194 12/26/2000 18:20 \n", "18195 12/26/2000 18:20 \n", "18196 12/26/2000 18:23 \n", "18197 12/26/2000 18:30 \n", "18210 12/28/2000 17:51 \n", "18213 12/28/2000 19:10 \n", "18216 12/29/2000 16:10 \n", "18220 12/29/2000 20:30 \n", "18233 12/31/2000 21:00 \n", "18239 12/31/2000 23:45 \n", "\n", "[2486 rows x 5 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Handling Missing Values\n", "## Drop Missing Values\n", "\n", "# dropna() method drops al those rows from the dataset where anyof the columns holds a NaN values\n", "df.dropna(how='any')\n", "\n", "# (how='all') - drop only if all the values in the row are missing" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(2877, 5)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop a row only if any of the City, Colors Reported columns have NaN value\n", "df.dropna(subset=['City', 'Colors Reported'], how='any').shape" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Seattle 187\n", "New York City 161\n", "Name: City, dtype: int64\n", "\n", "25\n", "\n", "Seattle 212\n", "New York City 161\n", "Name: City, dtype: int64\n", "\n", "\n", "187+25 = 212\n" ] } ], "source": [ "print df.City.value_counts().head(2)\n", "print \n", "print df.City.isnull().sum()\n", "print \n", "print df.City.fillna(value='Seattle', inplace=False).value_counts().head(2)\n", "\n", "print \"\\n\\n187+25 = 212\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Takeaways\n", "\n", "1. For any value that is missing in the dataset. Pandas makes it NaN bydefault\n", "2. Missing values are excluded while making a count using value_counts()\n", "\n", "# -----------------------\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }