{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pandas Data Type Mismatch" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook explains how to identify data in columns with the wrong data type with `pandas`." ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "### Packages" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This tutorial uses:\n", "* [pandas](https://pandas.pydata.org/docs/)\n", "* [datetime](https://docs.python.org/3/library/datetime.html)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import datetime\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creating the data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will create a dataframe that contains multiple occurances of duplication for this example." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({'A': ['text']*20,\n", " 'B': [1, 2.2]*10,\n", " 'C': [True, False]*10,\n", " 'D': pd.to_datetime('2020-01-01')\n", " })" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, add some mistyped data to the dataframe." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
011.0True2020-01-01 00:00:00
1-22.2False2020-01-01 00:00:00
2text1.0True12
3text2.2False2020-01-01 00:00:00
4text1.012020-01-01 00:00:00
5text2.2False2020-01-01 00:00:00
6text1.0True2020-01-01 00:00:00
7textA+BFalse2020-01-01 00:00:00
8text1.0True2020-01-01 00:00:00
9text2.2-12.62020-01-01 00:00:00
102021-01-01 00:00:001.0True2020-01-01 00:00:00
11text2.2False2020-01-01 00:00:00
12text1.0text2020-01-01
13text2.2False2020-01-01 00:00:00
14text1.0True2020-01-01 00:00:00
15text2.2False2020-01-01 00:00:00
16text1.0True2020-01-01 00:00:00
17text2.2False2020-01-01 00:00:00
18text1.0True2020-01-01 00:00:00
19text2.2False2020-01-01 00:00:00
\n", "
" ], "text/plain": [ " A B C D\n", "0 1 1.0 True 2020-01-01 00:00:00\n", "1 -2 2.2 False 2020-01-01 00:00:00\n", "2 text 1.0 True 12\n", "3 text 2.2 False 2020-01-01 00:00:00\n", "4 text 1.0 1 2020-01-01 00:00:00\n", "5 text 2.2 False 2020-01-01 00:00:00\n", "6 text 1.0 True 2020-01-01 00:00:00\n", "7 text A+B False 2020-01-01 00:00:00\n", "8 text 1.0 True 2020-01-01 00:00:00\n", "9 text 2.2 -12.6 2020-01-01 00:00:00\n", "10 2021-01-01 00:00:00 1.0 True 2020-01-01 00:00:00\n", "11 text 2.2 False 2020-01-01 00:00:00\n", "12 text 1.0 text 2020-01-01\n", "13 text 2.2 False 2020-01-01 00:00:00\n", "14 text 1.0 True 2020-01-01 00:00:00\n", "15 text 2.2 False 2020-01-01 00:00:00\n", "16 text 1.0 True 2020-01-01 00:00:00\n", "17 text 2.2 False 2020-01-01 00:00:00\n", "18 text 1.0 True 2020-01-01 00:00:00\n", "19 text 2.2 False 2020-01-01 00:00:00" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[0,0] = 1\n", "df.iloc[1,0] = -2\n", "df.iloc[10,0] = pd.to_datetime('2021-01-01')\n", "df.iloc[5,1] = '2.2'\n", "df.iloc[7,1] = 'A+B'\n", "df.iloc[4,2] = 1\n", "df.iloc[5,2] = 'False'\n", "df.iloc[9,2] = -12.6\n", "df.iloc[12,2] = 'text'\n", "df.iloc[2,3] = 12\n", "df.iloc[12,3] = '2020-01-01'\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Identify mistyped data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The function `applymap` and `isinstance` will return a Boolean dataframe with **True** when the data type matches and **False** when the data type does not match." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check numeric" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
0TrueTrueTrueFalse
1TrueTrueTrueFalse
2FalseTrueTrueTrue
3FalseTrueTrueFalse
4FalseTrueTrueFalse
5FalseFalseFalseFalse
6FalseTrueTrueFalse
7FalseFalseTrueFalse
8FalseTrueTrueFalse
9FalseTrueTrueFalse
10FalseTrueTrueFalse
11FalseTrueTrueFalse
12FalseTrueFalseFalse
13FalseTrueTrueFalse
14FalseTrueTrueFalse
15FalseTrueTrueFalse
16FalseTrueTrueFalse
17FalseTrueTrueFalse
18FalseTrueTrueFalse
19FalseTrueTrueFalse
\n", "
" ], "text/plain": [ " A B C D\n", "0 True True True False\n", "1 True True True False\n", "2 False True True True\n", "3 False True True False\n", "4 False True True False\n", "5 False False False False\n", "6 False True True False\n", "7 False False True False\n", "8 False True True False\n", "9 False True True False\n", "10 False True True False\n", "11 False True True False\n", "12 False True False False\n", "13 False True True False\n", "14 False True True False\n", "15 False True True False\n", "16 False True True False\n", "17 False True True False\n", "18 False True True False\n", "19 False True True False" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "numeric = df.applymap(lambda x: isinstance(x, (int, float)))\n", "numeric" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Since only column **B** is supposed to be numeric, this can be made more specific by running `applymap` only on column **B**." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0 True\n", "1 True\n", "2 True\n", "3 True\n", "4 True\n", "5 False\n", "6 True\n", "7 False\n", "8 True\n", "9 True\n", "10 True\n", "11 True\n", "12 True\n", "13 True\n", "14 True\n", "15 True\n", "16 True\n", "17 True\n", "18 True\n", "19 True\n", "Name: B, dtype: bool" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "numeric = df.applymap(lambda x: isinstance(x, (int, float)))['B']\n", "numeric" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using this Boolean series to return the non-numeric data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
5text2.2False2020-01-01 00:00:00
7textA+BFalse2020-01-01 00:00:00
\n", "
" ], "text/plain": [ " A B C D\n", "5 text 2.2 False 2020-01-01 00:00:00\n", "7 text A+B False 2020-01-01 00:00:00" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[~numeric]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check datetime" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0 True\n", "1 True\n", "2 False\n", "3 True\n", "4 True\n", "5 True\n", "6 True\n", "7 True\n", "8 True\n", "9 True\n", "10 True\n", "11 True\n", "12 False\n", "13 True\n", "14 True\n", "15 True\n", "16 True\n", "17 True\n", "18 True\n", "19 True\n", "Name: D, dtype: bool" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dt = df.applymap(lambda x: isinstance(x, (datetime.datetime)))['D']\n", "dt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using this Boolean series to return the non-numeric data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
2text1.0True12
12text1.0text2020-01-01
\n", "
" ], "text/plain": [ " A B C D\n", "2 text 1.0 True 12\n", "12 text 1.0 text 2020-01-01" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[~dt]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check string" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 True\n", "3 True\n", "4 True\n", "5 True\n", "6 True\n", "7 True\n", "8 True\n", "9 True\n", "10 False\n", "11 True\n", "12 True\n", "13 True\n", "14 True\n", "15 True\n", "16 True\n", "17 True\n", "18 True\n", "19 True\n", "Name: A, dtype: bool" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "strings = df.applymap(lambda x: isinstance(x, (str)))['A']\n", "strings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using this Boolean series to return the non-numeric data" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
011.0True2020-01-01 00:00:00
1-22.2False2020-01-01 00:00:00
102021-01-01 00:00:001.0True2020-01-01 00:00:00
\n", "
" ], "text/plain": [ " A B C D\n", "0 1 1.0 True 2020-01-01 00:00:00\n", "1 -2 2.2 False 2020-01-01 00:00:00\n", "10 2021-01-01 00:00:00 1.0 True 2020-01-01 00:00:00" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[~strings]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Check Boolean" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0 True\n", "1 True\n", "2 True\n", "3 True\n", "4 False\n", "5 False\n", "6 True\n", "7 True\n", "8 True\n", "9 False\n", "10 True\n", "11 True\n", "12 False\n", "13 True\n", "14 True\n", "15 True\n", "16 True\n", "17 True\n", "18 True\n", "19 True\n", "Name: C, dtype: bool" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "torf = df.applymap(lambda x: isinstance(x, (bool)))['C']\n", "torf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using this Boolean series to return the non-numeric data" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
4text1.012020-01-01 00:00:00
5text2.2False2020-01-01 00:00:00
9text2.2-12.62020-01-01 00:00:00
12text1.0text2020-01-01
\n", "
" ], "text/plain": [ " A B C D\n", "4 text 1.0 1 2020-01-01 00:00:00\n", "5 text 2.2 False 2020-01-01 00:00:00\n", "9 text 2.2 -12.6 2020-01-01 00:00:00\n", "12 text 1.0 text 2020-01-01" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[~torf]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 4 }