{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2021-11-09T03:40:59.657176Z", "iopub.status.busy": "2021-11-09T03:40:59.656913Z", "iopub.status.idle": "2021-11-09T03:40:59.660859Z", "shell.execute_reply": "2021-11-09T03:40:59.660154Z", "shell.execute_reply.started": "2021-11-09T03:40:59.657147Z" }, "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2021-11-09T03:41:00.023002Z", "iopub.status.busy": "2021-11-09T03:41:00.022740Z", "iopub.status.idle": "2021-11-09T03:41:00.033547Z", "shell.execute_reply": "2021-11-09T03:41:00.032810Z", "shell.execute_reply.started": "2021-11-09T03:41:00.022977Z" } }, "outputs": [ { "data": { "text/plain": [ "('1.3.2', '1.21.2')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.__version__, np.__version__" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2021-11-09T03:41:00.537187Z", "iopub.status.busy": "2021-11-09T03:41:00.535145Z", "iopub.status.idle": "2021-11-09T03:41:00.580259Z", "shell.execute_reply": "2021-11-09T03:41:00.579517Z", "shell.execute_reply.started": "2021-11-09T03:41:00.537094Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountry
0johnUSA
1davidUK
2annaNaN
\n", "
" ], "text/plain": [ " name country\n", "0 john USA\n", "1 david UK\n", "2 anna NaN" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# needs two steps\n", "# one to assign the dataframe to a variable\n", "df = pd.DataFrame({\n", " 'name':['john','david','anna'],\n", " 'country':['USA','UK',np.nan]\n", "})\n", "df" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2021-11-09T03:41:01.322087Z", "iopub.status.busy": "2021-11-09T03:41:01.321370Z", "iopub.status.idle": "2021-11-09T03:41:01.347716Z", "shell.execute_reply": "2021-11-09T03:41:01.346825Z", "shell.execute_reply.started": "2021-11-09T03:41:01.322012Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountry
0johnUSA
\n", "
" ], "text/plain": [ " name country\n", "0 john USA" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# another one to perform the filter\n", "df[df['country']=='USA']" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountry
0johnUSA
\n", "
" ], "text/plain": [ " name country\n", "0 john USA" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame({\n", " 'name':['john','david','anna'],\n", " 'country':['USA','UK',np.nan]\n", "}).query(\"country == 'USA'\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## query where is null" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountry
2annaNaN
\n", "
" ], "text/plain": [ " name country\n", "2 anna NaN" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\n", " 'name':['john','david','anna'],\n", " 'country':['USA','UK',np.nan]\n", "})\n", "\n", "df.query('country.isnull()')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## query with python variable" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountryage
1davidUK45
2annaNaN45
\n", "
" ], "text/plain": [ " name country age\n", "1 david UK 45\n", "2 anna NaN 45" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "df = pd.DataFrame({\n", " 'name':['john','david','anna'],\n", " 'country':['USA','UK',np.nan],\n", " 'age':[23,45,45]\n", "})\n", "\n", "target_age = 45\n", "\n", "df.query('age == @target_age')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## OR operator" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountryage
0johnUSA23
1davidUK45
2annaUSA45
\n", "
" ], "text/plain": [ " name country age\n", "0 john USA 23\n", "1 david UK 45\n", "2 anna USA 45" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.DataFrame({\n", " 'name':['john','david','anna'],\n", " 'country':['USA','UK', 'USA'],\n", " 'age':[23,45,45]\n", "})\n", "df" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountryage
0johnUSA23
1davidUK45
\n", "
" ], "text/plain": [ " name country age\n", "0 john USA 23\n", "1 david UK 45" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.query(\"(name=='john') or (country=='UK')\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## AND operator" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountryage
0johnUSA23
1davidUK45
2annaUSA45
\n", "
" ], "text/plain": [ " name country age\n", "0 john USA 23\n", "1 david UK 45\n", "2 anna USA 45" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.DataFrame({\n", " 'name':['john','david','anna'],\n", " 'country':['USA','UK', 'USA'],\n", " 'age':[23,45,45]\n", "})\n", "df" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountryage
0johnUSA23
\n", "
" ], "text/plain": [ " name country age\n", "0 john USA 23" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.query(\"(country=='USA') and (age==23)\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Column is in array" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountryage
0johnUSA23
1davidUK45
2annaUSA45
\n", "
" ], "text/plain": [ " name country age\n", "0 john USA 23\n", "1 david UK 45\n", "2 anna USA 45" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.DataFrame({\n", " 'name':['john','david','anna'],\n", " 'country':['USA','UK', 'USA'],\n", " 'age':[23,45,45]\n", "})\n", "df" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountryage
0johnUSA23
2annaUSA45
\n", "
" ], "text/plain": [ " name country age\n", "0 john USA 23\n", "2 anna USA 45" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "names_array = ['john','anna']\n", "\n", "df.query('name in @names_array')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Column is not in array" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountryage
0johnUSA23
1davidUK45
2annaUSA45
\n", "
" ], "text/plain": [ " name country age\n", "0 john USA 23\n", "1 david UK 45\n", "2 anna USA 45" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.DataFrame({\n", " 'name':['john','david','anna'],\n", " 'country':['USA','UK', 'USA'],\n", " 'age':[23,45,45]\n", "})\n", "df" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountryage
0johnUSA23
1davidUK45
\n", "
" ], "text/plain": [ " name country age\n", "0 john USA 23\n", "1 david UK 45" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "invalid_array = ['anna']\n", "\n", "df.query('name not in @invalid_array')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Column name with spaces" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountry of birthage
0johnUSA23
1davidUK45
2annaUSA45
\n", "
" ], "text/plain": [ " name country of birth age\n", "0 john USA 23\n", "1 david UK 45\n", "2 anna USA 45" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.DataFrame({\n", " 'name':['john','david','anna'],\n", " 'country of birth':['USA','UK', 'USA'],\n", " 'age':[23,45,45]\n", "})\n", "df" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountry of birthage
1davidUK45
\n", "
" ], "text/plain": [ " name country of birth age\n", "1 david UK 45" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.query('`country of birth` == \"UK\"')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Where column is null" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountry
0johnUSA
1davidUK
2annaNaN
\n", "
" ], "text/plain": [ " name country\n", "0 john USA\n", "1 david UK\n", "2 anna NaN" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "df = pd.DataFrame({\n", " 'name':['john','david','anna'],\n", " 'country':['USA','UK',np.nan]\n", "})\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountry
2annaNaN
\n", "
" ], "text/plain": [ " name country\n", "2 anna NaN" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.query('country.isnull()')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Where column is not null" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountry
0johnUSA
1davidUK
2annaNaN
\n", "
" ], "text/plain": [ " name country\n", "0 john USA\n", "1 david UK\n", "2 anna NaN" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "df = pd.DataFrame({\n", " 'name':['john','david','anna'],\n", " 'country':['USA','UK',np.nan]\n", "})\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountry
0johnUSA
1davidUK
\n", "
" ], "text/plain": [ " name country\n", "0 john USA\n", "1 david UK" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.query('country.notnull()')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## like" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
col1
0foo
1bar
2baz
3quux
\n", "
" ], "text/plain": [ " col1\n", "0 foo\n", "1 bar\n", "2 baz\n", "3 quux" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.DataFrame({\n", " 'col1':['foo','bar','baz','quux']\n", "})\n", "\n", "df" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
col1
1bar
2baz
\n", "
" ], "text/plain": [ " col1\n", "1 bar\n", "2 baz" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.query('col1.str.contains(\"ba\")')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "toc-autonumbering": false, "toc-showcode": true }, "nbformat": 4, "nbformat_minor": 4 }