{
"cells": [
{
"cell_type": "markdown",
"id": "8537d349-6446-473e-bd9e-fd6ec2042f28",
"metadata": {},
"source": [
"# Lesson 10 demo: Pandas\n",
"\n",
"## 1. Set up\n",
"\n",
"Install with:\n",
"\n",
"```text\n",
"pip install pandas\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "30dcf970-f9f9-4230-afc5-a31426b16e49",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pandas version 1.5.3\n"
]
}
],
"source": [
"import pandas as pd\n",
"print(f'Pandas version {pd.__version__}')"
]
},
{
"cell_type": "markdown",
"id": "4d5a78cf-a186-4310-a80d-ed7e3ca14e73",
"metadata": {
"toc-hr-collapsed": true
},
"source": [
"## 1. Series: one-dimensional data\n",
"\n",
"### 1.1. Create a series"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "73015824-e26c-45b0-8659-8a63bb14f637",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 20\n",
"1 30\n",
"2 40\n",
"3 50\n",
"dtype: int64\n"
]
}
],
"source": [
"my_series = pd.Series([20, 30, 40, 50])\n",
"print(my_series)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e33fe749-feb2-48ff-9930-01b060c6ee76",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pandas.core.series.Series"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(my_series)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "58930871-dea4-4276-908e-8191c2b80906",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Mon', 'Tues', 'Wed', 'Thurs']\n"
]
}
],
"source": [
"day = ['Mon', 'Tues', 'Wed', 'Thurs']\n",
"print(day)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "8e420bc9-e8a4-423f-a3a1-1a3ba4be0c21",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mon 20\n",
"Tues 30\n",
"Wed 40\n",
"Thurs 50\n",
"dtype: int64\n"
]
}
],
"source": [
"my_indexed_series = pd.Series([20, 30, 40, 50], index=day)\n",
"print(my_indexed_series)"
]
},
{
"cell_type": "markdown",
"id": "fa28881d-1e50-4b75-a351-a26b383f56a2",
"metadata": {},
"source": [
"### 1.2. Indexing series"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "4560f31c-16bf-4e36-8e60-412b4f8bb258",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"40\n"
]
}
],
"source": [
"print(my_indexed_series['Wed'])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c2f253e5-a9ca-4200-aa71-d26de5e2efaf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"30\n"
]
}
],
"source": [
"print(my_series.iloc[1])"
]
},
{
"cell_type": "markdown",
"id": "5e7bd797-89bd-4bc0-96d3-56a43a529442",
"metadata": {},
"source": [
"### 2.3. Series operations"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "e3c0ec65-b978-4d7f-9c08-29d77d40150b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"50\n"
]
}
],
"source": [
"print(my_series.max())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "f6510c0a-55a2-4a35-8bc6-88af0c94c080",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"35.0\n"
]
}
],
"source": [
"print(my_series.mean())"
]
},
{
"cell_type": "markdown",
"id": "c111179b-fb73-46a0-a3e7-cc45280e990d",
"metadata": {
"tags": []
},
"source": [
"## 2. DataFrame: two-deimensional data\n",
"\n",
"### 2.1. Creating dataframes"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "ad5862cc-42d6-4b32-8189-ac8d0da02ccc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = {\n",
" 'name': ['Alice', 'Bob', 'Laura', 'George'],\n",
" 'age': [25, 30, 35, 40],\n",
" 'city': ['New York', 'Paris', 'London', 'Chicago'],\n",
" 'salary': [50000, 60000, 70000, 80000]\n",
"}\n",
"\n",
"type(data)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "bf087fe4-1720-4574-90e9-6449cf43809d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'name': ['Alice', 'Bob', 'Laura', 'George'], 'age': [25, 30, 35, 40], 'city': ['New York', 'Paris', 'London', 'Chicago'], 'salary': [50000, 60000, 70000, 80000]}\n"
]
}
],
"source": [
"print(data)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "6def2b20-846f-4775-80bd-0e1e47ccb2c0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" city | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 50000 | \n",
"
\n",
" \n",
" | 1 | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 60000 | \n",
"
\n",
" \n",
" | 2 | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 70000 | \n",
"
\n",
" \n",
" | 3 | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 80000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age city salary\n",
"0 Alice 25 New York 50000\n",
"1 Bob 30 Paris 60000\n",
"2 Laura 35 London 70000\n",
"3 George 40 Chicago 80000"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(data)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "e4a0d969-79c8-4560-9ce4-c69a33e55af6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Day | \n",
" Temperature | \n",
" Precipitation | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 23.232203 | \n",
" 2.645556 | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" 25.727840 | \n",
" 7.742337 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" 24.041451 | \n",
" 4.561503 | \n",
"
\n",
" \n",
" | 3 | \n",
" 4 | \n",
" 23.173248 | \n",
" 5.684339 | \n",
"
\n",
" \n",
" | 4 | \n",
" 5 | \n",
" 21.354822 | \n",
" 0.187898 | \n",
"
\n",
" \n",
" | 5 | \n",
" 6 | \n",
" 24.688412 | \n",
" 6.176355 | \n",
"
\n",
" \n",
" | 6 | \n",
" 7 | \n",
" 21.563808 | \n",
" 6.120957 | \n",
"
\n",
" \n",
" | 7 | \n",
" 8 | \n",
" 28.376595 | \n",
" 6.169340 | \n",
"
\n",
" \n",
" | 8 | \n",
" 9 | \n",
" 29.454941 | \n",
" 9.437481 | \n",
"
\n",
" \n",
" | 9 | \n",
" 10 | \n",
" 20.751623 | \n",
" 6.818203 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Day Temperature Precipitation\n",
"0 1 23.232203 2.645556\n",
"1 2 25.727840 7.742337\n",
"2 3 24.041451 4.561503\n",
"3 4 23.173248 5.684339\n",
"4 5 21.354822 0.187898\n",
"5 6 24.688412 6.176355\n",
"6 7 21.563808 6.120957\n",
"7 8 28.376595 6.169340\n",
"8 9 29.454941 9.437481\n",
"9 10 20.751623 6.818203"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"weather_df = pd.read_csv('./Weather_data.csv')\n",
"weather_df.head(10)"
]
},
{
"cell_type": "markdown",
"id": "578b6b08-c431-4896-b8cb-ab77b31a102e",
"metadata": {},
"source": [
"### 2.2. Setting the index"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "5c491fbf-1f96-40d1-adb8-bf60542f80df",
"metadata": {},
"outputs": [],
"source": [
"new_index = ['a', 'b', 'c', 'd']\n",
"df.set_index(pd.Index(new_index), inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "4d9b8158-14ec-4cde-87c9-febda56272ae",
"metadata": {},
"outputs": [],
"source": [
"df['index_col'] = new_index"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "edbb2450-1553-4eda-bbd8-3b74216107a5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" city | \n",
" salary | \n",
" index_col | \n",
"
\n",
" \n",
" \n",
" \n",
" | a | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 50000 | \n",
" a | \n",
"
\n",
" \n",
" | b | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 60000 | \n",
" b | \n",
"
\n",
" \n",
" | c | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 70000 | \n",
" c | \n",
"
\n",
" \n",
" | d | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 80000 | \n",
" d | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age city salary index_col\n",
"a Alice 25 New York 50000 a\n",
"b Bob 30 Paris 60000 b\n",
"c Laura 35 London 70000 c\n",
"d George 40 Chicago 80000 d"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "efd96440-2361-4e8b-aa18-36d2ac736044",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" city | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | a | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 50000 | \n",
"
\n",
" \n",
" | b | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 60000 | \n",
"
\n",
" \n",
" | c | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 70000 | \n",
"
\n",
" \n",
" | d | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 80000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age city salary\n",
"a Alice 25 New York 50000\n",
"b Bob 30 Paris 60000\n",
"c Laura 35 London 70000\n",
"d George 40 Chicago 80000"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.drop('index_col', axis=1, inplace=True)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "cef670a3-a85a-4da8-b1f0-fc01f27222f8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" city | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 50000 | \n",
"
\n",
" \n",
" | 1 | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 60000 | \n",
"
\n",
" \n",
" | 2 | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 70000 | \n",
"
\n",
" \n",
" | 3 | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 80000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age city salary\n",
"0 Alice 25 New York 50000\n",
"1 Bob 30 Paris 60000\n",
"2 Laura 35 London 70000\n",
"3 George 40 Chicago 80000"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.reset_index(inplace=True, drop=True)\n",
"df"
]
},
{
"cell_type": "markdown",
"id": "ac555d48-8979-410c-a68e-f9a62cb120f7",
"metadata": {},
"source": [
"### 2.3. Getting and setting column names"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "911eda97-ee4c-4f32-80c5-037838f02a75",
"metadata": {},
"outputs": [],
"source": [
"col_names = df.columns"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "7497e7bc-2e06-4e94-9826-a3b3fc9c33c4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Index(['name', 'age', 'city', 'salary'], dtype='object')\n"
]
}
],
"source": [
"print(col_names)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "4b1dc8a4-256a-410b-b68f-1c2f2bdc29de",
"metadata": {},
"outputs": [],
"source": [
"rename_dict = {'city': 'location'}"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "3d812b3b-a43d-43e3-a5e5-19ad2e8ab6cf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" location | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 50000 | \n",
"
\n",
" \n",
" | 1 | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 60000 | \n",
"
\n",
" \n",
" | 2 | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 70000 | \n",
"
\n",
" \n",
" | 3 | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 80000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age location salary\n",
"0 Alice 25 New York 50000\n",
"1 Bob 30 Paris 60000\n",
"2 Laura 35 London 70000\n",
"3 George 40 Chicago 80000"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rename(columns=rename_dict, inplace=True)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "371a9b4e-a227-43c3-9d0f-53eb87049c99",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" location | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 50000 | \n",
"
\n",
" \n",
" | 1 | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 60000 | \n",
"
\n",
" \n",
" | 2 | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 70000 | \n",
"
\n",
" \n",
" | 3 | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 80000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age location salary\n",
"0 Alice 25 New York 50000\n",
"1 Bob 30 Paris 60000\n",
"2 Laura 35 London 70000\n",
"3 George 40 Chicago 80000"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.column = pd.Index(['Name', 'Age', 'City', 'Salary'])\n",
"df"
]
},
{
"cell_type": "markdown",
"id": "9ea6f95c-4875-43fc-b03b-28cbcbd183c9",
"metadata": {},
"source": [
"### 2.4. Getting information about dataframe"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "f04579d9-64c2-43ca-aade-dea4b538718a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" location | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 50000 | \n",
"
\n",
" \n",
" | 1 | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 60000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age location salary\n",
"0 Alice 25 New York 50000\n",
"1 Bob 30 Paris 60000"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "e169cfde-a0eb-49e5-a17c-0c9e9cd8bacb",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" location | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 60000 | \n",
"
\n",
" \n",
" | 2 | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 70000 | \n",
"
\n",
" \n",
" | 3 | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 80000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age location salary\n",
"1 Bob 30 Paris 60000\n",
"2 Laura 35 London 70000\n",
"3 George 40 Chicago 80000"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.tail(3)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "b2828047-da85-4e4d-a631-aa70aaebabbe",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 4 entries, 0 to 3\n",
"Data columns (total 4 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 name 4 non-null object\n",
" 1 age 4 non-null int64 \n",
" 2 location 4 non-null object\n",
" 3 salary 4 non-null int64 \n",
"dtypes: int64(2), object(2)\n",
"memory usage: 256.0+ bytes\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "1169a714-4f97-4a2f-9eee-9b0236e13909",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 4.000000 | \n",
" 4.000000 | \n",
"
\n",
" \n",
" | mean | \n",
" 32.500000 | \n",
" 65000.000000 | \n",
"
\n",
" \n",
" | std | \n",
" 6.454972 | \n",
" 12909.944487 | \n",
"
\n",
" \n",
" | min | \n",
" 25.000000 | \n",
" 50000.000000 | \n",
"
\n",
" \n",
" | 25% | \n",
" 28.750000 | \n",
" 57500.000000 | \n",
"
\n",
" \n",
" | 50% | \n",
" 32.500000 | \n",
" 65000.000000 | \n",
"
\n",
" \n",
" | 75% | \n",
" 36.250000 | \n",
" 72500.000000 | \n",
"
\n",
" \n",
" | max | \n",
" 40.000000 | \n",
" 80000.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age salary\n",
"count 4.000000 4.000000\n",
"mean 32.500000 65000.000000\n",
"std 6.454972 12909.944487\n",
"min 25.000000 50000.000000\n",
"25% 28.750000 57500.000000\n",
"50% 32.500000 65000.000000\n",
"75% 36.250000 72500.000000\n",
"max 40.000000 80000.000000"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "markdown",
"id": "10de24a4-16d1-4f3e-9889-b5afc0697240",
"metadata": {},
"source": [
"### 2.5. Selecting data"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "facf9f2e-b27a-40c2-a887-1ea1bb70b66d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 Alice\n",
"1 Bob\n",
"2 Laura\n",
"3 George\n",
"Name: name, dtype: object"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"name_col = df['name']\n",
"name_col"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "1d582720-2dfa-491b-aa1e-93ef6780ac14",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Alice | \n",
" 25 | \n",
"
\n",
" \n",
" | 1 | \n",
" Bob | \n",
" 30 | \n",
"
\n",
" \n",
" | 2 | \n",
" Laura | \n",
" 35 | \n",
"
\n",
" \n",
" | 3 | \n",
" George | \n",
" 40 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age\n",
"0 Alice 25\n",
"1 Bob 30\n",
"2 Laura 35\n",
"3 George 40"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cols = ['name', 'age']\n",
"\n",
"multiple_col = df[cols]\n",
"multiple_col"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "8e7ce30d-405f-42da-a589-fec8cd968964",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" city | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | first | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 50000 | \n",
"
\n",
" \n",
" | second | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 60000 | \n",
"
\n",
" \n",
" | third | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 70000 | \n",
"
\n",
" \n",
" | fourth | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 80000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age city salary\n",
"first Alice 25 New York 50000\n",
"second Bob 30 Paris 60000\n",
"third Laura 35 London 70000\n",
"fourth George 40 Chicago 80000"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"index = pd.Index(['first', 'second', 'third', 'fourth'])\n",
"\n",
"df.set_index(index, inplace=True)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "52323183-35fe-4378-afef-e3b57d1329d9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"name Alice\n",
"age 25\n",
"city New York\n",
"salary 50000\n",
"Name: first, dtype: object"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc['first']"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "055d7849-278b-4fad-97de-816efaba36b6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"name Alice\n",
"age 25\n",
"city New York\n",
"salary 50000\n",
"Name: first, dtype: object"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.iloc[0]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "79bd0c20-dea5-4fbc-b162-92cabd52d858",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" city | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | first | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 50000 | \n",
"
\n",
" \n",
" | second | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 60000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age city salary\n",
"first Alice 25 New York 50000\n",
"second Bob 30 Paris 60000"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[['first', 'second']]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "d0794d2b-11a7-4e1a-a04a-5f938afa72e3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" city | \n",
"
\n",
" \n",
" \n",
" \n",
" | first | \n",
" Alice | \n",
" New York | \n",
"
\n",
" \n",
" | second | \n",
" Bob | \n",
" Paris | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name city\n",
"first Alice New York\n",
"second Bob Paris"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[['first', 'second'], ['name', 'city']]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "ba463bd6-71f9-4ec4-b336-7790828dde42",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" age | \n",
" city | \n",
"
\n",
" \n",
" \n",
" \n",
" | first | \n",
" 25 | \n",
" New York | \n",
"
\n",
" \n",
" | second | \n",
" 30 | \n",
" Paris | \n",
"
\n",
" \n",
" | third | \n",
" 35 | \n",
" London | \n",
"
\n",
" \n",
" | fourth | \n",
" 40 | \n",
" Chicago | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" age city\n",
"first 25 New York\n",
"second 30 Paris\n",
"third 35 London\n",
"fourth 40 Chicago"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.iloc[:, 1:3]"
]
},
{
"cell_type": "markdown",
"id": "d4c5c16d-9e07-46d5-99f7-0438e67ac1e5",
"metadata": {},
"source": [
"### 2.6. Filtering"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "8d082df3-3bcd-4452-8b52-85fdcc1e61f9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"first False\n",
"second False\n",
"third True\n",
"fourth True\n",
"Name: salary, dtype: bool"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['salary'] > 60000"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "2d5b51d1-9cee-4b6a-919e-95b4a0af95cd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" city | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | third | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 70000 | \n",
"
\n",
" \n",
" | fourth | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 80000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age city salary\n",
"third Laura 35 London 70000\n",
"fourth George 40 Chicago 80000"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['salary'] > 60000]"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "13062732-66ad-43fb-b1db-1493a2e771a2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" city | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | third | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 70000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age city salary\n",
"third Laura 35 London 70000"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['city'] == 'London']"
]
},
{
"cell_type": "markdown",
"id": "90124921-8a17-44ee-8351-ff8242613651",
"metadata": {},
"source": [
"## 3. Working with dates\n",
"\n",
"### 3.1. Creating datetime data"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "08162f21-ab61-4596-ad82-bb40e37e3335",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DatetimeIndex(['2026-01-01', '2026-02-01', '2026-03-01', '2026-02-01'], dtype='datetime64[ns]', freq=None)"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dates = pd.to_datetime(['2026-01-01', '2026-02-01', '2026-03-01', '2026-02-01'])\n",
"dates"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "c4c3d667-ad54-4cfa-a176-74fad848ec11",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" event | \n",
" date | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" New Year | \n",
" 2026-01-01 | \n",
"
\n",
" \n",
" | 1 | \n",
" Valentine | \n",
" 2026-02-15 | \n",
"
\n",
" \n",
" | 2 | \n",
" Spring | \n",
" 2026-03-20 | \n",
"
\n",
" \n",
" | 3 | \n",
" July 4th | \n",
" 2026-07-04 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" event date\n",
"0 New Year 2026-01-01\n",
"1 Valentine 2026-02-15\n",
"2 Spring 2026-03-20\n",
"3 July 4th 2026-07-04"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_dates = pd.DataFrame({\n",
" 'event': ['New Year', 'Valentine', 'Spring', 'July 4th'],\n",
" 'date': pd.to_datetime(['2026-01-01', '2026-02-15', '2026-03-20', '2026-07-04'])\n",
"})\n",
"df_dates"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "4ed067c2-04f1-421a-9ae9-9805afb6aaab",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 4 entries, 0 to 3\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 event 4 non-null object \n",
" 1 date 4 non-null datetime64[ns]\n",
"dtypes: datetime64[ns](1), object(1)\n",
"memory usage: 192.0+ bytes\n"
]
}
],
"source": [
"df_dates.info()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "db7bd28c-20c1-4297-af36-b9e20e5eb9c0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" event | \n",
" date | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" New Year | \n",
" 2026-01-01 | \n",
" 2026 | \n",
"
\n",
" \n",
" | 1 | \n",
" Valentine | \n",
" 2026-02-15 | \n",
" 2026 | \n",
"
\n",
" \n",
" | 2 | \n",
" Spring | \n",
" 2026-03-20 | \n",
" 2026 | \n",
"
\n",
" \n",
" | 3 | \n",
" July 4th | \n",
" 2026-07-04 | \n",
" 2026 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" event date year\n",
"0 New Year 2026-01-01 2026\n",
"1 Valentine 2026-02-15 2026\n",
"2 Spring 2026-03-20 2026\n",
"3 July 4th 2026-07-04 2026"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_dates['year'] = df_dates['date'].dt.year\n",
"df_dates"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "617899cf-0f4a-47c1-a12e-07ec780a5b2b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" event | \n",
" date | \n",
" year | \n",
" month | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" New Year | \n",
" 2026-01-01 | \n",
" 2026 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" Valentine | \n",
" 2026-02-15 | \n",
" 2026 | \n",
" 2 | \n",
"
\n",
" \n",
" | 2 | \n",
" Spring | \n",
" 2026-03-20 | \n",
" 2026 | \n",
" 3 | \n",
"
\n",
" \n",
" | 3 | \n",
" July 4th | \n",
" 2026-07-04 | \n",
" 2026 | \n",
" 7 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" event date year month\n",
"0 New Year 2026-01-01 2026 1\n",
"1 Valentine 2026-02-15 2026 2\n",
"2 Spring 2026-03-20 2026 3\n",
"3 July 4th 2026-07-04 2026 7"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_dates['month'] = df_dates['date'].dt.month\n",
"df_dates"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "c6e69b73-2c60-4f58-908a-12f78ec791d4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" event | \n",
" date | \n",
" year | \n",
" month | \n",
" day | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" New Year | \n",
" 2026-01-01 | \n",
" 2026 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" Valentine | \n",
" 2026-02-15 | \n",
" 2026 | \n",
" 2 | \n",
" 15 | \n",
"
\n",
" \n",
" | 2 | \n",
" Spring | \n",
" 2026-03-20 | \n",
" 2026 | \n",
" 3 | \n",
" 20 | \n",
"
\n",
" \n",
" | 3 | \n",
" July 4th | \n",
" 2026-07-04 | \n",
" 2026 | \n",
" 7 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" event date year month day\n",
"0 New Year 2026-01-01 2026 1 1\n",
"1 Valentine 2026-02-15 2026 2 15\n",
"2 Spring 2026-03-20 2026 3 20\n",
"3 July 4th 2026-07-04 2026 7 4"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_dates['day'] = df_dates['date'].dt.day\n",
"df_dates"
]
},
{
"cell_type": "markdown",
"id": "25d5a116-9f27-4357-9ce9-037800685213",
"metadata": {},
"source": [
"### 3.3. Date arithmetic"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "f2521fd5-0043-4e5d-b350-d40736f83313",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" event | \n",
" date | \n",
" year | \n",
" month | \n",
" day | \n",
" days_since_newyear | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" New Year | \n",
" 2026-01-01 | \n",
" 2026 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" Valentine | \n",
" 2026-02-15 | \n",
" 2026 | \n",
" 2 | \n",
" 15 | \n",
" 45 | \n",
"
\n",
" \n",
" | 2 | \n",
" Spring | \n",
" 2026-03-20 | \n",
" 2026 | \n",
" 3 | \n",
" 20 | \n",
" 78 | \n",
"
\n",
" \n",
" | 3 | \n",
" July 4th | \n",
" 2026-07-04 | \n",
" 2026 | \n",
" 7 | \n",
" 4 | \n",
" 184 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" event date year month day days_since_newyear\n",
"0 New Year 2026-01-01 2026 1 1 0\n",
"1 Valentine 2026-02-15 2026 2 15 45\n",
"2 Spring 2026-03-20 2026 3 20 78\n",
"3 July 4th 2026-07-04 2026 7 4 184"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"days_since_newyear = (df_dates['date'] - df_dates['date'].min()).dt.days\n",
"df_dates['days_since_newyear'] = days_since_newyear\n",
"df_dates"
]
},
{
"cell_type": "markdown",
"id": "a7a8ba21-9d74-4a4d-8174-936a520d261e",
"metadata": {},
"source": [
"## 4. Working with data types"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "9003c281-e3ae-4808-94a8-c8b586e12409",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" numbers_as_text | \n",
" prices | \n",
" categories | \n",
" is_active | \n",
" dates_as_text | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 10.50 | \n",
" A | \n",
" True | \n",
" 2024-01-01 | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" 20.75 | \n",
" B | \n",
" False | \n",
" 2024-01-02 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" 15.25 | \n",
" A | \n",
" True | \n",
" 2024-01-03 | \n",
"
\n",
" \n",
" | 3 | \n",
" 4 | \n",
" 30.00 | \n",
" C | \n",
" True | \n",
" 2024-01-04 | \n",
"
\n",
" \n",
" | 4 | \n",
" 5 | \n",
" 25.50 | \n",
" B | \n",
" False | \n",
" 2024-01-05 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" numbers_as_text prices categories is_active dates_as_text\n",
"0 1 10.50 A True 2024-01-01\n",
"1 2 20.75 B False 2024-01-02\n",
"2 3 15.25 A True 2024-01-03\n",
"3 4 30.00 C True 2024-01-04\n",
"4 5 25.50 B False 2024-01-05"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mixed_data = pd.DataFrame({\n",
" 'numbers_as_text': ['1', '2', '3', '4', '5'],\n",
" 'prices': ['10.50', '20.75', '15.25', '30.00', '25.50'],\n",
" 'categories': ['A', 'B', 'A', 'C', 'B'],\n",
" 'is_active': ['True', 'False', 'True', 'True', 'False'],\n",
" 'dates_as_text': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05']\n",
"})\n",
"mixed_data"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "0dc3da58-a059-4500-bb40-6796e4a0fad8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 5 entries, 0 to 4\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 numbers_as_text 5 non-null object\n",
" 1 prices 5 non-null object\n",
" 2 categories 5 non-null object\n",
" 3 is_active 5 non-null object\n",
" 4 dates_as_text 5 non-null object\n",
"dtypes: object(5)\n",
"memory usage: 328.0+ bytes\n"
]
}
],
"source": [
"mixed_data.info()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "1060fb70-a1e5-4dd8-83de-3d3dce550742",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 5 entries, 0 to 4\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 numbers_as_text 5 non-null object \n",
" 1 prices 5 non-null float64\n",
" 2 categories 5 non-null object \n",
" 3 is_active 5 non-null object \n",
" 4 dates_as_text 5 non-null object \n",
"dtypes: float64(1), object(4)\n",
"memory usage: 328.0+ bytes\n"
]
}
],
"source": [
"mixed_data['prices'] = mixed_data['prices'].astype(float)\n",
"mixed_data.info()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "34acdb4e-d705-486d-81e2-7b8aa63be884",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" numbers_as_text | \n",
" prices | \n",
" categories | \n",
" is_active | \n",
" dates_as_text | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 10.50 | \n",
" A | \n",
" True | \n",
" 2024-01-01 | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" 20.75 | \n",
" B | \n",
" False | \n",
" 2024-01-02 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" 15.25 | \n",
" A | \n",
" True | \n",
" 2024-01-03 | \n",
"
\n",
" \n",
" | 3 | \n",
" 4 | \n",
" 30.00 | \n",
" C | \n",
" True | \n",
" 2024-01-04 | \n",
"
\n",
" \n",
" | 4 | \n",
" 5 | \n",
" 25.50 | \n",
" B | \n",
" False | \n",
" 2024-01-05 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" numbers_as_text prices categories is_active dates_as_text\n",
"0 1 10.50 A True 2024-01-01\n",
"1 2 20.75 B False 2024-01-02\n",
"2 3 15.25 A True 2024-01-03\n",
"3 4 30.00 C True 2024-01-04\n",
"4 5 25.50 B False 2024-01-05"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mixed_data['categories'] = mixed_data['categories'].astype('category')\n",
"mixed_data"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "682b76ab-9a8f-42f6-9628-8a8a247ec0cc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 5 entries, 0 to 4\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 numbers_as_text 5 non-null object \n",
" 1 prices 5 non-null float64 \n",
" 2 categories 5 non-null category\n",
" 3 is_active 5 non-null object \n",
" 4 dates_as_text 5 non-null object \n",
"dtypes: category(1), float64(1), object(3)\n",
"memory usage: 425.0+ bytes\n"
]
}
],
"source": [
"mixed_data.info()"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "3e9951f8-93d2-4ece-8d62-75d6aa27ea57",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['A', 'B', 'C'], dtype='object')"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mixed_data['categories'].cat.categories"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "3a0f8788-72b4-411a-802c-0367789d14de",
"metadata": {},
"outputs": [],
"source": [
"mixed_data['dates_as_text'] = mixed_data['dates_as_text'].astype('string')"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "d85ea388-79f6-4843-93be-534f1de2109a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 5 entries, 0 to 4\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 numbers_as_text 5 non-null object \n",
" 1 prices 5 non-null float64 \n",
" 2 categories 5 non-null category\n",
" 3 is_active 5 non-null object \n",
" 4 dates_as_text 5 non-null string \n",
"dtypes: category(1), float64(1), object(2), string(1)\n",
"memory usage: 425.0+ bytes\n"
]
}
],
"source": [
"mixed_data.info()"
]
},
{
"cell_type": "markdown",
"id": "d94ef96e-e6d8-4cb2-b879-54275b91c975",
"metadata": {},
"source": [
"## 5. Iteration"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "6c5bce80-48f8-4aba-a49d-202269dfbb32",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"first row, name: Alice\n",
"second row, name: Bob\n",
"third row, name: Laura\n",
"fourth row, name: George\n"
]
}
],
"source": [
"for index, row in df.iterrows():\n",
" print(f'{index} row, name: {row[\"name\"]}')"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "7a3160c6-2948-4df4-bced-cf30a56e5894",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"name Alice\n",
"age 25\n",
"city New York\n",
"salary 50000\n",
"Name: first, dtype: object\n",
"name Bob\n",
"age 30\n",
"city Paris\n",
"salary 60000\n",
"Name: second, dtype: object\n",
"name Laura\n",
"age 35\n",
"city London\n",
"salary 70000\n",
"Name: third, dtype: object\n",
"name George\n",
"age 40\n",
"city Chicago\n",
"salary 80000\n",
"Name: fourth, dtype: object\n"
]
}
],
"source": [
"count = 0\n",
"\n",
"while count < len(df):\n",
" print(f'{df.iloc[count]}')\n",
" count+=1"
]
},
{
"cell_type": "markdown",
"id": "4b6c5b7a-39d3-4c94-8f75-139cbab32374",
"metadata": {},
"source": [
"## 6. Sorting"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "71ad75a5-5e83-482c-b235-ed0c1f950bb1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" city | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | first | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 50000 | \n",
"
\n",
" \n",
" | second | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 60000 | \n",
"
\n",
" \n",
" | third | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 70000 | \n",
"
\n",
" \n",
" | fourth | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 80000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age city salary\n",
"first Alice 25 New York 50000\n",
"second Bob 30 Paris 60000\n",
"third Laura 35 London 70000\n",
"fourth George 40 Chicago 80000"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_values('age')"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "563bac5f-7264-4e8a-9963-b38725036248",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" city | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | fourth | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 80000 | \n",
"
\n",
" \n",
" | third | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 70000 | \n",
"
\n",
" \n",
" | second | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 60000 | \n",
"
\n",
" \n",
" | first | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 50000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age city salary\n",
"fourth George 40 Chicago 80000\n",
"third Laura 35 London 70000\n",
"second Bob 30 Paris 60000\n",
"first Alice 25 New York 50000"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_values('age', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "110448ea-df28-4ed5-983f-e2c8b27feb08",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" city | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | first | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 50000 | \n",
"
\n",
" \n",
" | second | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 60000 | \n",
"
\n",
" \n",
" | third | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 70000 | \n",
"
\n",
" \n",
" | fourth | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 80000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age city salary\n",
"first Alice 25 New York 50000\n",
"second Bob 30 Paris 60000\n",
"third Laura 35 London 70000\n",
"fourth George 40 Chicago 80000"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_values(['age', 'city'])"
]
},
{
"cell_type": "markdown",
"id": "e81e6d72-2b95-4555-81c0-2badc5ce8cdc",
"metadata": {},
"source": [
"## 7. `apply()` and `groupby()`\n",
"\n",
"### 7.1 `apply()`"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "40ec2d2c-ffec-40ff-99c6-114c99f21d65",
"metadata": {},
"outputs": [],
"source": [
"def double(x):\n",
" print(f'Doubling {x}')\n",
" return x * 2"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "a2481c29-20bb-4770-8e5a-d470a5d80ab8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Doubling 5\n",
"10\n"
]
}
],
"source": [
"print(double(5))"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "47900c1c-462d-4354-97dc-eddc85c4a082",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10\n"
]
}
],
"source": [
"double_lambda = lambda x: x * 2\n",
"print(double_lambda(5))"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "a2a9eb53-3cf4-42d4-9f90-cf0dd50bd74d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" city | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" | first | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 60500.0 | \n",
"
\n",
" \n",
" | second | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 72600.0 | \n",
"
\n",
" \n",
" | third | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 84700.0 | \n",
"
\n",
" \n",
" | fourth | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 96800.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age city salary\n",
"first Alice 25 New York 60500.0\n",
"second Bob 30 Paris 72600.0\n",
"third Laura 35 London 84700.0\n",
"fourth George 40 Chicago 96800.0"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['salary'] = df['salary'].apply(lambda x: x * 1.1)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "16af2dce-e9f6-4c7b-a8ed-5614217e2617",
"metadata": {},
"outputs": [],
"source": [
"def salary_bracket(salary):\n",
" \n",
" if salary >= 80000:\n",
" return 'high'\n",
" \n",
" elif salary >= 50000:\n",
" return 'mid'\n",
" \n",
" else:\n",
" return 'standard'"
]
},
{
"cell_type": "code",
"execution_count": 78,
"id": "c5ec69dc-34d5-419f-87cd-c1d72143b22c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" name | \n",
" age | \n",
" city | \n",
" salary | \n",
" bracket | \n",
"
\n",
" \n",
" \n",
" \n",
" | first | \n",
" Alice | \n",
" 25 | \n",
" New York | \n",
" 60500.0 | \n",
" mid | \n",
"
\n",
" \n",
" | second | \n",
" Bob | \n",
" 30 | \n",
" Paris | \n",
" 72600.0 | \n",
" mid | \n",
"
\n",
" \n",
" | third | \n",
" Laura | \n",
" 35 | \n",
" London | \n",
" 84700.0 | \n",
" high | \n",
"
\n",
" \n",
" | fourth | \n",
" George | \n",
" 40 | \n",
" Chicago | \n",
" 96800.0 | \n",
" high | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" name age city salary bracket\n",
"first Alice 25 New York 60500.0 mid\n",
"second Bob 30 Paris 72600.0 mid\n",
"third Laura 35 London 84700.0 high\n",
"fourth George 40 Chicago 96800.0 high"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['bracket'] = df['salary'].apply(salary_bracket)\n",
"df"
]
},
{
"cell_type": "markdown",
"id": "c120c229-6e03-4814-b386-aeb71aa87b6a",
"metadata": {},
"source": [
"### 7.2. `groupby()`"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "e49ddcfc-cb4f-459e-b9f2-32afffe0dbcf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" region | \n",
" product | \n",
" revenue | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" North | \n",
" A | \n",
" 100 | \n",
"
\n",
" \n",
" | 1 | \n",
" South | \n",
" A | \n",
" 150 | \n",
"
\n",
" \n",
" | 2 | \n",
" North | \n",
" B | \n",
" 200 | \n",
"
\n",
" \n",
" | 3 | \n",
" South | \n",
" B | \n",
" 130 | \n",
"
\n",
" \n",
" | 4 | \n",
" East | \n",
" A | \n",
" 110 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" region product revenue\n",
"0 North A 100\n",
"1 South A 150\n",
"2 North B 200\n",
"3 South B 130\n",
"4 East A 110"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales = pd.DataFrame({\n",
" 'region': ['North', 'South', 'North', 'South', 'East'],\n",
" 'product': ['A', 'A', 'B', 'B', 'A'],\n",
" 'revenue': [100, 150, 200, 130, 110]\n",
"})\n",
"\n",
"sales"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "85e7dd27-f25c-4e53-89ad-7ebe93775f48",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"region\n",
"East 110\n",
"North 300\n",
"South 280\n",
"Name: revenue, dtype: int64"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales.groupby('region')['revenue'].sum()"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "5135c548-0480-4977-bcb0-8b0b4d50fe29",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"region product\n",
"East A 110\n",
"North A 100\n",
" B 200\n",
"South A 150\n",
" B 130\n",
"Name: revenue, dtype: int64"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales.groupby(['region', 'product'])['revenue'].sum()"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "95047e7b-93e9-4e7d-bb3e-ec3391c94ad9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" min | \n",
" max | \n",
" sum | \n",
" mean | \n",
"
\n",
" \n",
" | region | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | East | \n",
" 110 | \n",
" 110 | \n",
" 110 | \n",
" 110.0 | \n",
"
\n",
" \n",
" | North | \n",
" 100 | \n",
" 200 | \n",
" 300 | \n",
" 150.0 | \n",
"
\n",
" \n",
" | South | \n",
" 130 | \n",
" 150 | \n",
" 280 | \n",
" 140.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" min max sum mean\n",
"region \n",
"East 110 110 110 110.0\n",
"North 100 200 300 150.0\n",
"South 130 150 280 140.0"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales.groupby('region')['revenue'].agg(['min', 'max', 'sum', 'mean'])"
]
},
{
"cell_type": "markdown",
"id": "7a75f0cc-8445-4b00-bd05-b527b8ccbe93",
"metadata": {},
"source": [
"## 8. Writing to a file"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "ae62b086-8185-42eb-9a86-2a020c7192eb",
"metadata": {},
"outputs": [],
"source": [
"sales.to_json('./sales_data.json')"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "15e4fdd5-f720-42ed-82ff-bd0b50dd7b4d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" region | \n",
" product | \n",
" revenue | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" North | \n",
" A | \n",
" 100 | \n",
"
\n",
" \n",
" | 1 | \n",
" South | \n",
" A | \n",
" 150 | \n",
"
\n",
" \n",
" | 2 | \n",
" North | \n",
" B | \n",
" 200 | \n",
"
\n",
" \n",
" | 3 | \n",
" South | \n",
" B | \n",
" 130 | \n",
"
\n",
" \n",
" | 4 | \n",
" East | \n",
" A | \n",
" 110 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" region product revenue\n",
"0 North A 100\n",
"1 South A 150\n",
"2 North B 200\n",
"3 South B 130\n",
"4 East A 110"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales_json = pd.read_json('./sales_data.json')\n",
"sales_json"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "f3b34e14-5d28-4039-b64f-98b777bcd19a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 5 entries, 0 to 4\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 region 5 non-null object\n",
" 1 product 5 non-null object\n",
" 2 revenue 5 non-null int64 \n",
"dtypes: int64(1), object(2)\n",
"memory usage: 248.0+ bytes\n"
]
}
],
"source": [
"sales.info()"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "9f950f05-7c6e-46c1-803a-f34df64eb23d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Int64Index: 5 entries, 0 to 4\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 region 5 non-null object\n",
" 1 product 5 non-null object\n",
" 2 revenue 5 non-null int64 \n",
"dtypes: int64(1), object(2)\n",
"memory usage: 160.0+ bytes\n"
]
}
],
"source": [
"sales_json.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bed11687-7d21-4093-9622-1d17641b88d5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 [3.10]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}