{ "cells": [ { "cell_type": "markdown", "id": "8537d349-6446-473e-bd9e-fd6ec2042f28", "metadata": {}, "source": [ "# Lesson 10 demo: Pandas\n", "\n", "## 1. Set up\n", "\n", "Install with:\n", "\n", "```text\n", "pip install pandas\n", "```" ] }, { "cell_type": "code", "execution_count": 1, "id": "30dcf970-f9f9-4230-afc5-a31426b16e49", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pandas version 1.5.3\n" ] } ], "source": [ "import pandas as pd\n", "print(f'Pandas version {pd.__version__}')" ] }, { "cell_type": "markdown", "id": "4d5a78cf-a186-4310-a80d-ed7e3ca14e73", "metadata": { "toc-hr-collapsed": true }, "source": [ "## 1. Series: one-dimensional data\n", "\n", "### 1.1. Create a series" ] }, { "cell_type": "code", "execution_count": 2, "id": "73015824-e26c-45b0-8659-8a63bb14f637", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 20\n", "1 30\n", "2 40\n", "3 50\n", "dtype: int64\n" ] } ], "source": [ "my_series = pd.Series([20, 30, 40, 50])\n", "print(my_series)" ] }, { "cell_type": "code", "execution_count": 3, "id": "e33fe749-feb2-48ff-9930-01b060c6ee76", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pandas.core.series.Series" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(my_series)" ] }, { "cell_type": "code", "execution_count": 4, "id": "58930871-dea4-4276-908e-8191c2b80906", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Mon', 'Tues', 'Wed', 'Thurs']\n" ] } ], "source": [ "day = ['Mon', 'Tues', 'Wed', 'Thurs']\n", "print(day)" ] }, { "cell_type": "code", "execution_count": 5, "id": "8e420bc9-e8a4-423f-a3a1-1a3ba4be0c21", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mon 20\n", "Tues 30\n", "Wed 40\n", "Thurs 50\n", "dtype: int64\n" ] } ], "source": [ "my_indexed_series = pd.Series([20, 30, 40, 50], index=day)\n", "print(my_indexed_series)" ] }, { "cell_type": "markdown", "id": "fa28881d-1e50-4b75-a351-a26b383f56a2", "metadata": {}, "source": [ "### 1.2. Indexing series" ] }, { "cell_type": "code", "execution_count": 6, "id": "4560f31c-16bf-4e36-8e60-412b4f8bb258", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "40\n" ] } ], "source": [ "print(my_indexed_series['Wed'])" ] }, { "cell_type": "code", "execution_count": 7, "id": "c2f253e5-a9ca-4200-aa71-d26de5e2efaf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "30\n" ] } ], "source": [ "print(my_series.iloc[1])" ] }, { "cell_type": "markdown", "id": "5e7bd797-89bd-4bc0-96d3-56a43a529442", "metadata": {}, "source": [ "### 2.3. Series operations" ] }, { "cell_type": "code", "execution_count": 8, "id": "e3c0ec65-b978-4d7f-9c08-29d77d40150b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "50\n" ] } ], "source": [ "print(my_series.max())" ] }, { "cell_type": "code", "execution_count": 9, "id": "f6510c0a-55a2-4a35-8bc6-88af0c94c080", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "35.0\n" ] } ], "source": [ "print(my_series.mean())" ] }, { "cell_type": "markdown", "id": "c111179b-fb73-46a0-a3e7-cc45280e990d", "metadata": { "tags": [] }, "source": [ "## 2. DataFrame: two-deimensional data\n", "\n", "### 2.1. Creating dataframes" ] }, { "cell_type": "code", "execution_count": 10, "id": "ad5862cc-42d6-4b32-8189-ac8d0da02ccc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = {\n", " 'name': ['Alice', 'Bob', 'Laura', 'George'],\n", " 'age': [25, 30, 35, 40],\n", " 'city': ['New York', 'Paris', 'London', 'Chicago'],\n", " 'salary': [50000, 60000, 70000, 80000]\n", "}\n", "\n", "type(data)" ] }, { "cell_type": "code", "execution_count": 11, "id": "bf087fe4-1720-4574-90e9-6449cf43809d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'name': ['Alice', 'Bob', 'Laura', 'George'], 'age': [25, 30, 35, 40], 'city': ['New York', 'Paris', 'London', 'Chicago'], 'salary': [50000, 60000, 70000, 80000]}\n" ] } ], "source": [ "print(data)" ] }, { "cell_type": "code", "execution_count": 12, "id": "6def2b20-846f-4775-80bd-0e1e47ccb2c0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagecitysalary
0Alice25New York50000
1Bob30Paris60000
2Laura35London70000
3George40Chicago80000
\n", "
" ], "text/plain": [ " name age city salary\n", "0 Alice 25 New York 50000\n", "1 Bob 30 Paris 60000\n", "2 Laura 35 London 70000\n", "3 George 40 Chicago 80000" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(data)\n", "df" ] }, { "cell_type": "code", "execution_count": 15, "id": "e4a0d969-79c8-4560-9ce4-c69a33e55af6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DayTemperaturePrecipitation
0123.2322032.645556
1225.7278407.742337
2324.0414514.561503
3423.1732485.684339
4521.3548220.187898
5624.6884126.176355
6721.5638086.120957
7828.3765956.169340
8929.4549419.437481
91020.7516236.818203
\n", "
" ], "text/plain": [ " Day Temperature Precipitation\n", "0 1 23.232203 2.645556\n", "1 2 25.727840 7.742337\n", "2 3 24.041451 4.561503\n", "3 4 23.173248 5.684339\n", "4 5 21.354822 0.187898\n", "5 6 24.688412 6.176355\n", "6 7 21.563808 6.120957\n", "7 8 28.376595 6.169340\n", "8 9 29.454941 9.437481\n", "9 10 20.751623 6.818203" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "weather_df = pd.read_csv('./Weather_data.csv')\n", "weather_df.head(10)" ] }, { "cell_type": "markdown", "id": "578b6b08-c431-4896-b8cb-ab77b31a102e", "metadata": {}, "source": [ "### 2.2. Setting the index" ] }, { "cell_type": "code", "execution_count": 46, "id": "5c491fbf-1f96-40d1-adb8-bf60542f80df", "metadata": {}, "outputs": [], "source": [ "new_index = ['a', 'b', 'c', 'd']\n", "df.set_index(pd.Index(new_index), inplace=True)" ] }, { "cell_type": "code", "execution_count": 47, "id": "4d9b8158-14ec-4cde-87c9-febda56272ae", "metadata": {}, "outputs": [], "source": [ "df['index_col'] = new_index" ] }, { "cell_type": "code", "execution_count": 48, "id": "edbb2450-1553-4eda-bbd8-3b74216107a5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagecitysalaryindex_col
aAlice25New York50000a
bBob30Paris60000b
cLaura35London70000c
dGeorge40Chicago80000d
\n", "
" ], "text/plain": [ " name age city salary index_col\n", "a Alice 25 New York 50000 a\n", "b Bob 30 Paris 60000 b\n", "c Laura 35 London 70000 c\n", "d George 40 Chicago 80000 d" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 50, "id": "efd96440-2361-4e8b-aa18-36d2ac736044", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagecitysalary
aAlice25New York50000
bBob30Paris60000
cLaura35London70000
dGeorge40Chicago80000
\n", "
" ], "text/plain": [ " name age city salary\n", "a Alice 25 New York 50000\n", "b Bob 30 Paris 60000\n", "c Laura 35 London 70000\n", "d George 40 Chicago 80000" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop('index_col', axis=1, inplace=True)\n", "df" ] }, { "cell_type": "code", "execution_count": 51, "id": "cef670a3-a85a-4da8-b1f0-fc01f27222f8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagecitysalary
0Alice25New York50000
1Bob30Paris60000
2Laura35London70000
3George40Chicago80000
\n", "
" ], "text/plain": [ " name age city salary\n", "0 Alice 25 New York 50000\n", "1 Bob 30 Paris 60000\n", "2 Laura 35 London 70000\n", "3 George 40 Chicago 80000" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.reset_index(inplace=True, drop=True)\n", "df" ] }, { "cell_type": "markdown", "id": "ac555d48-8979-410c-a68e-f9a62cb120f7", "metadata": {}, "source": [ "### 2.3. Getting and setting column names" ] }, { "cell_type": "code", "execution_count": 52, "id": "911eda97-ee4c-4f32-80c5-037838f02a75", "metadata": {}, "outputs": [], "source": [ "col_names = df.columns" ] }, { "cell_type": "code", "execution_count": 53, "id": "7497e7bc-2e06-4e94-9826-a3b3fc9c33c4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['name', 'age', 'city', 'salary'], dtype='object')\n" ] } ], "source": [ "print(col_names)" ] }, { "cell_type": "code", "execution_count": 54, "id": "4b1dc8a4-256a-410b-b68f-1c2f2bdc29de", "metadata": {}, "outputs": [], "source": [ "rename_dict = {'city': 'location'}" ] }, { "cell_type": "code", "execution_count": 55, "id": "3d812b3b-a43d-43e3-a5e5-19ad2e8ab6cf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagelocationsalary
0Alice25New York50000
1Bob30Paris60000
2Laura35London70000
3George40Chicago80000
\n", "
" ], "text/plain": [ " name age location salary\n", "0 Alice 25 New York 50000\n", "1 Bob 30 Paris 60000\n", "2 Laura 35 London 70000\n", "3 George 40 Chicago 80000" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.rename(columns=rename_dict, inplace=True)\n", "df" ] }, { "cell_type": "code", "execution_count": 61, "id": "371a9b4e-a227-43c3-9d0f-53eb87049c99", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagelocationsalary
0Alice25New York50000
1Bob30Paris60000
2Laura35London70000
3George40Chicago80000
\n", "
" ], "text/plain": [ " name age location salary\n", "0 Alice 25 New York 50000\n", "1 Bob 30 Paris 60000\n", "2 Laura 35 London 70000\n", "3 George 40 Chicago 80000" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.column = pd.Index(['Name', 'Age', 'City', 'Salary'])\n", "df" ] }, { "cell_type": "markdown", "id": "9ea6f95c-4875-43fc-b03b-28cbcbd183c9", "metadata": {}, "source": [ "### 2.4. Getting information about dataframe" ] }, { "cell_type": "code", "execution_count": 62, "id": "f04579d9-64c2-43ca-aade-dea4b538718a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagelocationsalary
0Alice25New York50000
1Bob30Paris60000
\n", "
" ], "text/plain": [ " name age location salary\n", "0 Alice 25 New York 50000\n", "1 Bob 30 Paris 60000" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(2)" ] }, { "cell_type": "code", "execution_count": 63, "id": "e169cfde-a0eb-49e5-a17c-0c9e9cd8bacb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagelocationsalary
1Bob30Paris60000
2Laura35London70000
3George40Chicago80000
\n", "
" ], "text/plain": [ " name age location salary\n", "1 Bob 30 Paris 60000\n", "2 Laura 35 London 70000\n", "3 George 40 Chicago 80000" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail(3)" ] }, { "cell_type": "code", "execution_count": 64, "id": "b2828047-da85-4e4d-a631-aa70aaebabbe", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 4 entries, 0 to 3\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 name 4 non-null object\n", " 1 age 4 non-null int64 \n", " 2 location 4 non-null object\n", " 3 salary 4 non-null int64 \n", "dtypes: int64(2), object(2)\n", "memory usage: 256.0+ bytes\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 65, "id": "1169a714-4f97-4a2f-9eee-9b0236e13909", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agesalary
count4.0000004.000000
mean32.50000065000.000000
std6.45497212909.944487
min25.00000050000.000000
25%28.75000057500.000000
50%32.50000065000.000000
75%36.25000072500.000000
max40.00000080000.000000
\n", "
" ], "text/plain": [ " age salary\n", "count 4.000000 4.000000\n", "mean 32.500000 65000.000000\n", "std 6.454972 12909.944487\n", "min 25.000000 50000.000000\n", "25% 28.750000 57500.000000\n", "50% 32.500000 65000.000000\n", "75% 36.250000 72500.000000\n", "max 40.000000 80000.000000" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "markdown", "id": "10de24a4-16d1-4f3e-9889-b5afc0697240", "metadata": {}, "source": [ "### 2.5. Selecting data" ] }, { "cell_type": "code", "execution_count": 19, "id": "facf9f2e-b27a-40c2-a887-1ea1bb70b66d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 Alice\n", "1 Bob\n", "2 Laura\n", "3 George\n", "Name: name, dtype: object" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "name_col = df['name']\n", "name_col" ] }, { "cell_type": "code", "execution_count": 23, "id": "1d582720-2dfa-491b-aa1e-93ef6780ac14", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameage
0Alice25
1Bob30
2Laura35
3George40
\n", "
" ], "text/plain": [ " name age\n", "0 Alice 25\n", "1 Bob 30\n", "2 Laura 35\n", "3 George 40" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cols = ['name', 'age']\n", "\n", "multiple_col = df[cols]\n", "multiple_col" ] }, { "cell_type": "code", "execution_count": 27, "id": "8e7ce30d-405f-42da-a589-fec8cd968964", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagecitysalary
firstAlice25New York50000
secondBob30Paris60000
thirdLaura35London70000
fourthGeorge40Chicago80000
\n", "
" ], "text/plain": [ " name age city salary\n", "first Alice 25 New York 50000\n", "second Bob 30 Paris 60000\n", "third Laura 35 London 70000\n", "fourth George 40 Chicago 80000" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index = pd.Index(['first', 'second', 'third', 'fourth'])\n", "\n", "df.set_index(index, inplace=True)\n", "df" ] }, { "cell_type": "code", "execution_count": 28, "id": "52323183-35fe-4378-afef-e3b57d1329d9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "name Alice\n", "age 25\n", "city New York\n", "salary 50000\n", "Name: first, dtype: object" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc['first']" ] }, { "cell_type": "code", "execution_count": 31, "id": "055d7849-278b-4fad-97de-816efaba36b6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "name Alice\n", "age 25\n", "city New York\n", "salary 50000\n", "Name: first, dtype: object" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[0]" ] }, { "cell_type": "code", "execution_count": 32, "id": "79bd0c20-dea5-4fbc-b162-92cabd52d858", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagecitysalary
firstAlice25New York50000
secondBob30Paris60000
\n", "
" ], "text/plain": [ " name age city salary\n", "first Alice 25 New York 50000\n", "second Bob 30 Paris 60000" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[['first', 'second']]" ] }, { "cell_type": "code", "execution_count": 33, "id": "d0794d2b-11a7-4e1a-a04a-5f938afa72e3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecity
firstAliceNew York
secondBobParis
\n", "
" ], "text/plain": [ " name city\n", "first Alice New York\n", "second Bob Paris" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[['first', 'second'], ['name', 'city']]" ] }, { "cell_type": "code", "execution_count": 34, "id": "ba463bd6-71f9-4ec4-b336-7790828dde42", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agecity
first25New York
second30Paris
third35London
fourth40Chicago
\n", "
" ], "text/plain": [ " age city\n", "first 25 New York\n", "second 30 Paris\n", "third 35 London\n", "fourth 40 Chicago" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[:, 1:3]" ] }, { "cell_type": "markdown", "id": "d4c5c16d-9e07-46d5-99f7-0438e67ac1e5", "metadata": {}, "source": [ "### 2.6. Filtering" ] }, { "cell_type": "code", "execution_count": 35, "id": "8d082df3-3bcd-4452-8b52-85fdcc1e61f9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "first False\n", "second False\n", "third True\n", "fourth True\n", "Name: salary, dtype: bool" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['salary'] > 60000" ] }, { "cell_type": "code", "execution_count": 36, "id": "2d5b51d1-9cee-4b6a-919e-95b4a0af95cd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagecitysalary
thirdLaura35London70000
fourthGeorge40Chicago80000
\n", "
" ], "text/plain": [ " name age city salary\n", "third Laura 35 London 70000\n", "fourth George 40 Chicago 80000" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['salary'] > 60000]" ] }, { "cell_type": "code", "execution_count": 37, "id": "13062732-66ad-43fb-b1db-1493a2e771a2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagecitysalary
thirdLaura35London70000
\n", "
" ], "text/plain": [ " name age city salary\n", "third Laura 35 London 70000" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['city'] == 'London']" ] }, { "cell_type": "markdown", "id": "90124921-8a17-44ee-8351-ff8242613651", "metadata": {}, "source": [ "## 3. Working with dates\n", "\n", "### 3.1. Creating datetime data" ] }, { "cell_type": "code", "execution_count": 38, "id": "08162f21-ab61-4596-ad82-bb40e37e3335", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatetimeIndex(['2026-01-01', '2026-02-01', '2026-03-01', '2026-02-01'], dtype='datetime64[ns]', freq=None)" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dates = pd.to_datetime(['2026-01-01', '2026-02-01', '2026-03-01', '2026-02-01'])\n", "dates" ] }, { "cell_type": "code", "execution_count": 39, "id": "c4c3d667-ad54-4cfa-a176-74fad848ec11", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eventdate
0New Year2026-01-01
1Valentine2026-02-15
2Spring2026-03-20
3July 4th2026-07-04
\n", "
" ], "text/plain": [ " event date\n", "0 New Year 2026-01-01\n", "1 Valentine 2026-02-15\n", "2 Spring 2026-03-20\n", "3 July 4th 2026-07-04" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_dates = pd.DataFrame({\n", " 'event': ['New Year', 'Valentine', 'Spring', 'July 4th'],\n", " 'date': pd.to_datetime(['2026-01-01', '2026-02-15', '2026-03-20', '2026-07-04'])\n", "})\n", "df_dates" ] }, { "cell_type": "code", "execution_count": 40, "id": "4ed067c2-04f1-421a-9ae9-9805afb6aaab", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 4 entries, 0 to 3\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 event 4 non-null object \n", " 1 date 4 non-null datetime64[ns]\n", "dtypes: datetime64[ns](1), object(1)\n", "memory usage: 192.0+ bytes\n" ] } ], "source": [ "df_dates.info()" ] }, { "cell_type": "code", "execution_count": 41, "id": "db7bd28c-20c1-4297-af36-b9e20e5eb9c0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eventdateyear
0New Year2026-01-012026
1Valentine2026-02-152026
2Spring2026-03-202026
3July 4th2026-07-042026
\n", "
" ], "text/plain": [ " event date year\n", "0 New Year 2026-01-01 2026\n", "1 Valentine 2026-02-15 2026\n", "2 Spring 2026-03-20 2026\n", "3 July 4th 2026-07-04 2026" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_dates['year'] = df_dates['date'].dt.year\n", "df_dates" ] }, { "cell_type": "code", "execution_count": 42, "id": "617899cf-0f4a-47c1-a12e-07ec780a5b2b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eventdateyearmonth
0New Year2026-01-0120261
1Valentine2026-02-1520262
2Spring2026-03-2020263
3July 4th2026-07-0420267
\n", "
" ], "text/plain": [ " event date year month\n", "0 New Year 2026-01-01 2026 1\n", "1 Valentine 2026-02-15 2026 2\n", "2 Spring 2026-03-20 2026 3\n", "3 July 4th 2026-07-04 2026 7" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_dates['month'] = df_dates['date'].dt.month\n", "df_dates" ] }, { "cell_type": "code", "execution_count": 43, "id": "c6e69b73-2c60-4f58-908a-12f78ec791d4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eventdateyearmonthday
0New Year2026-01-01202611
1Valentine2026-02-152026215
2Spring2026-03-202026320
3July 4th2026-07-04202674
\n", "
" ], "text/plain": [ " event date year month day\n", "0 New Year 2026-01-01 2026 1 1\n", "1 Valentine 2026-02-15 2026 2 15\n", "2 Spring 2026-03-20 2026 3 20\n", "3 July 4th 2026-07-04 2026 7 4" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_dates['day'] = df_dates['date'].dt.day\n", "df_dates" ] }, { "cell_type": "markdown", "id": "25d5a116-9f27-4357-9ce9-037800685213", "metadata": {}, "source": [ "### 3.3. Date arithmetic" ] }, { "cell_type": "code", "execution_count": 44, "id": "f2521fd5-0043-4e5d-b350-d40736f83313", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eventdateyearmonthdaydays_since_newyear
0New Year2026-01-012026110
1Valentine2026-02-15202621545
2Spring2026-03-20202632078
3July 4th2026-07-04202674184
\n", "
" ], "text/plain": [ " event date year month day days_since_newyear\n", "0 New Year 2026-01-01 2026 1 1 0\n", "1 Valentine 2026-02-15 2026 2 15 45\n", "2 Spring 2026-03-20 2026 3 20 78\n", "3 July 4th 2026-07-04 2026 7 4 184" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "days_since_newyear = (df_dates['date'] - df_dates['date'].min()).dt.days\n", "df_dates['days_since_newyear'] = days_since_newyear\n", "df_dates" ] }, { "cell_type": "markdown", "id": "a7a8ba21-9d74-4a4d-8174-936a520d261e", "metadata": {}, "source": [ "## 4. Working with data types" ] }, { "cell_type": "code", "execution_count": 45, "id": "9003c281-e3ae-4808-94a8-c8b586e12409", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
numbers_as_textpricescategoriesis_activedates_as_text
0110.50ATrue2024-01-01
1220.75BFalse2024-01-02
2315.25ATrue2024-01-03
3430.00CTrue2024-01-04
4525.50BFalse2024-01-05
\n", "
" ], "text/plain": [ " numbers_as_text prices categories is_active dates_as_text\n", "0 1 10.50 A True 2024-01-01\n", "1 2 20.75 B False 2024-01-02\n", "2 3 15.25 A True 2024-01-03\n", "3 4 30.00 C True 2024-01-04\n", "4 5 25.50 B False 2024-01-05" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mixed_data = pd.DataFrame({\n", " 'numbers_as_text': ['1', '2', '3', '4', '5'],\n", " 'prices': ['10.50', '20.75', '15.25', '30.00', '25.50'],\n", " 'categories': ['A', 'B', 'A', 'C', 'B'],\n", " 'is_active': ['True', 'False', 'True', 'True', 'False'],\n", " 'dates_as_text': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05']\n", "})\n", "mixed_data" ] }, { "cell_type": "code", "execution_count": 46, "id": "0dc3da58-a059-4500-bb40-6796e4a0fad8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5 entries, 0 to 4\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 numbers_as_text 5 non-null object\n", " 1 prices 5 non-null object\n", " 2 categories 5 non-null object\n", " 3 is_active 5 non-null object\n", " 4 dates_as_text 5 non-null object\n", "dtypes: object(5)\n", "memory usage: 328.0+ bytes\n" ] } ], "source": [ "mixed_data.info()" ] }, { "cell_type": "code", "execution_count": 48, "id": "1060fb70-a1e5-4dd8-83de-3d3dce550742", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5 entries, 0 to 4\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 numbers_as_text 5 non-null object \n", " 1 prices 5 non-null float64\n", " 2 categories 5 non-null object \n", " 3 is_active 5 non-null object \n", " 4 dates_as_text 5 non-null object \n", "dtypes: float64(1), object(4)\n", "memory usage: 328.0+ bytes\n" ] } ], "source": [ "mixed_data['prices'] = mixed_data['prices'].astype(float)\n", "mixed_data.info()" ] }, { "cell_type": "code", "execution_count": 49, "id": "34acdb4e-d705-486d-81e2-7b8aa63be884", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
numbers_as_textpricescategoriesis_activedates_as_text
0110.50ATrue2024-01-01
1220.75BFalse2024-01-02
2315.25ATrue2024-01-03
3430.00CTrue2024-01-04
4525.50BFalse2024-01-05
\n", "
" ], "text/plain": [ " numbers_as_text prices categories is_active dates_as_text\n", "0 1 10.50 A True 2024-01-01\n", "1 2 20.75 B False 2024-01-02\n", "2 3 15.25 A True 2024-01-03\n", "3 4 30.00 C True 2024-01-04\n", "4 5 25.50 B False 2024-01-05" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mixed_data['categories'] = mixed_data['categories'].astype('category')\n", "mixed_data" ] }, { "cell_type": "code", "execution_count": 50, "id": "682b76ab-9a8f-42f6-9628-8a8a247ec0cc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5 entries, 0 to 4\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 numbers_as_text 5 non-null object \n", " 1 prices 5 non-null float64 \n", " 2 categories 5 non-null category\n", " 3 is_active 5 non-null object \n", " 4 dates_as_text 5 non-null object \n", "dtypes: category(1), float64(1), object(3)\n", "memory usage: 425.0+ bytes\n" ] } ], "source": [ "mixed_data.info()" ] }, { "cell_type": "code", "execution_count": 51, "id": "3e9951f8-93d2-4ece-8d62-75d6aa27ea57", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['A', 'B', 'C'], dtype='object')" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mixed_data['categories'].cat.categories" ] }, { "cell_type": "code", "execution_count": 52, "id": "3a0f8788-72b4-411a-802c-0367789d14de", "metadata": {}, "outputs": [], "source": [ "mixed_data['dates_as_text'] = mixed_data['dates_as_text'].astype('string')" ] }, { "cell_type": "code", "execution_count": 53, "id": "d85ea388-79f6-4843-93be-534f1de2109a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5 entries, 0 to 4\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 numbers_as_text 5 non-null object \n", " 1 prices 5 non-null float64 \n", " 2 categories 5 non-null category\n", " 3 is_active 5 non-null object \n", " 4 dates_as_text 5 non-null string \n", "dtypes: category(1), float64(1), object(2), string(1)\n", "memory usage: 425.0+ bytes\n" ] } ], "source": [ "mixed_data.info()" ] }, { "cell_type": "markdown", "id": "d94ef96e-e6d8-4cb2-b879-54275b91c975", "metadata": {}, "source": [ "## 5. Iteration" ] }, { "cell_type": "code", "execution_count": 56, "id": "6c5bce80-48f8-4aba-a49d-202269dfbb32", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "first row, name: Alice\n", "second row, name: Bob\n", "third row, name: Laura\n", "fourth row, name: George\n" ] } ], "source": [ "for index, row in df.iterrows():\n", " print(f'{index} row, name: {row[\"name\"]}')" ] }, { "cell_type": "code", "execution_count": 58, "id": "7a3160c6-2948-4df4-bced-cf30a56e5894", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name Alice\n", "age 25\n", "city New York\n", "salary 50000\n", "Name: first, dtype: object\n", "name Bob\n", "age 30\n", "city Paris\n", "salary 60000\n", "Name: second, dtype: object\n", "name Laura\n", "age 35\n", "city London\n", "salary 70000\n", "Name: third, dtype: object\n", "name George\n", "age 40\n", "city Chicago\n", "salary 80000\n", "Name: fourth, dtype: object\n" ] } ], "source": [ "count = 0\n", "\n", "while count < len(df):\n", " print(f'{df.iloc[count]}')\n", " count+=1" ] }, { "cell_type": "markdown", "id": "4b6c5b7a-39d3-4c94-8f75-139cbab32374", "metadata": {}, "source": [ "## 6. Sorting" ] }, { "cell_type": "code", "execution_count": 61, "id": "71ad75a5-5e83-482c-b235-ed0c1f950bb1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagecitysalary
firstAlice25New York50000
secondBob30Paris60000
thirdLaura35London70000
fourthGeorge40Chicago80000
\n", "
" ], "text/plain": [ " name age city salary\n", "first Alice 25 New York 50000\n", "second Bob 30 Paris 60000\n", "third Laura 35 London 70000\n", "fourth George 40 Chicago 80000" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_values('age')" ] }, { "cell_type": "code", "execution_count": 62, "id": "563bac5f-7264-4e8a-9963-b38725036248", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagecitysalary
fourthGeorge40Chicago80000
thirdLaura35London70000
secondBob30Paris60000
firstAlice25New York50000
\n", "
" ], "text/plain": [ " name age city salary\n", "fourth George 40 Chicago 80000\n", "third Laura 35 London 70000\n", "second Bob 30 Paris 60000\n", "first Alice 25 New York 50000" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_values('age', ascending=False)" ] }, { "cell_type": "code", "execution_count": 64, "id": "110448ea-df28-4ed5-983f-e2c8b27feb08", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagecitysalary
firstAlice25New York50000
secondBob30Paris60000
thirdLaura35London70000
fourthGeorge40Chicago80000
\n", "
" ], "text/plain": [ " name age city salary\n", "first Alice 25 New York 50000\n", "second Bob 30 Paris 60000\n", "third Laura 35 London 70000\n", "fourth George 40 Chicago 80000" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_values(['age', 'city'])" ] }, { "cell_type": "markdown", "id": "e81e6d72-2b95-4555-81c0-2badc5ce8cdc", "metadata": {}, "source": [ "## 7. `apply()` and `groupby()`\n", "\n", "### 7.1 `apply()`" ] }, { "cell_type": "code", "execution_count": 66, "id": "40ec2d2c-ffec-40ff-99c6-114c99f21d65", "metadata": {}, "outputs": [], "source": [ "def double(x):\n", " print(f'Doubling {x}')\n", " return x * 2" ] }, { "cell_type": "code", "execution_count": 67, "id": "a2481c29-20bb-4770-8e5a-d470a5d80ab8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Doubling 5\n", "10\n" ] } ], "source": [ "print(double(5))" ] }, { "cell_type": "code", "execution_count": 68, "id": "47900c1c-462d-4354-97dc-eddc85c4a082", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "10\n" ] } ], "source": [ "double_lambda = lambda x: x * 2\n", "print(double_lambda(5))" ] }, { "cell_type": "code", "execution_count": 72, "id": "a2a9eb53-3cf4-42d4-9f90-cf0dd50bd74d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagecitysalary
firstAlice25New York60500.0
secondBob30Paris72600.0
thirdLaura35London84700.0
fourthGeorge40Chicago96800.0
\n", "
" ], "text/plain": [ " name age city salary\n", "first Alice 25 New York 60500.0\n", "second Bob 30 Paris 72600.0\n", "third Laura 35 London 84700.0\n", "fourth George 40 Chicago 96800.0" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['salary'] = df['salary'].apply(lambda x: x * 1.1)\n", "df" ] }, { "cell_type": "code", "execution_count": 77, "id": "16af2dce-e9f6-4c7b-a8ed-5614217e2617", "metadata": {}, "outputs": [], "source": [ "def salary_bracket(salary):\n", " \n", " if salary >= 80000:\n", " return 'high'\n", " \n", " elif salary >= 50000:\n", " return 'mid'\n", " \n", " else:\n", " return 'standard'" ] }, { "cell_type": "code", "execution_count": 78, "id": "c5ec69dc-34d5-419f-87cd-c1d72143b22c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagecitysalarybracket
firstAlice25New York60500.0mid
secondBob30Paris72600.0mid
thirdLaura35London84700.0high
fourthGeorge40Chicago96800.0high
\n", "
" ], "text/plain": [ " name age city salary bracket\n", "first Alice 25 New York 60500.0 mid\n", "second Bob 30 Paris 72600.0 mid\n", "third Laura 35 London 84700.0 high\n", "fourth George 40 Chicago 96800.0 high" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['bracket'] = df['salary'].apply(salary_bracket)\n", "df" ] }, { "cell_type": "markdown", "id": "c120c229-6e03-4814-b386-aeb71aa87b6a", "metadata": {}, "source": [ "### 7.2. `groupby()`" ] }, { "cell_type": "code", "execution_count": 82, "id": "e49ddcfc-cb4f-459e-b9f2-32afffe0dbcf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
regionproductrevenue
0NorthA100
1SouthA150
2NorthB200
3SouthB130
4EastA110
\n", "
" ], "text/plain": [ " region product revenue\n", "0 North A 100\n", "1 South A 150\n", "2 North B 200\n", "3 South B 130\n", "4 East A 110" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sales = pd.DataFrame({\n", " 'region': ['North', 'South', 'North', 'South', 'East'],\n", " 'product': ['A', 'A', 'B', 'B', 'A'],\n", " 'revenue': [100, 150, 200, 130, 110]\n", "})\n", "\n", "sales" ] }, { "cell_type": "code", "execution_count": 83, "id": "85e7dd27-f25c-4e53-89ad-7ebe93775f48", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "region\n", "East 110\n", "North 300\n", "South 280\n", "Name: revenue, dtype: int64" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sales.groupby('region')['revenue'].sum()" ] }, { "cell_type": "code", "execution_count": 84, "id": "5135c548-0480-4977-bcb0-8b0b4d50fe29", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "region product\n", "East A 110\n", "North A 100\n", " B 200\n", "South A 150\n", " B 130\n", "Name: revenue, dtype: int64" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sales.groupby(['region', 'product'])['revenue'].sum()" ] }, { "cell_type": "code", "execution_count": 85, "id": "95047e7b-93e9-4e7d-bb3e-ec3391c94ad9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
minmaxsummean
region
East110110110110.0
North100200300150.0
South130150280140.0
\n", "
" ], "text/plain": [ " min max sum mean\n", "region \n", "East 110 110 110 110.0\n", "North 100 200 300 150.0\n", "South 130 150 280 140.0" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sales.groupby('region')['revenue'].agg(['min', 'max', 'sum', 'mean'])" ] }, { "cell_type": "markdown", "id": "7a75f0cc-8445-4b00-bd05-b527b8ccbe93", "metadata": {}, "source": [ "## 8. Writing to a file" ] }, { "cell_type": "code", "execution_count": 87, "id": "ae62b086-8185-42eb-9a86-2a020c7192eb", "metadata": {}, "outputs": [], "source": [ "sales.to_json('./sales_data.json')" ] }, { "cell_type": "code", "execution_count": 88, "id": "15e4fdd5-f720-42ed-82ff-bd0b50dd7b4d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
regionproductrevenue
0NorthA100
1SouthA150
2NorthB200
3SouthB130
4EastA110
\n", "
" ], "text/plain": [ " region product revenue\n", "0 North A 100\n", "1 South A 150\n", "2 North B 200\n", "3 South B 130\n", "4 East A 110" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sales_json = pd.read_json('./sales_data.json')\n", "sales_json" ] }, { "cell_type": "code", "execution_count": 89, "id": "f3b34e14-5d28-4039-b64f-98b777bcd19a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5 entries, 0 to 4\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 region 5 non-null object\n", " 1 product 5 non-null object\n", " 2 revenue 5 non-null int64 \n", "dtypes: int64(1), object(2)\n", "memory usage: 248.0+ bytes\n" ] } ], "source": [ "sales.info()" ] }, { "cell_type": "code", "execution_count": 90, "id": "9f950f05-7c6e-46c1-803a-f34df64eb23d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 5 entries, 0 to 4\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 region 5 non-null object\n", " 1 product 5 non-null object\n", " 2 revenue 5 non-null int64 \n", "dtypes: int64(1), object(2)\n", "memory usage: 160.0+ bytes\n" ] } ], "source": [ "sales_json.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "bed11687-7d21-4093-9622-1d17641b88d5", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 [3.10]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.2" } }, "nbformat": 4, "nbformat_minor": 5 }