{
"cells": [
{
"cell_type": "markdown",
"id": "2faa102c",
"metadata": {},
"source": [
"\n",
"The city is for Nominal Onehot Encoding.\n",
"The Size is for Ordinla Encoding"
]
},
{
"cell_type": "markdown",
"id": "46c96522",
"metadata": {},
"source": [
"### Nominal OneHotEncoding "
]
},
{
"cell_type": "code",
"execution_count": 144,
"id": "845be9a6",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 186,
"id": "f40b286b",
"metadata": {},
"outputs": [],
"source": [
"d = {'sales': [100000, 222000, 10000000, 525000, 111111, 200000, 75000, 9000, 109000, 10000],\n",
" 'city': ['Tampa', 'Tampa', 'Orlando', 'Jacksonville', 'Miami', 'Miami', 'Orlando', 'Jacksonville', 'Jacksonville', 'Orlando' ],\n",
" 'size':['Small', 'Medium', 'Large', 'Medium', 'Medium', 'Large', 'Small', 'Small','Medium', 'Small'],\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 187,
"id": "19c6caf4",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(d)"
]
},
{
"cell_type": "code",
"execution_count": 188,
"id": "9108471a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sales | \n",
" city | \n",
" size | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100000 | \n",
" Tampa | \n",
" Small | \n",
"
\n",
" \n",
" 1 | \n",
" 222000 | \n",
" Tampa | \n",
" Medium | \n",
"
\n",
" \n",
" 2 | \n",
" 10000000 | \n",
" Orlando | \n",
" Large | \n",
"
\n",
" \n",
" 3 | \n",
" 525000 | \n",
" Jacksonville | \n",
" Medium | \n",
"
\n",
" \n",
" 4 | \n",
" 111111 | \n",
" Miami | \n",
" Medium | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sales city size\n",
"0 100000 Tampa Small\n",
"1 222000 Tampa Medium\n",
"2 10000000 Orlando Large\n",
"3 525000 Jacksonville Medium\n",
"4 111111 Miami Medium"
]
},
"execution_count": 188,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 189,
"id": "bb9c6a80",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Tampa', 'Orlando', 'Jacksonville', 'Miami'], dtype=object)"
]
},
"execution_count": 189,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['city'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 190,
"id": "d54b3754",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import OneHotEncoder"
]
},
{
"cell_type": "code",
"execution_count": 191,
"id": "b27a339f",
"metadata": {},
"outputs": [],
"source": [
"ohe = OneHotEncoder(handle_unknown = 'ignore', sparse=False)"
]
},
{
"cell_type": "markdown",
"id": "d9a9dfb5",
"metadata": {},
"source": [
"**In this code:**\n",
"\n",
"- handle_unknown='ignore' specifies that if unknown categories are encountered during transform, they should be ignored.\n",
"- sparse=False specifies that the output should be a dense array rather than a sparse matrix."
]
},
{
"cell_type": "code",
"execution_count": 192,
"id": "8a673202",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0., 0., 0., 1.],\n",
" [0., 0., 0., 1.],\n",
" [0., 0., 1., 0.],\n",
" [1., 0., 0., 0.],\n",
" [0., 1., 0., 0.],\n",
" [0., 1., 0., 0.],\n",
" [0., 0., 1., 0.],\n",
" [1., 0., 0., 0.],\n",
" [1., 0., 0., 0.],\n",
" [0., 0., 1., 0.]])"
]
},
"execution_count": 192,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ohe_transform_city = ohe_city.fit_transform(df[['city']])\n",
"# This fits the OneHotEncoder to the 'city' column of the DataFrame df and transforms it into a one-hot encoded representation.\n",
"ohe_transform_city"
]
},
{
"cell_type": "code",
"execution_count": 193,
"id": "17c4f8d1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['city_Jacksonville', 'city_Miami', 'city_Orlando', 'city_Tampa'],\n",
" dtype=object)"
]
},
"execution_count": 193,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_names_city = ohe_city.get_feature_names_out(input_features=['city'])\n",
"# This retrieves the feature names for the one-hot encoded 'city' column. It ensures that the column name is included in the feature names.\n",
"feature_names_city"
]
},
{
"cell_type": "code",
"execution_count": 194,
"id": "c1ad36cf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" city_Jacksonville | \n",
" city_Miami | \n",
" city_Orlando | \n",
" city_Tampa | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 7 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 8 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 9 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" city_Jacksonville city_Miami city_Orlando city_Tampa\n",
"0 0.0 0.0 0.0 1.0\n",
"1 0.0 0.0 0.0 1.0\n",
"2 0.0 0.0 1.0 0.0\n",
"3 1.0 0.0 0.0 0.0\n",
"4 0.0 1.0 0.0 0.0\n",
"5 0.0 1.0 0.0 0.0\n",
"6 0.0 0.0 1.0 0.0\n",
"7 1.0 0.0 0.0 0.0\n",
"8 1.0 0.0 0.0 0.0\n",
"9 0.0 0.0 1.0 0.0"
]
},
"execution_count": 194,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ohe_df_city = pd.DataFrame(ohe_transform_city, columns=feature_names_city)\n",
"# This converts the transformed array of the one-hot encoded 'city' column into a pandas DataFrame using the feature names obtained earlier.\n",
"ohe_df_city"
]
},
{
"cell_type": "code",
"execution_count": 195,
"id": "e080cff1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sales | \n",
" size | \n",
" city_Jacksonville | \n",
" city_Miami | \n",
" city_Orlando | \n",
" city_Tampa | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100000 | \n",
" Small | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 222000 | \n",
" Medium | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 10000000 | \n",
" Large | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 525000 | \n",
" Medium | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 111111 | \n",
" Medium | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 200000 | \n",
" Large | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 75000 | \n",
" Small | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 7 | \n",
" 9000 | \n",
" Small | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 8 | \n",
" 109000 | \n",
" Medium | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 9 | \n",
" 10000 | \n",
" Small | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sales size city_Jacksonville city_Miami city_Orlando city_Tampa\n",
"0 100000 Small 0.0 0.0 0.0 1.0\n",
"1 222000 Medium 0.0 0.0 0.0 1.0\n",
"2 10000000 Large 0.0 0.0 1.0 0.0\n",
"3 525000 Medium 1.0 0.0 0.0 0.0\n",
"4 111111 Medium 0.0 1.0 0.0 0.0\n",
"5 200000 Large 0.0 1.0 0.0 0.0\n",
"6 75000 Small 0.0 0.0 1.0 0.0\n",
"7 9000 Small 1.0 0.0 0.0 0.0\n",
"8 109000 Medium 1.0 0.0 0.0 0.0\n",
"9 10000 Small 0.0 0.0 1.0 0.0"
]
},
"execution_count": 195,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_encoded = pd.concat([df.drop(columns=['city']), ohe_df_city], axis=1)\n",
"# This concatenates the original DataFrame df after dropping the 'city' column with the one-hot encoded 'city' DataFrame ohe_df_city, resulting in the final DataFrame df_encoded.\n",
"df_encoded"
]
},
{
"cell_type": "code",
"execution_count": 196,
"id": "7f6733a0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sales | \n",
" size | \n",
" city_Jacksonville | \n",
" city_Miami | \n",
" city_Orlando | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100000 | \n",
" Small | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 222000 | \n",
" Medium | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 10000000 | \n",
" Large | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 525000 | \n",
" Medium | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 111111 | \n",
" Medium | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 200000 | \n",
" Large | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 75000 | \n",
" Small | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 7 | \n",
" 9000 | \n",
" Small | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 8 | \n",
" 109000 | \n",
" Medium | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 9 | \n",
" 10000 | \n",
" Small | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sales size city_Jacksonville city_Miami city_Orlando\n",
"0 100000 Small 0.0 0.0 0.0\n",
"1 222000 Medium 0.0 0.0 0.0\n",
"2 10000000 Large 0.0 0.0 1.0\n",
"3 525000 Medium 1.0 0.0 0.0\n",
"4 111111 Medium 0.0 1.0 0.0\n",
"5 200000 Large 0.0 1.0 0.0\n",
"6 75000 Small 0.0 0.0 1.0\n",
"7 9000 Small 1.0 0.0 0.0\n",
"8 109000 Medium 1.0 0.0 0.0\n",
"9 10000 Small 0.0 0.0 1.0"
]
},
"execution_count": 196,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_encoded.drop('city_Tampa', axis=1)"
]
},
{
"cell_type": "markdown",
"id": "eb4d721e",
"metadata": {},
"source": [
"### OR"
]
},
{
"cell_type": "code",
"execution_count": 197,
"id": "b5f571d7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sales | \n",
" size | \n",
" city_Jacksonville | \n",
" city_Miami | \n",
" city_Orlando | \n",
" city_Tampa | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100000 | \n",
" Small | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 222000 | \n",
" Medium | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 10000000 | \n",
" Large | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 525000 | \n",
" Medium | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 111111 | \n",
" Medium | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" 200000 | \n",
" Large | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 6 | \n",
" 75000 | \n",
" Small | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 7 | \n",
" 9000 | \n",
" Small | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 8 | \n",
" 109000 | \n",
" Medium | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 9 | \n",
" 10000 | \n",
" Small | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sales size city_Jacksonville city_Miami city_Orlando city_Tampa\n",
"0 100000 Small 0 0 0 1\n",
"1 222000 Medium 0 0 0 1\n",
"2 10000000 Large 0 0 1 0\n",
"3 525000 Medium 1 0 0 0\n",
"4 111111 Medium 0 1 0 0\n",
"5 200000 Large 0 1 0 0\n",
"6 75000 Small 0 0 1 0\n",
"7 9000 Small 1 0 0 0\n",
"8 109000 Medium 1 0 0 0\n",
"9 10000 Small 0 0 1 0"
]
},
"execution_count": 197,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_encoded = pd.get_dummies(df, columns=['city'])\n",
"df_encoded\n",
"\n",
"# This is straight forward"
]
},
{
"cell_type": "markdown",
"id": "d90b7716",
"metadata": {},
"source": [
"### Nominal OneHotEncoding "
]
},
{
"cell_type": "code",
"execution_count": 211,
"id": "2036e719",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sales | \n",
" city | \n",
" size | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100000 | \n",
" Tampa | \n",
" Small | \n",
"
\n",
" \n",
" 1 | \n",
" 222000 | \n",
" Tampa | \n",
" Medium | \n",
"
\n",
" \n",
" 2 | \n",
" 10000000 | \n",
" Orlando | \n",
" Large | \n",
"
\n",
" \n",
" 3 | \n",
" 525000 | \n",
" Jacksonville | \n",
" Medium | \n",
"
\n",
" \n",
" 4 | \n",
" 111111 | \n",
" Miami | \n",
" Medium | \n",
"
\n",
" \n",
" 5 | \n",
" 200000 | \n",
" Miami | \n",
" Large | \n",
"
\n",
" \n",
" 6 | \n",
" 75000 | \n",
" Orlando | \n",
" Small | \n",
"
\n",
" \n",
" 7 | \n",
" 9000 | \n",
" Jacksonville | \n",
" Small | \n",
"
\n",
" \n",
" 8 | \n",
" 109000 | \n",
" Jacksonville | \n",
" Medium | \n",
"
\n",
" \n",
" 9 | \n",
" 10000 | \n",
" Orlando | \n",
" Small | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sales city size\n",
"0 100000 Tampa Small\n",
"1 222000 Tampa Medium\n",
"2 10000000 Orlando Large\n",
"3 525000 Jacksonville Medium\n",
"4 111111 Miami Medium\n",
"5 200000 Miami Large\n",
"6 75000 Orlando Small\n",
"7 9000 Jacksonville Small\n",
"8 109000 Jacksonville Medium\n",
"9 10000 Orlando Small"
]
},
"execution_count": 211,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 212,
"id": "3f3be309",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Small', 'Medium', 'Large'], dtype=object)"
]
},
"execution_count": 212,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['size'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 213,
"id": "97911261",
"metadata": {},
"outputs": [],
"source": [
"sizes = ['Small', 'Medium', 'Large']"
]
},
{
"cell_type": "code",
"execution_count": 214,
"id": "5e523950",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import OrdinalEncoder"
]
},
{
"cell_type": "code",
"execution_count": 219,
"id": "16c1fbd2",
"metadata": {},
"outputs": [],
"source": [
"enc = OrdinalEncoder(categories = [sizes])"
]
},
{
"cell_type": "code",
"execution_count": 221,
"id": "9a79444b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0.],\n",
" [1.],\n",
" [2.],\n",
" [1.],\n",
" [1.],\n",
" [2.],\n",
" [0.],\n",
" [0.],\n",
" [1.],\n",
" [0.]])"
]
},
"execution_count": 221,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"enc.fit_transform(df[['size']])"
]
},
{
"cell_type": "code",
"execution_count": 217,
"id": "f8005068",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sales | \n",
" city | \n",
" size | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100000 | \n",
" Tampa | \n",
" Small | \n",
"
\n",
" \n",
" 1 | \n",
" 222000 | \n",
" Tampa | \n",
" Medium | \n",
"
\n",
" \n",
" 2 | \n",
" 10000000 | \n",
" Orlando | \n",
" Large | \n",
"
\n",
" \n",
" 3 | \n",
" 525000 | \n",
" Jacksonville | \n",
" Medium | \n",
"
\n",
" \n",
" 4 | \n",
" 111111 | \n",
" Miami | \n",
" Medium | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sales city size\n",
"0 100000 Tampa Small\n",
"1 222000 Tampa Medium\n",
"2 10000000 Orlando Large\n",
"3 525000 Jacksonville Medium\n",
"4 111111 Miami Medium"
]
},
"execution_count": 217,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 222,
"id": "6974c92e",
"metadata": {},
"outputs": [],
"source": [
"df['size '] = enc.fit_transform(df[['size']])"
]
},
{
"cell_type": "code",
"execution_count": 224,
"id": "e8b3424f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sales | \n",
" city | \n",
" size | \n",
" size | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100000 | \n",
" Tampa | \n",
" Small | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 222000 | \n",
" Tampa | \n",
" Medium | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 10000000 | \n",
" Orlando | \n",
" Large | \n",
" 2.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 525000 | \n",
" Jacksonville | \n",
" Medium | \n",
" 1.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 111111 | \n",
" Miami | \n",
" Medium | \n",
" 1.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 200000 | \n",
" Miami | \n",
" Large | \n",
" 2.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 75000 | \n",
" Orlando | \n",
" Small | \n",
" 0.0 | \n",
"
\n",
" \n",
" 7 | \n",
" 9000 | \n",
" Jacksonville | \n",
" Small | \n",
" 0.0 | \n",
"
\n",
" \n",
" 8 | \n",
" 109000 | \n",
" Jacksonville | \n",
" Medium | \n",
" 1.0 | \n",
"
\n",
" \n",
" 9 | \n",
" 10000 | \n",
" Orlando | \n",
" Small | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sales city size size \n",
"0 100000 Tampa Small 0.0\n",
"1 222000 Tampa Medium 1.0\n",
"2 10000000 Orlando Large 2.0\n",
"3 525000 Jacksonville Medium 1.0\n",
"4 111111 Miami Medium 1.0\n",
"5 200000 Miami Large 2.0\n",
"6 75000 Orlando Small 0.0\n",
"7 9000 Jacksonville Small 0.0\n",
"8 109000 Jacksonville Medium 1.0\n",
"9 10000 Orlando Small 0.0"
]
},
"execution_count": 224,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(10)"
]
},
{
"cell_type": "markdown",
"id": "d62227bf",
"metadata": {},
"source": [
"##### To now convert the Nominal Data- City"
]
},
{
"cell_type": "code",
"execution_count": 206,
"id": "b92b0bbe",
"metadata": {},
"outputs": [],
"source": [
"df_encoded = pd.get_dummies(df[['city']])"
]
},
{
"cell_type": "code",
"execution_count": 225,
"id": "ac8359f6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" city_Jacksonville | \n",
" city_Miami | \n",
" city_Orlando | \n",
" city_Tampa | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 6 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 7 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 8 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 9 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" city_Jacksonville city_Miami city_Orlando city_Tampa\n",
"0 0 0 0 1\n",
"1 0 0 0 1\n",
"2 0 0 1 0\n",
"3 1 0 0 0\n",
"4 0 1 0 0\n",
"5 0 1 0 0\n",
"6 0 0 1 0\n",
"7 1 0 0 0\n",
"8 1 0 0 0\n",
"9 0 0 1 0"
]
},
"execution_count": 225,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_encoded"
]
},
{
"cell_type": "code",
"execution_count": 226,
"id": "cf7ab28f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sales | \n",
" city | \n",
" size | \n",
" size | \n",
" city_Jacksonville | \n",
" city_Miami | \n",
" city_Orlando | \n",
" city_Tampa | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100000 | \n",
" Tampa | \n",
" Small | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 222000 | \n",
" Tampa | \n",
" Medium | \n",
" 1.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 10000000 | \n",
" Orlando | \n",
" Large | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 525000 | \n",
" Jacksonville | \n",
" Medium | \n",
" 1.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 111111 | \n",
" Miami | \n",
" Medium | \n",
" 1.0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" 200000 | \n",
" Miami | \n",
" Large | \n",
" 2.0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 6 | \n",
" 75000 | \n",
" Orlando | \n",
" Small | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 7 | \n",
" 9000 | \n",
" Jacksonville | \n",
" Small | \n",
" 0.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 8 | \n",
" 109000 | \n",
" Jacksonville | \n",
" Medium | \n",
" 1.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 9 | \n",
" 10000 | \n",
" Orlando | \n",
" Small | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sales city size size city_Jacksonville city_Miami \\\n",
"0 100000 Tampa Small 0.0 0 0 \n",
"1 222000 Tampa Medium 1.0 0 0 \n",
"2 10000000 Orlando Large 2.0 0 0 \n",
"3 525000 Jacksonville Medium 1.0 1 0 \n",
"4 111111 Miami Medium 1.0 0 1 \n",
"5 200000 Miami Large 2.0 0 1 \n",
"6 75000 Orlando Small 0.0 0 0 \n",
"7 9000 Jacksonville Small 0.0 1 0 \n",
"8 109000 Jacksonville Medium 1.0 1 0 \n",
"9 10000 Orlando Small 0.0 0 0 \n",
"\n",
" city_Orlando city_Tampa \n",
"0 0 1 \n",
"1 0 1 \n",
"2 1 0 \n",
"3 0 0 \n",
"4 0 0 \n",
"5 0 0 \n",
"6 1 0 \n",
"7 0 0 \n",
"8 0 0 \n",
"9 1 0 "
]
},
"execution_count": 226,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_df = pd.concat([df, df_encoded], axis=1)\n",
"final_df"
]
},
{
"cell_type": "code",
"execution_count": 232,
"id": "d1c30211",
"metadata": {},
"outputs": [],
"source": [
"# Select columns of type 'object' (string)\n",
"string_size_columns = final_df.select_dtypes(include=['object']).columns\n",
"\n",
"# Drop the column containing string values, all string values\n",
"final_df = final_df.drop(columns=string_size_columns)\n"
]
},
{
"cell_type": "code",
"execution_count": 233,
"id": "a0713ee4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sales | \n",
" size | \n",
" city_Jacksonville | \n",
" city_Miami | \n",
" city_Orlando | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 100000 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 222000 | \n",
" 1.0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 10000000 | \n",
" 2.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 525000 | \n",
" 1.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 111111 | \n",
" 1.0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" 200000 | \n",
" 2.0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 6 | \n",
" 75000 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 7 | \n",
" 9000 | \n",
" 0.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 8 | \n",
" 109000 | \n",
" 1.0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 9 | \n",
" 10000 | \n",
" 0.0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" sales size city_Jacksonville city_Miami city_Orlando\n",
"0 100000 0.0 0 0 0\n",
"1 222000 1.0 0 0 0\n",
"2 10000000 2.0 0 0 1\n",
"3 525000 1.0 1 0 0\n",
"4 111111 1.0 0 1 0\n",
"5 200000 2.0 0 1 0\n",
"6 75000 0.0 0 0 1\n",
"7 9000 0.0 1 0 0\n",
"8 109000 1.0 1 0 0\n",
"9 10000 0.0 0 0 1"
]
},
"execution_count": 233,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "233ca643",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b7aa33b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a8b5324",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b0e9551",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "fd22a693",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "133dd589",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "5af006ca",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "4881a786",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "6bd66716",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}