{ "cells": [ { "cell_type": "markdown", "id": "2faa102c", "metadata": {}, "source": [ "\n", "The city is for Nominal Onehot Encoding.\n", "The Size is for Ordinla Encoding" ] }, { "cell_type": "markdown", "id": "46c96522", "metadata": {}, "source": [ "### Nominal OneHotEncoding " ] }, { "cell_type": "code", "execution_count": 144, "id": "845be9a6", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 186, "id": "f40b286b", "metadata": {}, "outputs": [], "source": [ "d = {'sales': [100000, 222000, 10000000, 525000, 111111, 200000, 75000, 9000, 109000, 10000],\n", " 'city': ['Tampa', 'Tampa', 'Orlando', 'Jacksonville', 'Miami', 'Miami', 'Orlando', 'Jacksonville', 'Jacksonville', 'Orlando' ],\n", " 'size':['Small', 'Medium', 'Large', 'Medium', 'Medium', 'Large', 'Small', 'Small','Medium', 'Small'],\n", " }" ] }, { "cell_type": "code", "execution_count": 187, "id": "19c6caf4", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(d)" ] }, { "cell_type": "code", "execution_count": 188, "id": "9108471a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
salescitysize
0100000TampaSmall
1222000TampaMedium
210000000OrlandoLarge
3525000JacksonvilleMedium
4111111MiamiMedium
\n", "
" ], "text/plain": [ " sales city size\n", "0 100000 Tampa Small\n", "1 222000 Tampa Medium\n", "2 10000000 Orlando Large\n", "3 525000 Jacksonville Medium\n", "4 111111 Miami Medium" ] }, "execution_count": 188, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 189, "id": "bb9c6a80", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Tampa', 'Orlando', 'Jacksonville', 'Miami'], dtype=object)" ] }, "execution_count": 189, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['city'].unique()" ] }, { "cell_type": "code", "execution_count": 190, "id": "d54b3754", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import OneHotEncoder" ] }, { "cell_type": "code", "execution_count": 191, "id": "b27a339f", "metadata": {}, "outputs": [], "source": [ "ohe = OneHotEncoder(handle_unknown = 'ignore', sparse=False)" ] }, { "cell_type": "markdown", "id": "d9a9dfb5", "metadata": {}, "source": [ "**In this code:**\n", "\n", "- handle_unknown='ignore' specifies that if unknown categories are encountered during transform, they should be ignored.\n", "- sparse=False specifies that the output should be a dense array rather than a sparse matrix." ] }, { "cell_type": "code", "execution_count": 192, "id": "8a673202", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0., 0., 0., 1.],\n", " [0., 0., 0., 1.],\n", " [0., 0., 1., 0.],\n", " [1., 0., 0., 0.],\n", " [0., 1., 0., 0.],\n", " [0., 1., 0., 0.],\n", " [0., 0., 1., 0.],\n", " [1., 0., 0., 0.],\n", " [1., 0., 0., 0.],\n", " [0., 0., 1., 0.]])" ] }, "execution_count": 192, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ohe_transform_city = ohe_city.fit_transform(df[['city']])\n", "# This fits the OneHotEncoder to the 'city' column of the DataFrame df and transforms it into a one-hot encoded representation.\n", "ohe_transform_city" ] }, { "cell_type": "code", "execution_count": 193, "id": "17c4f8d1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['city_Jacksonville', 'city_Miami', 'city_Orlando', 'city_Tampa'],\n", " dtype=object)" ] }, "execution_count": 193, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_names_city = ohe_city.get_feature_names_out(input_features=['city'])\n", "# This retrieves the feature names for the one-hot encoded 'city' column. It ensures that the column name is included in the feature names.\n", "feature_names_city" ] }, { "cell_type": "code", "execution_count": 194, "id": "c1ad36cf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
city_Jacksonvillecity_Miamicity_Orlandocity_Tampa
00.00.00.01.0
10.00.00.01.0
20.00.01.00.0
31.00.00.00.0
40.01.00.00.0
50.01.00.00.0
60.00.01.00.0
71.00.00.00.0
81.00.00.00.0
90.00.01.00.0
\n", "
" ], "text/plain": [ " city_Jacksonville city_Miami city_Orlando city_Tampa\n", "0 0.0 0.0 0.0 1.0\n", "1 0.0 0.0 0.0 1.0\n", "2 0.0 0.0 1.0 0.0\n", "3 1.0 0.0 0.0 0.0\n", "4 0.0 1.0 0.0 0.0\n", "5 0.0 1.0 0.0 0.0\n", "6 0.0 0.0 1.0 0.0\n", "7 1.0 0.0 0.0 0.0\n", "8 1.0 0.0 0.0 0.0\n", "9 0.0 0.0 1.0 0.0" ] }, "execution_count": 194, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ohe_df_city = pd.DataFrame(ohe_transform_city, columns=feature_names_city)\n", "# This converts the transformed array of the one-hot encoded 'city' column into a pandas DataFrame using the feature names obtained earlier.\n", "ohe_df_city" ] }, { "cell_type": "code", "execution_count": 195, "id": "e080cff1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
salessizecity_Jacksonvillecity_Miamicity_Orlandocity_Tampa
0100000Small0.00.00.01.0
1222000Medium0.00.00.01.0
210000000Large0.00.01.00.0
3525000Medium1.00.00.00.0
4111111Medium0.01.00.00.0
5200000Large0.01.00.00.0
675000Small0.00.01.00.0
79000Small1.00.00.00.0
8109000Medium1.00.00.00.0
910000Small0.00.01.00.0
\n", "
" ], "text/plain": [ " sales size city_Jacksonville city_Miami city_Orlando city_Tampa\n", "0 100000 Small 0.0 0.0 0.0 1.0\n", "1 222000 Medium 0.0 0.0 0.0 1.0\n", "2 10000000 Large 0.0 0.0 1.0 0.0\n", "3 525000 Medium 1.0 0.0 0.0 0.0\n", "4 111111 Medium 0.0 1.0 0.0 0.0\n", "5 200000 Large 0.0 1.0 0.0 0.0\n", "6 75000 Small 0.0 0.0 1.0 0.0\n", "7 9000 Small 1.0 0.0 0.0 0.0\n", "8 109000 Medium 1.0 0.0 0.0 0.0\n", "9 10000 Small 0.0 0.0 1.0 0.0" ] }, "execution_count": 195, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_encoded = pd.concat([df.drop(columns=['city']), ohe_df_city], axis=1)\n", "# This concatenates the original DataFrame df after dropping the 'city' column with the one-hot encoded 'city' DataFrame ohe_df_city, resulting in the final DataFrame df_encoded.\n", "df_encoded" ] }, { "cell_type": "code", "execution_count": 196, "id": "7f6733a0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
salessizecity_Jacksonvillecity_Miamicity_Orlando
0100000Small0.00.00.0
1222000Medium0.00.00.0
210000000Large0.00.01.0
3525000Medium1.00.00.0
4111111Medium0.01.00.0
5200000Large0.01.00.0
675000Small0.00.01.0
79000Small1.00.00.0
8109000Medium1.00.00.0
910000Small0.00.01.0
\n", "
" ], "text/plain": [ " sales size city_Jacksonville city_Miami city_Orlando\n", "0 100000 Small 0.0 0.0 0.0\n", "1 222000 Medium 0.0 0.0 0.0\n", "2 10000000 Large 0.0 0.0 1.0\n", "3 525000 Medium 1.0 0.0 0.0\n", "4 111111 Medium 0.0 1.0 0.0\n", "5 200000 Large 0.0 1.0 0.0\n", "6 75000 Small 0.0 0.0 1.0\n", "7 9000 Small 1.0 0.0 0.0\n", "8 109000 Medium 1.0 0.0 0.0\n", "9 10000 Small 0.0 0.0 1.0" ] }, "execution_count": 196, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_encoded.drop('city_Tampa', axis=1)" ] }, { "cell_type": "markdown", "id": "eb4d721e", "metadata": {}, "source": [ "### OR" ] }, { "cell_type": "code", "execution_count": 197, "id": "b5f571d7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
salessizecity_Jacksonvillecity_Miamicity_Orlandocity_Tampa
0100000Small0001
1222000Medium0001
210000000Large0010
3525000Medium1000
4111111Medium0100
5200000Large0100
675000Small0010
79000Small1000
8109000Medium1000
910000Small0010
\n", "
" ], "text/plain": [ " sales size city_Jacksonville city_Miami city_Orlando city_Tampa\n", "0 100000 Small 0 0 0 1\n", "1 222000 Medium 0 0 0 1\n", "2 10000000 Large 0 0 1 0\n", "3 525000 Medium 1 0 0 0\n", "4 111111 Medium 0 1 0 0\n", "5 200000 Large 0 1 0 0\n", "6 75000 Small 0 0 1 0\n", "7 9000 Small 1 0 0 0\n", "8 109000 Medium 1 0 0 0\n", "9 10000 Small 0 0 1 0" ] }, "execution_count": 197, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_encoded = pd.get_dummies(df, columns=['city'])\n", "df_encoded\n", "\n", "# This is straight forward" ] }, { "cell_type": "markdown", "id": "d90b7716", "metadata": {}, "source": [ "### Nominal OneHotEncoding " ] }, { "cell_type": "code", "execution_count": 211, "id": "2036e719", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
salescitysize
0100000TampaSmall
1222000TampaMedium
210000000OrlandoLarge
3525000JacksonvilleMedium
4111111MiamiMedium
5200000MiamiLarge
675000OrlandoSmall
79000JacksonvilleSmall
8109000JacksonvilleMedium
910000OrlandoSmall
\n", "
" ], "text/plain": [ " sales city size\n", "0 100000 Tampa Small\n", "1 222000 Tampa Medium\n", "2 10000000 Orlando Large\n", "3 525000 Jacksonville Medium\n", "4 111111 Miami Medium\n", "5 200000 Miami Large\n", "6 75000 Orlando Small\n", "7 9000 Jacksonville Small\n", "8 109000 Jacksonville Medium\n", "9 10000 Orlando Small" ] }, "execution_count": 211, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 212, "id": "3f3be309", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Small', 'Medium', 'Large'], dtype=object)" ] }, "execution_count": 212, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['size'].unique()" ] }, { "cell_type": "code", "execution_count": 213, "id": "97911261", "metadata": {}, "outputs": [], "source": [ "sizes = ['Small', 'Medium', 'Large']" ] }, { "cell_type": "code", "execution_count": 214, "id": "5e523950", "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import OrdinalEncoder" ] }, { "cell_type": "code", "execution_count": 219, "id": "16c1fbd2", "metadata": {}, "outputs": [], "source": [ "enc = OrdinalEncoder(categories = [sizes])" ] }, { "cell_type": "code", "execution_count": 221, "id": "9a79444b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.],\n", " [1.],\n", " [2.],\n", " [1.],\n", " [1.],\n", " [2.],\n", " [0.],\n", " [0.],\n", " [1.],\n", " [0.]])" ] }, "execution_count": 221, "metadata": {}, "output_type": "execute_result" } ], "source": [ "enc.fit_transform(df[['size']])" ] }, { "cell_type": "code", "execution_count": 217, "id": "f8005068", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
salescitysize
0100000TampaSmall
1222000TampaMedium
210000000OrlandoLarge
3525000JacksonvilleMedium
4111111MiamiMedium
\n", "
" ], "text/plain": [ " sales city size\n", "0 100000 Tampa Small\n", "1 222000 Tampa Medium\n", "2 10000000 Orlando Large\n", "3 525000 Jacksonville Medium\n", "4 111111 Miami Medium" ] }, "execution_count": 217, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 222, "id": "6974c92e", "metadata": {}, "outputs": [], "source": [ "df['size '] = enc.fit_transform(df[['size']])" ] }, { "cell_type": "code", "execution_count": 224, "id": "e8b3424f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
salescitysizesize
0100000TampaSmall0.0
1222000TampaMedium1.0
210000000OrlandoLarge2.0
3525000JacksonvilleMedium1.0
4111111MiamiMedium1.0
5200000MiamiLarge2.0
675000OrlandoSmall0.0
79000JacksonvilleSmall0.0
8109000JacksonvilleMedium1.0
910000OrlandoSmall0.0
\n", "
" ], "text/plain": [ " sales city size size \n", "0 100000 Tampa Small 0.0\n", "1 222000 Tampa Medium 1.0\n", "2 10000000 Orlando Large 2.0\n", "3 525000 Jacksonville Medium 1.0\n", "4 111111 Miami Medium 1.0\n", "5 200000 Miami Large 2.0\n", "6 75000 Orlando Small 0.0\n", "7 9000 Jacksonville Small 0.0\n", "8 109000 Jacksonville Medium 1.0\n", "9 10000 Orlando Small 0.0" ] }, "execution_count": 224, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(10)" ] }, { "cell_type": "markdown", "id": "d62227bf", "metadata": {}, "source": [ "##### To now convert the Nominal Data- City" ] }, { "cell_type": "code", "execution_count": 206, "id": "b92b0bbe", "metadata": {}, "outputs": [], "source": [ "df_encoded = pd.get_dummies(df[['city']])" ] }, { "cell_type": "code", "execution_count": 225, "id": "ac8359f6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
city_Jacksonvillecity_Miamicity_Orlandocity_Tampa
00001
10001
20010
31000
40100
50100
60010
71000
81000
90010
\n", "
" ], "text/plain": [ " city_Jacksonville city_Miami city_Orlando city_Tampa\n", "0 0 0 0 1\n", "1 0 0 0 1\n", "2 0 0 1 0\n", "3 1 0 0 0\n", "4 0 1 0 0\n", "5 0 1 0 0\n", "6 0 0 1 0\n", "7 1 0 0 0\n", "8 1 0 0 0\n", "9 0 0 1 0" ] }, "execution_count": 225, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_encoded" ] }, { "cell_type": "code", "execution_count": 226, "id": "cf7ab28f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
salescitysizesizecity_Jacksonvillecity_Miamicity_Orlandocity_Tampa
0100000TampaSmall0.00001
1222000TampaMedium1.00001
210000000OrlandoLarge2.00010
3525000JacksonvilleMedium1.01000
4111111MiamiMedium1.00100
5200000MiamiLarge2.00100
675000OrlandoSmall0.00010
79000JacksonvilleSmall0.01000
8109000JacksonvilleMedium1.01000
910000OrlandoSmall0.00010
\n", "
" ], "text/plain": [ " sales city size size city_Jacksonville city_Miami \\\n", "0 100000 Tampa Small 0.0 0 0 \n", "1 222000 Tampa Medium 1.0 0 0 \n", "2 10000000 Orlando Large 2.0 0 0 \n", "3 525000 Jacksonville Medium 1.0 1 0 \n", "4 111111 Miami Medium 1.0 0 1 \n", "5 200000 Miami Large 2.0 0 1 \n", "6 75000 Orlando Small 0.0 0 0 \n", "7 9000 Jacksonville Small 0.0 1 0 \n", "8 109000 Jacksonville Medium 1.0 1 0 \n", "9 10000 Orlando Small 0.0 0 0 \n", "\n", " city_Orlando city_Tampa \n", "0 0 1 \n", "1 0 1 \n", "2 1 0 \n", "3 0 0 \n", "4 0 0 \n", "5 0 0 \n", "6 1 0 \n", "7 0 0 \n", "8 0 0 \n", "9 1 0 " ] }, "execution_count": 226, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_df = pd.concat([df, df_encoded], axis=1)\n", "final_df" ] }, { "cell_type": "code", "execution_count": 232, "id": "d1c30211", "metadata": {}, "outputs": [], "source": [ "# Select columns of type 'object' (string)\n", "string_size_columns = final_df.select_dtypes(include=['object']).columns\n", "\n", "# Drop the column containing string values, all string values\n", "final_df = final_df.drop(columns=string_size_columns)\n" ] }, { "cell_type": "code", "execution_count": 233, "id": "a0713ee4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
salessizecity_Jacksonvillecity_Miamicity_Orlando
01000000.0000
12220001.0000
2100000002.0001
35250001.0100
41111111.0010
52000002.0010
6750000.0001
790000.0100
81090001.0100
9100000.0001
\n", "
" ], "text/plain": [ " sales size city_Jacksonville city_Miami city_Orlando\n", "0 100000 0.0 0 0 0\n", "1 222000 1.0 0 0 0\n", "2 10000000 2.0 0 0 1\n", "3 525000 1.0 1 0 0\n", "4 111111 1.0 0 1 0\n", "5 200000 2.0 0 1 0\n", "6 75000 0.0 0 0 1\n", "7 9000 0.0 1 0 0\n", "8 109000 1.0 1 0 0\n", "9 10000 0.0 0 0 1" ] }, "execution_count": 233, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_df" ] }, { "cell_type": "code", "execution_count": null, "id": "233ca643", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "6b7aa33b", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "2a8b5324", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "6b0e9551", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "fd22a693", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "133dd589", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "5af006ca", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "4881a786", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "6bd66716", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }