{
"cells": [
{
"cell_type": "code",
"execution_count": 315,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 316,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"Train.csv\")\n",
"test = pd.read_csv(\"Test.csv\")\n"
]
},
{
"cell_type": "code",
"execution_count": 317,
"metadata": {},
"outputs": [],
"source": [
"dataset = [df,test]"
]
},
{
"cell_type": "code",
"execution_count": 318,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" realtionship_status | \n",
" industry | \n",
" genre | \n",
" targeted_sex | \n",
" average_runtime(minutes_per_week) | \n",
" airtime | \n",
" airlocation | \n",
" ratings | \n",
" expensive | \n",
" money_back_guarantee | \n",
" netgain | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 19717 | \n",
" Married-spouse-absent | \n",
" Auto | \n",
" Comedy | \n",
" Male | \n",
" 45 | \n",
" Primetime | \n",
" United-States | \n",
" 0.027465 | \n",
" High | \n",
" No | \n",
" False | \n",
"
\n",
" \n",
" 1 | \n",
" 31593 | \n",
" Married-civ-spouse | \n",
" Pharma | \n",
" Comedy | \n",
" Male | \n",
" 45 | \n",
" Primetime | \n",
" United-States | \n",
" 0.027465 | \n",
" Low | \n",
" No | \n",
" False | \n",
"
\n",
" \n",
" 2 | \n",
" 5681 | \n",
" Divorced | \n",
" Entertainment | \n",
" Comedy | \n",
" Female | \n",
" 45 | \n",
" Primetime | \n",
" United-States | \n",
" 0.027465 | \n",
" High | \n",
" Yes | \n",
" False | \n",
"
\n",
" \n",
" 3 | \n",
" 15491 | \n",
" Separated | \n",
" Political | \n",
" Infomercial | \n",
" Female | \n",
" 40 | \n",
" Primetime | \n",
" United-States | \n",
" 0.027465 | \n",
" Low | \n",
" No | \n",
" False | \n",
"
\n",
" \n",
" 4 | \n",
" 23587 | \n",
" Married-civ-spouse | \n",
" Pharma | \n",
" Comedy | \n",
" Male | \n",
" 48 | \n",
" Primetime | \n",
" United-States | \n",
" 0.027465 | \n",
" High | \n",
" No | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id realtionship_status industry genre targeted_sex \\\n",
"0 19717 Married-spouse-absent Auto Comedy Male \n",
"1 31593 Married-civ-spouse Pharma Comedy Male \n",
"2 5681 Divorced Entertainment Comedy Female \n",
"3 15491 Separated Political Infomercial Female \n",
"4 23587 Married-civ-spouse Pharma Comedy Male \n",
"\n",
" average_runtime(minutes_per_week) airtime airlocation ratings \\\n",
"0 45 Primetime United-States 0.027465 \n",
"1 45 Primetime United-States 0.027465 \n",
"2 45 Primetime United-States 0.027465 \n",
"3 40 Primetime United-States 0.027465 \n",
"4 48 Primetime United-States 0.027465 \n",
"\n",
" expensive money_back_guarantee netgain \n",
"0 High No False \n",
"1 Low No False \n",
"2 High Yes False \n",
"3 Low No False \n",
"4 High No True "
]
},
"execution_count": 318,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 319,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" realtionship_status | \n",
" industry | \n",
" genre | \n",
" targeted_sex | \n",
" average_runtime(minutes_per_week) | \n",
" airtime | \n",
" airlocation | \n",
" ratings | \n",
" expensive | \n",
" money_back_guarantee | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Widowed | \n",
" Auto | \n",
" Comedy | \n",
" Female | \n",
" 10 | \n",
" Daytime | \n",
" United-States | \n",
" 0.027465 | \n",
" Low | \n",
" No | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" Married-civ-spouse | \n",
" Pharma | \n",
" Comedy | \n",
" Male | \n",
" 40 | \n",
" Morning | \n",
" United-States | \n",
" 0.056262 | \n",
" High | \n",
" Yes | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" Divorced | \n",
" Entertainment | \n",
" Comedy | \n",
" Female | \n",
" 50 | \n",
" Morning | \n",
" United-States | \n",
" 0.027465 | \n",
" Low | \n",
" No | \n",
"
\n",
" \n",
" 3 | \n",
" 9 | \n",
" Married-civ-spouse | \n",
" Pharma | \n",
" Infomercial | \n",
" Male | \n",
" 40 | \n",
" Primetime | \n",
" United-States | \n",
" 0.027465 | \n",
" Low | \n",
" No | \n",
"
\n",
" \n",
" 4 | \n",
" 10 | \n",
" Married-civ-spouse | \n",
" Pharma | \n",
" Comedy | \n",
" Male | \n",
" 40 | \n",
" Primetime | \n",
" United-States | \n",
" 0.027465 | \n",
" Low | \n",
" Yes | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id realtionship_status industry genre targeted_sex \\\n",
"0 1 Widowed Auto Comedy Female \n",
"1 4 Married-civ-spouse Pharma Comedy Male \n",
"2 5 Divorced Entertainment Comedy Female \n",
"3 9 Married-civ-spouse Pharma Infomercial Male \n",
"4 10 Married-civ-spouse Pharma Comedy Male \n",
"\n",
" average_runtime(minutes_per_week) airtime airlocation ratings \\\n",
"0 10 Daytime United-States 0.027465 \n",
"1 40 Morning United-States 0.056262 \n",
"2 50 Morning United-States 0.027465 \n",
"3 40 Primetime United-States 0.027465 \n",
"4 40 Primetime United-States 0.027465 \n",
"\n",
" expensive money_back_guarantee \n",
"0 Low No \n",
"1 High Yes \n",
"2 Low No \n",
"3 Low No \n",
"4 Low Yes "
]
},
"execution_count": 319,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
]
},
{
"cell_type": "code",
"execution_count": 320,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 26048 entries, 0 to 26047\n",
"Data columns (total 12 columns):\n",
"id 26048 non-null int64\n",
"realtionship_status 26048 non-null object\n",
"industry 26048 non-null object\n",
"genre 26048 non-null object\n",
"targeted_sex 26048 non-null object\n",
"average_runtime(minutes_per_week) 26048 non-null int64\n",
"airtime 26048 non-null object\n",
"airlocation 26048 non-null object\n",
"ratings 26048 non-null float64\n",
"expensive 26048 non-null object\n",
"money_back_guarantee 26048 non-null object\n",
"netgain 26048 non-null bool\n",
"dtypes: bool(1), float64(1), int64(2), object(8)\n",
"memory usage: 2.2+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 321,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pharma 10339\n",
"Auto 6801\n",
"Political 4014\n",
"Entertainment 2765\n",
"Other 1333\n",
"ClassAction 796\n",
"Name: industry, dtype: int64"
]
},
"execution_count": 321,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.industry.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 322,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Comedy 22258\n",
"Infomercial 2516\n",
"Drama 803\n",
"Direct 247\n",
"Other 224\n",
"Name: genre, dtype: int64"
]
},
"execution_count": 322,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.genre.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 323,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Low 15693\n",
"High 7279\n",
"Medium 3076\n",
"Name: expensive, dtype: int64"
]
},
"execution_count": 323,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.expensive.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 324,
"metadata": {},
"outputs": [],
"source": [
"for x in dataset:\n",
" x['industry'] = x['industry'].map({\"Pharma\":5,\"Auto\":4,\"Political\":3,\"Entertainment\":2,\"Other\":1,\"ClassAction\":0})\n",
" x['genre'] = x['genre'].map({\"Comedy\":4,\"Infomercial\":3,\"Drama\":2,\"Direct\":1,\"Other\":0})\n",
" x['targeted_sex']=x['targeted_sex'].map({\"Male\":1,\"Female\":0})\n",
" x['airtime']=x.airtime.map({\"Primetime\":2,\"Morning\":1,\"Daytime\":0})\n",
" x['expensive']=x.expensive.map({\"High\":2,\"Medium\":1,\"Low\":0})\n",
" x['money_back_guarantee']=x.money_back_guarantee.map({\"Yes\":1,\"No\":0})\n",
" x['realtionship_status']=x.realtionship_status.map({\"Married-civ-spouse\":6,\"Never-married\":5,\"Divorced\":4,\"Widowed\":3,\"Separated\":2,\"Married-spouse-absent\":1,\"Married-AF-spouse\":0})"
]
},
{
"cell_type": "code",
"execution_count": 325,
"metadata": {},
"outputs": [],
"source": [
"df['ratings']=(df['ratings']*100000).astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 368,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" realtionship_status | \n",
" industry | \n",
" genre | \n",
" targeted_sex | \n",
" average_runtime(minutes_per_week) | \n",
" airtime | \n",
" airlocation | \n",
" ratings | \n",
" expensive | \n",
" money_back_guarantee | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 3 | \n",
" 4 | \n",
" 4 | \n",
" 0 | \n",
" 10 | \n",
" 0 | \n",
" 38 | \n",
" 2746 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" 6 | \n",
" 5 | \n",
" 4 | \n",
" 1 | \n",
" 40 | \n",
" 1 | \n",
" 38 | \n",
" 5626 | \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" 4 | \n",
" 2 | \n",
" 4 | \n",
" 0 | \n",
" 50 | \n",
" 1 | \n",
" 38 | \n",
" 2746 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 9 | \n",
" 6 | \n",
" 5 | \n",
" 3 | \n",
" 1 | \n",
" 40 | \n",
" 2 | \n",
" 38 | \n",
" 2746 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 10 | \n",
" 6 | \n",
" 5 | \n",
" 4 | \n",
" 1 | \n",
" 40 | \n",
" 2 | \n",
" 38 | \n",
" 2746 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id realtionship_status industry genre targeted_sex \\\n",
"0 1 3 4 4 0 \n",
"1 4 6 5 4 1 \n",
"2 5 4 2 4 0 \n",
"3 9 6 5 3 1 \n",
"4 10 6 5 4 1 \n",
"\n",
" average_runtime(minutes_per_week) airtime airlocation ratings \\\n",
"0 10 0 38 2746 \n",
"1 40 1 38 5626 \n",
"2 50 1 38 2746 \n",
"3 40 2 38 2746 \n",
"4 40 2 38 2746 \n",
"\n",
" expensive money_back_guarantee \n",
"0 0 0 \n",
"1 2 1 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 1 "
]
},
"execution_count": 368,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 326,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" realtionship_status | \n",
" industry | \n",
" genre | \n",
" targeted_sex | \n",
" average_runtime(minutes_per_week) | \n",
" airtime | \n",
" airlocation | \n",
" ratings | \n",
" expensive | \n",
" money_back_guarantee | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 3 | \n",
" 4 | \n",
" 4 | \n",
" 0 | \n",
" 10 | \n",
" 0 | \n",
" United-States | \n",
" 0.027465 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" 6 | \n",
" 5 | \n",
" 4 | \n",
" 1 | \n",
" 40 | \n",
" 1 | \n",
" United-States | \n",
" 0.056262 | \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" 4 | \n",
" 2 | \n",
" 4 | \n",
" 0 | \n",
" 50 | \n",
" 1 | \n",
" United-States | \n",
" 0.027465 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 9 | \n",
" 6 | \n",
" 5 | \n",
" 3 | \n",
" 1 | \n",
" 40 | \n",
" 2 | \n",
" United-States | \n",
" 0.027465 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 10 | \n",
" 6 | \n",
" 5 | \n",
" 4 | \n",
" 1 | \n",
" 40 | \n",
" 2 | \n",
" United-States | \n",
" 0.027465 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id realtionship_status industry genre targeted_sex \\\n",
"0 1 3 4 4 0 \n",
"1 4 6 5 4 1 \n",
"2 5 4 2 4 0 \n",
"3 9 6 5 3 1 \n",
"4 10 6 5 4 1 \n",
"\n",
" average_runtime(minutes_per_week) airtime airlocation ratings \\\n",
"0 10 0 United-States 0.027465 \n",
"1 40 1 United-States 0.056262 \n",
"2 50 1 United-States 0.027465 \n",
"3 40 2 United-States 0.027465 \n",
"4 40 2 United-States 0.027465 \n",
"\n",
" expensive money_back_guarantee \n",
"0 0 0 \n",
"1 2 1 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 1 "
]
},
"execution_count": 326,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
]
},
{
"cell_type": "code",
"execution_count": 327,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"le = LabelEncoder()\n",
"for x in dataset:\n",
" x['airlocation'] = le.fit_transform(x['airlocation'])\n"
]
},
{
"cell_type": "code",
"execution_count": 328,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" realtionship_status | \n",
" industry | \n",
" genre | \n",
" targeted_sex | \n",
" average_runtime(minutes_per_week) | \n",
" airtime | \n",
" airlocation | \n",
" ratings | \n",
" expensive | \n",
" money_back_guarantee | \n",
" netgain | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 19717 | \n",
" 1 | \n",
" 4 | \n",
" 4 | \n",
" 1 | \n",
" 45 | \n",
" 2 | \n",
" 39 | \n",
" 2746 | \n",
" 2 | \n",
" 0 | \n",
" False | \n",
"
\n",
" \n",
" 1 | \n",
" 31593 | \n",
" 6 | \n",
" 5 | \n",
" 4 | \n",
" 1 | \n",
" 45 | \n",
" 2 | \n",
" 39 | \n",
" 2746 | \n",
" 0 | \n",
" 0 | \n",
" False | \n",
"
\n",
" \n",
" 2 | \n",
" 5681 | \n",
" 4 | \n",
" 2 | \n",
" 4 | \n",
" 0 | \n",
" 45 | \n",
" 2 | \n",
" 39 | \n",
" 2746 | \n",
" 2 | \n",
" 1 | \n",
" False | \n",
"
\n",
" \n",
" 3 | \n",
" 15491 | \n",
" 2 | \n",
" 3 | \n",
" 3 | \n",
" 0 | \n",
" 40 | \n",
" 2 | \n",
" 39 | \n",
" 2746 | \n",
" 0 | \n",
" 0 | \n",
" False | \n",
"
\n",
" \n",
" 4 | \n",
" 23587 | \n",
" 6 | \n",
" 5 | \n",
" 4 | \n",
" 1 | \n",
" 48 | \n",
" 2 | \n",
" 39 | \n",
" 2746 | \n",
" 2 | \n",
" 0 | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id realtionship_status industry genre targeted_sex \\\n",
"0 19717 1 4 4 1 \n",
"1 31593 6 5 4 1 \n",
"2 5681 4 2 4 0 \n",
"3 15491 2 3 3 0 \n",
"4 23587 6 5 4 1 \n",
"\n",
" average_runtime(minutes_per_week) airtime airlocation ratings \\\n",
"0 45 2 39 2746 \n",
"1 45 2 39 2746 \n",
"2 45 2 39 2746 \n",
"3 40 2 39 2746 \n",
"4 48 2 39 2746 \n",
"\n",
" expensive money_back_guarantee netgain \n",
"0 2 0 False \n",
"1 0 0 False \n",
"2 2 1 False \n",
"3 0 0 False \n",
"4 2 0 True "
]
},
"execution_count": 328,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 329,
"metadata": {},
"outputs": [],
"source": [
"# def gain(x):\n",
"# if x==\"True\":\n",
"# return 1\n",
"# if x==\"False\":\n",
"# return 0"
]
},
{
"cell_type": "code",
"execution_count": 330,
"metadata": {},
"outputs": [],
"source": [
"# df['netgain_e'] = df['netgain'].apply(gain)"
]
},
{
"cell_type": "code",
"execution_count": 331,
"metadata": {},
"outputs": [],
"source": [
"df['netgain'] = le.fit_transform(df.netgain)"
]
},
{
"cell_type": "code",
"execution_count": 332,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6 11844\n",
"5 8547\n",
"4 3649\n",
"3 818\n",
"2 793\n",
"1 378\n",
"0 19\n",
"Name: realtionship_status, dtype: int64"
]
},
"execution_count": 332,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.realtionship_status.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 333,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" realtionship_status | \n",
" industry | \n",
" genre | \n",
" targeted_sex | \n",
" average_runtime(minutes_per_week) | \n",
" airtime | \n",
" airlocation | \n",
" ratings | \n",
" expensive | \n",
" money_back_guarantee | \n",
" netgain | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 19717 | \n",
" 1 | \n",
" 4 | \n",
" 4 | \n",
" 1 | \n",
" 45 | \n",
" 2 | \n",
" 39 | \n",
" 2746 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 31593 | \n",
" 6 | \n",
" 5 | \n",
" 4 | \n",
" 1 | \n",
" 45 | \n",
" 2 | \n",
" 39 | \n",
" 2746 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 5681 | \n",
" 4 | \n",
" 2 | \n",
" 4 | \n",
" 0 | \n",
" 45 | \n",
" 2 | \n",
" 39 | \n",
" 2746 | \n",
" 2 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 15491 | \n",
" 2 | \n",
" 3 | \n",
" 3 | \n",
" 0 | \n",
" 40 | \n",
" 2 | \n",
" 39 | \n",
" 2746 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 23587 | \n",
" 6 | \n",
" 5 | \n",
" 4 | \n",
" 1 | \n",
" 48 | \n",
" 2 | \n",
" 39 | \n",
" 2746 | \n",
" 2 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id realtionship_status industry genre targeted_sex \\\n",
"0 19717 1 4 4 1 \n",
"1 31593 6 5 4 1 \n",
"2 5681 4 2 4 0 \n",
"3 15491 2 3 3 0 \n",
"4 23587 6 5 4 1 \n",
"\n",
" average_runtime(minutes_per_week) airtime airlocation ratings \\\n",
"0 45 2 39 2746 \n",
"1 45 2 39 2746 \n",
"2 45 2 39 2746 \n",
"3 40 2 39 2746 \n",
"4 48 2 39 2746 \n",
"\n",
" expensive money_back_guarantee netgain \n",
"0 2 0 0 \n",
"1 0 0 0 \n",
"2 2 1 0 \n",
"3 0 0 0 \n",
"4 2 0 1 "
]
},
"execution_count": 333,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 334,
"metadata": {},
"outputs": [],
"source": [
"y = df['netgain'].values"
]
},
{
"cell_type": "code",
"execution_count": 335,
"metadata": {},
"outputs": [],
"source": [
"x = df.drop(['netgain'],axis=1).values"
]
},
{
"cell_type": "code",
"execution_count": 336,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)"
]
},
{
"cell_type": "code",
"execution_count": 337,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=100,\n",
" n_jobs=None, oob_score=False, random_state=None,\n",
" verbose=0, warm_start=False)"
]
},
"execution_count": 337,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"clf=RandomForestClassifier(n_estimators=100)\n",
"clf.fit(x_train,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 338,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 26048 entries, 0 to 26047\n",
"Data columns (total 12 columns):\n",
"id 26048 non-null int64\n",
"realtionship_status 26048 non-null int64\n",
"industry 26048 non-null int64\n",
"genre 26048 non-null int64\n",
"targeted_sex 26048 non-null int64\n",
"average_runtime(minutes_per_week) 26048 non-null int64\n",
"airtime 26048 non-null int64\n",
"airlocation 26048 non-null int64\n",
"ratings 26048 non-null int64\n",
"expensive 26048 non-null int64\n",
"money_back_guarantee 26048 non-null int64\n",
"netgain 26048 non-null int64\n",
"dtypes: int64(12)\n",
"memory usage: 2.4 MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 339,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 339,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.expensive.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 340,
"metadata": {},
"outputs": [],
"source": [
"y_pred = clf.predict(x_test)"
]
},
{
"cell_type": "code",
"execution_count": 341,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 0, ..., 0, 1, 0])"
]
},
"execution_count": 341,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test"
]
},
{
"cell_type": "code",
"execution_count": 342,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, ..., 0, 1, 0])"
]
},
"execution_count": 342,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_pred"
]
},
{
"cell_type": "code",
"execution_count": 343,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import metrics"
]
},
{
"cell_type": "code",
"execution_count": 344,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.7869481765834933\n"
]
}
],
"source": [
"print(\"Accuracy:\",metrics.accuracy_score(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 345,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" realtionship_status | \n",
" industry | \n",
" genre | \n",
" targeted_sex | \n",
" average_runtime(minutes_per_week) | \n",
" airtime | \n",
" airlocation | \n",
" ratings | \n",
" expensive | \n",
" money_back_guarantee | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 3 | \n",
" 4 | \n",
" 4 | \n",
" 0 | \n",
" 10 | \n",
" 0 | \n",
" 38 | \n",
" 0.027465 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" 6 | \n",
" 5 | \n",
" 4 | \n",
" 1 | \n",
" 40 | \n",
" 1 | \n",
" 38 | \n",
" 0.056262 | \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" 4 | \n",
" 2 | \n",
" 4 | \n",
" 0 | \n",
" 50 | \n",
" 1 | \n",
" 38 | \n",
" 0.027465 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 9 | \n",
" 6 | \n",
" 5 | \n",
" 3 | \n",
" 1 | \n",
" 40 | \n",
" 2 | \n",
" 38 | \n",
" 0.027465 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 10 | \n",
" 6 | \n",
" 5 | \n",
" 4 | \n",
" 1 | \n",
" 40 | \n",
" 2 | \n",
" 38 | \n",
" 0.027465 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id realtionship_status industry genre targeted_sex \\\n",
"0 1 3 4 4 0 \n",
"1 4 6 5 4 1 \n",
"2 5 4 2 4 0 \n",
"3 9 6 5 3 1 \n",
"4 10 6 5 4 1 \n",
"\n",
" average_runtime(minutes_per_week) airtime airlocation ratings \\\n",
"0 10 0 38 0.027465 \n",
"1 40 1 38 0.056262 \n",
"2 50 1 38 0.027465 \n",
"3 40 2 38 0.027465 \n",
"4 40 2 38 0.027465 \n",
"\n",
" expensive money_back_guarantee \n",
"0 0 0 \n",
"1 2 1 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 1 "
]
},
"execution_count": 345,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
]
},
{
"cell_type": "code",
"execution_count": 346,
"metadata": {},
"outputs": [],
"source": [
"test['ratings']=(test['ratings']*100000).astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 347,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 6513 entries, 0 to 6512\n",
"Data columns (total 11 columns):\n",
"id 6513 non-null int64\n",
"realtionship_status 6513 non-null int64\n",
"industry 6513 non-null int64\n",
"genre 6513 non-null int64\n",
"targeted_sex 6513 non-null int64\n",
"average_runtime(minutes_per_week) 6513 non-null int64\n",
"airtime 6513 non-null int64\n",
"airlocation 6513 non-null int64\n",
"ratings 6513 non-null int64\n",
"expensive 6513 non-null int64\n",
"money_back_guarantee 6513 non-null int64\n",
"dtypes: int64(11)\n",
"memory usage: 559.8 KB\n"
]
}
],
"source": [
"test.info()"
]
},
{
"cell_type": "code",
"execution_count": 348,
"metadata": {},
"outputs": [],
"source": [
" x = test.values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 349,
"metadata": {},
"outputs": [],
"source": [
"Y_pred = clf.predict(x)"
]
},
{
"cell_type": "code",
"execution_count": 350,
"metadata": {},
"outputs": [],
"source": [
"dft = test['id']"
]
},
{
"cell_type": "code",
"execution_count": 351,
"metadata": {},
"outputs": [],
"source": [
"dft = pd.DataFrame(dft)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 354,
"metadata": {},
"outputs": [],
"source": [
"dft['netgain'] = Y_pred"
]
},
{
"cell_type": "code",
"execution_count": 355,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" netgain | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 9 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 10 | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" 20 | \n",
" 0 | \n",
"
\n",
" \n",
" 6 | \n",
" 28 | \n",
" 0 | \n",
"
\n",
" \n",
" 7 | \n",
" 31 | \n",
" 0 | \n",
"
\n",
" \n",
" 8 | \n",
" 32 | \n",
" 0 | \n",
"
\n",
" \n",
" 9 | \n",
" 34 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id netgain\n",
"0 1 0\n",
"1 4 1\n",
"2 5 0\n",
"3 9 1\n",
"4 10 1\n",
"5 20 0\n",
"6 28 0\n",
"7 31 0\n",
"8 32 0\n",
"9 34 0"
]
},
"execution_count": 355,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dft.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 356,
"metadata": {},
"outputs": [],
"source": [
"# dft['netgain'] = df['netgain'].apply(bool)"
]
},
{
"cell_type": "code",
"execution_count": 357,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" netgain | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 9 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" 10 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id netgain\n",
"0 1 0\n",
"1 4 1\n",
"2 5 0\n",
"3 9 1\n",
"4 10 1"
]
},
"execution_count": 357,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dft.head()"
]
},
{
"cell_type": "code",
"execution_count": 358,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 5096\n",
"1 1417\n",
"Name: netgain, dtype: int64"
]
},
"execution_count": 358,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dft.netgain.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 359,
"metadata": {},
"outputs": [],
"source": [
"# for x in range(len(dft['netgain'])):\n",
"# if x==0:\n",
"# dft['netgain'][x] = 'false'\n",
"# elif x==1:\n",
"# dft['netgain'][x] = 'true'"
]
},
{
"cell_type": "code",
"execution_count": 363,
"metadata": {},
"outputs": [],
"source": [
"dft.loc[dft['netgain']==0,'netgain']='false'\n",
"dft.loc[dft['netgain']==1,'netgain']='true'"
]
},
{
"cell_type": "code",
"execution_count": 364,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" netgain | \n",
" netgain_e | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" false | \n",
" false | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" true | \n",
" true | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" false | \n",
" false | \n",
"
\n",
" \n",
" 3 | \n",
" 9 | \n",
" true | \n",
" true | \n",
"
\n",
" \n",
" 4 | \n",
" 10 | \n",
" true | \n",
" true | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id netgain netgain_e\n",
"0 1 false false\n",
"1 4 true true\n",
"2 5 false false\n",
"3 9 true true\n",
"4 10 true true"
]
},
"execution_count": 364,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dft.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 365,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"false 5096\n",
"true 1417\n",
"Name: netgain_e, dtype: int64"
]
},
"execution_count": 365,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dft.netgain_e.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 366,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" netgain | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" false | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" true | \n",
"
\n",
" \n",
" 2 | \n",
" 5 | \n",
" false | \n",
"
\n",
" \n",
" 3 | \n",
" 9 | \n",
" true | \n",
"
\n",
" \n",
" 4 | \n",
" 10 | \n",
" true | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 6508 | \n",
" 32538 | \n",
" false | \n",
"
\n",
" \n",
" 6509 | \n",
" 32542 | \n",
" false | \n",
"
\n",
" \n",
" 6510 | \n",
" 32549 | \n",
" false | \n",
"
\n",
" \n",
" 6511 | \n",
" 32558 | \n",
" true | \n",
"
\n",
" \n",
" 6512 | \n",
" 32560 | \n",
" false | \n",
"
\n",
" \n",
"
\n",
"
6513 rows × 2 columns
\n",
"
"
],
"text/plain": [
" id netgain\n",
"0 1 false\n",
"1 4 true\n",
"2 5 false\n",
"3 9 true\n",
"4 10 true\n",
"... ... ...\n",
"6508 32538 false\n",
"6509 32542 false\n",
"6510 32549 false\n",
"6511 32558 true\n",
"6512 32560 false\n",
"\n",
"[6513 rows x 2 columns]"
]
},
"execution_count": 366,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dft.drop(['netgain_e'],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 367,
"metadata": {},
"outputs": [],
"source": [
"dft.to_csv(\"First_submission.csv\",index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}