{ "cells": [ { "cell_type": "markdown", "id": "4c157315", "metadata": {}, "source": [ "# Improve Product Recommendation using Sentiment Analysis\n", "\n", "- Watch [Other Interesting Data Science Topics](https://www.youtube.com/channel/UC4yh4xPxRP0-bLG_ldnLCHA/videos)\n", "- Subscribe on [YouTube](https://www.youtube.com/channel/UC4yh4xPxRP0-bLG_ldnLCHA?sub_confirmation=1)\n", "- Created on: 26-MAY-2022\n", "- Last Updated on: 26-MAY-2022" ] }, { "cell_type": "markdown", "id": "f4d231b9", "metadata": {}, "source": [ "## Recommendation" ] }, { "cell_type": "code", "execution_count": 1, "id": "301c0740", "metadata": { "scrolled": true }, "outputs": [], "source": [ "################################\n", "## STEP 01: Import Libraries ##\n", "################################\n", "import pandas as pd\n", "import numpy as np\n", "import pickle\n", "from sklearn.model_selection import train_test_split \n", "from sklearn.metrics.pairwise import pairwise_distances\n", "from sklearn.preprocessing import MinMaxScaler\n", "import matplotlib.pyplot as plt\n", "from IPython.display import display" ] }, { "cell_type": "code", "execution_count": 2, "id": "4fa09dfc", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdratingprod_name
28266daitaliana235Storkcraft Tuscany Glider and Ottoman, Beige C...
15603beverly5Lysol Concentrate Deodorizing Cleaner, Origina...
7839amy775Clorox Disinfecting Wipes Value Pack Scented 1...
4850dmann101015The Resident Evil Collection 5 Discs (blu-Ray)
4699morenito0215825The Resident Evil Collection 5 Discs (blu-Ray)
\n", "
" ], "text/plain": [ " userId rating \\\n", "28266 daitaliana23 5 \n", "15603 beverly 5 \n", "7839 amy77 5 \n", "4850 dmann10101 5 \n", "4699 morenito021582 5 \n", "\n", " prod_name \n", "28266 Storkcraft Tuscany Glider and Ottoman, Beige C... \n", "15603 Lysol Concentrate Deodorizing Cleaner, Origina... \n", "7839 Clorox Disinfecting Wipes Value Pack Scented 1... \n", "4850 The Resident Evil Collection 5 Discs (blu-Ray) \n", "4699 The Resident Evil Collection 5 Discs (blu-Ray) " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#############################\n", "## STEP 02: Read Data ####\n", "#############################\n", "# Reading ratings file\n", "ratings = pd.read_csv('https://raw.githubusercontent.com/aakashgoel12/blogs/master/input/product_ratings_final.csv',\\\n", " encoding='latin-1')\n", "# ratings.reset_index(drop=True, inplace=True)\n", "display(ratings.sample(n=5, random_state=42))" ] }, { "cell_type": "code", "execution_count": 3, "id": "74d8ecb2", "metadata": { "scrolled": false }, "outputs": [], "source": [ "#################################\n", "## STEP 03: Data Preparation ####\n", "#################################\n", "\n", "def apply_pivot(df,fillby = None):\n", " if fillby is not None:\n", " return df.pivot_table(index='userId', columns='prod_name',values='rating').fillna(fillby)\n", " return df.pivot_table(index='userId', columns='prod_name',values='rating')\n", "\n", "\n", "#3.1 Dividing the dataset into train and test\n", "train, test = train_test_split(ratings, test_size=0.30, random_state=42)\n", "test = test[test.userId.isin(train.userId)]\n", "#3.2 Apply pivot operation and fillna used to replace NaN values with 0 i.e. where user didn't made any rating\n", "df_train_pivot = apply_pivot(df = train, fillby = 0)\n", "df_test_pivot = apply_pivot(df = test, fillby = 0)\n", "#3.3 dummy dataset (train and test)\n", "## Train\n", "dummy_train = train.copy()\n", "dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x>=1 else 1)\n", "dummy_train = apply_pivot(df = dummy_train, fillby = 1)\n", "## Test\n", "dummy_test = test.copy()\n", "dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x>=1 else 0)\n", "dummy_test = apply_pivot(df = dummy_test, fillby = 0)" ] }, { "cell_type": "code", "execution_count": 4, "id": "6c193726", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
prod_name0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. Fire File Chest100:Complete First Season (blu-Ray)2017-2018 Brownline174 Duraflex 14-Month Planner 8 1/2 X 11 Black2x Ultra Era with Oxi Booster, 50fl oz42 Dual Drop Leaf Table with 2 Madrid Chairs\"4C Grated Parmesan Cheese 100% Natural 8oz ShakerAfrica's Best No-Lye Dual Conditioning Relaxer System SuperAlberto VO5 Salon Series Smooth Plus Sleek ShampooAll,bran Complete Wheat Flakes, 18 Oz.Ambi Complexion Cleansing Bar...Vicks Vaporub, Regular, 3.53ozVoortman Sugar Free Fudge Chocolate Chip CookiesWagan Smartac 80watt Inverter With UsbWallmount Server Cabinet (450mm, 9 RU)Way Basics 3-Shelf Eco Narrow Bookcase Storage Shelf, Espresso - Formaldehyde Free - Lifetime GuaranteeWedding Wishes Wedding Guest BookWeleda Everon Lip BalmWindex Original Glass Cleaner Refill 67.6oz (2 Liter)Yes To Carrots Nourishing Body WashYes To Grapefruit Rejuvenating Body Wash
userId
brewno3.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
deelee0.00.00.00.00.05.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
embum5.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
erinn0.00.00.00.00.05.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
rmtarboro0.00.00.00.00.05.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
smokey bear3.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
spicesea5.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", "

7 rows × 231 columns

\n", "
" ], "text/plain": [ "prod_name 0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. Fire File Chest \\\n", "userId \n", "brewno 3.0 \n", "deelee 0.0 \n", "embum 5.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 3.0 \n", "spicesea 5.0 \n", "\n", "prod_name 100:Complete First Season (blu-Ray) \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name 2017-2018 Brownline174 Duraflex 14-Month Planner 8 1/2 X 11 Black \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name 2x Ultra Era with Oxi Booster, 50fl oz \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name 42 Dual Drop Leaf Table with 2 Madrid Chairs\" \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name 4C Grated Parmesan Cheese 100% Natural 8oz Shaker \\\n", "userId \n", "brewno 0.0 \n", "deelee 5.0 \n", "embum 0.0 \n", "erinn 5.0 \n", "rmtarboro 5.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name Africa's Best No-Lye Dual Conditioning Relaxer System Super \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name Alberto VO5 Salon Series Smooth Plus Sleek Shampoo \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name All,bran Complete Wheat Flakes, 18 Oz. \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name Ambi Complexion Cleansing Bar ... \\\n", "userId ... \n", "brewno 0.0 ... \n", "deelee 0.0 ... \n", "embum 0.0 ... \n", "erinn 0.0 ... \n", "rmtarboro 0.0 ... \n", "smokey bear 0.0 ... \n", "spicesea 0.0 ... \n", "\n", "prod_name Vicks Vaporub, Regular, 3.53oz \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name Voortman Sugar Free Fudge Chocolate Chip Cookies \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name Wagan Smartac 80watt Inverter With Usb \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name Wallmount Server Cabinet (450mm, 9 RU) \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name Way Basics 3-Shelf Eco Narrow Bookcase Storage Shelf, Espresso - Formaldehyde Free - Lifetime Guarantee \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name Wedding Wishes Wedding Guest Book Weleda Everon Lip Balm \\\n", "userId \n", "brewno 0.0 0.0 \n", "deelee 0.0 0.0 \n", "embum 0.0 0.0 \n", "erinn 0.0 0.0 \n", "rmtarboro 0.0 0.0 \n", "smokey bear 0.0 0.0 \n", "spicesea 0.0 0.0 \n", "\n", "prod_name Windex Original Glass Cleaner Refill 67.6oz (2 Liter) \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name Yes To Carrots Nourishing Body Wash \\\n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "prod_name Yes To Grapefruit Rejuvenating Body Wash \n", "userId \n", "brewno 0.0 \n", "deelee 0.0 \n", "embum 0.0 \n", "erinn 0.0 \n", "rmtarboro 0.0 \n", "smokey bear 0.0 \n", "spicesea 0.0 \n", "\n", "[7 rows x 231 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train_pivot[(df_train_pivot['0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. Fire File Chest']!=0) | \\\n", " (df_train_pivot['4C Grated Parmesan Cheese 100% Natural 8oz Shaker']!=0)]" ] }, { "cell_type": "code", "execution_count": 6, "id": "73b81faa", "metadata": { "scrolled": true }, "outputs": [], "source": [ "#####################################\n", "## STEP 04: User-User Similarity ####\n", "#####################################\n", "\n", "# to calculate mean, use only ratings given by user instead of fillna by 0 as it increase denominator in mean\n", "mean = np.nanmean(apply_pivot(df = train), axis = 1)\n", "df_train_subtracted = (apply_pivot(df = train).T-mean).T\n", "# Make rating=0 where user hasn't given any rating\n", "df_train_subtracted.fillna(0, inplace = True)\n", "# Creating the User Similarity Matrix using pairwise_distance function. shape of user_correlation is userXuser i.e. 18025X18025\n", "user_correlation = 1 - pairwise_distances(df_train_subtracted, metric='cosine')\n", "user_correlation[np.isnan(user_correlation)] = 0\n", "# user_correlation[user_correlation<0] = 0\n", "# Convert the user_correlation matrix into dataframe\n", "user_correlation_df = pd.DataFrame(user_correlation)\n", "user_correlation_df['userId'] = df_train_subtracted.index\n", "user_correlation_df.set_index('userId',inplace=True)\n", "user_correlation_df.columns = df_train_subtracted.index.tolist()" ] }, { "cell_type": "code", "execution_count": 8, "id": "b1748140", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((18025, 18025), (18025, 231))" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_correlation.shape,df_train_pivot.shape" ] }, { "cell_type": "code", "execution_count": 9, "id": "f033747d", "metadata": { "scrolled": true }, "outputs": [], "source": [ "###########################################\n", "## STEP 05: Predict Rating (User-User) ####\n", "###########################################\n", "# Rating predicted by the user (for rated & non rated product both) is the weighted sum of correlation with the product rating (as present in the rating dataset). \n", "user_predicted_ratings = np.dot(user_correlation, df_train_pivot)\n", "\n", "# To find only product not rated by the user, ignore the product rated by the user by making it zero. \n", "user_final_rating = np.multiply(user_predicted_ratings,dummy_train)\n", "\n", "# scaler = MinMaxScaler(feature_range=(1, 5))\n", "# scaler.fit(user_final_rating)\n", "# user_final_rating = scaler.transform(user_final_rating)" ] }, { "cell_type": "code", "execution_count": 10, "id": "ebca7115", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Enter your user idjoshua\n" ] } ], "source": [ "################################################################\n", "## STEP 06: Find Top N recommendation for User (User-User) #####\n", "################################################################\n", "\n", "def find_top_recommendations(pred_rating_df, userid, topn):\n", " recommendation = pred_rating_df.loc[userid].sort_values(ascending=False)[0:topn]\n", " recommendation = pd.DataFrame(recommendation).reset_index().rename(columns={userid:'predicted_ratings'})\n", " return recommendation\n", "\n", "user_input = str(input(\"Enter your user id\"))\n", "recommendation_user_user = find_top_recommendations(user_final_rating, user_input, 5)\n", "recommendation_user_user['userId'] = user_input" ] }, { "cell_type": "code", "execution_count": 11, "id": "537a2db6", "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Recommended products for user id:joshua as below\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
prod_namepredicted_ratingsuserId
0Clorox Disinfecting Wipes Value Pack Scented 1...5.226926joshua
1Lysol Concentrate Deodorizing Cleaner, Origina...3.750000joshua
2Head & Shoulders Dandruff Shampoo Ocean Lift 2...3.535534joshua
3Bounce Dryer Sheets, Fresh Linen, 160 sheets3.535534joshua
4The Resident Evil Collection 5 Discs (blu-Ray)3.345348joshua
\n", "
" ], "text/plain": [ " prod_name predicted_ratings \\\n", "0 Clorox Disinfecting Wipes Value Pack Scented 1... 5.226926 \n", "1 Lysol Concentrate Deodorizing Cleaner, Origina... 3.750000 \n", "2 Head & Shoulders Dandruff Shampoo Ocean Lift 2... 3.535534 \n", "3 Bounce Dryer Sheets, Fresh Linen, 160 sheets 3.535534 \n", "4 The Resident Evil Collection 5 Discs (blu-Ray) 3.345348 \n", "\n", " userId \n", "0 joshua \n", "1 joshua \n", "2 joshua \n", "3 joshua \n", "4 joshua " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Earlier rated products by user id:joshua as below\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdratingprod_name
0joshua5Pink Friday: Roman Reloaded Re-Up (w/dvd)
17718joshua5Smead174 Recycled Letter Size Manila File Back...
22379joshua5Cheetos Crunchy Flamin' Hot Cheese Flavored Sn...
1541joshua3Dark Shadows (includes Digital Copy) (ultravio...
\n", "
" ], "text/plain": [ " userId rating prod_name\n", "0 joshua 5 Pink Friday: Roman Reloaded Re-Up (w/dvd)\n", "17718 joshua 5 Smead174 Recycled Letter Size Manila File Back...\n", "22379 joshua 5 Cheetos Crunchy Flamin' Hot Cheese Flavored Sn...\n", "1541 joshua 3 Dark Shadows (includes Digital Copy) (ultravio..." ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "print(\"Recommended products for user id:{} as below\".format(user_input))\n", "display(recommendation_user_user)\n", "print(\"Earlier rated products by user id:{} as below\".format(user_input))\n", "display(train[train['userId']==user_input].sort_values(['rating'],ascending=False))" ] }, { "cell_type": "code", "execution_count": 12, "id": "991ffd6c", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2.506663023687151\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\aakashgoel\\Anaconda3\\envs\\forecast_anaconda\\lib\\site-packages\\sklearn\\preprocessing\\_data.py:464: RuntimeWarning: All-NaN slice encountered\n", " data_min = np.nanmin(X, axis=0)\n", "C:\\Users\\aakashgoel\\Anaconda3\\envs\\forecast_anaconda\\lib\\site-packages\\sklearn\\preprocessing\\_data.py:465: RuntimeWarning: All-NaN slice encountered\n", " data_max = np.nanmax(X, axis=0)\n" ] } ], "source": [ "################################################\n", "## STEP 07: Evaluation (User-User) on test #####\n", "################################################s\n", "\n", "#Filter user correlation only for user which is in test, test is subset/equal of train in terms of userId\n", "\n", "user_correlation_test_df = user_correlation_df[user_correlation_df.index.isin(test.userId)]\n", "user_correlation_test_df = user_correlation_test_df[list(set(test.userId))]\n", "# user_correlation_test_df[user_correlation_test_df<0]=0\n", "\n", "#Get test user predicted rating\n", "test_user_predicted_ratings = np.dot(user_correlation_test_df, df_test_pivot)\n", "test_user_predicted_ratings = np.multiply(test_user_predicted_ratings,dummy_test)\n", "#Get NaN where user never rated as it shouldn't contribute in calculating RMSE\n", "test_user_predicted_ratings = test_user_predicted_ratings[test_user_predicted_ratings>0]\n", "scaler = MinMaxScaler(feature_range=(1, 5))\n", "scaler.fit(test_user_predicted_ratings)\n", "test_user_predicted_ratings = scaler.transform(test_user_predicted_ratings)\n", "\n", "total_non_nan = np.count_nonzero(~np.isnan(test_user_predicted_ratings))\n", "rmse = (np.sum(np.sum((apply_pivot(df = test) - test_user_predicted_ratings)**2))/total_non_nan)**0.5\n", "print(rmse)" ] }, { "cell_type": "code", "execution_count": 13, "id": "7a217ded", "metadata": {}, "outputs": [], "source": [ "############################\n", "## STEP 08: Save Model ####\n", "############################\n", "pickle.dump(user_final_rating,open('./model/user_final_rating.pkl','wb'))" ] }, { "cell_type": "markdown", "id": "245a1358", "metadata": {}, "source": [ "## Sentiment" ] }, { "cell_type": "code", "execution_count": 1, "id": "92250611", "metadata": {}, "outputs": [], "source": [ "################################\n", "## STEP 01: Import Libraries ##\n", "################################\n", "import pandas as pd\n", "import numpy as np\n", "import pickle\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import confusion_matrix, classification_report\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", "from sklearn.preprocessing import MinMaxScaler\n", "from imblearn import over_sampling\n", "from IPython.display import display" ] }, { "cell_type": "code", "execution_count": 2, "id": "4caafb8c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Reviewuser_sentiment
9329fresh clean smell everything need quick clean ...1
4160great vacuum love lightweight vacuum easy carr...1
18500smell great wipe easy use work smell great1
8840product count use clorox wipe everything trave...1
5098great movie excellent movie add blu ray collec...1
\n", "
" ], "text/plain": [ " Review user_sentiment\n", "9329 fresh clean smell everything need quick clean ... 1\n", "4160 great vacuum love lightweight vacuum easy carr... 1\n", "18500 smell great wipe easy use work smell great 1\n", "8840 product count use clorox wipe everything trave... 1\n", "5098 great movie excellent movie add blu ray collec... 1" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#############################\n", "## STEP 02: Read Data ####\n", "#############################\n", "# Reading product review sentiment file\n", "df_prod_review = pd.read_csv('https://raw.githubusercontent.com/aakashgoel12/blogs/master/input/product_review_sentiment.csv',\\\n", " encoding='latin-1')\n", "display(df_prod_review.sample(n=5, random_state=42))" ] }, { "cell_type": "code", "execution_count": 3, "id": "e82554eb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Checking distribution of +ve and -ve review sentiment: \n", "1 0.888401\n", "0 0.111599\n", "Name: user_sentiment, dtype: float64\n", "Checking distribution of +ve and -ve review sentiment after oversampling: \n", "1 0.5\n", "0 0.5\n", "Name: user_sentiment, dtype: float64\n" ] } ], "source": [ "#################################\n", "## STEP 03: Data Preparation ####\n", "#################################\n", "x=df_prod_review['Review']\n", "y=df_prod_review['user_sentiment']\n", "print(\"Checking distribution of +ve and -ve review sentiment: \\n{}\".format(y.value_counts(normalize=True)))\n", "# Split the dataset into test and train\n", "X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=50)\n", "\n", "#As we saw above that data is imbalanced, balance training data using over sampling\n", "\n", "ros = over_sampling.RandomOverSampler(random_state=0)\n", "X_train, y_train = ros.fit_resample(pd.DataFrame(X_train), pd.Series(y_train))\n", "print(\"Checking distribution of +ve and -ve review sentiment after oversampling: \\n{}\".format(y_train.value_counts(normalize=True)))\n", "#convert into list of string\n", "X_train = X_train['Review'].tolist()" ] }, { "cell_type": "code", "execution_count": 4, "id": "fa6f1d9d", "metadata": {}, "outputs": [], "source": [ "################################################################\n", "## STEP 04: Feature Engineering (Convert text into numbers) ####\n", "################################################################\n", "word_vectorizer = TfidfVectorizer(strip_accents='unicode', token_pattern=r'\\w{1,}',\\\n", " ngram_range=(1, 3), stop_words='english', sublinear_tf=True, max_df = 0.80, min_df = 0.01)\n", "\n", "# Fiting it on Train\n", "word_vectorizer.fit(X_train)\n", "# transforming the train and test datasets\n", "X_train_transformed = word_vectorizer.transform(X_train)\n", "X_test_transformed = word_vectorizer.transform(X_test.tolist())" ] }, { "cell_type": "code", "execution_count": 5, "id": "7c61bde3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((33468, 263), (8062, 263))" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train_transformed.shape, X_test_transformed.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "50db52e2", "metadata": {}, "outputs": [], "source": [ "# print(list(word_vectorizer.get_feature_names()))" ] }, { "cell_type": "code", "execution_count": 6, "id": "936aaf31", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Evaluation on Train dataset ..\n", " precision recall f1-score support\n", "\n", " 0 0.82 0.83 0.82 16734\n", " 1 0.83 0.81 0.82 16734\n", "\n", " accuracy 0.82 33468\n", " macro avg 0.82 0.82 0.82 33468\n", "weighted avg 0.82 0.82 0.82 33468\n", "\n", "sensitivity: 0.81\n", "specificity: 0.83\n", "Evaluation on Test dataset ..\n", " precision recall f1-score support\n", "\n", " 0 0.35 0.80 0.49 922\n", " 1 0.97 0.81 0.88 7140\n", "\n", " accuracy 0.81 8062\n", " macro avg 0.66 0.81 0.69 8062\n", "weighted avg 0.90 0.81 0.84 8062\n", "\n", "sensitivity: 0.81\n", "specificity: 0.8\n" ] } ], "source": [ "###############################################\n", "## STEP 05: ML Model (Logistic Regression) ####\n", "###############################################\n", "\n", "def evaluate_model(y_pred,y_actual):\n", " print(classification_report(y_true = y_actual, y_pred = y_pred))\n", " #confusion matrix\n", " cm = confusion_matrix(y_true = y_actual, y_pred = y_pred)\n", " TN = cm[0, 0] \n", " FP = cm[0, 1]\n", " FN = cm[1, 0]\n", " TP = cm[1, 1]\n", " #Calculating the Sensitivity\n", " sensitivity = round(TP/float(FN + TP),2)\n", " print(\"sensitivity: {}\".format(sensitivity))\n", " #Calculating the Specificity\n", " specificity = round(TN / float(TN + FP),2)\n", " print(\"specificity: {}\".format(specificity))\n", "\n", "#4.1 Model Training\n", "logit = LogisticRegression()\n", "logit.fit(X_train_transformed,y_train)\n", "#4.2 Prediction on Train Data\n", "y_pred_train= logit.predict(X_train_transformed)\n", "#4.3 Prediction on Test Data\n", "y_pred_test = logit.predict(X_test_transformed)\n", "#4.4 Evaluation on Train\n", "print(\"Evaluation on Train dataset ..\")\n", "evaluate_model(y_pred = y_pred_train, y_actual = y_train)\n", "print(\"Evaluation on Test dataset ..\")\n", "#4.5 Evaluation on Test\n", "evaluate_model(y_pred = y_pred_test, y_actual = y_test)" ] }, { "cell_type": "code", "execution_count": 7, "id": "fef2c721", "metadata": {}, "outputs": [], "source": [ "############################\n", "## STEP 06: Save Model ####\n", "############################\n", "pickle.dump(logit,open('./model/logit_model.pkl', 'wb'))\n", "pickle.dump(word_vectorizer,open('./model/word_vectorizer.pkl','wb'))" ] }, { "cell_type": "markdown", "id": "131bd76b", "metadata": {}, "source": [ "## Connecting dot -- Use Sentiment in Improving Recommendation" ] }, { "cell_type": "code", "execution_count": 7, "id": "74222c66", "metadata": {}, "outputs": [], "source": [ "################################\n", "## STEP 01: Import Libraries ##\n", "################################\n", "import pandas as pd\n", "import numpy as np\n", "import pickle\n", "from sklearn.preprocessing import MinMaxScaler" ] }, { "cell_type": "code", "execution_count": 2, "id": "b9e11dfa", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
prod_nameReview
2501Hawaiian Punch Berry Limeade Blast Juicepretty good stuff much sugar kid like
21252Godzilla 3d Includes Digital Copy Ultraviolet ...enteraining great interesting version classic ...
23503Godzilla 3d Includes Digital Copy Ultraviolet ...best godzilla date like previous godzilla film...
26827Storkcraft Tuscany Glider and Ottoman, Beige C...comfy good put baby sleep calming sister mom n...
18210Clorox Disinfecting Bathroom Cleanerproduct easy use product easy use open use har...
\n", "
" ], "text/plain": [ " prod_name \\\n", "2501 Hawaiian Punch Berry Limeade Blast Juice \n", "21252 Godzilla 3d Includes Digital Copy Ultraviolet ... \n", "23503 Godzilla 3d Includes Digital Copy Ultraviolet ... \n", "26827 Storkcraft Tuscany Glider and Ottoman, Beige C... \n", "18210 Clorox Disinfecting Bathroom Cleaner \n", "\n", " Review \n", "2501 pretty good stuff much sugar kid like \n", "21252 enteraining great interesting version classic ... \n", "23503 best godzilla date like previous godzilla film... \n", "26827 comfy good put baby sleep calming sister mom n... \n", "18210 product easy use product easy use open use har... " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#############################\n", "## STEP 02: Read Data ####\n", "#############################\n", "# Reading product review data\n", "df_prod_review = pd.read_csv('https://raw.githubusercontent.com/aakashgoel12/blogs/master/input/product_review.csv',\\\n", " encoding='latin-1')\n", "display(df_prod_review.sample(n=5, random_state=42))" ] }, { "cell_type": "code", "execution_count": 3, "id": "238b4adc", "metadata": {}, "outputs": [], "source": [ "###########################\n", "## STEP 03: Load Model ####\n", "###########################\n", "\n", "model = pickle.load(open('./model/logit_model.pkl', 'rb'))\n", "word_vectorizer = pickle.load(open('./model/word_vectorizer.pkl','rb'))\n", "user_final_rating = pickle.load(open('./model/user_final_rating.pkl','rb'))" ] }, { "cell_type": "code", "execution_count": 15, "id": "902c72a3", "metadata": {}, "outputs": [], "source": [ "##########################################################################\n", "## STEP 04: Get positive review Recommendation only for given user id ####\n", "##########################################################################\n", "\n", "def find_top_recommendations(pred_rating_df, userid, topn):\n", " recommendation = pred_rating_df.loc[userid].sort_values(ascending=False)[0:topn]\n", " recommendation = pd.DataFrame(recommendation).reset_index().rename(columns={userid:'predicted_ratings'})\n", " return recommendation\n", "\n", "def get_sentiment_product(x):\n", " ## Get review list for given product\n", " product_name_review_list = df_prod_review[df_prod_review['prod_name']== x]['Review'].tolist()\n", " ## Transform review list into DTM (Document/review Term Matrix)\n", " features= word_vectorizer.transform(product_name_review_list)\n", " ## Predict sentiment\n", " return model.predict(features).mean()\n", "\n", "def find_top_pos_recommendation(user_final_rating, user_input, df_prod_review, word_vectorizer,\\\n", " model, no_recommendation):\n", " ## 10 is manually coded, need to change \n", " ## Generate top recommenddations using user-user based recommendation system w/o using sentiment analysis \n", " recommendation_user_user = find_top_recommendations(user_final_rating, user_input, 10)\n", " recommendation_user_user['userId'] = user_input\n", " ## filter out recommendations where predicted rating is zero\n", " recommendation_user_user = recommendation_user_user[recommendation_user_user['predicted_ratings']!=0]\n", " print(\"Recommended products for user id:{} without using sentiment\".format(user_input))\n", " display(recommendation_user_user)\n", " ## Get overall sentiment score for each recommended product\n", " recommendation_user_user['sentiment_score'] = recommendation_user_user['prod_name'].apply(get_sentiment_product)\n", " ## Transform scale of sentiment so that it can be manipulated with predicted rating score\n", " scaler = MinMaxScaler(feature_range=(1, 5))\n", " scaler.fit(recommendation_user_user[['sentiment_score']])\n", " recommendation_user_user['sentiment_score'] = scaler.transform(recommendation_user_user[['sentiment_score']])\n", " ## Get final product ranking score using 1*Predicted rating of recommended product + 2*normalized sentiment score on scale of 1–5 of recommended product \n", " recommendation_user_user['product_ranking_score'] = 1*recommendation_user_user['predicted_ratings'] + \\\n", " 2*recommendation_user_user['sentiment_score']\n", " print(\"Recommended products for user id:{} after using sentiment\".format(user_input))\n", " ## Sort product ranking score in descending order and show only top `no_recommendation`\n", " display(recommendation_user_user.sort_values(by = ['product_ranking_score'],ascending = False).head(no_recommendation))" ] }, { "cell_type": "code", "execution_count": 16, "id": "4cfb8a8d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Enter your user idjoshua\n", "Recommended products for user id:joshua without using sentiment\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
prod_namepredicted_ratingsuserId
0Clorox Disinfecting Wipes Value Pack Scented 1...5.226926joshua
1Lysol Concentrate Deodorizing Cleaner, Origina...3.750000joshua
2Head & Shoulders Dandruff Shampoo Ocean Lift 2...3.535534joshua
3Bounce Dryer Sheets, Fresh Linen, 160 sheets3.535534joshua
4The Resident Evil Collection 5 Discs (blu-Ray)3.345348joshua
5Hormel Chili, No Beans3.286511joshua
6Chester's Cheese Flavored Puffcorn Snacks2.204404joshua
7Mike Dave Need Wedding Dates (dvd + Digital)0.720898joshua
8Storkcraft Tuscany Glider and Ottoman, Beige C...0.708318joshua
9Ceiling Fan With Light White 14.2 X 29.9 X 9.2...0.708318joshua
\n", "
" ], "text/plain": [ " prod_name predicted_ratings \\\n", "0 Clorox Disinfecting Wipes Value Pack Scented 1... 5.226926 \n", "1 Lysol Concentrate Deodorizing Cleaner, Origina... 3.750000 \n", "2 Head & Shoulders Dandruff Shampoo Ocean Lift 2... 3.535534 \n", "3 Bounce Dryer Sheets, Fresh Linen, 160 sheets 3.535534 \n", "4 The Resident Evil Collection 5 Discs (blu-Ray) 3.345348 \n", "5 Hormel Chili, No Beans 3.286511 \n", "6 Chester's Cheese Flavored Puffcorn Snacks 2.204404 \n", "7 Mike Dave Need Wedding Dates (dvd + Digital) 0.720898 \n", "8 Storkcraft Tuscany Glider and Ottoman, Beige C... 0.708318 \n", "9 Ceiling Fan With Light White 14.2 X 29.9 X 9.2... 0.708318 \n", "\n", " userId \n", "0 joshua \n", "1 joshua \n", "2 joshua \n", "3 joshua \n", "4 joshua \n", "5 joshua \n", "6 joshua \n", "7 joshua \n", "8 joshua \n", "9 joshua " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Recommended products for user id:joshua after using sentiment\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
prod_namepredicted_ratingsuserIdsentiment_scoreproduct_ranking_score
0Clorox Disinfecting Wipes Value Pack Scented 1...5.226926joshua5.00000015.226926
3Bounce Dryer Sheets, Fresh Linen, 160 sheets3.535534joshua4.39032912.316191
8Storkcraft Tuscany Glider and Ottoman, Beige C...0.708318joshua4.97856210.665442
5Hormel Chili, No Beans3.286511joshua3.2022799.691070
6Chester's Cheese Flavored Puffcorn Snacks2.204404joshua3.6419069.488215
\n", "
" ], "text/plain": [ " prod_name predicted_ratings \\\n", "0 Clorox Disinfecting Wipes Value Pack Scented 1... 5.226926 \n", "3 Bounce Dryer Sheets, Fresh Linen, 160 sheets 3.535534 \n", "8 Storkcraft Tuscany Glider and Ottoman, Beige C... 0.708318 \n", "5 Hormel Chili, No Beans 3.286511 \n", "6 Chester's Cheese Flavored Puffcorn Snacks 2.204404 \n", "\n", " userId sentiment_score product_ranking_score \n", "0 joshua 5.000000 15.226926 \n", "3 joshua 4.390329 12.316191 \n", "8 joshua 4.978562 10.665442 \n", "5 joshua 3.202279 9.691070 \n", "6 joshua 3.641906 9.488215 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "user_input = str(input(\"Enter your user id\"))\n", "find_top_pos_recommendation(user_final_rating, user_input, df_prod_review, word_vectorizer,\\\n", " model, no_recommendation = 5)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 }