{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "2021-07-28-audible-audiobook-recommender.ipynb", "provenance": [], "authorship_tag": "ABX9TyMrZO9INiS0MfLYdaPheRkx" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "9kwmww3PqNY7" }, "source": [ "# Audible Book Recommender\n", "> Finding similar books using simple text countvectorizer model on audible dataset\n", "\n", "- toc: false\n", "- badges: true\n", "- comments: true\n", "- categories: [Books, CountVectorizer]\n", "- image:" ] }, { "cell_type": "code", "metadata": { "id": "3wQ4xg6-mJqQ" }, "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity" ], "execution_count": 1, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 661 }, "id": "YU2R9RnOmLc8", "outputId": "15d4c7ba-57b8-446a-8d07-1a5f4c69b0cd" }, "source": [ "audible_data = pd.read_csv(\"https://github.com/sparsh-ai/reco-data/raw/audible/audible/audible.csv\",\n", " encoding='latin1')\n", "audible_data.head()" ], "execution_count": 2, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Book TitleBook SubtitleBook AuthorBook NarratorAudio RuntimeAudiobook_TypeCategoriesRatingTotal No. of RatingsPriceReview 1Review 2Review 3Review 4Review 5Review 6Review 7Review 8Review 9Review 10Review 11Review 12Review 13Review 14Review 15Review 16Review 17Review 18Review 19Review 20Review 21Review 22Review 23Review 24Review 25Review 26Review 27Review 28Review 29Review 30...Review 61Review 62Review 63Review 64Review 65Review 66Review 67Review 68Review 69Review 70Review 71Review 72Review 73Review 74Review 75Review 76Review 77Review 78Review 79Review 80Review 81Review 82Review 83Review 84Review 85Review 86Review 87Review 88Review 89Review 90Review 91Review 92Review 93Review 94Review 95Review 96Review 97Review 98Review 99Review100
0Bamboozled by JesusHow God Tricked Me into the Life of My DreamsYvonne OrjiYvonne Orji6 hrs and 31 minsUnabridged AudiobookBiographies & Memoirs547.0$29.65Thank you for being obedient and sharing your ...This book was amazing. What made it amazing wa...The narration of the book by the author was a ...I'm sending Yvonne a tilth because this was th...Yvonne is truly amazing at blending scripture ...I enjoyed this book immensely. Thank you for m...This book really blessed my life. I pray that ...I have enjoyed Yvonnes work on Insecure and he...to quote my wife \"I feel so seen!\" Yvonne must...This content was amazing and being a fan of Yv...Already surrendered my life to Jesus but this!...I loved this book. I finished it in 2 days. I ...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1Building BridgesNaNMarie DunlopDiane Books, Natalie Moore Williams, John Scou...1 hr and 41 minsUnabridged AudiobookLiterature & Fiction, Genre Fiction51.0$0.00Recent old times brought to lifeNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2King of ScotlandModern PlaysIain HeggieLiam Brennan52 minsUnabridged AudiobookLiterature & Fiction, Drama & PlaysNot rated yetNaN$0.00NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3Mrs GNaNMike TibbettsSarah Rose Graber, Brett Whitted34 minsUnabridged AudiobookLiterature & Fiction51.0$0.00great story in 30 mins. you wont know who's si...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4SignatureNaNBob DavidsonSakshi Sharma, Lucy Goldie36 minsUnabridged AudiobookMystery, Thriller & Suspense, MysteryNot rated yetNaN$0.00NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

5 rows × 110 columns

\n", "
" ], "text/plain": [ " Book Title ... Review100\n", "0 Bamboozled by Jesus ... NaN\n", "1 Building Bridges ... NaN\n", "2 King of Scotland ... NaN\n", "3 Mrs G ... NaN\n", "4 Signature ... NaN\n", "\n", "[5 rows x 110 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 2 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dh_0Gvc9megx", "outputId": "7a6c2b8e-0213-46e6-983b-ff40bc284493" }, "source": [ "audible_data.info()" ], "execution_count": 3, "outputs": [ { "output_type": "stream", "text": [ "\n", "RangeIndex: 2275 entries, 0 to 2274\n", "Columns: 110 entries, Book Title to Review100\n", "dtypes: float64(1), object(109)\n", "memory usage: 1.9+ MB\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "MlU2jIzlmnTs" }, "source": [ "# Selecting 4 columns: Title, Author, Narrator,Categories(Genre)\n", "audible_data = audible_data[['Book Title', 'Book Author', 'Book Narrator', 'Categories']]\n", "\n", "# Remove all 'Categories', and 'Book Narrator' NaN records\n", "audible_data = audible_data[audible_data['Categories'].notna()]\n", "audible_data = audible_data[audible_data['Book Narrator'].notna()]\n", "\n", "# lower case and split on commas or &-sign 'Categories'\n", "audible_data['Categories'] = audible_data['Categories'].map(\n", " lambda x: x.lower().replace(' &', ',').replace('genre', '').split(','))\n", "# Book Author\n", "audible_data['Book Author'] = audible_data['Book Author'].map(lambda x: x.lower().replace(' ', '').split(' '))\n", "# Book Narrator\n", "audible_data['Book Narrator'] = audible_data['Book Narrator'].map(lambda x: x.lower().replace(' ', '').split(' '))\n", "\n", "for index, row in audible_data.iterrows():\n", " # row['Book Narrator'] = [x.replace(' ','') for x in row['Book Narrator']]\n", " row['Book Author'] = ''.join(row['Book Author'])" ], "execution_count": 4, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "H1U617hkmnRm" }, "source": [ "# make 'Book Title' as an index\n", "audible_data.set_index('Book Title', inplace=True)\n", "\n", "audible_data['bag_of_words'] = ''\n", "for index, row in audible_data.iterrows():\n", " words = ''\n", " for col in audible_data.columns:\n", " if col != 'Book Author':\n", " words = words + ' '.join(row[col]) + ' '\n", " else:\n", " words = words + row[col] + ' '\n", " row['bag_of_words'] = words\n", "\n", "audible_data.drop(columns=[x for x in audible_data.columns if x != 'bag_of_words'], inplace=True)" ], "execution_count": 5, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "iEFzDIAumnKp" }, "source": [ "recommendation_movies = []\n", "\n", "# Vectorizing the entire matrix as described above!\n", "count = CountVectorizer(stop_words='english')\n", "count_matrix = count.fit_transform(audible_data['bag_of_words'])\n", "\n", "# running pairwise cosine similarity\n", "cosine_sim2 = cosine_similarity(count_matrix, count_matrix) # getting a similarity matrix" ], "execution_count": 8, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "GWWiWb8Wpj8D" }, "source": [ "def recommend(k=5):\n", " # gettin the index of the book that matches the title\n", " indices = pd.Series(audible_data.index)\n", " idx = indices.sample(1)\n", "\n", " # creating a Series with the similarity scores in descending order\n", " score_series = pd.Series(cosine_sim2[idx.index[0]]).sort_values(ascending=False)\n", "\n", " # getting the indexes of the k most similar audiobooks\n", " top_k_indexes = list(score_series.iloc[1:k+1].index)\n", "\n", " topk = indices[top_k_indexes].tolist()\n", "\n", " print(\"For '{}', Top {} similar audiobooks are {}\".format(idx.values[0], k, topk))" ], "execution_count": 37, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "65NE6oyence8", "outputId": "fb3f5ed6-0f68-4146-caff-608c70a90fd5" }, "source": [ "recommend()" ], "execution_count": 38, "outputs": [ { "output_type": "stream", "text": [ "For 'The Hobbit', Top 5 similar audiobooks are ['A Wizard of Earthsea', 'Harold & the Purple Crayon', 'The Green Ember', 'Harry Potter and the Chamber of Secrets, Book 2', 'Harry Potter and the Prisoner of Azkaban, Book 3']\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "z9xBOtlznaZn", "outputId": "69182b9a-6b8d-4ac4-e0c3-e89f407c686e" }, "source": [ "recommend()" ], "execution_count": 39, "outputs": [ { "output_type": "stream", "text": [ "For 'How to Win Friends and Influence People in the Digital Age', Top 5 similar audiobooks are ['How to Remember Names and Faces', '10ä¸\\x87å\\x86\\x86ã\\x81\\x8bã\\x82\\x89å§\\x8bã\\x82\\x81ã\\x82\\x8b! å°\\x8få\\x9e\\x8bæ\\xa0ªé\\x9b\\x86ä¸\\xadæ\\x8a\\x95è³\\x87ã\\x81§1å\\x84\\x84å\\x86\\x86 å®\\x9fè·µã\\x83\\x90ã\\x82¤ã\\x83\\x96ã\\x83«', '1,001 Ways to Engage Employees', 'Getting to Yes', '#1 Best Seller']\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "W-PNtqJJp7Y8", "outputId": "46e74752-8696-49b4-aeb1-88386c8b9d80" }, "source": [ "recommend()" ], "execution_count": 40, "outputs": [ { "output_type": "stream", "text": [ "For 'The Power Of: M.I.N.D', Top 5 similar audiobooks are ['The Power Of: M.I.N.D', 'Breaking the Habit of Being Yourself', 'Law of Attraction, Get Your Ex Back', '10 Things Every Woman Needs to Know About Men', 'The Battlefield of the Mind']\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yTNWmARoqA2j", "outputId": "5248fd90-eeae-4fd2-ae18-1a2730f9bf76" }, "source": [ "recommend()" ], "execution_count": 41, "outputs": [ { "output_type": "stream", "text": [ "For 'Acts of Omission', Top 5 similar audiobooks are ['21st Birthday', 'Black Ice', 'Sycamore Row', 'A Lady Compromised', 'Deadly Cross']\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "a01LJFT5qst2", "outputId": "de33b1ec-ce3c-444d-ae08-62de500a2a95" }, "source": [ "recommend(10)" ], "execution_count": 42, "outputs": [ { "output_type": "stream", "text": [ "For 'Broken (in the Best Possible Way)', Top 10 similar audiobooks are ['Andrew Cunanan: Short Spree Killer and Versace Nemesis', 'Steve Jobs', 'Say Nothing', 'Billion Dollar Loser', 'Unbroken', 'Nothing Personal', 'The Immortal Life of Henrietta Lacks', 'The Splendid and the Vile', 'Red Notice', 'We Few']\n" ], "name": "stdout" } ] } ] }