{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "04-Recommand.ipynb", "version": "0.3.2", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "cells": [ { "metadata": { "id": "b3vIWjxfy4uB", "colab_type": "text" }, "cell_type": "markdown", "source": [ "\n", "
\n", "# **Chapter 4 | 추천시스템**\n", "참고사이트 : https://www.machinelearningplus.com/nlp/cosine-similarity/\n", "## **1 데이터 불러오기**" ] }, { "metadata": { "id": "_-fO8fCqy4uD", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "# ! apt-get update\n", "# ! apt-get install g++ openjdk-8-jdk \n", "# ! pip3 install nltk konlpy matplotlib gensim \n", "\n", "# ! apt-get install fonts-nanum-eco\n", "# ! apt-get install fontconfig\n", "# ! fc-cache -fv\n", "# ! cp /usr/share/fonts/truetype/nanum/Nanum* /usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/\n", "# ! rm -rf /content/.cache/matplotlib/*" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "nBagMLpZzEyD", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 72 }, "outputId": "57224cf2-c10e-4148-e7d7-3f5bf539d896" }, "cell_type": "code", "source": [ " import nltk\n", " nltk.download('wordnet')\n", "\n", "import pandas as pd\n", "import io, requests\n", "url = \"https://raw.githubusercontent.com/YongBeomKim/nltk_basic/master/data/movies_metadata.csv\"\n", "response = requests.get(url).content\n", "movies = pd.read_csv(io.StringIO(response.decode('utf-8')),\n", " usecols=['original_title', 'overview', 'title'], low_memory=False)\n", "movies = movies.dropna(axis=0)\n", "movies.shape" ], "execution_count": 2, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "(44506, 3)" ] }, "metadata": { "tags": [] }, "execution_count": 2 } ] }, { "metadata": { "id": "anzxC1Kxy4uH", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 138 }, "outputId": "c9d4aee2-a4eb-4e43-bae1-7abe04e80f8f" }, "cell_type": "code", "source": [ "movie_plot_li = movies['overview']\n", "movie_info_li = movies['title']\n", "movies.head(3)" ], "execution_count": 3, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
original_titleoverviewtitle
0Toy StoryLed by Woody, Andy's toys live happily in his ...Toy Story
1JumanjiWhen siblings Judy and Peter discover an encha...Jumanji
2Grumpier Old MenA family wedding reignites the ancient feud be...Grumpier Old Men
\n", "
" ], "text/plain": [ " original_title overview \\\n", "0 Toy Story Led by Woody, Andy's toys live happily in his ... \n", "1 Jumanji When siblings Judy and Peter discover an encha... \n", "2 Grumpier Old Men A family wedding reignites the ancient feud be... \n", "\n", " title \n", "0 Toy Story \n", "1 Jumanji \n", "2 Grumpier Old Men " ] }, "metadata": { "tags": [] }, "execution_count": 3 } ] }, { "metadata": { "id": "ehOhWwIcy4uM", "colab_type": "text" }, "cell_type": "markdown", "source": [ "## **2 텍스트 전처리, 모델 만들기**" ] }, { "metadata": { "id": "k293bsziy4uN", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "from nltk.stem import WordNetLemmatizer\n", "from nltk.tokenize import RegexpTokenizer\n", "\n", "class LemmaTokenizer(object):\n", " def __init__(self):\n", " self.wnl = WordNetLemmatizer()\n", " self.tokenizer = RegexpTokenizer('(?u)[A-z]+')\n", " \n", " def __call__(self, doc): # 클래스 호출시 마다 실행(Tf-idf Vector 호출)\n", " return([self.wnl.lemmatize(t) for t in self.tokenizer.tokenize(doc)])" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "AGhilQ72y4uP", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 74 }, "outputId": "1cc6a351-7787-458f-cf75-8392fbfe2833" }, "cell_type": "code", "source": [ "# 사이킷런에 위에서 정의한 토크나이저를 입력으로 넣습니다.\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "vectorizer = TfidfVectorizer(min_df=3, tokenizer=LemmaTokenizer(), stop_words='english')\n", "X = vectorizer.fit_transform(movie_plot_li[:25000]) # 메모리 오류로 갯수를 제한\n", "vocabluary = vectorizer.get_feature_names()" ], "execution_count": 5, "outputs": [ { "output_type": "stream", "text": [ "/usr/local/lib/python3.6/dist-packages/sklearn/feature_extraction/text.py:301: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ha', 'le', 'u', 'wa'] not in stop_words.\n", " 'stop_words.' % sorted(inconsistent))\n" ], "name": "stderr" } ] }, { "metadata": { "id": "vsIAD3WHy4uS", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 272 }, "outputId": "541d08d6-93fc-4108-eace-bd37f4f5c0d3" }, "cell_type": "code", "source": [ "# 비슷한 영화 추천하는 Cosin 유사모델 만들기\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "movie_sim = cosine_similarity(X)\n", "print(movie_sim.shape)\n", "movie_sim" ], "execution_count": 6, "outputs": [ { "output_type": "stream", "text": [ "(25000, 25000)\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "array([[1. , 0.02707166, 0.00539808, ..., 0. , 0.00976661,\n", " 0.01429638],\n", " [0.02707166, 1. , 0.0466465 , ..., 0. , 0.00801626,\n", " 0.02456379],\n", " [0.00539808, 0.0466465 , 1. , ..., 0. , 0.02812631,\n", " 0.01745318],\n", " ...,\n", " [0. , 0. , 0. , ..., 1. , 0. ,\n", " 0.01639453],\n", " [0.00976661, 0.00801626, 0.02812631, ..., 0. , 1. ,\n", " 0.01056217],\n", " [0.01429638, 0.02456379, 0.01745318, ..., 0.01639453, 0.01056217,\n", " 1. ]])" ] }, "metadata": { "tags": [] }, "execution_count": 6 } ] }, { "metadata": { "id": "GGT1n5Eyy4uV", "colab_type": "text" }, "cell_type": "markdown", "source": [ "## **3 코싸인 유사도 테이블 활용**" ] }, { "metadata": { "id": "CSRqhQuIy4uW", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "# 특정 영화와 유사한 영화목록 출력하기\n", "def similar_recommend_by_movie_id(movielens_id, rank=8):\n", " movie_index = movielens_id - 1\n", " similar_movies = sorted(list(enumerate(movie_sim[movie_index])), key=lambda x:x[1], reverse=True)\n", " print(\"----- {} : 관람객 추천영화 -------\".format(movie_info_li[similar_movies[0][0]]))\n", " for no, movie_idx in enumerate(similar_movies[1:rank]):\n", " print('추천영화 {}순위 : {}'.format(no, movie_info_li[movie_idx[0]]))" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "55V6Wyxxy4uY", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 381 }, "outputId": "231e1e6d-d7ca-41c1-bba5-83cbfa3751b3" }, "cell_type": "code", "source": [ "similar_recommend_by_movie_id(20, rank=20)" ], "execution_count": 8, "outputs": [ { "output_type": "stream", "text": [ "----- Money Train : 관람객 추천영화 -------\n", "추천영화 0순위 : The One Percent\n", "추천영화 1순위 : The Long Good Friday\n", "추천영화 2순위 : The Milagro Beanfield War\n", "추천영화 3순위 : Gone in Sixty Seconds\n", "추천영화 4순위 : The January Man\n", "추천영화 5순위 : Armored Car Robbery\n", "추천영화 6순위 : You Can Count on Me\n", "추천영화 7순위 : Funny People\n", "추천영화 8순위 : Uranus\n", "추천영화 9순위 : OMG: Oh My God!\n", "추천영화 10순위 : Hush!\n", "추천영화 11순위 : Rich and Famous\n", "추천영화 12순위 : RoboCop 3\n", "추천영화 13순위 : Afterschool\n", "추천영화 14순위 : The Searchers\n", "추천영화 15순위 : Anatomy of Hell\n", "추천영화 16순위 : Lagaan: Once Upon a Time in India\n", "추천영화 17순위 : The Krays\n", "추천영화 18순위 : A Face in the Crowd\n" ], "name": "stdout" } ] }, { "metadata": { "id": "i4OEQ88f0cWn", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "" ], "execution_count": 0, "outputs": [] } ] }