{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "\n", "

\n", "# **Chapter 4 | 추천시스템**\n", "참고사이트 : https://www.machinelearningplus.com/nlp/cosine-similarity/\n", "## **1 데이터 불러오기**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# ! apt-get update\n", "# ! apt-get install g++ openjdk-8-jdk \n", "# ! pip3 install nltk konlpy matplotlib gensim \n", "\n", "# ! apt-get install fonts-nanum-eco\n", "# ! apt-get install fontconfig\n", "# ! fc-cache -fv\n", "# ! cp /usr/share/fonts/truetype/nanum/Nanum* /usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/\n", "# ! rm -rf /content/.cache/matplotlib/*" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "movies = pd.read_csv('../data/movies_metadata.csv', usecols=['original_title', 'overview', 'title'], low_memory=False)\n", "movies = movies.dropna(axis=0)\n", "print(movies.shape)\n", "\n", "movie_plot_li = movies['overview']\n", "movie_info_li = movies['title']\n", "movies.head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **2 텍스트 전처리, 모델 만들기**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nltk.stem import WordNetLemmatizer\n", "from nltk.tokenize import RegexpTokenizer\n", "\n", "class LemmaTokenizer(object):\n", " def __init__(self):\n", " self.wnl = WordNetLemmatizer()\n", " self.tokenizer = RegexpTokenizer('(?u)[A-z]+')\n", " \n", " def __call__(self, doc): # 클래스 호출시 마다 실행(Tf-idf Vector 호출)\n", " return([self.wnl.lemmatize(t) for t in self.tokenizer.tokenize(doc)])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 사이킷런에 위에서 정의한 토크나이저를 입력으로 넣습니다.\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "vectorizer = TfidfVectorizer(min_df=3, tokenizer=LemmaTokenizer(), \n", " stop_words='english')\n", "X = vectorizer.fit_transform(movie_plot_li[:10000]) # 메모리 오류로 갯수를 제한\n", "vocabluary = vectorizer.get_feature_names()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 비슷한 영화 추천하는 Cosin 유사모델 만들기\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "movie_sim = cosine_similarity(X)\n", "print(movie_sim.shape)\n", "movie_sim" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## **3 코싸인 유사도 테이블 활용**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 특정 영화와 유사한 영화목록 출력하기\n", "def similar_recommend_by_movie_id(movielens_id, rank=8):\n", " movie_index = movielens_id - 1\n", " similar_movies = sorted(list(enumerate(movie_sim[movie_index])), key=lambda x:x[1], reverse=True)\n", " print(\"----- {} : 관람객 추천영화 -------\".format(movie_info_li[similar_movies[0][0]]))\n", " for no, movie_idx in enumerate(similar_movies[1:rank]):\n", " print('추천영화 {}순위 : {}'.format(no, movie_info_li[movie_idx[0]]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "similar_recommend_by_movie_id(1, rank=20)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 4 }