{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"\n",
"
\n",
"# **Chapter 4 | 추천시스템**\n",
"참고사이트 : https://www.machinelearningplus.com/nlp/cosine-similarity/\n",
"## **1 데이터 불러오기**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ! apt-get update\n",
"# ! apt-get install g++ openjdk-8-jdk \n",
"# ! pip3 install nltk konlpy matplotlib gensim \n",
"\n",
"# ! apt-get install fonts-nanum-eco\n",
"# ! apt-get install fontconfig\n",
"# ! fc-cache -fv\n",
"# ! cp /usr/share/fonts/truetype/nanum/Nanum* /usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/\n",
"# ! rm -rf /content/.cache/matplotlib/*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"movies = pd.read_csv('../data/movies_metadata.csv', usecols=['original_title', 'overview', 'title'], low_memory=False)\n",
"movies = movies.dropna(axis=0)\n",
"print(movies.shape)\n",
"\n",
"movie_plot_li = movies['overview']\n",
"movie_info_li = movies['title']\n",
"movies.head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **2 텍스트 전처리, 모델 만들기**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.tokenize import RegexpTokenizer\n",
"\n",
"class LemmaTokenizer(object):\n",
" def __init__(self):\n",
" self.wnl = WordNetLemmatizer()\n",
" self.tokenizer = RegexpTokenizer('(?u)[A-z]+')\n",
" \n",
" def __call__(self, doc): # 클래스 호출시 마다 실행(Tf-idf Vector 호출)\n",
" return([self.wnl.lemmatize(t) for t in self.tokenizer.tokenize(doc)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 사이킷런에 위에서 정의한 토크나이저를 입력으로 넣습니다.\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"vectorizer = TfidfVectorizer(min_df=3, tokenizer=LemmaTokenizer(), \n",
" stop_words='english')\n",
"X = vectorizer.fit_transform(movie_plot_li[:10000]) # 메모리 오류로 갯수를 제한\n",
"vocabluary = vectorizer.get_feature_names()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 비슷한 영화 추천하는 Cosin 유사모델 만들기\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"movie_sim = cosine_similarity(X)\n",
"print(movie_sim.shape)\n",
"movie_sim"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **3 코싸인 유사도 테이블 활용**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 특정 영화와 유사한 영화목록 출력하기\n",
"def similar_recommend_by_movie_id(movielens_id, rank=8):\n",
" movie_index = movielens_id - 1\n",
" similar_movies = sorted(list(enumerate(movie_sim[movie_index])), key=lambda x:x[1], reverse=True)\n",
" print(\"----- {} : 관람객 추천영화 -------\".format(movie_info_li[similar_movies[0][0]]))\n",
" for no, movie_idx in enumerate(similar_movies[1:rank]):\n",
" print('추천영화 {}순위 : {}'.format(no, movie_info_li[movie_idx[0]]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"similar_recommend_by_movie_id(1, rank=20)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}