{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# MovieLens Data Processing" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Import packages\n", "import os\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Define file directories\n", "MOVIELENS_DIR = 'dat'\n", "USER_DATA_FILE = 'users.dat'\n", "MOVIE_DATA_FILE = 'movies.dat'\n", "RATING_DATA_FILE = 'ratings.dat'" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Specify User's Age and Occupation Column\n", "AGES = { 1: \"Under 18\", 18: \"18-24\", 25: \"25-34\", 35: \"35-44\", 45: \"45-49\", 50: \"50-55\", 56: \"56+\" }\n", "OCCUPATIONS = { 0: \"other or not specified\", 1: \"academic/educator\", 2: \"artist\", 3: \"clerical/admin\",\n", " 4: \"college/grad student\", 5: \"customer service\", 6: \"doctor/health care\",\n", " 7: \"executive/managerial\", 8: \"farmer\", 9: \"homemaker\", 10: \"K-12 student\", 11: \"lawyer\",\n", " 12: \"programmer\", 13: \"retired\", 14: \"sales/marketing\", 15: \"scientist\", 16: \"self-employed\",\n", " 17: \"technician/engineer\", 18: \"tradesman/craftsman\", 19: \"unemployed\", 20: \"writer\" }" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Define csv files to be saved into\n", "USERS_CSV_FILE = 'users.csv'\n", "MOVIES_CSV_FILE = 'movies.csv'\n", "RATINGS_CSV_FILE = 'ratings.csv'" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1000209 ratings loaded\n" ] } ], "source": [ "# Read the Ratings File\n", "ratings = pd.read_csv(os.path.join(MOVIELENS_DIR, RATING_DATA_FILE), \n", " sep='::', \n", " engine='python', \n", " encoding='latin-1',\n", " names=['user_id', 'movie_id', 'rating', 'timestamp'])\n", "\n", "# Set max_userid to the maximum user_id in the ratings\n", "max_userid = ratings['user_id'].drop_duplicates().max()\n", "# Set max_movieid to the maximum movie_id in the ratings\n", "max_movieid = ratings['movie_id'].drop_duplicates().max()\n", "\n", "# Process ratings dataframe for Keras Deep Learning model\n", "# Add user_emb_id column whose values == user_id - 1\n", "ratings['user_emb_id'] = ratings['user_id'] - 1\n", "# Add movie_emb_id column whose values == movie_id - 1\n", "ratings['movie_emb_id'] = ratings['movie_id'] - 1\n", "\n", "print len(ratings), 'ratings loaded'" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saved to ratings.csv\n" ] } ], "source": [ "# Save into ratings.csv\n", "ratings.to_csv(RATINGS_CSV_FILE, \n", " sep='\\t', \n", " header=True, \n", " encoding='latin-1', \n", " columns=['user_id', 'movie_id', 'rating', 'timestamp', 'user_emb_id', 'movie_emb_id'])\n", "print 'Saved to', RATINGS_CSV_FILE" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "6040 descriptions of 6040 users loaded.\n" ] } ], "source": [ "# Read the Users File\n", "users = pd.read_csv(os.path.join(MOVIELENS_DIR, USER_DATA_FILE), \n", " sep='::', \n", " engine='python', \n", " encoding='latin-1',\n", " names=['user_id', 'gender', 'age', 'occupation', 'zipcode'])\n", "users['age_desc'] = users['age'].apply(lambda x: AGES[x])\n", "users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x])\n", "print len(users), 'descriptions of', max_userid, 'users loaded.'" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saved to users.csv\n" ] } ], "source": [ "# Save into users.csv\n", "users.to_csv(USERS_CSV_FILE, \n", " sep='\\t', \n", " header=True, \n", " encoding='latin-1',\n", " columns=['user_id', 'gender', 'age', 'occupation', 'zipcode', 'age_desc', 'occ_desc'])\n", "print 'Saved to', USERS_CSV_FILE" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3883 descriptions of 3952 movies loaded.\n" ] } ], "source": [ "# Read the Movies File\n", "movies = pd.read_csv(os.path.join(MOVIELENS_DIR, MOVIE_DATA_FILE), \n", " sep='::', \n", " engine='python', \n", " encoding='latin-1',\n", " names=['movie_id', 'title', 'genres'])\n", "print len(movies), 'descriptions of', max_movieid, 'movies loaded.'" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saved to movies.csv\n" ] } ], "source": [ "# Save into movies.csv\n", "movies.to_csv(MOVIES_CSV_FILE, \n", " sep='\\t', \n", " header=True, \n", " columns=['movie_id', 'title', 'genres'])\n", "print 'Saved to', MOVIES_CSV_FILE" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.14" } }, "nbformat": 4, "nbformat_minor": 2 }