{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import SparkSession" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "spark = SparkSession.builder.appName('CF').getOrCreate()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from pyspark.ml.recommendation import ALS" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from pyspark.ml.evaluation import RegressionEvaluator" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# data source:\n", "# https://github.com/yennanliu/movie_recommendation\n", "# https://grouplens.org/datasets/movielens/\n", "# https://github.com/khanhnamle1994/movielens" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "data = spark.read.csv(\"movie_ratings.csv\", inferSchema=True, header=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- userId: integer (nullable = true)\n", " |-- movieId: integer (nullable = true)\n", " |-- rating: double (nullable = true)\n", " |-- timestamp: integer (nullable = true)\n", "\n" ] } ], "source": [ "data.printSchema()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+------+----------+\n", "|userId|movieId|rating| timestamp|\n", "+------+-------+------+----------+\n", "| 1| 31| 2.5|1260759144|\n", "| 1| 1029| 3.0|1260759179|\n", "| 1| 1061| 3.0|1260759182|\n", "| 1| 1129| 2.0|1260759185|\n", "| 1| 1172| 4.0|1260759205|\n", "| 1| 1263| 2.0|1260759151|\n", "| 1| 1287| 2.0|1260759187|\n", "| 1| 1293| 2.0|1260759148|\n", "| 1| 1339| 3.5|1260759125|\n", "| 1| 1343| 2.0|1260759131|\n", "| 1| 1371| 2.5|1260759135|\n", "| 1| 1405| 1.0|1260759203|\n", "| 1| 1953| 4.0|1260759191|\n", "| 1| 2105| 4.0|1260759139|\n", "| 1| 2150| 3.0|1260759194|\n", "| 1| 2193| 2.0|1260759198|\n", "| 1| 2294| 2.0|1260759108|\n", "| 1| 2455| 2.5|1260759113|\n", "| 1| 2968| 1.0|1260759200|\n", "| 1| 3671| 3.0|1260759117|\n", "+------+-------+------+----------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "data.show()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+------------------+------------------+------------------+--------------------+\n", "|summary| userId| movieId| rating| timestamp|\n", "+-------+------------------+------------------+------------------+--------------------+\n", "| count| 100004| 100004| 100004| 100004|\n", "| mean| 347.0113095476181|12548.664363425463| 3.543608255669773|1.1296390869392424E9|\n", "| stddev|195.16383797819535|26369.198968815268|1.0580641091070326|1.9168582602710962E8|\n", "| min| 1| 1| 0.5| 789652009|\n", "| max| 671| 163949| 5.0| 1476640644|\n", "+-------+------------------+------------------+------------------+--------------------+\n", "\n" ] } ], "source": [ "data.describe().show()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# train, test split \n", "training, test = data.randomSplit([0.8, 0.2])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# set up the model and super-parameters \n", "als = ALS(maxIter=5, \n", " regParam=0.01,\n", " userCol='userId',\n", " itemCol='movieId',\n", " ratingCol= 'rating')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# train the model \n", "model = als.fit(training)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# do the prediction \n", "predicitons = model.transform(test)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+------+----------+----------+\n", "|userId|movieId|rating| timestamp|prediction|\n", "+------+-------+------+----------+----------+\n", "| 452| 463| 2.0| 976424451| 2.4552588|\n", "| 85| 471| 3.0| 837512312| 3.9172719|\n", "| 588| 471| 3.0| 842298526| 4.7625732|\n", "| 460| 471| 5.0|1072836030| 3.8125675|\n", "| 274| 471| 5.0|1074104142| 3.6691563|\n", "| 292| 471| 3.5|1140049920| 4.0752306|\n", "| 15| 471| 3.0|1166586067| 2.311449|\n", "| 73| 471| 4.0|1296460183| 3.3499885|\n", "| 354| 471| 5.0| 846062674| 4.579715|\n", "| 529| 471| 4.0| 965497394| 3.1544423|\n", "| 184| 471| 5.0| 833525100| 4.5493975|\n", "| 311| 471| 0.5|1062015819| 2.6232295|\n", "| 521| 471| 3.5|1370072127| 4.019308|\n", "| 547| 496| 3.0| 974778561| 2.5938766|\n", "| 463| 1088| 3.0|1050499697| 3.0081568|\n", "| 52| 1088| 4.0|1231766626| 4.288722|\n", "| 500| 1088| 4.0|1229098924| 2.4964237|\n", "| 387| 1088| 4.0| 974790964| 2.1745355|\n", "| 514| 1088| 3.0| 853896732| 3.0606182|\n", "| 160| 1088| 4.0| 974258881| 4.6870093|\n", "+------+-------+------+----------+----------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# show the prediciton\n", "predicitons.show()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# evaluate the model (RegressionEvaluator : rmse)\n", "evaluator = RegressionEvaluator(\n", " metricName = 'rmse',\n", " labelCol= 'rating',\n", " predictionCol = 'prediction')" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "rmse = evaluator.evaluate(predicitons)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RMSE\n", "nan\n" ] } ], "source": [ "print ('RMSE')\n", "print (rmse)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+------+----------+\n", "|userId|movieId|rating| timestamp|\n", "+------+-------+------+----------+\n", "| 1| 1129| 2.0|1260759185|\n", "| 1| 1287| 2.0|1260759187|\n", "| 1| 1339| 3.5|1260759125|\n", "| 1| 1343| 2.0|1260759131|\n", "| 1| 2294| 2.0|1260759108|\n", "| 2| 10| 4.0| 835355493|\n", "| 2| 161| 3.0| 835355493|\n", "| 2| 186| 3.0| 835355664|\n", "| 2| 208| 3.0| 835355511|\n", "| 2| 292| 3.0| 835355492|\n", "| 2| 300| 3.0| 835355532|\n", "| 2| 339| 3.0| 835355492|\n", "| 2| 367| 3.0| 835355619|\n", "| 2| 457| 3.0| 835355511|\n", "| 2| 468| 4.0| 835355790|\n", "| 2| 474| 2.0| 835355828|\n", "| 2| 515| 4.0| 835355817|\n", "| 2| 550| 3.0| 835356109|\n", "| 2| 587| 3.0| 835355779|\n", "| 3| 736| 3.5|1298932787|\n", "+------+-------+------+----------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "test.show()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# predict single user's taste\n", "single_user = test.filter(test['userId']== 11).select(['userId', 'movieId'])\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+\n", "|userId|movieId|\n", "+------+-------+\n", "| 11| 70|\n", "| 11| 1027|\n", "| 11| 1201|\n", "| 11| 1408|\n", "| 11| 2042|\n", "| 11| 3424|\n", "| 11| 71211|\n", "| 11| 77455|\n", "| 11| 81158|\n", "| 11| 81562|\n", "| 11| 96079|\n", "| 11| 96861|\n", "+------+-------+\n", "\n" ] } ], "source": [ "# movies a single users has watched\n", "single_user.show()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "recommendations = model.transform(single_user)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+----------+\n", "|userId|movieId|prediction|\n", "+------+-------+----------+\n", "| 11| 1201| 5.064004|\n", "| 11| 71211| 2.056238|\n", "| 11| 2042| 2.8747272|\n", "| 11| 96079| 5.072752|\n", "| 11| 81562| 5.3010736|\n", "| 11| 81158| 1.9231318|\n", "| 11| 96861| 1.9171791|\n", "| 11| 70| 3.744793|\n", "| 11| 1027| 4.1614656|\n", "| 11| 1408| 4.0286865|\n", "| 11| 3424| 3.1237607|\n", "| 11| 77455| 4.8629804|\n", "+------+-------+----------+\n", "\n" ] } ], "source": [ "# recommendations output \n", "recommendations.show()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+----------+\n", "|userId|movieId|prediction|\n", "+------+-------+----------+\n", "| 11| 81562| 5.3010736|\n", "| 11| 96079| 5.072752|\n", "| 11| 1201| 5.064004|\n", "| 11| 77455| 4.8629804|\n", "| 11| 1027| 4.1614656|\n", "| 11| 1408| 4.0286865|\n", "| 11| 70| 3.744793|\n", "| 11| 3424| 3.1237607|\n", "| 11| 2042| 2.8747272|\n", "| 11| 71211| 2.056238|\n", "| 11| 81158| 1.9231318|\n", "| 11| 96861| 1.9171791|\n", "+------+-------+----------+\n", "\n" ] } ], "source": [ "recommendations.orderBy('prediction', ascending=False).show()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# end of course (CF) : 15.56 \n", "# next : 16 : NLP " ] } ], "metadata": { "kernelspec": { "display_name": "Environment (conda_pyspark_)", "language": "python", "name": "conda_pyspark_" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.5" } }, "nbformat": 4, "nbformat_minor": 2 }