{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = SparkSession.builder.appName('CF').getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.recommendation import ALS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.evaluation import RegressionEvaluator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# data source:\n",
    "# https://github.com/yennanliu/movie_recommendation\n",
    "# https://grouplens.org/datasets/movielens/\n",
    "# https://github.com/khanhnamle1994/movielens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = spark.read.csv(\"movie_ratings.csv\", inferSchema=True, header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- userId: integer (nullable = true)\n",
      " |-- movieId: integer (nullable = true)\n",
      " |-- rating: double (nullable = true)\n",
      " |-- timestamp: integer (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+------+----------+\n",
      "|userId|movieId|rating| timestamp|\n",
      "+------+-------+------+----------+\n",
      "|     1|     31|   2.5|1260759144|\n",
      "|     1|   1029|   3.0|1260759179|\n",
      "|     1|   1061|   3.0|1260759182|\n",
      "|     1|   1129|   2.0|1260759185|\n",
      "|     1|   1172|   4.0|1260759205|\n",
      "|     1|   1263|   2.0|1260759151|\n",
      "|     1|   1287|   2.0|1260759187|\n",
      "|     1|   1293|   2.0|1260759148|\n",
      "|     1|   1339|   3.5|1260759125|\n",
      "|     1|   1343|   2.0|1260759131|\n",
      "|     1|   1371|   2.5|1260759135|\n",
      "|     1|   1405|   1.0|1260759203|\n",
      "|     1|   1953|   4.0|1260759191|\n",
      "|     1|   2105|   4.0|1260759139|\n",
      "|     1|   2150|   3.0|1260759194|\n",
      "|     1|   2193|   2.0|1260759198|\n",
      "|     1|   2294|   2.0|1260759108|\n",
      "|     1|   2455|   2.5|1260759113|\n",
      "|     1|   2968|   1.0|1260759200|\n",
      "|     1|   3671|   3.0|1260759117|\n",
      "+------+-------+------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------------------+------------------+------------------+--------------------+\n",
      "|summary|            userId|           movieId|            rating|           timestamp|\n",
      "+-------+------------------+------------------+------------------+--------------------+\n",
      "|  count|            100004|            100004|            100004|              100004|\n",
      "|   mean| 347.0113095476181|12548.664363425463| 3.543608255669773|1.1296390869392424E9|\n",
      "| stddev|195.16383797819535|26369.198968815268|1.0580641091070326|1.9168582602710962E8|\n",
      "|    min|                 1|                 1|               0.5|           789652009|\n",
      "|    max|               671|            163949|               5.0|          1476640644|\n",
      "+-------+------------------+------------------+------------------+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train, test split \n",
    "training, test = data.randomSplit([0.8, 0.2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set up the model and super-parameters \n",
    "als = ALS(maxIter=5, \n",
    "          regParam=0.01,\n",
    "          userCol='userId',\n",
    "          itemCol='movieId',\n",
    "          ratingCol= 'rating')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train the model \n",
    "model  = als.fit(training)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# do the prediction \n",
    "predicitons = model.transform(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+------+----------+----------+\n",
      "|userId|movieId|rating| timestamp|prediction|\n",
      "+------+-------+------+----------+----------+\n",
      "|   452|    463|   2.0| 976424451| 2.4552588|\n",
      "|    85|    471|   3.0| 837512312| 3.9172719|\n",
      "|   588|    471|   3.0| 842298526| 4.7625732|\n",
      "|   460|    471|   5.0|1072836030| 3.8125675|\n",
      "|   274|    471|   5.0|1074104142| 3.6691563|\n",
      "|   292|    471|   3.5|1140049920| 4.0752306|\n",
      "|    15|    471|   3.0|1166586067|  2.311449|\n",
      "|    73|    471|   4.0|1296460183| 3.3499885|\n",
      "|   354|    471|   5.0| 846062674|  4.579715|\n",
      "|   529|    471|   4.0| 965497394| 3.1544423|\n",
      "|   184|    471|   5.0| 833525100| 4.5493975|\n",
      "|   311|    471|   0.5|1062015819| 2.6232295|\n",
      "|   521|    471|   3.5|1370072127|  4.019308|\n",
      "|   547|    496|   3.0| 974778561| 2.5938766|\n",
      "|   463|   1088|   3.0|1050499697| 3.0081568|\n",
      "|    52|   1088|   4.0|1231766626|  4.288722|\n",
      "|   500|   1088|   4.0|1229098924| 2.4964237|\n",
      "|   387|   1088|   4.0| 974790964| 2.1745355|\n",
      "|   514|   1088|   3.0| 853896732| 3.0606182|\n",
      "|   160|   1088|   4.0| 974258881| 4.6870093|\n",
      "+------+-------+------+----------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# show the prediciton\n",
    "predicitons.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# evaluate the model  (RegressionEvaluator : rmse)\n",
    "evaluator = RegressionEvaluator(\n",
    "            metricName = 'rmse',\n",
    "            labelCol= 'rating',\n",
    "            predictionCol = 'prediction')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "rmse = evaluator.evaluate(predicitons)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE\n",
      "nan\n"
     ]
    }
   ],
   "source": [
    "print ('RMSE')\n",
    "print (rmse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+------+----------+\n",
      "|userId|movieId|rating| timestamp|\n",
      "+------+-------+------+----------+\n",
      "|     1|   1129|   2.0|1260759185|\n",
      "|     1|   1287|   2.0|1260759187|\n",
      "|     1|   1339|   3.5|1260759125|\n",
      "|     1|   1343|   2.0|1260759131|\n",
      "|     1|   2294|   2.0|1260759108|\n",
      "|     2|     10|   4.0| 835355493|\n",
      "|     2|    161|   3.0| 835355493|\n",
      "|     2|    186|   3.0| 835355664|\n",
      "|     2|    208|   3.0| 835355511|\n",
      "|     2|    292|   3.0| 835355492|\n",
      "|     2|    300|   3.0| 835355532|\n",
      "|     2|    339|   3.0| 835355492|\n",
      "|     2|    367|   3.0| 835355619|\n",
      "|     2|    457|   3.0| 835355511|\n",
      "|     2|    468|   4.0| 835355790|\n",
      "|     2|    474|   2.0| 835355828|\n",
      "|     2|    515|   4.0| 835355817|\n",
      "|     2|    550|   3.0| 835356109|\n",
      "|     2|    587|   3.0| 835355779|\n",
      "|     3|    736|   3.5|1298932787|\n",
      "+------+-------+------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# predict single user's taste\n",
    "single_user = test.filter(test['userId']== 11).select(['userId', 'movieId'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+\n",
      "|userId|movieId|\n",
      "+------+-------+\n",
      "|    11|     70|\n",
      "|    11|   1027|\n",
      "|    11|   1201|\n",
      "|    11|   1408|\n",
      "|    11|   2042|\n",
      "|    11|   3424|\n",
      "|    11|  71211|\n",
      "|    11|  77455|\n",
      "|    11|  81158|\n",
      "|    11|  81562|\n",
      "|    11|  96079|\n",
      "|    11|  96861|\n",
      "+------+-------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# movies a single users has watched\n",
    "single_user.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "recommendations = model.transform(single_user)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+----------+\n",
      "|userId|movieId|prediction|\n",
      "+------+-------+----------+\n",
      "|    11|   1201|  5.064004|\n",
      "|    11|  71211|  2.056238|\n",
      "|    11|   2042| 2.8747272|\n",
      "|    11|  96079|  5.072752|\n",
      "|    11|  81562| 5.3010736|\n",
      "|    11|  81158| 1.9231318|\n",
      "|    11|  96861| 1.9171791|\n",
      "|    11|     70|  3.744793|\n",
      "|    11|   1027| 4.1614656|\n",
      "|    11|   1408| 4.0286865|\n",
      "|    11|   3424| 3.1237607|\n",
      "|    11|  77455| 4.8629804|\n",
      "+------+-------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# recommendations output \n",
    "recommendations.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+----------+\n",
      "|userId|movieId|prediction|\n",
      "+------+-------+----------+\n",
      "|    11|  81562| 5.3010736|\n",
      "|    11|  96079|  5.072752|\n",
      "|    11|   1201|  5.064004|\n",
      "|    11|  77455| 4.8629804|\n",
      "|    11|   1027| 4.1614656|\n",
      "|    11|   1408| 4.0286865|\n",
      "|    11|     70|  3.744793|\n",
      "|    11|   3424| 3.1237607|\n",
      "|    11|   2042| 2.8747272|\n",
      "|    11|  71211|  2.056238|\n",
      "|    11|  81158| 1.9231318|\n",
      "|    11|  96861| 1.9171791|\n",
      "+------+-------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "recommendations.orderBy('prediction', ascending=False).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# end of course (CF) :  15.56 \n",
    "# next : 16 : NLP "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Environment (conda_pyspark_)",
   "language": "python",
   "name": "conda_pyspark_"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}