{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Section 11 : Linear Regression "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# course 11.35"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from pyspark.ml.regression import LinearRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = SparkSession.builder.appName('linear_regression').getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the data\n",
    "# https://github.com/yennanliu/analysis/blob/master/SPARK_/sample_linear_regression_data.txt\n",
    "\n",
    "training = spark.read.format('libsvm').load('sample_linear_regression_data.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-9.490009878824548 1:0.4551273600657362 2:0.36644694351969087 3:-0.38256108933468047 4:-0.4458430198517267 5:0.33109790358914726 6:0.8067445293443565 7:-0.2624341731773887 8:-0.44850386111659524 9:-0.07269284838169332 10:0.5658035575800715\r\n",
      "0.2577820163584905 1:0.8386555657374337 2:-0.1270180511534269 3:0.499812362510895 4:-0.22686625128130267 5:-0.6452430441812433 6:0.18869982177936828 7:-0.5804648622673358 8:0.651931743775642 9:-0.6555641246242951 10:0.17485476357259122\r\n",
      "-4.438869807456516 1:0.5025608135349202 2:0.14208069682973434 3:0.16004976900412138 4:0.505019897181302 5:-0.9371635223468384 6:-0.2841601610457427 7:0.6355938616712786 8:-0.1646249064941625 9:0.9480713629917628 10:0.42681251564645817\r\n",
      "-19.782762789614537 1:-0.0388509668871313 2:-0.4166870051763918 3:0.8997202693189332 4:0.6409836467726933 5:0.273289095712564 6:-0.26175701211620517 7:-0.2794902492677298 8:-0.1306778297187794 9:-0.08536581111046115 10:-0.05462315824828923\r\n",
      "-7.966593841555266 1:-0.06195495876886281 2:0.6546448480299902 3:-0.6979368909424835 4:0.6677324708883314 5:-0.07938725467767771 6:-0.43885601665437957 7:-0.608071585153688 8:-0.6414531182501653 9:0.7313735926547045 10:-0.026818676347611925\r\n"
     ]
    }
   ],
   "source": [
    "# take a look on the original data\n",
    "\n",
    "!head -5 sample_linear_regression_data.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[label: double, features: vector]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------------+--------------------+\n",
      "|              label|            features|\n",
      "+-------------------+--------------------+\n",
      "| -9.490009878824548|(10,[0,1,2,3,4,5,...|\n",
      "| 0.2577820163584905|(10,[0,1,2,3,4,5,...|\n",
      "| -4.438869807456516|(10,[0,1,2,3,4,5,...|\n",
      "|-19.782762789614537|(10,[0,1,2,3,4,5,...|\n",
      "| -7.966593841555266|(10,[0,1,2,3,4,5,...|\n",
      "| -7.896274316726144|(10,[0,1,2,3,4,5,...|\n",
      "| -8.464803554195287|(10,[0,1,2,3,4,5,...|\n",
      "| 2.1214592666251364|(10,[0,1,2,3,4,5,...|\n",
      "| 1.0720117616524107|(10,[0,1,2,3,4,5,...|\n",
      "|-13.772441561702871|(10,[0,1,2,3,4,5,...|\n",
      "| -5.082010756207233|(10,[0,1,2,3,4,5,...|\n",
      "|  7.887786536531237|(10,[0,1,2,3,4,5,...|\n",
      "| 14.323146365332388|(10,[0,1,2,3,4,5,...|\n",
      "|-20.057482615789212|(10,[0,1,2,3,4,5,...|\n",
      "|-0.8995693247765151|(10,[0,1,2,3,4,5,...|\n",
      "| -19.16829262296376|(10,[0,1,2,3,4,5,...|\n",
      "|  5.601801561245534|(10,[0,1,2,3,4,5,...|\n",
      "|-3.2256352187273354|(10,[0,1,2,3,4,5,...|\n",
      "| 1.5299675726687754|(10,[0,1,2,3,4,5,...|\n",
      "| -0.250102447941961|(10,[0,1,2,3,4,5,...|\n",
      "+-------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# HERE YOU CAN SEE THE DATASET IS WITH \"LABEL\" AND \"FEATURES\" COLUMNS\n",
    "# WHICH IS THE DEFAULT SCHEMA THAT SPARK ML-LIB CAN RUN MODELING WITH \n",
    "#####  label: double, features: vector #####\n",
    "\n",
    "training.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 0) simple training "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create the model \n",
    "\n",
    "lr = LinearRegression(featuresCol='features',\n",
    "                      labelCol= 'label',\n",
    "                      predictionCol = 'prediciton')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train the model \n",
    "lrModel = lr.fit(training)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "coefficients :  [0.0073350710225801715,0.8313757584337543,-0.8095307954684084,2.441191686884721,0.5191713795290003,1.1534591903547016,-0.2989124112808717,-0.5128514186201779,-0.619712827067017,0.6956151804322931]\n",
      "intercept :  0.14228558260358093\n"
     ]
    }
   ],
   "source": [
    "# print output \n",
    "\n",
    "print ('coefficients : ', lrModel.coefficients)\n",
    "print ('intercept : ', lrModel.intercept)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "r2 :  0.027839179518600154\n",
      "rootMeanSquaredError :  10.16309157133015\n"
     ]
    }
   ],
   "source": [
    "# use train cummary \n",
    "\n",
    "training_summary = lrModel.summary\n",
    "print ('r2 : ', training_summary.r2)\n",
    "print ('rootMeanSquaredError : ', training_summary.rootMeanSquaredError)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1) train - test data split  and prediciton "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = spark.read.format('libsvm').load('sample_linear_regression_data.txt')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# random split \n",
    "\n",
    "train_data, test_data = all_data.randomSplit([.7,.3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "all data count :  501\n",
      "train data count :  365\n",
      "test data count :  136\n"
     ]
    }
   ],
   "source": [
    "print ( 'all data count : ', all_data.count())\n",
    "print ( 'train data count : ', train_data.count())\n",
    "print ( 'test data count : ', test_data.count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train again only on train data \n",
    "correct_model = lr.fit(training)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# test on test data \n",
    "\n",
    "test_results = correct_model.evaluate(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "r2 :  0.08455715143231024\n",
      "rootMeanSquaredError :  9.691326744797616\n"
     ]
    }
   ],
   "source": [
    "# print the results (test data)\n",
    "\n",
    "# use train cummary \n",
    "\n",
    "print ('r2 : ', test_results.r2)\n",
    "print ('rootMeanSquaredError : ', test_results.rootMeanSquaredError)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "unlabeled_data = test_data.select('features')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|            features|\n",
      "+--------------------+\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "|(10,[0,1,2,3,4,5,...|\n",
      "+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# have some test data to run the followign prdiciton \n",
    "unlabeled_data.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# MAKE PREDICTION \n",
    "# NOTE :  IN SPARK ML-LIB, PREDICT IS RUN BY \"transform\" COMMAND \n",
    "\n",
    "predictions = correct_model.transform(unlabeled_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+\n",
      "|            features|          prediciton|\n",
      "+--------------------+--------------------+\n",
      "|(10,[0,1,2,3,4,5,...| -3.5124943764463135|\n",
      "|(10,[0,1,2,3,4,5,...|  -3.147868811718382|\n",
      "|(10,[0,1,2,3,4,5,...|  -2.499423280435292|\n",
      "|(10,[0,1,2,3,4,5,...|  1.7010353768556734|\n",
      "|(10,[0,1,2,3,4,5,...| -0.5388564818088987|\n",
      "|(10,[0,1,2,3,4,5,...|  -1.475284763550391|\n",
      "|(10,[0,1,2,3,4,5,...| -0.7489108841213971|\n",
      "|(10,[0,1,2,3,4,5,...|  -2.508322852836744|\n",
      "|(10,[0,1,2,3,4,5,...|  -0.976510689078842|\n",
      "|(10,[0,1,2,3,4,5,...| -0.9566138722165072|\n",
      "|(10,[0,1,2,3,4,5,...|  3.7236186142728274|\n",
      "|(10,[0,1,2,3,4,5,...|  1.2421598960943985|\n",
      "|(10,[0,1,2,3,4,5,...| -0.7195663865895121|\n",
      "|(10,[0,1,2,3,4,5,...|  -1.780965034607929|\n",
      "|(10,[0,1,2,3,4,5,...|-0.06740884917840151|\n",
      "|(10,[0,1,2,3,4,5,...|   2.746996971787099|\n",
      "|(10,[0,1,2,3,4,5,...|  0.5789191740943999|\n",
      "|(10,[0,1,2,3,4,5,...| -1.2048075065353916|\n",
      "|(10,[0,1,2,3,4,5,...| -0.6964026254414395|\n",
      "|(10,[0,1,2,3,4,5,...| -3.0756131143558623|\n",
      "+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# show prediction \n",
    "predictions.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# end of 11.35 \n",
    "# next :  11.36"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2)  * * * Load csv and transform to the data can be trained by Spark MLIB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.linalg import Vectors\n",
    "from pyspark.ml.feature import VectorAssembler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = spark.read.csv('boston.csv', inferSchema=True, header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CRIM',\n",
       " 'ZN',\n",
       " 'INDUS',\n",
       " 'CHAS',\n",
       " 'NOX',\n",
       " 'RM',\n",
       " 'AGE',\n",
       " 'DIS',\n",
       " 'RAD',\n",
       " 'TAX',\n",
       " 'PTRATIO',\n",
       " 'B',\n",
       " 'LSTAT',\n",
       " 'price']"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- CRIM: double (nullable = true)\n",
      " |-- ZN: double (nullable = true)\n",
      " |-- INDUS: double (nullable = true)\n",
      " |-- CHAS: double (nullable = true)\n",
      " |-- NOX: double (nullable = true)\n",
      " |-- RM: double (nullable = true)\n",
      " |-- AGE: double (nullable = true)\n",
      " |-- DIS: double (nullable = true)\n",
      " |-- RAD: double (nullable = true)\n",
      " |-- TAX: double (nullable = true)\n",
      " |-- PTRATIO: double (nullable = true)\n",
      " |-- B: double (nullable = true)\n",
      " |-- LSTAT: double (nullable = true)\n",
      " |-- price: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+----+-----+----+-----+-----+----+------+---+-----+-------+-----+-----+-----+\n",
      "|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM| AGE|   DIS|RAD|  TAX|PTRATIO|    B|LSTAT|price|\n",
      "+-------+----+-----+----+-----+-----+----+------+---+-----+-------+-----+-----+-----+\n",
      "|0.00632|18.0| 2.31| 0.0|0.538|6.575|65.2|  4.09|1.0|296.0|   15.3|396.9| 4.98| 24.0|\n",
      "|0.02731| 0.0| 7.07| 0.0|0.469|6.421|78.9|4.9671|2.0|242.0|   17.8|396.9| 9.14| 21.6|\n",
      "+-------+----+-----+----+-----+-----+----+------+---+-----+-------+-----+-----+-----+\n",
      "only showing top 2 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data.show(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "#################################################################################\n",
    "# \n",
    "#      --   transform csv to feature to be access by SPARK MLIB    --\n",
    "#      all numerical cols as feature (except price), set price as target to predict \n",
    "#\n",
    "#\n",
    "#\n",
    "#\n",
    "#\n",
    "#################################################################################\n",
    "\n",
    "input_cols_ = ['CRIM',\n",
    "             'ZN',\n",
    "             'INDUS',\n",
    "             'CHAS',\n",
    "             'NOX',\n",
    "             'RM',\n",
    "             'AGE',\n",
    "             'DIS',\n",
    "             'RAD',\n",
    "             'TAX',\n",
    "             'PTRATIO',\n",
    "             'B',\n",
    "             'LSTAT']\n",
    "\n",
    "\n",
    "# -------------------------------------------------------------------# \n",
    "\n",
    "# transform  input_cols_ --> feature  (for following ML using)  #\n",
    "assembler = VectorAssembler(inputCols= input_cols_, outputCol = 'features')\n",
    "\n",
    "# -------------------------------------------------------------------# "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# run the transformation \n",
    "output = assembler.transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|            features|\n",
      "+--------------------+\n",
      "|[0.00632,18.0,2.3...|\n",
      "|[0.02731,0.0,7.07...|\n",
      "|[0.02729,0.0,7.07...|\n",
      "|[0.03237,0.0,2.18...|\n",
      "|[0.06905,0.0,2.18...|\n",
      "|[0.02985,0.0,2.18...|\n",
      "|[0.08829,12.5,7.8...|\n",
      "|[0.14455,12.5,7.8...|\n",
      "|[0.21124,12.5,7.8...|\n",
      "|[0.17004,12.5,7.8...|\n",
      "|[0.22489,12.5,7.8...|\n",
      "|[0.11747,12.5,7.8...|\n",
      "|[0.09378,12.5,7.8...|\n",
      "|[0.62976,0.0,8.14...|\n",
      "|[0.63796,0.0,8.14...|\n",
      "|[0.62739,0.0,8.14...|\n",
      "|[1.05393,0.0,8.14...|\n",
      "|[0.7842,0.0,8.14,...|\n",
      "|[0.80271,0.0,8.14...|\n",
      "|[0.7258,0.0,8.14,...|\n",
      "+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# show the transformed feature col \n",
    "output.select('features').show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create the final data for training ( feature as input, price as prediction output)\n",
    "final_data = output.select('features', 'price')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+-----+\n",
      "|            features|price|\n",
      "+--------------------+-----+\n",
      "|[0.00632,18.0,2.3...| 24.0|\n",
      "|[0.02731,0.0,7.07...| 21.6|\n",
      "|[0.02729,0.0,7.07...| 34.7|\n",
      "|[0.03237,0.0,2.18...| 33.4|\n",
      "|[0.06905,0.0,2.18...| 36.2|\n",
      "|[0.02985,0.0,2.18...| 28.7|\n",
      "|[0.08829,12.5,7.8...| 22.9|\n",
      "|[0.14455,12.5,7.8...| 27.1|\n",
      "|[0.21124,12.5,7.8...| 16.5|\n",
      "|[0.17004,12.5,7.8...| 18.9|\n",
      "|[0.22489,12.5,7.8...| 15.0|\n",
      "|[0.11747,12.5,7.8...| 18.9|\n",
      "|[0.09378,12.5,7.8...| 21.7|\n",
      "|[0.62976,0.0,8.14...| 20.4|\n",
      "|[0.63796,0.0,8.14...| 18.2|\n",
      "|[0.62739,0.0,8.14...| 19.9|\n",
      "|[1.05393,0.0,8.14...| 23.1|\n",
      "|[0.7842,0.0,8.14,...| 17.5|\n",
      "|[0.80271,0.0,8.14...| 20.2|\n",
      "|[0.7258,0.0,8.14,...| 18.2|\n",
      "+--------------------+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "final_data.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train, test split \n",
    "\n",
    "train_data, test_data = final_data.randomSplit([0.7, 0.3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "all data count :  506\n",
      "train data count :  345\n",
      "test data count :  161\n"
     ]
    }
   ],
   "source": [
    "print ( 'all data count : ', final_data.count())\n",
    "print ( 'train data count : ', train_data.count())\n",
    "print ( 'test data count : ', test_data.count())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------------------+\n",
      "|summary|             price|\n",
      "+-------+------------------+\n",
      "|  count|               345|\n",
      "|   mean|22.602898550724642|\n",
      "| stddev| 9.311339220842957|\n",
      "|    min|               5.0|\n",
      "|    max|              50.0|\n",
      "+-------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_data.describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now, training with linear regression again \n",
    "\n",
    "lr = LinearRegression(labelCol='price')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr_model = lr.fit(train_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_result = lr_model.evaluate(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|           residuals|\n",
      "+--------------------+\n",
      "|   2.820847914698863|\n",
      "|  10.092829367610015|\n",
      "|   7.383702839871042|\n",
      "|    5.77379335027215|\n",
      "|   1.439135362533321|\n",
      "|   6.360009488793956|\n",
      "|  -5.894733583181768|\n",
      "|  -3.771279882265347|\n",
      "|-0.10009484396961454|\n",
      "|    5.50501225227282|\n",
      "|  1.1319632754860187|\n",
      "|  -5.434888522994932|\n",
      "|   7.662804510616631|\n",
      "|-0.03649746518004804|\n",
      "| -6.7716019362626625|\n",
      "|  -3.257293842284902|\n",
      "| -0.5461996288341915|\n",
      "| -1.0521216424280162|\n",
      "|  -3.452964004804734|\n",
      "| -11.084224152768945|\n",
      "+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# print the test data fitting results \n",
    "\n",
    "test_result.residuals.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "* r2 :  0.7225741051946737\n",
      "* rootMeanSquaredError :  4.712121102879971\n",
      "* coefficients :  [0.0073350710225801715,0.8313757584337543,-0.8095307954684084,2.441191686884721,0.5191713795290003,1.1534591903547016,-0.2989124112808717,-0.5128514186201779,-0.619712827067017,0.6956151804322931]\n",
      "* intercept :  0.14228558260358093\n"
     ]
    }
   ],
   "source": [
    "# print the test data fitting results \n",
    "\n",
    "\n",
    "print ('* r2 : ', test_result.r2)\n",
    "print ('* rootMeanSquaredError : ', test_result.rootMeanSquaredError)\n",
    "\n",
    "\n",
    "print ('* coefficients : ', lrModel.coefficients)\n",
    "print ('* intercept : ', lrModel.intercept)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------------------+\n",
      "|summary|             price|\n",
      "+-------+------------------+\n",
      "|  count|               506|\n",
      "|   mean|22.532806324110698|\n",
      "| stddev| 9.197104087379815|\n",
      "|    min|               5.0|\n",
      "|    max|              50.0|\n",
      "+-------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# compare with the input data \n",
    "\n",
    "# 1) rootMeanSquaredError :  4.482524071400767 VS  mean :  22.53280632411069\n",
    "# 2) r2 :  0.7534954131369154 --> means the model only covers ~ 75% of the variant of the data, which is not a very good model \n",
    "\n",
    "final_data.describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# run the trained model with test data ( the data the model hasn't seen before)\n",
    "unlabeled_data = test_data.select('features')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|            features|\n",
      "+--------------------+\n",
      "|[0.01301,35.0,1.5...|\n",
      "|[0.01381,80.0,0.4...|\n",
      "|[0.01538,90.0,3.7...|\n",
      "|[0.01709,90.0,2.0...|\n",
      "|[0.01965,80.0,1.7...|\n",
      "|[0.02177,82.5,2.0...|\n",
      "|[0.02498,0.0,1.89...|\n",
      "|[0.02731,0.0,7.07...|\n",
      "|[0.02763,75.0,2.9...|\n",
      "|[0.02899,40.0,1.2...|\n",
      "|[0.03113,0.0,4.39...|\n",
      "|[0.03445,82.5,2.0...|\n",
      "|[0.0351,95.0,2.68...|\n",
      "|[0.03548,80.0,3.6...|\n",
      "|[0.03584,80.0,3.3...|\n",
      "|[0.03615,80.0,4.9...|\n",
      "|[0.03738,0.0,5.19...|\n",
      "|[0.04297,52.5,5.3...|\n",
      "|[0.0456,0.0,13.89...|\n",
      "|[0.04741,0.0,11.9...|\n",
      "+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "unlabeled_data.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+------------------+\n",
      "|            features|        prediction|\n",
      "+--------------------+------------------+\n",
      "|[0.01301,35.0,1.5...| 29.87915208530114|\n",
      "|[0.01381,80.0,0.4...|39.907170632389985|\n",
      "|[0.01538,90.0,3.7...| 36.61629716012896|\n",
      "|[0.01709,90.0,2.0...| 24.32620664972785|\n",
      "|[0.01965,80.0,1.7...| 18.66086463746668|\n",
      "|[0.02177,82.5,2.0...| 35.93999051120604|\n",
      "|[0.02498,0.0,1.89...|22.394733583181768|\n",
      "|[0.02731,0.0,7.07...| 25.37127988226535|\n",
      "|[0.02763,75.0,2.9...|30.900094843969615|\n",
      "|[0.02899,40.0,1.2...| 21.09498774772718|\n",
      "|[0.03113,0.0,4.39...| 16.36803672451398|\n",
      "|[0.03445,82.5,2.0...|29.534888522994933|\n",
      "|[0.0351,95.0,2.68...| 40.83719548938337|\n",
      "|[0.03548,80.0,3.6...|20.936497465180047|\n",
      "|[0.03584,80.0,3.3...|30.271601936262662|\n",
      "|[0.03615,80.0,4.9...|  31.1572938422849|\n",
      "|[0.03738,0.0,5.19...| 21.24619962883419|\n",
      "|[0.04297,52.5,5.3...|25.852121642428017|\n",
      "|[0.0456,0.0,13.89...|26.752964004804735|\n",
      "|[0.04741,0.0,11.9...|22.984224152768945|\n",
      "+--------------------+------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# run the prediction \n",
    "predictions = lr_model.transform(unlabeled_data)\n",
    "\n",
    "predictions.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "# end of 11.37 \n",
    "# next :  11.38 "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Environment (conda_pyspark_)",
   "language": "python",
   "name": "conda_pyspark_"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}