{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Section 11 : Linear Regression " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# course 11.35" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from pyspark.sql import SparkSession\n", "from pyspark.ml.regression import LinearRegression" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "spark = SparkSession.builder.appName('linear_regression').getOrCreate()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# load the data\n", "# https://github.com/yennanliu/analysis/blob/master/SPARK_/sample_linear_regression_data.txt\n", "\n", "training = spark.read.format('libsvm').load('sample_linear_regression_data.txt')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-9.490009878824548 1:0.4551273600657362 2:0.36644694351969087 3:-0.38256108933468047 4:-0.4458430198517267 5:0.33109790358914726 6:0.8067445293443565 7:-0.2624341731773887 8:-0.44850386111659524 9:-0.07269284838169332 10:0.5658035575800715\r\n", "0.2577820163584905 1:0.8386555657374337 2:-0.1270180511534269 3:0.499812362510895 4:-0.22686625128130267 5:-0.6452430441812433 6:0.18869982177936828 7:-0.5804648622673358 8:0.651931743775642 9:-0.6555641246242951 10:0.17485476357259122\r\n", "-4.438869807456516 1:0.5025608135349202 2:0.14208069682973434 3:0.16004976900412138 4:0.505019897181302 5:-0.9371635223468384 6:-0.2841601610457427 7:0.6355938616712786 8:-0.1646249064941625 9:0.9480713629917628 10:0.42681251564645817\r\n", "-19.782762789614537 1:-0.0388509668871313 2:-0.4166870051763918 3:0.8997202693189332 4:0.6409836467726933 5:0.273289095712564 6:-0.26175701211620517 7:-0.2794902492677298 8:-0.1306778297187794 9:-0.08536581111046115 10:-0.05462315824828923\r\n", "-7.966593841555266 1:-0.06195495876886281 2:0.6546448480299902 3:-0.6979368909424835 4:0.6677324708883314 5:-0.07938725467767771 6:-0.43885601665437957 7:-0.608071585153688 8:-0.6414531182501653 9:0.7313735926547045 10:-0.026818676347611925\r\n" ] } ], "source": [ "# take a look on the original data\n", "\n", "!head -5 sample_linear_regression_data.txt" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DataFrame[label: double, features: vector]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------------------+--------------------+\n", "| label| features|\n", "+-------------------+--------------------+\n", "| -9.490009878824548|(10,[0,1,2,3,4,5,...|\n", "| 0.2577820163584905|(10,[0,1,2,3,4,5,...|\n", "| -4.438869807456516|(10,[0,1,2,3,4,5,...|\n", "|-19.782762789614537|(10,[0,1,2,3,4,5,...|\n", "| -7.966593841555266|(10,[0,1,2,3,4,5,...|\n", "| -7.896274316726144|(10,[0,1,2,3,4,5,...|\n", "| -8.464803554195287|(10,[0,1,2,3,4,5,...|\n", "| 2.1214592666251364|(10,[0,1,2,3,4,5,...|\n", "| 1.0720117616524107|(10,[0,1,2,3,4,5,...|\n", "|-13.772441561702871|(10,[0,1,2,3,4,5,...|\n", "| -5.082010756207233|(10,[0,1,2,3,4,5,...|\n", "| 7.887786536531237|(10,[0,1,2,3,4,5,...|\n", "| 14.323146365332388|(10,[0,1,2,3,4,5,...|\n", "|-20.057482615789212|(10,[0,1,2,3,4,5,...|\n", "|-0.8995693247765151|(10,[0,1,2,3,4,5,...|\n", "| -19.16829262296376|(10,[0,1,2,3,4,5,...|\n", "| 5.601801561245534|(10,[0,1,2,3,4,5,...|\n", "|-3.2256352187273354|(10,[0,1,2,3,4,5,...|\n", "| 1.5299675726687754|(10,[0,1,2,3,4,5,...|\n", "| -0.250102447941961|(10,[0,1,2,3,4,5,...|\n", "+-------------------+--------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# HERE YOU CAN SEE THE DATASET IS WITH \"LABEL\" AND \"FEATURES\" COLUMNS\n", "# WHICH IS THE DEFAULT SCHEMA THAT SPARK ML-LIB CAN RUN MODELING WITH \n", "##### label: double, features: vector #####\n", "\n", "training.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 0) simple training " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# create the model \n", "\n", "lr = LinearRegression(featuresCol='features',\n", " labelCol= 'label',\n", " predictionCol = 'prediciton')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# train the model \n", "lrModel = lr.fit(training)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "coefficients : [0.0073350710225801715,0.8313757584337543,-0.8095307954684084,2.441191686884721,0.5191713795290003,1.1534591903547016,-0.2989124112808717,-0.5128514186201779,-0.619712827067017,0.6956151804322931]\n", "intercept : 0.14228558260358093\n" ] } ], "source": [ "# print output \n", "\n", "print ('coefficients : ', lrModel.coefficients)\n", "print ('intercept : ', lrModel.intercept)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "r2 : 0.027839179518600154\n", "rootMeanSquaredError : 10.16309157133015\n" ] } ], "source": [ "# use train cummary \n", "\n", "training_summary = lrModel.summary\n", "print ('r2 : ', training_summary.r2)\n", "print ('rootMeanSquaredError : ', training_summary.rootMeanSquaredError)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1) train - test data split and prediciton " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "all_data = spark.read.format('libsvm').load('sample_linear_regression_data.txt')\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# random split \n", "\n", "train_data, test_data = all_data.randomSplit([.7,.3])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "all data count : 501\n", "train data count : 365\n", "test data count : 136\n" ] } ], "source": [ "print ( 'all data count : ', all_data.count())\n", "print ( 'train data count : ', train_data.count())\n", "print ( 'test data count : ', test_data.count())" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# train again only on train data \n", "correct_model = lr.fit(training)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# test on test data \n", "\n", "test_results = correct_model.evaluate(test_data)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "r2 : 0.08455715143231024\n", "rootMeanSquaredError : 9.691326744797616\n" ] } ], "source": [ "# print the results (test data)\n", "\n", "# use train cummary \n", "\n", "print ('r2 : ', test_results.r2)\n", "print ('rootMeanSquaredError : ', test_results.rootMeanSquaredError)\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "unlabeled_data = test_data.select('features')" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| features|\n", "+--------------------+\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "|(10,[0,1,2,3,4,5,...|\n", "+--------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# have some test data to run the followign prdiciton \n", "unlabeled_data.show()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# MAKE PREDICTION \n", "# NOTE : IN SPARK ML-LIB, PREDICT IS RUN BY \"transform\" COMMAND \n", "\n", "predictions = correct_model.transform(unlabeled_data)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+--------------------+\n", "| features| prediciton|\n", "+--------------------+--------------------+\n", "|(10,[0,1,2,3,4,5,...| -3.5124943764463135|\n", "|(10,[0,1,2,3,4,5,...| -3.147868811718382|\n", "|(10,[0,1,2,3,4,5,...| -2.499423280435292|\n", "|(10,[0,1,2,3,4,5,...| 1.7010353768556734|\n", "|(10,[0,1,2,3,4,5,...| -0.5388564818088987|\n", "|(10,[0,1,2,3,4,5,...| -1.475284763550391|\n", "|(10,[0,1,2,3,4,5,...| -0.7489108841213971|\n", "|(10,[0,1,2,3,4,5,...| -2.508322852836744|\n", "|(10,[0,1,2,3,4,5,...| -0.976510689078842|\n", "|(10,[0,1,2,3,4,5,...| -0.9566138722165072|\n", "|(10,[0,1,2,3,4,5,...| 3.7236186142728274|\n", "|(10,[0,1,2,3,4,5,...| 1.2421598960943985|\n", "|(10,[0,1,2,3,4,5,...| -0.7195663865895121|\n", "|(10,[0,1,2,3,4,5,...| -1.780965034607929|\n", "|(10,[0,1,2,3,4,5,...|-0.06740884917840151|\n", "|(10,[0,1,2,3,4,5,...| 2.746996971787099|\n", "|(10,[0,1,2,3,4,5,...| 0.5789191740943999|\n", "|(10,[0,1,2,3,4,5,...| -1.2048075065353916|\n", "|(10,[0,1,2,3,4,5,...| -0.6964026254414395|\n", "|(10,[0,1,2,3,4,5,...| -3.0756131143558623|\n", "+--------------------+--------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# show prediction \n", "predictions.show()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# end of 11.35 \n", "# next : 11.36" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2) * * * Load csv and transform to the data can be trained by Spark MLIB" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "from pyspark.ml.linalg import Vectors\n", "from pyspark.ml.feature import VectorAssembler" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "data = spark.read.csv('boston.csv', inferSchema=True, header=True)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['CRIM',\n", " 'ZN',\n", " 'INDUS',\n", " 'CHAS',\n", " 'NOX',\n", " 'RM',\n", " 'AGE',\n", " 'DIS',\n", " 'RAD',\n", " 'TAX',\n", " 'PTRATIO',\n", " 'B',\n", " 'LSTAT',\n", " 'price']" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.columns" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- CRIM: double (nullable = true)\n", " |-- ZN: double (nullable = true)\n", " |-- INDUS: double (nullable = true)\n", " |-- CHAS: double (nullable = true)\n", " |-- NOX: double (nullable = true)\n", " |-- RM: double (nullable = true)\n", " |-- AGE: double (nullable = true)\n", " |-- DIS: double (nullable = true)\n", " |-- RAD: double (nullable = true)\n", " |-- TAX: double (nullable = true)\n", " |-- PTRATIO: double (nullable = true)\n", " |-- B: double (nullable = true)\n", " |-- LSTAT: double (nullable = true)\n", " |-- price: double (nullable = true)\n", "\n" ] } ], "source": [ "data.printSchema()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+----+-----+----+-----+-----+----+------+---+-----+-------+-----+-----+-----+\n", "| CRIM| ZN|INDUS|CHAS| NOX| RM| AGE| DIS|RAD| TAX|PTRATIO| B|LSTAT|price|\n", "+-------+----+-----+----+-----+-----+----+------+---+-----+-------+-----+-----+-----+\n", "|0.00632|18.0| 2.31| 0.0|0.538|6.575|65.2| 4.09|1.0|296.0| 15.3|396.9| 4.98| 24.0|\n", "|0.02731| 0.0| 7.07| 0.0|0.469|6.421|78.9|4.9671|2.0|242.0| 17.8|396.9| 9.14| 21.6|\n", "+-------+----+-----+----+-----+-----+----+------+---+-----+-------+-----+-----+-----+\n", "only showing top 2 rows\n", "\n" ] } ], "source": [ "data.show(2)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "#################################################################################\n", "# \n", "# -- transform csv to feature to be access by SPARK MLIB --\n", "# all numerical cols as feature (except price), set price as target to predict \n", "#\n", "#\n", "#\n", "#\n", "#\n", "#################################################################################\n", "\n", "input_cols_ = ['CRIM',\n", " 'ZN',\n", " 'INDUS',\n", " 'CHAS',\n", " 'NOX',\n", " 'RM',\n", " 'AGE',\n", " 'DIS',\n", " 'RAD',\n", " 'TAX',\n", " 'PTRATIO',\n", " 'B',\n", " 'LSTAT']\n", "\n", "\n", "# -------------------------------------------------------------------# \n", "\n", "# transform input_cols_ --> feature (for following ML using) #\n", "assembler = VectorAssembler(inputCols= input_cols_, outputCol = 'features')\n", "\n", "# -------------------------------------------------------------------# " ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# run the transformation \n", "output = assembler.transform(data)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| features|\n", "+--------------------+\n", "|[0.00632,18.0,2.3...|\n", "|[0.02731,0.0,7.07...|\n", "|[0.02729,0.0,7.07...|\n", "|[0.03237,0.0,2.18...|\n", "|[0.06905,0.0,2.18...|\n", "|[0.02985,0.0,2.18...|\n", "|[0.08829,12.5,7.8...|\n", "|[0.14455,12.5,7.8...|\n", "|[0.21124,12.5,7.8...|\n", "|[0.17004,12.5,7.8...|\n", "|[0.22489,12.5,7.8...|\n", "|[0.11747,12.5,7.8...|\n", "|[0.09378,12.5,7.8...|\n", "|[0.62976,0.0,8.14...|\n", "|[0.63796,0.0,8.14...|\n", "|[0.62739,0.0,8.14...|\n", "|[1.05393,0.0,8.14...|\n", "|[0.7842,0.0,8.14,...|\n", "|[0.80271,0.0,8.14...|\n", "|[0.7258,0.0,8.14,...|\n", "+--------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# show the transformed feature col \n", "output.select('features').show()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# create the final data for training ( feature as input, price as prediction output)\n", "final_data = output.select('features', 'price')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+-----+\n", "| features|price|\n", "+--------------------+-----+\n", "|[0.00632,18.0,2.3...| 24.0|\n", "|[0.02731,0.0,7.07...| 21.6|\n", "|[0.02729,0.0,7.07...| 34.7|\n", "|[0.03237,0.0,2.18...| 33.4|\n", "|[0.06905,0.0,2.18...| 36.2|\n", "|[0.02985,0.0,2.18...| 28.7|\n", "|[0.08829,12.5,7.8...| 22.9|\n", "|[0.14455,12.5,7.8...| 27.1|\n", "|[0.21124,12.5,7.8...| 16.5|\n", "|[0.17004,12.5,7.8...| 18.9|\n", "|[0.22489,12.5,7.8...| 15.0|\n", "|[0.11747,12.5,7.8...| 18.9|\n", "|[0.09378,12.5,7.8...| 21.7|\n", "|[0.62976,0.0,8.14...| 20.4|\n", "|[0.63796,0.0,8.14...| 18.2|\n", "|[0.62739,0.0,8.14...| 19.9|\n", "|[1.05393,0.0,8.14...| 23.1|\n", "|[0.7842,0.0,8.14,...| 17.5|\n", "|[0.80271,0.0,8.14...| 20.2|\n", "|[0.7258,0.0,8.14,...| 18.2|\n", "+--------------------+-----+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "final_data.show()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# train, test split \n", "\n", "train_data, test_data = final_data.randomSplit([0.7, 0.3])" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "all data count : 506\n", "train data count : 345\n", "test data count : 161\n" ] } ], "source": [ "print ( 'all data count : ', final_data.count())\n", "print ( 'train data count : ', train_data.count())\n", "print ( 'test data count : ', test_data.count())" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+------------------+\n", "|summary| price|\n", "+-------+------------------+\n", "| count| 345|\n", "| mean|22.602898550724642|\n", "| stddev| 9.311339220842957|\n", "| min| 5.0|\n", "| max| 50.0|\n", "+-------+------------------+\n", "\n" ] } ], "source": [ "train_data.describe().show()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# now, training with linear regression again \n", "\n", "lr = LinearRegression(labelCol='price')" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "lr_model = lr.fit(train_data)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "test_result = lr_model.evaluate(test_data)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| residuals|\n", "+--------------------+\n", "| 2.820847914698863|\n", "| 10.092829367610015|\n", "| 7.383702839871042|\n", "| 5.77379335027215|\n", "| 1.439135362533321|\n", "| 6.360009488793956|\n", "| -5.894733583181768|\n", "| -3.771279882265347|\n", "|-0.10009484396961454|\n", "| 5.50501225227282|\n", "| 1.1319632754860187|\n", "| -5.434888522994932|\n", "| 7.662804510616631|\n", "|-0.03649746518004804|\n", "| -6.7716019362626625|\n", "| -3.257293842284902|\n", "| -0.5461996288341915|\n", "| -1.0521216424280162|\n", "| -3.452964004804734|\n", "| -11.084224152768945|\n", "+--------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# print the test data fitting results \n", "\n", "test_result.residuals.show()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "* r2 : 0.7225741051946737\n", "* rootMeanSquaredError : 4.712121102879971\n", "* coefficients : [0.0073350710225801715,0.8313757584337543,-0.8095307954684084,2.441191686884721,0.5191713795290003,1.1534591903547016,-0.2989124112808717,-0.5128514186201779,-0.619712827067017,0.6956151804322931]\n", "* intercept : 0.14228558260358093\n" ] } ], "source": [ "# print the test data fitting results \n", "\n", "\n", "print ('* r2 : ', test_result.r2)\n", "print ('* rootMeanSquaredError : ', test_result.rootMeanSquaredError)\n", "\n", "\n", "print ('* coefficients : ', lrModel.coefficients)\n", "print ('* intercept : ', lrModel.intercept)\n" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+------------------+\n", "|summary| price|\n", "+-------+------------------+\n", "| count| 506|\n", "| mean|22.532806324110698|\n", "| stddev| 9.197104087379815|\n", "| min| 5.0|\n", "| max| 50.0|\n", "+-------+------------------+\n", "\n" ] } ], "source": [ "# compare with the input data \n", "\n", "# 1) rootMeanSquaredError : 4.482524071400767 VS mean : 22.53280632411069\n", "# 2) r2 : 0.7534954131369154 --> means the model only covers ~ 75% of the variant of the data, which is not a very good model \n", "\n", "final_data.describe().show()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "# run the trained model with test data ( the data the model hasn't seen before)\n", "unlabeled_data = test_data.select('features')" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+\n", "| features|\n", "+--------------------+\n", "|[0.01301,35.0,1.5...|\n", "|[0.01381,80.0,0.4...|\n", "|[0.01538,90.0,3.7...|\n", "|[0.01709,90.0,2.0...|\n", "|[0.01965,80.0,1.7...|\n", "|[0.02177,82.5,2.0...|\n", "|[0.02498,0.0,1.89...|\n", "|[0.02731,0.0,7.07...|\n", "|[0.02763,75.0,2.9...|\n", "|[0.02899,40.0,1.2...|\n", "|[0.03113,0.0,4.39...|\n", "|[0.03445,82.5,2.0...|\n", "|[0.0351,95.0,2.68...|\n", "|[0.03548,80.0,3.6...|\n", "|[0.03584,80.0,3.3...|\n", "|[0.03615,80.0,4.9...|\n", "|[0.03738,0.0,5.19...|\n", "|[0.04297,52.5,5.3...|\n", "|[0.0456,0.0,13.89...|\n", "|[0.04741,0.0,11.9...|\n", "+--------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "unlabeled_data.show()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------------------+------------------+\n", "| features| prediction|\n", "+--------------------+------------------+\n", "|[0.01301,35.0,1.5...| 29.87915208530114|\n", "|[0.01381,80.0,0.4...|39.907170632389985|\n", "|[0.01538,90.0,3.7...| 36.61629716012896|\n", "|[0.01709,90.0,2.0...| 24.32620664972785|\n", "|[0.01965,80.0,1.7...| 18.66086463746668|\n", "|[0.02177,82.5,2.0...| 35.93999051120604|\n", "|[0.02498,0.0,1.89...|22.394733583181768|\n", "|[0.02731,0.0,7.07...| 25.37127988226535|\n", "|[0.02763,75.0,2.9...|30.900094843969615|\n", "|[0.02899,40.0,1.2...| 21.09498774772718|\n", "|[0.03113,0.0,4.39...| 16.36803672451398|\n", "|[0.03445,82.5,2.0...|29.534888522994933|\n", "|[0.0351,95.0,2.68...| 40.83719548938337|\n", "|[0.03548,80.0,3.6...|20.936497465180047|\n", "|[0.03584,80.0,3.3...|30.271601936262662|\n", "|[0.03615,80.0,4.9...| 31.1572938422849|\n", "|[0.03738,0.0,5.19...| 21.24619962883419|\n", "|[0.04297,52.5,5.3...|25.852121642428017|\n", "|[0.0456,0.0,13.89...|26.752964004804735|\n", "|[0.04741,0.0,11.9...|22.984224152768945|\n", "+--------------------+------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# run the prediction \n", "predictions = lr_model.transform(unlabeled_data)\n", "\n", "predictions.show()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "# end of 11.37 \n", "# next : 11.38 " ] } ], "metadata": { "kernelspec": { "display_name": "Environment (conda_pyspark_)", "language": "python", "name": "conda_pyspark_" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.5" } }, "nbformat": 4, "nbformat_minor": 2 }