{"cells":[{"cell_type":"code","source":["from pyspark.sql import SparkSession\nspark = SparkSession.builder.appName('treecode').getOrCreate()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"758fc56e-691c-407b-a8f7-621387500ff2"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data = spark.read.csv(\"dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/College.csv\",inferSchema=True,header=True)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"a7a241d9-203e-4e42-880c-75f57aff3298"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b5485f9c-39b4-48fc-be18-b44ca3482fc6"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"root\n |-- School: string (nullable = true)\n |-- Private: string (nullable = true)\n |-- Apps: integer (nullable = true)\n |-- Accept: integer (nullable = true)\n |-- Enroll: integer (nullable = true)\n |-- Top10perc: integer (nullable = true)\n |-- Top25perc: integer (nullable = true)\n |-- F_Undergrad: integer (nullable = true)\n |-- P_Undergrad: integer (nullable = true)\n |-- Outstate: integer (nullable = true)\n |-- Room_Board: integer (nullable = true)\n |-- Books: integer (nullable = true)\n |-- Personal: integer (nullable = true)\n |-- PhD: integer (nullable = true)\n |-- Terminal: integer (nullable = true)\n |-- S_F_Ratio: double (nullable = true)\n |-- perc_alumni: integer (nullable = true)\n |-- Expend: integer (nullable = true)\n |-- Grad_Rate: integer (nullable = true)\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["root\n |-- School: string (nullable = true)\n |-- Private: string (nullable = true)\n |-- Apps: integer (nullable = true)\n |-- Accept: integer (nullable = true)\n |-- Enroll: integer (nullable = true)\n |-- Top10perc: integer (nullable = true)\n |-- Top25perc: integer (nullable = true)\n |-- F_Undergrad: integer (nullable = true)\n |-- P_Undergrad: integer (nullable = true)\n |-- Outstate: integer (nullable = true)\n |-- Room_Board: integer (nullable = true)\n |-- Books: integer (nullable = true)\n |-- Personal: integer (nullable = true)\n |-- PhD: integer (nullable = true)\n |-- Terminal: integer (nullable = true)\n |-- S_F_Ratio: double (nullable = true)\n |-- perc_alumni: integer (nullable = true)\n |-- Expend: integer (nullable = true)\n |-- Grad_Rate: integer (nullable = true)\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["data.head()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"bbb546bd-d353-45be-994a-7bd2a5ecff09"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[4]: Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["Out[4]: Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)"]}}],"execution_count":0},{"cell_type":"code","source":["from pyspark.ml.linalg import Vectors\nfrom pyspark.ml.feature import VectorAssembler"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"accd66f1-4d05-487c-b113-ac1cef307daf"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data.columns"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3b351c91-0ca8-4920-bfc1-3461ef98eaf5"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[6]: ['School',\n 'Private',\n 'Apps',\n 'Accept',\n 'Enroll',\n 'Top10perc',\n 'Top25perc',\n 'F_Undergrad',\n 'P_Undergrad',\n 'Outstate',\n 'Room_Board',\n 'Books',\n 'Personal',\n 'PhD',\n 'Terminal',\n 'S_F_Ratio',\n 'perc_alumni',\n 'Expend',\n 'Grad_Rate']","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["Out[6]: ['School',\n 'Private',\n 'Apps',\n 'Accept',\n 'Enroll',\n 'Top10perc',\n 'Top25perc',\n 'F_Undergrad',\n 'P_Undergrad',\n 'Outstate',\n 'Room_Board',\n 'Books',\n 'Personal',\n 'PhD',\n 'Terminal',\n 'S_F_Ratio',\n 'perc_alumni',\n 'Expend',\n 'Grad_Rate']"]}}],"execution_count":0},{"cell_type":"code","source":["assembler = VectorAssembler(\n inputCols=['Apps',\n 'Accept',\n 'Enroll',\n 'Top10perc',\n 'Top25perc',\n 'F_Undergrad',\n 'P_Undergrad',\n 'Outstate',\n 'Room_Board',\n 'Books',\n 'Personal',\n 'PhD',\n 'Terminal',\n 'S_F_Ratio',\n 'perc_alumni',\n 'Expend',\n 'Grad_Rate'],\n outputCol=\"features\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"fc52f2b2-1fd7-46de-9b4b-a371438e518a"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["output = assembler.transform(data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1104187e-95dd-4385-9830-33c06bf3022b"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["from pyspark.ml.feature import StringIndexer"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6149a827-fa57-4766-86ba-49aab52526a9"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["indexer = StringIndexer(inputCol=\"Private\", outputCol=\"PrivateIndex\")\noutput_fixed = indexer.fit(output).transform(output)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"00fbc916-1ff8-43dd-a356-7334b4f38025"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["final_data = output_fixed.select(\"features\",'PrivateIndex')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"4d42505a-5143-4bf6-b50a-997ad8a29f47"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["train_data,test_data = final_data.randomSplit([0.7,0.3])"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3b5c4248-61de-406d-b399-7ec061b78d40"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier\nfrom pyspark.ml import Pipeline"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"f6d38a46-12ce-470a-bfd5-d8fd0146cd28"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dtc = DecisionTreeClassifier(labelCol='PrivateIndex',featuresCol='features')\nrfc = RandomForestClassifier(labelCol='PrivateIndex',featuresCol='features')\ngbt = GBTClassifier(labelCol='PrivateIndex',featuresCol='features')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"5d955c3f-763d-4377-8839-0332ec020a63"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dtc_model = dtc.fit(train_data)\nrfc_model = rfc.fit(train_data)\ngbt_model = gbt.fit(train_data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"12bd699d-aaa8-4f42-832e-33980e0674f3"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dtc_predictions = dtc_model.transform(test_data)\nrfc_predictions = rfc_model.transform(test_data)\ngbt_predictions = gbt_model.transform(test_data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"43271b34-56d3-4495-b070-951502fdfd2f"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["from pyspark.ml.evaluation import MulticlassClassificationEvaluator"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b229509e-98de-465f-8562-9b596830c3df"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["acc_evaluator = MulticlassClassificationEvaluator(labelCol=\"PrivateIndex\", predictionCol=\"prediction\", metricName=\"accuracy\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"345d8160-aa9d-4941-b3ed-d50281956e8a"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dtc_acc = acc_evaluator.evaluate(dtc_predictions)\nrfc_acc = acc_evaluator.evaluate(rfc_predictions)\ngbt_acc = acc_evaluator.evaluate(gbt_predictions)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"87cf17ce-bdae-43ad-aad3-11c33010c915"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["print(\"Results:\")\nprint('A single decision tree had an accuracy of: {0:2.2f}%'.format(dtc_acc*100))\nprint('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))\nprint('A ensemble using GBT had an accuracy of: {0:2.2f}%'.format(gbt_acc*100))"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3b79923b-e1f2-4495-9d2f-c89ba6587fae"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Results:\nA single decision tree had an accuracy of: 91.39%\nA random forest ensemble had an accuracy of: 95.69%\nA ensemble using GBT had an accuracy of: 94.74%\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["Results:\nA single decision tree had an accuracy of: 91.39%\nA random forest ensemble had an accuracy of: 95.69%\nA ensemble using GBT had an accuracy of: 94.74%\n"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"ml-tree-model","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":3598965581908934}},"nbformat":4,"nbformat_minor":0}