{"cells":[{"cell_type":"markdown","source":["In this project, we aim to predict the dog food batch was spoiled. The data description is as follows:\n\n* Pres_A : Percentage of preservative A in the mix\n* Pres_B : Percentage of preservative B in the mix\n* Pres_C : Percentage of preservative C in the mix\n* Pres_D : Percentage of preservative D in the mix\n* Spoiled: Label indicating whether or not the dog food batch was spoiled."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"0c2f889d-caf0-4e98-899a-c184a7a25cfa"}}},{"cell_type":"code","source":["from pyspark.sql import SparkSession\nfrom pyspark.ml.linalg import Vectors\nfrom pyspark.ml.feature import VectorAssembler\nfrom pyspark.ml.classification import RandomForestClassifier,DecisionTreeClassifier"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"449f36c8-2120-43ac-9332-dd13a6fb2544"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["spark = SparkSession.builder.appName('dogfood').getOrCreate()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"226f5f4a-19d1-4d7c-a025-c95491acdc20"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data = spark.read.csv(\"dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/dog_food.csv\",inferSchema=True,header=True)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"70bd7e9a-45e4-4fd3-9f4e-3d1cca5732a6"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["data.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"794a884d-5e64-45ed-b606-cdc7a504367c"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"root\n |-- A: integer (nullable = true)\n |-- B: integer (nullable = true)\n |-- C: double (nullable = true)\n |-- D: integer (nullable = true)\n |-- Spoiled: double (nullable = true)\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["root\n |-- A: integer (nullable = true)\n |-- B: integer (nullable = true)\n |-- C: double (nullable = true)\n |-- D: integer (nullable = true)\n |-- Spoiled: double (nullable = true)\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["data.head()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"2ac18e97-a2f2-41b8-bd83-ef963ac6c643"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[5]: Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["Out[5]: Row(A=4, B=2, C=12.0, D=3, Spoiled=1.0)"]}}],"execution_count":0},{"cell_type":"code","source":["data.describe().show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"a435aa05-7818-4be1-bea7-90897484310f"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-------+------------------+------------------+------------------+------------------+-------------------+\n|summary| A| B| C| D| Spoiled|\n+-------+------------------+------------------+------------------+------------------+-------------------+\n| count| 490| 490| 490| 490| 490|\n| mean| 5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|\n| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|\n| min| 1| 1| 5.0| 1| 0.0|\n| max| 10| 10| 14.0| 10| 1.0|\n+-------+------------------+------------------+------------------+------------------+-------------------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-------+------------------+------------------+------------------+------------------+-------------------+\n|summary| A| B| C| D| Spoiled|\n+-------+------------------+------------------+------------------+------------------+-------------------+\n| count| 490| 490| 490| 490| 490|\n| mean| 5.53469387755102| 5.504081632653061| 9.126530612244897| 5.579591836734694| 0.2857142857142857|\n| stddev|2.9515204234399057|2.8537966089662063|2.0555451971054275|2.8548369309982857|0.45221563164613465|\n| min| 1| 1| 5.0| 1| 0.0|\n| max| 10| 10| 14.0| 10| 1.0|\n+-------+------------------+------------------+------------------+------------------+-------------------+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["data.columns"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b719519e-4596-46c5-bd0f-d4ab1fc47b86"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[8]: ['A', 'B', 'C', 'D', 'Spoiled']","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["Out[8]: ['A', 'B', 'C', 'D', 'Spoiled']"]}}],"execution_count":0},{"cell_type":"code","source":["assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol=\"features\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"2eef836a-1606-47ad-8b45-72f9a5e6c1a1"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["output = assembler.transform(data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"5c36bff7-06d6-46a2-91ef-8814e4f7d2bb"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["rfc = DecisionTreeClassifier(labelCol='Spoiled',featuresCol='features')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"ee06eee0-b6b1-4c52-932d-9de823895ae1"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["output.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"41952c00-3fed-4ee9-a299-1f3359a12c5f"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"root\n |-- A: integer (nullable = true)\n |-- B: integer (nullable = true)\n |-- C: double (nullable = true)\n |-- D: integer (nullable = true)\n |-- Spoiled: double (nullable = true)\n |-- features: vector (nullable = true)\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["root\n |-- A: integer (nullable = true)\n |-- B: integer (nullable = true)\n |-- C: double (nullable = true)\n |-- D: integer (nullable = true)\n |-- Spoiled: double (nullable = true)\n |-- features: vector (nullable = true)\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["final_data = output.select('features','Spoiled')\nfinal_data.head()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"5ec0fa03-1469-4736-875b-862e8b6287c4"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[14]: Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["Out[14]: Row(features=DenseVector([4.0, 2.0, 12.0, 3.0]), Spoiled=1.0)"]}}],"execution_count":0},{"cell_type":"code","source":["rfc_model = rfc.fit(final_data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"908bc951-75c7-4d45-ad0d-b5d182dbab62"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["rfc_model.featureImportances"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"dc98d107-3369-4d52-b5a3-c0f220edfeed"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[16]: SparseVector(4, {1: 0.0019, 2: 0.9832, 3: 0.0149})","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["Out[16]: SparseVector(4, {1: 0.0019, 2: 0.9832, 3: 0.0149})"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"ml-tree-model-case","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":3598965581908961}},"nbformat":4,"nbformat_minor":0}