{"cells":[{"cell_type":"markdown","source":["In this project, we aim to figure out 3 potential hackers based on the information about the hacks. The data description is as follows\n\n* 'Session_Connection_Time': How long the session lasted in minutes\n* 'Bytes Transferred': Number of MB transferred during session\n* 'Kali_Trace_Used': Indicates if the hacker was using Kali Linux\n* 'Servers_Corrupted': Number of server corrupted during the attack\n* 'Pages_Corrupted': Number of pages illegally accessed\n* 'Location': Location attack came from (Probably useless because the hackers used VPNs)\n* 'WPM_Typing_Speed': Their estimated typing speed based on session logs."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"bcd41a8c-8b70-46a0-881a-22f8997706b5"}}},{"cell_type":"code","source":["from pyspark.sql import SparkSession\nfrom pyspark.ml.clustering import KMeans\nfrom pyspark.ml.linalg import Vectors\nfrom pyspark.ml.feature import VectorAssembler\nfrom pyspark.ml.feature import StandardScaler\nfrom pyspark.ml.evaluation import ClusteringEvaluator"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"22cc489a-1a8e-4699-a2cd-ed19bece4a5c"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["spark = SparkSession.builder.appName('hack_find').getOrCreate()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"71e28f9e-b987-4c02-b0b1-92fe2ec55722"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dataset = spark.read.csv(\"dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/hack_data.csv\",header=True,inferSchema=True)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"58c23b05-3529-4fcd-8d36-c8e7bf0c676e"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dataset.head()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"79466993-ccfc-4cf7-9c00-e2dc561da4ff"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[3]: Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["Out[3]: Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)"]}}],"execution_count":0},{"cell_type":"code","source":["dataset.describe().show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"307808bf-3e34-4cc2-950f-ba8809256597"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+\n|summary|Session_Connection_Time| Bytes Transferred| Kali_Trace_Used|Servers_Corrupted| Pages_Corrupted| Location| WPM_Typing_Speed|\n+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+\n| count| 334| 334| 334| 334| 334| 334| 334|\n| mean| 30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413| null|57.342395209580864|\n| stddev| 14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697| 3.06352633036022| null| 13.41106336843464|\n| min| 1.0| 10.0| 0| 1.0| 6.0|Afghanistan| 40.0|\n| max| 60.0| 1330.5| 1| 10.0| 15.0| Zimbabwe| 75.0|\n+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+\n|summary|Session_Connection_Time| Bytes Transferred| Kali_Trace_Used|Servers_Corrupted| Pages_Corrupted| Location| WPM_Typing_Speed|\n+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+\n| count| 334| 334| 334| 334| 334| 334| 334|\n| mean| 30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413| null|57.342395209580864|\n| stddev| 14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697| 3.06352633036022| null| 13.41106336843464|\n| min| 1.0| 10.0| 0| 1.0| 6.0|Afghanistan| 40.0|\n| max| 60.0| 1330.5| 1| 10.0| 15.0| Zimbabwe| 75.0|\n+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["dataset.columns"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c56322e7-4f99-4cf9-8e5a-d89875bcc321"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[6]: ['Session_Connection_Time',\n 'Bytes Transferred',\n 'Kali_Trace_Used',\n 'Servers_Corrupted',\n 'Pages_Corrupted',\n 'Location',\n 'WPM_Typing_Speed']","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["Out[6]: ['Session_Connection_Time',\n 'Bytes Transferred',\n 'Kali_Trace_Used',\n 'Servers_Corrupted',\n 'Pages_Corrupted',\n 'Location',\n 'WPM_Typing_Speed']"]}}],"execution_count":0},{"cell_type":"code","source":["feat_cols = ['Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used',\n 'Servers_Corrupted', 'Pages_Corrupted','WPM_Typing_Speed']"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"0d5d7de0-167e-41be-9232-9014cbb580b2"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol='features')"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"9b6afb90-429e-46ac-a048-899897d126a1"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["final_data = vec_assembler.transform(dataset)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"1eb5c7a3-2663-40db-b52a-9b039f9164c9"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["scaler = StandardScaler(inputCol=\"features\", outputCol=\"scaledFeatures\", withStd=True, withMean=False)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3f2dd04e-4498-4242-b3a4-7e36abe3da46"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["scalerModel = scaler.fit(final_data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"bbc3c8fb-1fa3-4cbb-b084-2f0844ecfb7e"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["cluster_final_data = scalerModel.transform(final_data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"cd91f3c9-b700-458d-b126-a5c52fe039a1"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["kmeans3 = KMeans(featuresCol='scaledFeatures',k=3)\nkmeans2 = KMeans(featuresCol='scaledFeatures',k=2)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"96ecf1aa-27f0-439f-b610-fad9bb41f63a"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["model_k3 = kmeans3.fit(cluster_final_data)\nmodel_k2 = kmeans2.fit(cluster_final_data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6cd23533-b9de-4762-82b1-87ed4d5293a7"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["wssse_k3 = model_k3.transform(cluster_final_data)\nwssse_k2 = model_k2.transform(cluster_final_data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"5629f60c-ce9a-48d4-a4ec-751f8d96683d"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["evaluator = ClusteringEvaluator()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"f06cb1dd-10bc-4536-af06-5ebedbd1ed23"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["silhouette3 = evaluator.evaluate(wssse_k3)\nsilhouette2 = evaluator.evaluate(wssse_k2)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"7ec904b1-a1c2-435f-8a70-4cf84f6060f1"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["print(\"With K=3\")\nprint(\"Silhouette with squared euclidean distance = \" + str(silhouette3))\nprint(\"With K=2\")\nprint(\"Silhouette with squared euclidean distance = \" + str(silhouette2))"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"63eb81d7-d9e4-4e53-810b-74e94722b5ff"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"With K=3\nSilhouette with squared euclidean distance = 0.3068084951287429\nWith K=2\nSilhouette with squared euclidean distance = 0.6683623593283755\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["With K=3\nSilhouette with squared euclidean distance = 0.3068084951287429\nWith K=2\nSilhouette with squared euclidean distance = 0.6683623593283755\n"]}}],"execution_count":0},{"cell_type":"code","source":["wssse_k3.groupBy('prediction').count().show()\nwssse_k2.groupBy('prediction').count().show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"b6f2e123-0a94-4b4f-aab0-a75a131032ce"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+----------+-----+\n|prediction|count|\n+----------+-----+\n| 1| 83|\n| 2| 84|\n| 0| 167|\n+----------+-----+\n\n+----------+-----+\n|prediction|count|\n+----------+-----+\n| 1| 167|\n| 0| 167|\n+----------+-----+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+----------+-----+\n|prediction|count|\n+----------+-----+\n| 1| 83|\n| 2| 84|\n| 0| 167|\n+----------+-----+\n\n+----------+-----+\n|prediction|count|\n+----------+-----+\n| 1| 167|\n| 0| 167|\n+----------+-----+\n\n"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"ml-clustering-case","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":3150420838660164}},"nbformat":4,"nbformat_minor":0}