{"cells":[{"cell_type":"markdown","source":["GroupBy and Aggregate F"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"60b0d1e0-b304-42c9-bf5f-60577d8afa3b"}}},{"cell_type":"code","source":["from pyspark.sql import SparkSession"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"288bf066-fe86-41e7-89e5-b80601294e87"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["spark = SparkSession.builder.appName(\"groupbyagg\").getOrCreate()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c418a2f7-cad7-4b8b-ab81-03155b5cecb1"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["df = spark.read.csv(\"dbfs:/FileStore/shared_uploads/dizhen@hsph.harvard.edu/sales_info.csv\",inferSchema=True,header=True)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"f2cacfc2-3efc-45ff-aa39-788fe2fd30d0"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["df.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"7a9c49e3-2882-4641-8664-86d00d01bf4e"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"root\n |-- Company: string (nullable = true)\n |-- Person: string (nullable = true)\n |-- Sales: double (nullable = true)\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["root\n |-- Company: string (nullable = true)\n |-- Person: string (nullable = true)\n |-- Sales: double (nullable = true)\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["df.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"bc3d6a46-8289-42bc-8286-f8dc5c4d45e8"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-------+-------+-----+\n|Company| Person|Sales|\n+-------+-------+-----+\n| GOOG| Sam|200.0|\n| GOOG|Charlie|120.0|\n| GOOG| Frank|340.0|\n| MSFT| Tina|600.0|\n| MSFT| Amy|124.0|\n| MSFT|Vanessa|243.0|\n| FB| Carl|870.0|\n| FB| Sarah|350.0|\n| APPL| John|250.0|\n| APPL| Linda|130.0|\n| APPL| Mike|750.0|\n| APPL| Chris|350.0|\n+-------+-------+-----+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-------+-------+-----+\n|Company| Person|Sales|\n+-------+-------+-----+\n| GOOG| Sam|200.0|\n| GOOG|Charlie|120.0|\n| GOOG| Frank|340.0|\n| MSFT| Tina|600.0|\n| MSFT| Amy|124.0|\n| MSFT|Vanessa|243.0|\n| FB| Carl|870.0|\n| FB| Sarah|350.0|\n| APPL| John|250.0|\n| APPL| Linda|130.0|\n| APPL| Mike|750.0|\n| APPL| Chris|350.0|\n+-------+-------+-----+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["df.groupBy(\"Company\").count().show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"963ce2aa-c0e8-4bb0-a095-80ce2c4ba148"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-------+-----+\n|Company|count|\n+-------+-----+\n| APPL| 4|\n| GOOG| 3|\n| FB| 2|\n| MSFT| 3|\n+-------+-----+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-------+-----+\n|Company|count|\n+-------+-----+\n| APPL| 4|\n| GOOG| 3|\n| FB| 2|\n| MSFT| 3|\n+-------+-----+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["df.groupBy(\"Company\").max().show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6f22f846-d20c-4dd7-a279-c586b7af2e01"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-------+----------+\n|Company|max(Sales)|\n+-------+----------+\n| APPL| 750.0|\n| GOOG| 340.0|\n| FB| 870.0|\n| MSFT| 600.0|\n+-------+----------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-------+----------+\n|Company|max(Sales)|\n+-------+----------+\n| APPL| 750.0|\n| GOOG| 340.0|\n| FB| 870.0|\n| MSFT| 600.0|\n+-------+----------+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["df.groupBy(\"Company\").min().show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"08d073c6-0c42-4e42-8dd8-6b4e8d961d4c"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-------+----------+\n|Company|min(Sales)|\n+-------+----------+\n| APPL| 130.0|\n| GOOG| 120.0|\n| FB| 350.0|\n| MSFT| 124.0|\n+-------+----------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-------+----------+\n|Company|min(Sales)|\n+-------+----------+\n| APPL| 130.0|\n| GOOG| 120.0|\n| FB| 350.0|\n| MSFT| 124.0|\n+-------+----------+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["df.groupBy(\"Company\").sum().show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"20ee6524-03e1-4132-a974-25d265f2c515"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-------+----------+\n|Company|sum(Sales)|\n+-------+----------+\n| APPL| 1480.0|\n| GOOG| 660.0|\n| FB| 1220.0|\n| MSFT| 967.0|\n+-------+----------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-------+----------+\n|Company|sum(Sales)|\n+-------+----------+\n| APPL| 1480.0|\n| GOOG| 660.0|\n| FB| 1220.0|\n| MSFT| 967.0|\n+-------+----------+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["df.agg({'Sales':'max'}).show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"dfd9f762-191e-40f3-ac6a-c22103b4157b"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+----------+\n|max(Sales)|\n+----------+\n| 870.0|\n+----------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+----------+\n|max(Sales)|\n+----------+\n| 870.0|\n+----------+\n\n"]}}],"execution_count":0},{"cell_type":"markdown","source":["Functions"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c31aea62-d321-4d72-94fe-6565a5df21f8"}}},{"cell_type":"code","source":["from pyspark.sql.functions import countDistinct, avg,stddev"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"e06155d7-f574-4231-9ac2-ab648079fd8c"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["df.select(countDistinct(\"Sales\")).show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"aee40e4d-e76e-4192-9bdb-b1a85b4b10e2"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+---------------------+\n|count(DISTINCT Sales)|\n+---------------------+\n| 11|\n+---------------------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+---------------------+\n|count(DISTINCT Sales)|\n+---------------------+\n| 11|\n+---------------------+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["df.select(countDistinct(\"Sales\").alias(\"Distinct Sales\")).show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"e51bfe37-8da5-454c-87ca-2c1a261737c1"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+--------------+\n|Distinct Sales|\n+--------------+\n| 11|\n+--------------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+--------------+\n|Distinct Sales|\n+--------------+\n| 11|\n+--------------+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["df.select(avg('Sales')).show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"e0ab24f0-4d4f-412a-a18a-a367f95f0357"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-----------------+\n| avg(Sales)|\n+-----------------+\n|360.5833333333333|\n+-----------------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-----------------+\n| avg(Sales)|\n+-----------------+\n|360.5833333333333|\n+-----------------+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["df.select(stddev(\"Sales\")).show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"98fb4cf5-a695-498a-b65a-428fbe1282f7"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+------------------+\n|stddev_samp(Sales)|\n+------------------+\n|250.08742410799007|\n+------------------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+------------------+\n|stddev_samp(Sales)|\n+------------------+\n|250.08742410799007|\n+------------------+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["from pyspark.sql.functions import format_number"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"5a02194b-cd6b-4483-8f70-8a4357f29a64"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["sales_std = df.select(stddev(\"Sales\").alias('std'))"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"377b4d40-9a60-4082-a5d4-348f4479f4f7"}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["sales_std.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"f193fb05-17a7-4531-87c3-7db95e3ef713"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+------------------+\n| std|\n+------------------+\n|250.08742410799007|\n+------------------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+------------------+\n| std|\n+------------------+\n|250.08742410799007|\n+------------------+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["sales_std.select(format_number('std',2)).show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"c7d45cd7-0611-400f-9c31-d020837a813b"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+---------------------+\n|format_number(std, 2)|\n+---------------------+\n| 250.09|\n+---------------------+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+---------------------+\n|format_number(std, 2)|\n+---------------------+\n| 250.09|\n+---------------------+\n\n"]}}],"execution_count":0},{"cell_type":"markdown","source":["Order By"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"d2bd0498-76de-410c-abb6-34d1e8483f44"}}},{"cell_type":"code","source":["df.orderBy(\"Sales\").show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"a44603e8-3f7a-46e4-9e74-cf3163fc5c7a"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-------+-------+-----+\n|Company| Person|Sales|\n+-------+-------+-----+\n| GOOG|Charlie|120.0|\n| MSFT| Amy|124.0|\n| APPL| Linda|130.0|\n| GOOG| Sam|200.0|\n| MSFT|Vanessa|243.0|\n| APPL| John|250.0|\n| GOOG| Frank|340.0|\n| FB| Sarah|350.0|\n| APPL| Chris|350.0|\n| MSFT| Tina|600.0|\n| APPL| Mike|750.0|\n| FB| Carl|870.0|\n+-------+-------+-----+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-------+-------+-----+\n|Company| Person|Sales|\n+-------+-------+-----+\n| GOOG|Charlie|120.0|\n| MSFT| Amy|124.0|\n| APPL| Linda|130.0|\n| GOOG| Sam|200.0|\n| MSFT|Vanessa|243.0|\n| APPL| John|250.0|\n| GOOG| Frank|340.0|\n| FB| Sarah|350.0|\n| APPL| Chris|350.0|\n| MSFT| Tina|600.0|\n| APPL| Mike|750.0|\n| FB| Carl|870.0|\n+-------+-------+-----+\n\n"]}}],"execution_count":0},{"cell_type":"code","source":["df.orderBy(df[\"Sales\"].desc()).show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"56afb242-5ad1-45d6-8068-0b8bf1674f53"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+-------+-------+-----+\n|Company| Person|Sales|\n+-------+-------+-----+\n| FB| Carl|870.0|\n| APPL| Mike|750.0|\n| MSFT| Tina|600.0|\n| FB| Sarah|350.0|\n| APPL| Chris|350.0|\n| GOOG| Frank|340.0|\n| APPL| John|250.0|\n| MSFT|Vanessa|243.0|\n| GOOG| Sam|200.0|\n| APPL| Linda|130.0|\n| MSFT| Amy|124.0|\n| GOOG|Charlie|120.0|\n+-------+-------+-----+\n\n","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"ansi","arguments":{}}},"output_type":"display_data","data":{"text/plain":["+-------+-------+-----+\n|Company| Person|Sales|\n+-------+-------+-----+\n| FB| Carl|870.0|\n| APPL| Mike|750.0|\n| MSFT| Tina|600.0|\n| FB| Sarah|350.0|\n| APPL| Chris|350.0|\n| GOOG| Frank|340.0|\n| APPL| John|250.0|\n| MSFT|Vanessa|243.0|\n| GOOG| Sam|200.0|\n| APPL| Linda|130.0|\n| MSFT| Amy|124.0|\n| GOOG|Charlie|120.0|\n+-------+-------+-----+\n\n"]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"sale-info","dashboards":[],"notebookMetadata":{"pythonIndentUnit":4},"language":"python","widgets":{},"notebookOrigID":3598965581908784}},"nbformat":4,"nbformat_minor":0}