<!DOCTYPE html> <html> <head> <meta name="databricks-html-version" content="1"> <title>4.1_SparkR-Introduction - Databricks</title> <meta charset="utf-8"> <meta name="google" content="notranslate"> <meta name="robots" content="nofollow"> <meta http-equiv="Content-Language" content="en"> <meta http-equiv="Content-Type" content="text/html; charset=UTF8"> <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Source+Code+Pro:400,700"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/lib/css/bootstrap.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/lib/jquery-ui-bundle/jquery-ui.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/css/main.css"> <link rel="stylesheet" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/css/print.css" media="print"> <link rel="icon" type="image/png" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/img/favicon.ico"/> <script>window.settings = {"enableNotebookNotifications":true,"enableSshKeyUI":false,"defaultInteractivePricePerDBU":0.4,"enableOnDemandClusterType":true,"enableAutoCompleteAsYouType":[],"devTierName":"Community Edition","enableJobsPrefetching":true,"workspaceFeaturedLinks":[{"linkURI":"https://docs.databricks.com/index.html","displayName":"Documentation","icon":"question"},{"linkURI":"https://docs.databricks.com/release-notes/product/index.html","displayName":"Release Notes","icon":"code"},{"linkURI":"https://docs.databricks.com/spark/latest/training/index.html","displayName":"Training & Tutorials","icon":"graduation-cap"}],"enableClearStateFeature":false,"dbcForumURL":"http://forums.databricks.com/","enableProtoClusterInfoDeltaPublisher":true,"serverlessDefaultSparkVersion":"2.2.x-scala2.11","maxCustomTags":45,"serverlessDefaultMaxWorkers":20,"enableInstanceProfilesUIInJobs":true,"nodeInfo":{"node_types":[{"support_ssh":false,"spark_heap_memory":4800,"instance_type_id":"r3.2xlarge","spark_core_oversubscription_factor":8.0,"node_type_id":"dev-tier-node","description":"Community Optimized","support_cluster_tags":false,"container_memory_mb":6000,"node_instance_type":{"instance_type_id":"r3.2xlarge","provider":"AWS","local_disk_size_gb":160,"compute_units":26.0,"number_of_ips":14,"local_disks":1,"reserved_compute_units":3.64,"gpus":0,"memory_mb":62464,"num_cores":8,"local_disk_type":"AHCI","max_attachable_disks":0,"supported_disk_types":[{"ebs_volume_type":"GENERAL_PURPOSE_SSD"},{"ebs_volume_type":"THROUGHPUT_OPTIMIZED_HDD"}],"reserved_memory_mb":4800},"memory_mb":6144,"is_hidden":false,"category":"Community Edition","num_cores":0.88,"support_port_forwarding":false,"support_ebs_volumes":false,"is_deprecated":false}],"default_node_type_id":"dev-tier-node"},"enableDatabaseSupportClusterChoice":true,"enableClusterAcls":true,"notebookRevisionVisibilityHorizon":999999,"serverlessClusterProductName":"Serverless Pool","enableTableHandler":false,"maxEbsVolumesPerInstance":10,"isAdmin":true,"deltaProcessingBatchSize":1000,"timerUpdateQueueLength":100,"enableLargeResultDownload":true,"maxElasticDiskCapacityGB":5000,"serverlessDefaultMinWorkers":2,"zoneInfos":[{"id":"us-west-2c","isDefault":true},{"id":"us-west-2b","isDefault":false},{"id":"us-west-2a","isDefault":false}],"enableCustomSpotPricingUIByTier":false,"serverlessClustersEnabled":false,"enableEBSVolumesUIForJobs":true,"enablePublishNotebooks":true,"enableMaxConcurrentRuns":true,"enableJobAclsConfig":false,"enableFullTextSearch":false,"enableElasticSparkUI":false,"enableNewClustersCreate":true,"clusters":true,"allowRunOnPendingClusters":true,"fileStoreBase":"FileStore","enableSshKeyUIInJobs":true,"enableDetachAndAttachSubMenu":false,"configurableSparkOptionsSpec":[{"keyPattern":"spark\\.kryo(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.kryo.*","valuePatternDisplay":"*","description":"Configuration options for Kryo serialization"},{"keyPattern":"spark\\.io\\.compression\\.codec","valuePattern":"(lzf|snappy|org\\.apache\\.spark\\.io\\.LZFCompressionCodec|org\\.apache\\.spark\\.io\\.SnappyCompressionCodec)","keyPatternDisplay":"spark.io.compression.codec","valuePatternDisplay":"snappy|lzf","description":"The codec used to compress internal data such as RDD partitions, broadcast variables and shuffle outputs."},{"keyPattern":"spark\\.serializer","valuePattern":"(org\\.apache\\.spark\\.serializer\\.JavaSerializer|org\\.apache\\.spark\\.serializer\\.KryoSerializer)","keyPatternDisplay":"spark.serializer","valuePatternDisplay":"org.apache.spark.serializer.JavaSerializer|org.apache.spark.serializer.KryoSerializer","description":"Class to use for serializing objects that will be sent over the network or need to be cached in serialized form."},{"keyPattern":"spark\\.rdd\\.compress","valuePattern":"(true|false)","keyPatternDisplay":"spark.rdd.compress","valuePatternDisplay":"true|false","description":"Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER). Can save substantial space at the cost of some extra CPU time."},{"keyPattern":"spark\\.speculation","valuePattern":"(true|false)","keyPatternDisplay":"spark.speculation","valuePatternDisplay":"true|false","description":"Whether to use speculation (recommended off for streaming)"},{"keyPattern":"spark\\.es(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"es(\\.([^\\.]+))+","valuePattern":".*","keyPatternDisplay":"es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"spark\\.(storage|shuffle)\\.memoryFraction","valuePattern":"0?\\.0*([1-9])([0-9])*","keyPatternDisplay":"spark.(storage|shuffle).memoryFraction","valuePatternDisplay":"(0.0,1.0)","description":"Fraction of Java heap to use for Spark's shuffle or storage"},{"keyPattern":"spark\\.streaming\\.backpressure\\.enabled","valuePattern":"(true|false)","keyPatternDisplay":"spark.streaming.backpressure.enabled","valuePatternDisplay":"true|false","description":"Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` if they are set."},{"keyPattern":"spark\\.streaming\\.receiver\\.maxRate","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.receiver.maxRate","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRatePerPartition","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRatePerPartition","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the Kafka direct stream API introduced in Spark 1.3. See the Kafka Integration guide for more details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRetries","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRetries","valuePatternDisplay":"numeric","description":"Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the Kafka direct stream API introduced in Spark 1.3."},{"keyPattern":"spark\\.streaming\\.ui\\.retainedBatches","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.ui.retainedBatches","valuePatternDisplay":"numeric","description":"How many batches the Spark Streaming UI and status APIs remember before garbage collecting."}],"enableReactNotebookComments":true,"enableAdminPasswordReset":false,"enableResetPassword":true,"maxClusterTagValueLength":255,"enableJobsSparkUpgrade":true,"perClusterAutoterminationEnabled":false,"enableNotebookCommandNumbers":true,"sparkVersions":[{"key":"1.6.3-db2-hadoop2-scala2.10","displayName":"Spark 1.6.3-db2 (Hadoop 2, Scala 2.10)","packageLabel":"spark-image-aba860a0ffce4f3471fb14aefdcb1d768ac66a53a5ad884c48745ef98aeb9d67","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.1-db5-scala2.11","displayName":"Spark 2.1.1-db5 (Scala 2.11)","packageLabel":"spark-image-08d9fc1551087e0876236f19640c4a83116b1649f15137427d21c9056656e80e","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.x-ubuntu15.10","displayName":"Spark 1.6.x (Hadoop 1)","packageLabel":"spark-image-8cea23fb9094e174bf5815d79009f4a8e383eb86cf2909cf6c6434ed8da2a16a","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"1.4.x-ubuntu15.10","displayName":"Spark 1.4.1 (Hadoop 1)","packageLabel":"spark-image-f710650fb8aaade8e4e812368ea87c45cd8cd0b5e6894ca6c94f3354e8daa6dc","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.2.x-scala2.11","displayName":"3.0 beta (Scala 2.11)","packageLabel":"spark-image-67ab3a06d1e83d5b60df7063245eb419a2e9fe329aeeb7e7d9713332c669bb17","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"2.1.0-db2-scala2.11","displayName":"Spark 2.1.0-db2 (Scala 2.11)","packageLabel":"spark-image-267c4490a3ab8a39acdbbd9f1d36f6decdecebf013e30dd677faff50f1d9cf8b","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"2.1.x-gpu-scala2.11","displayName":"Spark 2.1 (Auto-updating, GPU, Scala 2.11 experimental)","packageLabel":"spark-image-f43ddf8ad9acfd71338f5a51345f077173d236016ad7e39ffd6f698403acd4ea","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.0.0-ubuntu15.10-scala2.10","displayName":"Spark 2.0.0 (Scala 2.10)","packageLabel":"spark-image-073c1b52ace74f251fae2680624a0d8d184a8b57096d1c21c5ce56c29be6a37a","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.2-db3-scala2.10","displayName":"Spark 2.0.2-db3 (Scala 2.10)","packageLabel":"spark-image-584091dedb690de20e8cf22d9e02fdcce1281edda99eedb441a418d50e28088f","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.0-db1-scala2.11","displayName":"Spark 2.1.0-db1 (Scala 2.11)","packageLabel":"spark-image-e8ad5b72cf0f899dcf2b4720c1f572ab0e87a311d6113b943b4e1d4a7edb77eb","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.1.1-db4-scala2.11","displayName":"Spark 2.1.1-db4 (Scala 2.11)","packageLabel":"spark-image-52bca0ca866e3f4243d3820a783abf3b9b3b553edf234abef14b892657ceaca9","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.0-db2-scala2.10","displayName":"Spark 2.1.0-db2 (Scala 2.10)","packageLabel":"spark-image-a2ca4f6b58c95f78dca91b1340305ab3fe32673bd894da2fa8e1dc8a9f8d0478","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"1.6.x-ubuntu15.10-hadoop1","displayName":"Spark 1.6.x (Hadoop 1)","packageLabel":"spark-image-8cea23fb9094e174bf5815d79009f4a8e383eb86cf2909cf6c6434ed8da2a16a","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"2.0.2-db4-scala2.11","displayName":"Spark 2.0.2-db4 (Scala 2.11)","packageLabel":"spark-image-7dbc7583e8271765b8a1508cb9e832768e35489bbde2c4c790bc6766aee2fd7f","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.1-ubuntu15.10-hadoop1","displayName":"Spark 1.6.1 (Hadoop 1)","packageLabel":"spark-image-21d1cac181b7b8856dd1b4214a3a734f95b5289089349db9d9c926cb87d843db","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.x-gpu-scala2.11","displayName":"Spark 2.0 (Auto-updating, GPU, Scala 2.11 experimental)","packageLabel":"spark-image-968b89f1d0ec32e1ee4dacd04838cae25ef44370a441224177a37980d539d83a","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.2-ubuntu15.10-hadoop1","displayName":"Spark 1.6.2 (Hadoop 1)","packageLabel":"spark-image-8cea23fb9094e174bf5815d79009f4a8e383eb86cf2909cf6c6434ed8da2a16a","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"1.6.3-db1-hadoop2-scala2.10","displayName":"Spark 1.6.3-db1 (Hadoop 2, Scala 2.10)","packageLabel":"spark-image-eaa8d9b990015a14e032fb2e2e15be0b8d5af9627cd01d855df728b67969d5d9","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.3-db2-hadoop1-scala2.10","displayName":"Spark 1.6.3-db2 (Hadoop 1, Scala 2.10)","packageLabel":"spark-image-14112ea0645bea94333a571a150819ce85573cf5541167d905b7e6588645cf3b","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.2-ubuntu15.10-hadoop2","displayName":"Spark 1.6.2 (Hadoop 2)","packageLabel":"spark-image-161245e66d887cd775e23286a54bab0b146143e1289f25bd1732beac454a1561","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"1.6.1-ubuntu15.10-hadoop2","displayName":"Spark 1.6.1 (Hadoop 2)","packageLabel":"spark-image-4cafdf8bc6cba8edad12f441e3b3f0a8ea27da35c896bc8290e16b41fd15496a","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.2-db2-scala2.10","displayName":"Spark 2.0.2-db2 (Scala 2.10)","packageLabel":"spark-image-36d48f22cca7a907538e07df71847dd22aaf84a852c2eeea2dcefe24c681602f","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.x-ubuntu15.10-scala2.11","displayName":"Spark 2.0 (Ubuntu 15.10, Scala 2.11, deprecated)","packageLabel":"spark-image-8e1c50d626a52eac5a6c8129e09ae206ba9890f4523775f77af4ad6d99a64c44","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.x-scala2.10","displayName":"Spark 2.0 (Auto-updating, Scala 2.10)","packageLabel":"spark-image-859e88079f97f58d50e25163b39a1943d1eeac0b6939c5a65faba986477e311a","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.1-db4-scala2.10","displayName":"Spark 2.1.1-db4 (Scala 2.10)","packageLabel":"spark-image-c7c0224de396cd1563addc1ae4bca6ba823780b6babe6c3729ddf73008f29ba4","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.0.2-db1-scala2.11","displayName":"Spark 2.0.2-db1 (Scala 2.11)","packageLabel":"spark-image-c2d623f03dd44097493c01aa54a941fc31978ebe6d759b36c75b716b2ff6ab9c","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.2-db4-scala2.10","displayName":"Spark 2.0.2-db4 (Scala 2.10)","packageLabel":"spark-image-859e88079f97f58d50e25163b39a1943d1eeac0b6939c5a65faba986477e311a","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.1-db5-scala2.10","displayName":"Spark 2.1.1-db5 (Scala 2.10)","packageLabel":"spark-image-74133df2c13950431298d1cab3e865c191d83ac33648a8590495c52fc644c654","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2 (Hadoop 1)","packageLabel":"spark-image-c9d2a8abf41f157a4acc6d52bc721090346f6fea2de356f3a66e388f54481698","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.2.x-scala2.10","displayName":"3.0 beta (Scala 2.10)","packageLabel":"spark-image-d549f2d4a523994ecdf37e531b51d5ec7d8be51534bb0ca5322eaad28ba8f557","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"3.0.x-scala2.11","displayName":"3.0 beta (Scala 2.11)","packageLabel":"spark-image-67ab3a06d1e83d5b60df7063245eb419a2e9fe329aeeb7e7d9713332c669bb17","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.0.x-scala2.11","displayName":"Spark 2.0 (Auto-updating, Scala 2.11)","packageLabel":"spark-image-7dbc7583e8271765b8a1508cb9e832768e35489bbde2c4c790bc6766aee2fd7f","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.x-scala2.10","displayName":"Spark 2.1 (Auto-updating, Scala 2.10)","packageLabel":"spark-image-74133df2c13950431298d1cab3e865c191d83ac33648a8590495c52fc644c654","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.0-db3-scala2.10","displayName":"Spark 2.1.0-db3 (Scala 2.10)","packageLabel":"spark-image-25a17d070af155f10c4232dcc6248e36a2eb48c24f8d4fc00f34041b86bd1626","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.0.2-db2-scala2.11","displayName":"Spark 2.0.2-db2 (Scala 2.11)","packageLabel":"spark-image-4fa852ba378e97815083b96c9cada7b962a513ec23554a5fc849f7f1dd8c065a","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"1.3.x-ubuntu15.10","displayName":"Spark 1.3.0 (Hadoop 1)","packageLabel":"spark-image-40d2842670bc3dc178b14042501847d76171437ccf70613fa397a7a24c48b912","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.0.1-db1-scala2.11","displayName":"Spark 2.0.1-db1 (Scala 2.11)","packageLabel":"spark-image-10ab19f634bbfdb860446c326a9f76dc25bfa87de6403b980566279142a289ea","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.2-db3-scala2.11","displayName":"Spark 2.0.2-db3 (Scala 2.11)","packageLabel":"spark-image-7fd7aaa89d55692e429115ae7eac3b1a1dc4de705d50510995f34306b39c2397","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.3-db1-hadoop1-scala2.10","displayName":"Spark 1.6.3-db1 (Hadoop 1, Scala 2.10)","packageLabel":"spark-image-d50af1032799546b8ccbeeb76889a20c819ebc2a0e68ea20920cb30d3895d3ae","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.0.2-db1-scala2.10","displayName":"Spark 2.0.2-db1 (Scala 2.10)","packageLabel":"spark-image-654bdd6e9bad70079491987d853b4b7abf3b736fff099701501acaabe0e75c41","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.x-ubuntu15.10","displayName":"Spark 2.0 (Ubuntu 15.10, Scala 2.10, deprecated)","packageLabel":"spark-image-a659f3909d51b38d297b20532fc807ecf708cfb7440ce9b090c406ab0c1e4b7e","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.1-db1-scala2.10","displayName":"Spark 2.0.1-db1 (Scala 2.10)","packageLabel":"spark-image-5a13c2db3091986a4e7363006cc185c5b1108c7761ef5d0218506cf2e6643840","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.1.x-scala2.11","displayName":"Spark 2.1 (Auto-updating, Scala 2.11)","packageLabel":"spark-image-08d9fc1551087e0876236f19640c4a83116b1649f15137427d21c9056656e80e","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.0-db1-scala2.10","displayName":"Spark 2.1.0-db1 (Scala 2.10)","packageLabel":"spark-image-f0ab82a5deb7908e0d159e9af066ba05fb56e1edb35bdad41b7ad2fd62a9b546","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"3.0.x-scala2.10","displayName":"3.0 beta (Scala 2.10)","packageLabel":"spark-image-d549f2d4a523994ecdf37e531b51d5ec7d8be51534bb0ca5322eaad28ba8f557","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.0-ubuntu15.10","displayName":"Spark 1.6.0 (Hadoop 1)","packageLabel":"spark-image-10ef758029b8c7e19cd7f4fb52fff9180d75db92ca071bd94c47f3c1171a7cb5","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"1.6.x-ubuntu15.10-hadoop2","displayName":"Spark 1.6.x (Hadoop 2)","packageLabel":"spark-image-161245e66d887cd775e23286a54bab0b146143e1289f25bd1732beac454a1561","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"2.0.0-ubuntu15.10-scala2.11","displayName":"Spark 2.0.0 (Scala 2.11)","packageLabel":"spark-image-b4ec141e751f201399f8358a82efee202560f7ed05e1a04a2ae8778f6324b909","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.1.0-db3-scala2.11","displayName":"Spark 2.1.0-db3 (Scala 2.11)","packageLabel":"spark-image-ccbc6b73f158e2001fc1fb8c827bfdde425d8bd6d65cb7b3269784c28bb72c16","upgradable":true,"deprecated":false,"customerVisible":true}],"enableRestrictedClusterCreation":true,"enableFeedback":true,"enableClusterAutoScaling":false,"enableUserVisibleDefaultTags":true,"defaultNumWorkers":0,"serverContinuationTimeoutMillis":10000,"driverStderrFilePrefix":"stderr","enableNotebookRefresh":false,"accountsOwnerUrl":"https://accounts.cloud.databricks.com/registration.html#login","driverStdoutFilePrefix":"stdout","defaultNodeTypeToPricingUnitsMap":{"r3.2xlarge":2,"i3.4xlarge":6,"class-node":1,"m4.2xlarge":1.5,"r4.xlarge":1,"m4.4xlarge":3,"r4.16xlarge":16,"Standard_DS11":0.5,"p2.8xlarge":16,"m4.10xlarge":8,"r3.8xlarge":8,"r4.4xlarge":4,"dev-tier-node":1,"c3.8xlarge":4,"r3.4xlarge":4,"i2.4xlarge":6,"m4.xlarge":0.75,"r4.8xlarge":8,"r4.large":0.5,"Standard_DS12":1,"development-node":1,"i2.2xlarge":3,"g2.8xlarge":6,"i3.large":0.75,"memory-optimized":1,"m4.large":0.375,"p2.16xlarge":24,"i3.8xlarge":12,"i3.16xlarge":24,"Standard_DS12_v2":1,"Standard_DS13":2,"Standard_DS11_v2":0.5,"Standard_DS13_v2":2,"c3.2xlarge":1,"c4.2xlarge":1,"i2.xlarge":1.5,"compute-optimized":1,"c4.4xlarge":2,"i3.2xlarge":3,"c3.4xlarge":2,"g2.2xlarge":1.5,"p2.xlarge":2,"m4.16xlarge":12,"c4.8xlarge":4,"i3.xlarge":1.5,"r3.xlarge":1,"r4.2xlarge":2,"i2.8xlarge":12},"enableSparkDocsSearch":true,"sparkHistoryServerEnabled":true,"enableEBSVolumesUI":false,"sanitizeMarkdownHtml":true,"metastoreServiceRowLimit":1000000,"enableIPythonImportExport":true,"enableClusterTagsUIForJobs":true,"enableClusterTagsUI":false,"enableNotebookHistoryDiffing":true,"branch":"2.48","accountsLimit":3,"enableSparkEnvironmentVariables":true,"enableX509Authentication":false,"enableStructuredStreamingNbOptimizations":true,"enableNotebookGitBranching":true,"local":false,"enableNotebookLazyRenderWrapper":false,"enableClusterAutoScalingForJobs":false,"enableStrongPassword":false,"displayDefaultContainerMemoryGB":6,"enableNotebookCommandMode":true,"disableS3TableImport":false,"deploymentMode":"production","useSpotForWorkers":true,"enableNonPollingTableCall":true,"enableUserInviteWorkflow":true,"enableStaticNotebooks":true,"enableCssTransitions":true,"serverlessEnableElasticDisk":true,"minClusterTagKeyLength":1,"showHomepageFeaturedLinks":true,"pricingURL":"https://databricks.com/product/pricing","enableClusterAclsConfig":false,"useTempS3UrlForTableUpload":false,"notifyLastLogin":false,"enableSshKeyUIByTier":false,"defaultAutomatedPricePerDBU":0.2,"enableNotebookGitVersioning":true,"files":"files/","feedbackEmail":"feedback@databricks.com","enableDriverLogsUI":true,"enableWorkspaceAclsConfig":false,"dropzoneMaxFileSize":2047,"enableNewClustersList":false,"enableNewDashboardViews":true,"driverLog4jFilePrefix":"log4j","enableSingleSignOn":true,"enableMavenLibraries":true,"displayRowLimit":1000,"deltaProcessingAsyncEnabled":true,"enableSparkEnvironmentVariablesUI":false,"defaultSparkVersion":{"key":"2.1.x-scala2.10","displayName":"Spark 2.1 (Auto-updating, Scala 2.10)","packageLabel":"spark-image-74133df2c13950431298d1cab3e865c191d83ac33648a8590495c52fc644c654","upgradable":true,"deprecated":false,"customerVisible":true},"enableCustomSpotPricing":false,"enableMountAclsConfig":false,"defaultAutoterminationMin":180,"useDevTierHomePage":true,"enableClusterClone":true,"enableNotebookLineNumbers":true,"enablePublishHub":false,"notebookHubUrl":"http://hub.dev.databricks.com/","showSqlEndpoints":false,"enableNotebookDatasetInfoView":false,"enableClusterAclsByTier":false,"databricksDocsBaseUrl":"https://docs.databricks.com/","cloud":"AWS","disallowAddingAdmins":true,"enableSparkConfUI":true,"featureTier":"DEVELOPER_BASIC_TIER","mavenCentralSearchEndpoint":"http://search.maven.org/solrsearch/select","enableOrgSwitcherUI":true,"clustersLimit":1,"enableJdbcImport":true,"enableElasticDisk":false,"logfiles":"logfiles/","enableWebappSharding":true,"enableClusterDeltaUpdates":true,"enableSingleSignOnLogin":false,"ebsVolumeSizeLimitGB":{"GENERAL_PURPOSE_SSD":[100,4096],"THROUGHPUT_OPTIMIZED_HDD":[500,4096]},"enableMountAcls":false,"requireEmailUserName":true,"dbcFeedbackURL":"mailto:feedback@databricks.com","enableMountAclService":true,"serverlessClustersByDefault":false,"enableWorkspaceAcls":false,"maxClusterTagKeyLength":127,"gitHash":"3594d0006cfff297612107c9065dcba3695eed9c","showWorkspaceFeaturedLinks":true,"signupUrl":"https://databricks.com/try-databricks","allowFeedbackForumAccess":true,"enableImportFromUrl":true,"enableMiniClusters":true,"enableNewJobList":true,"enableNewTableUI":true,"enableDebugUI":false,"enableStreamingMetricsDashboard":true,"allowNonAdminUsers":true,"enableSingleSignOnByTier":false,"enableJobsRetryOnTimeout":true,"useStandardTierUpgradeTooltips":true,"staticNotebookResourceUrl":"https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/","enableSpotClusterType":true,"enableSparkPackages":true,"dynamicSparkVersions":true,"enableClusterTagsUIByTier":false,"enableNotebookHistoryUI":true,"enableClusterLoggingUI":true,"enableDatabaseDropdownInTableUI":true,"showDebugCounters":false,"enableInstanceProfilesUI":false,"enableFolderHtmlExport":true,"homepageFeaturedLinks":[{"linkURI":"https://docs.databricks.com/_static/notebooks/gentle-introduction-to-apache-spark.html","displayName":"Introduction to Apache Spark on Databricks","icon":"img/home/Python_icon.svg"},{"linkURI":"https://docs.databricks.com/_static/notebooks/databricks-for-data-scientists.html","displayName":"Databricks for Data Scientists","icon":"img/home/Scala_icon.svg"},{"linkURI":"https://docs.databricks.com/_static/notebooks/structured-streaming-python.html","displayName":"Introduction to Structured Streaming","icon":"img/home/Python_icon.svg"}],"enableClusterStart":false,"enableEBSVolumesUIByTier":false,"singleSignOnComingSoon":false,"upgradeURL":"https://accounts.cloud.databricks.com/registration.html#login","maxAutoterminationMinutes":10000,"autoterminateClustersByDefault":true,"notebookLoadingBackground":"#fff","sshContainerForwardedPort":2200,"enableServerAutoComplete":true,"enableStaticHtmlImport":true,"enableInstanceProfilesByTier":false,"showForgotPasswordLink":true,"defaultMemoryPerContainerMB":6000,"enablePresenceUI":true,"minAutoterminationMinutes":10,"accounts":true,"useOnDemandClustersByDefault":true,"useFramedStaticNotebooks":false,"enableNewProgressReportUI":true,"enableAutoCreateUserUI":true,"defaultCoresPerContainer":4,"showTerminationReason":true,"enableNewClustersGet":true,"showPricePerDBU":false,"showSqlProxyUI":true,"enableNotebookErrorHighlighting":true};</script> <script>var __DATABRICKS_NOTEBOOK_MODEL = {"version":"NotebookV1","origId":2831780622018207,"name":"4.1_SparkR-Introduction","language":"r","commands":[{"version":"CommandV1","origId":2395288974719308,"guid":"f83a6d2e-8aa2-4d4f-a509-eb8132a073b5","subtype":"command","commandType":"auto","position":0.5,"command":"%md\n# 4.1 SparkR Introduction\n\nThis notebook demonstrates the basics of using SparkR - the Apache Spark api for R.\n\nSparkR uses the same underlying implementation as pyspark SQL but the API is designed resemble (to some degree)\nthe functions for R data frames.\n","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":0,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"2d9e1896-6a13-4525-8c12-2b08882a9a91"},{"version":"CommandV1","origId":2831780622018209,"guid":"2797679d-cf6e-444c-87b2-6ecff8d42093","subtype":"command","commandType":"auto","position":1.0,"command":"# initialise the sparkR session\nsparkR.session()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<pre style=\"font-size:10p\"></pre><pre style = 'font-size:10pt'>Java ref type org.apache.spark.sql.SparkSession id 1 </pre>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1499582131374,"submitTime":1499582105339,"finishTime":1499582131424,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"fff755fe-be1e-44da-b374-a773dd31cc9e"},{"version":"CommandV1","origId":2831780622018212,"guid":"ba8457a4-2ccd-403f-9931-0770a36664e4","subtype":"command","commandType":"auto","position":2.0,"command":"# read a Spark DataFrame from a csv file\ndf = read.df('data/winequality-white.csv', 'csv', header=TRUE, sep=';', inferSchema=TRUE)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<pre style=\"font-size:10p\"></pre><pre style = 'font-size:10pt'></pre>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":"<code style=\"font-size:10p\"> Error : Error in loadDF : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost, executor driver): java.io.IOException: Could not read footer for file: FileStatus{path=dbfs:/data/winequality-red.csv; isDirectory=false; length=84199; replication=0; blocksize=0; modification_time=0; access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false} </code>","error":null,"workflows":[],"startTime":1499685864194,"submitTime":1499685861409,"finishTime":1499685864729,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"5c89f845-30b0-4fea-a790-923f9e9a58e4"},{"version":"CommandV1","origId":2831780622018219,"guid":"3a23b7da-6d3c-4940-a98c-f815de29da7c","subtype":"command","commandType":"auto","position":3.0,"command":"# display the schema of the DataFrame\nprintSchema(df)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<pre style=\"font-size:10p\"></pre><pre style = 'font-size:10pt'>root\n |-- fixed acidity: double (nullable = true)\n |-- volatile acidity: double (nullable = true)\n |-- citric acid: double (nullable = true)\n |-- residual sugar: double (nullable = true)\n |-- chlorides: double (nullable = true)\n |-- free sulfur dioxide: double (nullable = true)\n |-- total sulfur dioxide: double (nullable = true)\n |-- density: double (nullable = true)\n |-- pH: double (nullable = true)\n |-- sulphates: double (nullable = true)\n |-- alcohol: double (nullable = true)\n |-- quality: integer (nullable = true)</pre>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1499685896020,"submitTime":1499685896015,"finishTime":1499685896027,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"fbbecf38-57e5-494c-a97a-510c32872f8a"},{"version":"CommandV1","origId":2831780622018220,"guid":"6593b897-2b43-433f-8d5c-897d41a8585d","subtype":"command","commandType":"auto","position":4.0,"command":"# display the head rows of a Spark DataFrame\nhead(df)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<pre style=\"font-size:10p\"></pre><pre style = 'font-size:10pt'> fixed acidity volatile acidity citric acid residual sugar chlorides\n1 7.0 0.27 0.36 20.7 0.045\n2 6.3 0.30 0.34 1.6 0.049\n3 8.1 0.28 0.40 6.9 0.050\n4 7.2 0.23 0.32 8.5 0.058\n5 7.2 0.23 0.32 8.5 0.058\n6 8.1 0.28 0.40 6.9 0.050\n free sulfur dioxide total sulfur dioxide density pH sulphates alcohol\n1 45 170 1.0010 3.00 0.45 8.8\n2 14 132 0.9940 3.30 0.49 9.5\n3 30 97 0.9951 3.26 0.44 10.1\n4 47 186 0.9956 3.19 0.40 9.9\n5 47 186 0.9956 3.19 0.40 9.9\n6 30 97 0.9951 3.26 0.44 10.1\n quality\n1 6\n2 6\n3 6\n4 6\n5 6\n6 6</pre>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1499686100795,"submitTime":1499686100789,"finishTime":1499686101018,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"a3d5d0bd-2193-4bb2-8385-634b76a51c18"},{"version":"CommandV1","origId":2831780622018224,"guid":"79871e6f-566d-4b06-b397-6270449b73d3","subtype":"command","commandType":"auto","position":4.5,"command":"# display the Spark DataFrame\nshowDF(df)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<pre style=\"font-size:10p\"></pre><pre style = 'font-size:10pt'>+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density| pH|sulphates|alcohol|quality|\n+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n| 7.0| 0.27| 0.36| 20.7| 0.045| 45.0| 170.0| 1.001| 3.0| 0.45| 8.8| 6|\n| 6.3| 0.3| 0.34| 1.6| 0.049| 14.0| 132.0| 0.994| 3.3| 0.49| 9.5| 6|\n| 8.1| 0.28| 0.4| 6.9| 0.05| 30.0| 97.0| 0.9951|3.26| 0.44| 10.1| 6|\n| 7.2| 0.23| 0.32| 8.5| 0.058| 47.0| 186.0| 0.9956|3.19| 0.4| 9.9| 6|\n| 7.2| 0.23| 0.32| 8.5| 0.058| 47.0| 186.0| 0.9956|3.19| 0.4| 9.9| 6|\n| 8.1| 0.28| 0.4| 6.9| 0.05| 30.0| 97.0| 0.9951|3.26| 0.44| 10.1| 6|\n| 6.2| 0.32| 0.16| 7.0| 0.045| 30.0| 136.0| 0.9949|3.18| 0.47| 9.6| 6|\n| 7.0| 0.27| 0.36| 20.7| 0.045| 45.0| 170.0| 1.001| 3.0| 0.45| 8.8| 6|\n| 6.3| 0.3| 0.34| 1.6| 0.049| 14.0| 132.0| 0.994| 3.3| 0.49| 9.5| 6|\n| 8.1| 0.22| 0.43| 1.5| 0.044| 28.0| 129.0| 0.9938|3.22| 0.45| 11.0| 6|\n| 8.1| 0.27| 0.41| 1.45| 0.033| 11.0| 63.0| 0.9908|2.99| 0.56| 12.0| 5|\n| 8.6| 0.23| 0.4| 4.2| 0.035| 17.0| 109.0| 0.9947|3.14| 0.53| 9.7| 5|\n| 7.9| 0.18| 0.37| 1.2| 0.04| 16.0| 75.0| 0.992|3.18| 0.63| 10.8| 5|\n| 6.6| 0.16| 0.4| 1.5| 0.044| 48.0| 143.0| 0.9912|3.54| 0.52| 12.4| 7|\n| 8.3| 0.42| 0.62| 19.25| 0.04| 41.0| 172.0| 1.0002|2.98| 0.67| 9.7| 5|\n| 6.6| 0.17| 0.38| 1.5| 0.032| 28.0| 112.0| 0.9914|3.25| 0.55| 11.4| 7|\n| 6.3| 0.48| 0.04| 1.1| 0.046| 30.0| 99.0| 0.9928|3.24| 0.36| 9.6| 6|\n| 6.2| 0.66| 0.48| 1.2| 0.029| 29.0| 75.0| 0.9892|3.33| 0.39| 12.8| 8|\n| 7.4| 0.34| 0.42| 1.1| 0.033| 17.0| 171.0| 0.9917|3.12| 0.53| 11.3| 6|\n| 6.5| 0.31| 0.14| 7.5| 0.044| 34.0| 133.0| 0.9955|3.22| 0.5| 9.5| 5|\n+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\nonly showing top 20 rows</pre>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1499686121813,"submitTime":1499686121807,"finishTime":1499686121954,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"5a0bc232-ba6a-492e-aa43-a368866cfb54"},{"version":"CommandV1","origId":2831780622018221,"guid":"d71074f6-e555-4941-b62d-963fce9b3ffb","subtype":"command","commandType":"auto","position":5.0,"command":"# produce a summary of a Spark DataFrame\n# please note that `summary()` returns another SparkFrame\n# which we can convert to an R data frame with `collect()`\ncollect(summary(df))","commandVersion":0,"state":"finished","results":{"type":"html","data":"<pre style=\"font-size:10p\"></pre><pre style = 'font-size:10pt'> summary fixed acidity volatile acidity citric acid\n1 count 4898 4898 4898\n2 mean 6.854787668436075 0.27824111882401087 0.33419150673743736\n3 stddev 0.8438682276875127 0.10079454842486532 0.12101980420298254\n4 min 3.8 0.08 0.0\n5 max 14.2 1.1 1.66\n residual sugar chlorides free sulfur dioxide\n1 4898 4898 4898\n2 6.391414863209486 0.0457723560636995 35.30808493262556\n3 5.072057784014878 0.021847968093728805 17.00713732523259\n4 0.6 0.009 2.0\n5 65.8 0.346 289.0\n total sulfur dioxide density pH\n1 4898 4898 4898\n2 138.36065741118824 0.9940273764801896 3.1882666394446693\n3 42.498064554142985 0.002990906916936997 0.15100059961506673\n4 9.0 0.98711 2.72\n5 440.0 1.03898 3.82\n sulphates alcohol quality\n1 4898 4898 4898\n2 0.4898468762760325 10.514267047774638 5.87790935075541\n3 0.11412583394883222 1.23062056775732 0.8856385749678322\n4 0.22 8.0 3\n5 1.08 14.2 9</pre>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1499686218024,"submitTime":1499686218019,"finishTime":1499686218645,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"774d4e43-1e5b-498c-80f1-13ebbd483d6f"},{"version":"CommandV1","origId":2831780622018228,"guid":"86e76de2-c62d-407e-8708-530498663d93","subtype":"command","commandType":"auto","position":5.25,"command":"# DataFrams can be filtered, grouped, aggregated and sorted\n# e.g. average `alcohol` aand max log(pH) grouped by `quality` for `density` > 1.0 sorted by `quality`\ncollect(orderBy(summarize(groupBy(filter(df, \n df$density > 1.0),\n df$quality),\n avg(df$alcohol), max(log(df$pH))),\n df$quality))","commandVersion":0,"state":"finished","results":{"type":"html","data":"<pre style=\"font-size:10p\"></pre><pre style = 'font-size:10pt'> quality avg(alcohol) max(LOG(pH))\n1 3 11.000000 1.111858\n2 4 9.000000 1.184790\n3 5 8.947368 1.249902\n4 6 9.168966 1.255616\n5 7 8.880000 1.166271\n6 8 8.800000 1.187843</pre>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":"<code style=\"font-size:10p\"> Error in collect(summarize(groupBy(select(df, df$density > 1), df$quality), : </code>","error":"<pre style=\"font-size:10p\">Error in collect(summarize(groupBy(select(df, df$density > 1), df$quality), : \n error in evaluating the argument 'x' in selecting a method for function 'collect': Error in handleErrors(returnStatus, conn) : \n org.apache.spark.sql.AnalysisException: resolved attribute(s) quality#30,alcohol#29,pH#27 missing from (density > 1.0)#1462 in operator !Aggregate [quality#30], [quality#30, avg(alcohol#29) AS avg(alcohol)#1469, avg(pH#27) AS avg(pH)#1470];;\n!Aggregate [quality#30], [quality#30, avg(alcohol#29) AS avg(alcohol)#1469, avg(pH#27) AS avg(pH)#1470]\n+- Project [(density#26 > 1.0) AS (density > 1.0)#1462]\n +- Relation[fixed acidity#19,volatile acidity#20,citric acid#21,residual sugar#22,chlorides#23,free sulfur dioxide#24,total sulfur dioxide#25,density#26,pH#27,sulphates#28,alcohol#29,quality#30] csv\n\n\tat org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:39)\n\tat org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:60)\n\tat org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$</pre>","workflows":[],"startTime":1499686515656,"submitTime":1499686515650,"finishTime":1499686516716,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"9cbcbe59-f10e-4c24-97ef-3b616f7be17c"},{"version":"CommandV1","origId":2831780622018226,"guid":"dca0b0b0-30ee-4ec7-88a1-a335bdb61b69","subtype":"command","commandType":"auto","position":5.5,"command":"# we can also use a few machine learning models\n\n# split the DataFrame into the traning and testing DataFrames\ntrainAndTestDFs = randomSplit(df, c(0.75, 0.25), 13)\ntrainDF = trainAndTestDFs[[1]]\nprint(count(trainDF))\ntestDF = trainAndTestDFs[[2]]\nprint(count(testDF))","commandVersion":0,"state":"finished","results":{"type":"html","data":"<pre style=\"font-size:10p\"></pre><pre style = 'font-size:10pt'>[1] 3625\n[1] 1273</pre>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1499686673445,"submitTime":1499686673434,"finishTime":1499686673904,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"18c2d122-013b-40f4-8500-d5dbf226738c"},{"version":"CommandV1","origId":2831780622018222,"guid":"145b9b34-2586-4652-bc6d-2534b29c7a69","subtype":"command","commandType":"auto","position":6.0,"command":"# train a linear regression model\nglmModel = spark.glm(trainDF, quality ~ ., family = \"gaussian\")\nsummary(glmModel)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<pre style=\"font-size:10p\"></pre><pre style = 'font-size:10pt'>\nDeviance Residuals: \n(Note: These are approximate quantiles with relative error <= 0.01)\n Min 1Q Median 3Q Max \n-3.7028 -0.4953 -0.0483 0.4346 2.8136 \n\nCoefficients:\n Estimate Std. Error t value Pr(>|t|) \n(Intercept) 141.91 20.667 6.8667 7.7005e-12\nfixed acidity 0.054059 0.023535 2.297 0.021678 \nvolatile acidity -1.8773 0.13289 -14.127 0 \ncitric acid 0.010091 0.1155 0.087369 0.93038 \nresidual sugar 0.080612 0.0084207 9.573 0 \nchlorides -0.13222 0.65353 -0.20231 0.83969 \nfree sulfur dioxide 0.0025999 0.00097505 2.6665 0.0076993 \ntotal sulfur dioxide 0.0002466 0.00044141 0.55865 0.57643 \ndensity -141.98 20.967 -6.7718 1.4789e-11\npH 0.65118 0.12034 5.4112 6.6682e-08\nsulphates 0.61715 0.11641 5.3016 1.2166e-07\nalcohol 0.21282 0.026941 7.8995 3.7748e-15\n\n(Dispersion parameter for gaussian family taken to be 0.5669293)\n\n Null deviance: 2842.3 on 3624 degrees of freedom\nResidual deviance: 2048.3 on 3613 degrees of freedom\nAIC: 8244\n\nNumber of Fisher Scoring iterations: 1\n</pre>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":"<code style=\"font-size:10p\"> Error in spark.glm(trainDF, quality ~ ., family = "gaussian") : </code>","error":"<pre style=\"font-size:10p\">Error in spark.glm(trainDF, quality ~ ., family = "gaussian") : \n error in evaluating the argument 'data' in selecting a method for function 'spark.glm': Error: object 'trainDF' not found\n</pre>","workflows":[],"startTime":1499686675976,"submitTime":1499686675971,"finishTime":1499686678896,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"9d0573b2-5fa0-4a1c-9903-02d29edd4482"},{"version":"CommandV1","origId":2831780622018225,"guid":"0badb2f2-04f8-43ed-bb0a-e78d0a7a0a70","subtype":"command","commandType":"auto","position":8.0,"command":"# make predictions on the testing DataFrame\npredictionsDF <- predict(glmModel, testDF)\nhead(select(predictionsDF, 'prediction'))","commandVersion":0,"state":"finished","results":{"type":"html","data":"<pre style=\"font-size:10p\"></pre><pre style = 'font-size:10pt'> prediction\n1 6.991938\n2 6.778693\n3 5.836421\n4 7.148434\n5 6.232502\n6 5.781459</pre>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":"<code style=\"font-size:10p\"> Warning messages: </code>","error":"<pre style=\"font-size:10p\">Warning messages:\n1: In odbcDriverConnect("DSN=spark;UID=root") :\n [RODBC] ERROR: state HY000, code 34, message [unixODBC][Simba][SparkODBC] (34) Error from Spark: No more data to read..\n2: In odbcDriverConnect("DSN=spark;UID=root") : ODBC connection failed\n3: 'sparkR.init' is deprecated.\nUse 'sparkR.session' instead.\nSee help("Deprecated") \nError in predict(glmModel, testDF) : \n error in evaluating the argument 'object' in selecting a method for function 'predict': Error: object 'glmModel' not found\n</pre>","workflows":[],"startTime":1499686720583,"submitTime":1499686720578,"finishTime":1499686720841,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"5ae4fdea-3119-4ac0-84c8-954433d0cda7"},{"version":"CommandV1","origId":2831780622018227,"guid":"b454534c-81d3-49fe-a7fe-ed957186f67b","subtype":"command","commandType":"auto","position":9.0,"command":"# calculate the RMSE (Root Mean Squared Error) on testing set\n\nresultDF = collect(agg(predictionsDF, avg((predictionsDF$prediction - predictionsDF$quality)* \n (predictionsDF$prediction - predictionsDF$quality))))\nsqrt(as.numeric(resultDF))","commandVersion":0,"state":"finished","results":{"type":"html","data":"<pre style=\"font-size:10p\"></pre><pre style = 'font-size:10pt'>[1] 0.7480906</pre>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":"<code style=\"font-size:10p\"> Error in parse(text = DATABRICKS_CURRENT_TEMP_CMD__) : </code>","error":null,"workflows":[],"startTime":1499686822749,"submitTime":1499686822743,"finishTime":1499686823074,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"44cf0a10-b52b-42f9-af51-fcf01e2d453e"},{"version":"CommandV1","origId":2831780622018229,"guid":"eb301e4e-b89b-4e9e-8539-d357c06c1e4e","subtype":"command","commandType":"auto","position":10.0,"command":"# use gapplyCollect to process the Spark DataFrame with a user defined function\n# the function will be called with the grouping key and the R dataframe which contatins \n# all the rows for this group.\n# it should return an R data frame representing the result of processing for a group.\n# gapplyCollect will then merge the resuls from all the groups into a single R data frame\n\ngapplyCollect(df, 'quality', function(key, f) {\n apply(f, FUN= mean, MARGIN = 2)\n})","commandVersion":0,"state":"finished","results":{"type":"html","data":"<pre style=\"font-size:10p\"></pre><pre style = 'font-size:10pt'> fixed acidity volatile acidity citric acid residual sugar chlorides\n[1,] 6.837671 0.2605641 0.3380255 6.441606 0.04521747\n[2,] 7.600000 0.3332500 0.3360000 6.392500 0.05430000\n[3,] 6.933974 0.3020110 0.3376527 7.334969 0.05154633\n[4,] 7.420000 0.2980000 0.3860000 4.120000 0.02740000\n[5,] 7.129448 0.3812270 0.3042331 4.628221 0.05009816\n[6,] 6.657143 0.2774000 0.3265143 5.671429 0.03831429\n[7,] 6.734716 0.2627670 0.3256250 5.186477 0.03819091\n free sulfur dioxide total sulfur dioxide density pH sulphates\n[1,] 35.65059 137.0473 0.9939613 3.188599 0.4911056\n[2,] 53.32500 170.6000 0.9948840 3.187500 0.4745000\n[3,] 36.43205 150.9046 0.9952626 3.168833 0.4822032\n[4,] 33.40000 116.0000 0.9914600 3.308000 0.4660000\n[5,] 23.35890 125.2791 0.9942767 3.182883 0.4761350\n[6,] 36.72000 126.1657 0.9922359 3.218686 0.4862286\n[7,] 34.12557 125.1148 0.9924524 3.213898 0.5031023\n alcohol quality\n[1,] 10.57537 6\n[2,] 10.34500 3\n[3,] 9.80884 5\n[4,] 12.18000 9\n[5,] 10.15245 4\n[6,] 11.63600 8\n[7,] 11.36794 7</pre>","arguments":{},"addedWidgets":{},"removedWidgets":[],"datasetInfos":[]},"errorSummary":"<code style=\"font-size:10p\"> Error in handleErrors(returnStatus, conn) : </code>","error":"<pre style=\"font-size:10p\">Error in handleErrors(returnStatus, conn) : \n org.apache.spark.SparkException: Job aborted due to stage failure: Task 51 in stage 33.0 failed 1 times, most recent failure: Lost task 51.0 in stage 33.0 (TID 482, localhost, executor driver): org.apache.spark.SparkException: R computation failed with\n Error in `rownames<-`(x, value) : \n attempt to set 'rownames' on an object with no dimensions\nCalls: compute ... computeFunc -> row.names<- -> row.names<-.default -> rownames<-\nExecution halted\n\tat org.apache.spark.api.r.RRunner.compute(RRunner.scala:108)\n\tat org.apache.spark.sql.execution.FlatMapGroupsInRExec$$anonfun$12.apply(objects.scala:404)\n\tat org.apache.spark.sql.execution.FlatMapGroupsInRExec$$anonfun$12.apply(objects.scala:386)\n\tat org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)\n\tat org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:287)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:287)\n\tat org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)\n\tat org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)\n\tat org.apache.spark.rdd.RDD.iterator(RDD.scala:287)\n\tat org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)\n\tat org.apache.spark.scheduler.Task.run(Task.scala:99)\n\tat org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)\n\tat java.lang.Thread.run(Thread.java:745)\n\nDriver stacktrace:\n\tat org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1442)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1430)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1429)\n\tat scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)\n\tat org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1429)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)\n\tat org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:803)\n\tat scala.Option.foreach(Option.scala:236)\n\tat org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:803)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1657)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1612)\n\tat org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1601)\n\tat org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)\n\tat org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1937)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1950)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1963)\n\tat org.apache.spark.SparkContext.runJob(SparkContext.scala:1977)\n\tat org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat org.apache.spark.rdd.RDD.withScope(RDD.scala:362)\n\tat org.apache.spark.rdd.RDD.collect(RDD.scala:935)\n\tat org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:275)\n\tat org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:2807)\n\tat org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2369)\n\tat org.apache.spark.sql.Dataset$$anonfun$collect$1.apply(Dataset.scala:2369)\n\tat org.apache.spark.sql.Dataset$$anonfun$60.apply(Dataset.scala:2791)\n\tat org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:87)\n\tat org.apache.spark.sql.execution.SQLExecution$.withFileAccessAudit(SQLExecution.scala:53)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:70)\n\tat org.apache.spark.sql.Dataset.withAction(Dataset.scala:2790)\n\tat org.apache.spark.sql.Dataset.collect(Dataset.scala:2369)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat org.apache.spark.api.r.RBackendHandler.handleMethodCall(RBackendHandler.scala:167)\n\tat org.apache.spark.api.r.RBackendHandler.channelRead0(RBackendHandler.scala:108)\n\tat org.apache.spark.api.r.RBackendHandler.channelRead0(RBackendHandler.scala:40)\n\tat io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)\n\tat io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)\n\tat io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:266)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)\n\tat io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)\n\tat io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)\n\tat io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)\n\tat io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:293)\n\tat io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:267)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)\n\tat io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:346)\n\tat io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1294)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:367)\n\tat io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:353)\n\tat io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:911)\n\tat io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)\n\tat io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:652)\n\tat io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:575)\n\tat io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:489)\n\tat io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:451)\n\tat io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:140)\n\tat io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:144)\n\tat java.lang.Thread.run(Thread.java:745)\nCaused by: org.apache.spark.SparkException: R computation failed with\n Error in `rownames<-`(x, value) : \n attempt t\nIn addition: Warning messages:\n1: In odbcDriverConnect("DSN=spark;UID=root") :\n [RODBC] ERROR: state HY000, code 34, message [unixODBC][Simba][SparkODBC] (34) Error from Spark: No more data to read..\n2: In odbcDriverConnect("DSN=spark;UID=root") : ODBC connection failed\n3: 'sparkR.init' is deprecated.\nUse 'sparkR.session' instead.\nSee help("Deprecated") </pre>","workflows":[],"startTime":1499687119545,"submitTime":1499687119539,"finishTime":1499687156638,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"9cfbef53-505b-434c-93de-bd81705ca54f"},{"version":"CommandV1","origId":2831780622018232,"guid":"13144cbf-f44a-44c5-b504-74358d776353","subtype":"command","commandType":"auto","position":11.0,"command":"%md\n\nFor more information on using SparkR please check the documentation:\n- SparkR Guide: <https://spark.apache.org/docs/latest/sparkr.html>\n- SparkR Documentation: <https://spark.apache.org/docs/latest/api/R/index.html>","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":0,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"a96a2dbb-909f-48d6-9593-0ff20b54f2da"}],"dashboards":[],"guid":"022e9b37-e106-42a0-addf-ba723a7973bc","globalVars":{},"iPythonMetadata":null,"inputWidgets":{}};</script> <script src="https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/js/notebook-main.js" onerror="window.mainJsLoadError = true;"></script> </head> <body> <script> if (window.mainJsLoadError) { var u = 'https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/js/notebook-main.js'; var b = document.getElementsByTagName('body')[0]; var c = document.createElement('div'); c.innerHTML = ('<h1>Network Error</h1>' + '<p><b>Please check your network connection and try again.</b></p>' + '<p>Could not load a required resource: ' + u + '</p>'); c.style.margin = '30px'; c.style.padding = '20px 50px'; c.style.backgroundColor = '#f5f5f5'; c.style.borderRadius = '5px'; b.appendChild(c); } </script> </body> </html>