<!DOCTYPE html> <html> <head> <meta name="databricks-html-version" content="1"> <title>001_whySpark - Databricks</title> <meta charset="utf-8"> <meta name="google" content="notranslate"> <meta http-equiv="Content-Language" content="en"> <meta http-equiv="Content-Type" content="text/html; charset=UTF8"> <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Source+Code+Pro:400,700"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/lib/css/bootstrap.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/lib/jquery-ui-bundle/jquery-ui.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/css/main.css"> <link rel="stylesheet" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/css/print.css" media="print"> <link rel="icon" type="image/png" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/img/favicon.ico"/> <script>window.settings = {"sparkDocsSearchGoogleCx":"004588677886978090460:_rj0wilqwdm","dbcForumURL":"http://forums.databricks.com/","dbfsS3Host":"https://databricks-prod-storage-sydney.s3.amazonaws.com","enableThirdPartyApplicationsUI":false,"enableClusterAcls":false,"notebookRevisionVisibilityHorizon":0,"enableTableHandler":true,"isAdmin":true,"enableLargeResultDownload":false,"nameAndEmail":"Raazesh Sainudiin (r.sainudiin@math.canterbury.ac.nz)","enablePresentationTimerConfig":true,"enableFullTextSearch":true,"enableElasticSparkUI":true,"clusters":true,"hideOffHeapCache":false,"applications":false,"useStaticGuide":false,"fileStoreBase":"FileStore","configurableSparkOptionsSpec":[{"keyPattern":"spark\\.kryo(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.kryo.*","valuePatternDisplay":"*","description":"Configuration options for Kryo serialization"},{"keyPattern":"spark\\.io\\.compression\\.codec","valuePattern":"(lzf|snappy|org\\.apache\\.spark\\.io\\.LZFCompressionCodec|org\\.apache\\.spark\\.io\\.SnappyCompressionCodec)","keyPatternDisplay":"spark.io.compression.codec","valuePatternDisplay":"snappy|lzf","description":"The codec used to compress internal data such as RDD partitions, broadcast variables and shuffle outputs."},{"keyPattern":"spark\\.serializer","valuePattern":"(org\\.apache\\.spark\\.serializer\\.JavaSerializer|org\\.apache\\.spark\\.serializer\\.KryoSerializer)","keyPatternDisplay":"spark.serializer","valuePatternDisplay":"org.apache.spark.serializer.JavaSerializer|org.apache.spark.serializer.KryoSerializer","description":"Class to use for serializing objects that will be sent over the network or need to be cached in serialized form."},{"keyPattern":"spark\\.rdd\\.compress","valuePattern":"(true|false)","keyPatternDisplay":"spark.rdd.compress","valuePatternDisplay":"true|false","description":"Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER). Can save substantial space at the cost of some extra CPU time."},{"keyPattern":"spark\\.speculation","valuePattern":"(true|false)","keyPatternDisplay":"spark.speculation","valuePatternDisplay":"true|false","description":"Whether to use speculation (recommended off for streaming)"},{"keyPattern":"spark\\.es(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"es(\\.([^\\.]+))+","valuePattern":".*","keyPatternDisplay":"es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"spark\\.(storage|shuffle)\\.memoryFraction","valuePattern":"0?\\.0*([1-9])([0-9])*","keyPatternDisplay":"spark.(storage|shuffle).memoryFraction","valuePatternDisplay":"(0.0,1.0)","description":"Fraction of Java heap to use for Spark's shuffle or storage"},{"keyPattern":"spark\\.streaming\\.backpressure\\.enabled","valuePattern":"(true|false)","keyPatternDisplay":"spark.streaming.backpressure.enabled","valuePatternDisplay":"true|false","description":"Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` if they are set."},{"keyPattern":"spark\\.streaming\\.receiver\\.maxRate","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.receiver.maxRate","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRatePerPartition","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRatePerPartition","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the Kafka direct stream API introduced in Spark 1.3. See the Kafka Integration guide for more details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRetries","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRetries","valuePatternDisplay":"numeric","description":"Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the Kafka direct stream API introduced in Spark 1.3."},{"keyPattern":"spark\\.streaming\\.ui\\.retainedBatches","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.ui.retainedBatches","valuePatternDisplay":"numeric","description":"How many batches the Spark Streaming UI and status APIs remember before garbage collecting."}],"enableReactNotebookComments":true,"enableResetPassword":true,"enableJobsSparkUpgrade":true,"sparkVersions":[{"key":"1.3.x-ubuntu15.10","displayName":"Spark 1.3.0","packageLabel":"spark-1.3-jenkins-ip-10-30-9-162-U0c2673ac85-Sa2ee4664b2-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.4.x-ubuntu15.10","displayName":"Spark 1.4.1","packageLabel":"spark-1.4-jenkins-ip-10-30-9-162-U0c2673ac85-S33a1e4b9c6-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2","packageLabel":"spark-1.5-jenkins-ip-10-30-9-162-U0c2673ac85-S5917a1044d-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.x-ubuntu15.10","displayName":"Spark 1.6.0","packageLabel":"spark-1.6-jenkins-ip-10-30-9-162-U0c2673ac85-Scabba801f3-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"master","displayName":"Spark master (dev)","packageLabel":"","upgradable":true,"deprecated":false,"customerVisible":false}],"enableRestrictedClusterCreation":false,"enableFeedback":false,"defaultNumWorkers":8,"serverContinuationTimeoutMillis":10000,"driverStderrFilePrefix":"stderr","driverStdoutFilePrefix":"stdout","enableSparkDocsSearch":true,"prefetchSidebarNodes":true,"sparkHistoryServerEnabled":true,"sanitizeMarkdownHtml":true,"enableIPythonImportExport":true,"enableNotebookHistoryDiffing":true,"branch":"2.12.3","accountsLimit":-1,"enableNotebookGitBranching":true,"local":false,"displayDefaultContainerMemoryGB":6,"deploymentMode":"production","useSpotForWorkers":false,"enableUserInviteWorkflow":false,"enableStaticNotebooks":true,"dbcGuideURL":"#workspace/databricks_guide/00 Welcome to Databricks","enableCssTransitions":true,"pricingURL":"https://databricks.com/product/pricing","enableClusterAclsConfig":false,"orgId":0,"enableNotebookGitVersioning":true,"files":"files/","enableDriverLogsUI":true,"disableLegacyDashboards":false,"enableWorkspaceAclsConfig":true,"dropzoneMaxFileSize":4096,"enableNewDashboardViews":false,"driverLog4jFilePrefix":"log4j","enableMavenLibraries":true,"displayRowLimit":1000,"defaultSparkVersion":{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2","packageLabel":"spark-1.5-jenkins-ip-10-30-9-162-U0c2673ac85-S5917a1044d-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},"clusterPublisherRootId":5,"enableLatestJobRunResultPermalink":true,"disallowAddingAdmins":false,"enableSparkConfUI":true,"enableOrgSwitcherUI":false,"clustersLimit":-1,"enableJdbcImport":true,"logfiles":"logfiles/","enableWebappSharding":false,"enableClusterDeltaUpdates":true,"csrfToken":"9653253d-3c62-448c-8b14-de4a662aba97","useFixedStaticNotebookVersionForDevelopment":false,"enableBasicReactDialogBoxes":true,"requireEmailUserName":true,"enableDashboardViews":false,"dbcFeedbackURL":"http://feedback.databricks.com/forums/263785-product-feedback","enableWorkspaceAclService":true,"someName":"Raazesh Sainudiin","enableWorkspaceAcls":true,"gitHash":"0c2673ac858e227cad536fdb45d140aeded238db","userFullname":"Raazesh Sainudiin","enableClusterCreatePage":false,"enableImportFromUrl":true,"enableMiniClusters":false,"enableWebSocketDeltaUpdates":true,"enableDebugUI":false,"showHiddenSparkVersions":false,"allowNonAdminUsers":true,"userId":100005,"dbcSupportURL":"","staticNotebookResourceUrl":"https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/","enableSparkPackages":true,"enableHybridClusterType":false,"enableNotebookHistoryUI":true,"availableWorkspaces":[{"name":"Workspace 0","orgId":0}],"enableFolderHtmlExport":true,"enableSparkVersionsUI":true,"databricksGuideStaticUrl":"","enableHybridClusters":true,"notebookLoadingBackground":"#fff","enableNewJobRunDetailsPage":true,"enableDashboardExport":true,"user":"r.sainudiin@math.canterbury.ac.nz","enableServerAutoComplete":true,"enableStaticHtmlImport":true,"defaultMemoryPerContainerMB":6000,"enablePresenceUI":true,"tablesPublisherRootId":7,"enableNewInputWidgetUI":false,"accounts":true,"enableNewProgressReportUI":true,"defaultCoresPerContainer":4};</script> <script>var __DATABRICKS_NOTEBOOK_MODEL = {"version":"NotebookV1","origId":5139,"name":"001_whySpark","language":"scala","commands":[{"version":"CommandV1","origId":5141,"guid":"09a3d6cb-52ce-45bc-ac7c-022f15e2fdbc","subtype":"command","commandType":"auto","position":1.0,"command":"%md\n\n# [Scalable Data Science](http://www.math.canterbury.ac.nz/~r.sainudiin/courses/ScalableDataScience/)\n\n\n### prepared by [Raazesh Sainudiin](https://nz.linkedin.com/in/raazesh-sainudiin-45955845) and [Sivanand Sivaram](https://www.linkedin.com/in/sivanand)\n\n*supported by* [](https://databricks.com/)\nand \n[](https://www.awseducate.com/microsite/CommunitiesEngageHome)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"3c5b8231-822b-4907-8b0a-9a1e52d5b07e"},{"version":"CommandV1","origId":128724,"guid":"7b14ae0e-923b-489f-9dc6-06b14b3c575e","subtype":"command","commandType":"auto","position":1.5,"command":"%md\nThe [html source url](https://raw.githubusercontent.com/raazesh-sainudiin/scalable-data-science/master/db/week1/01_introduction/001_whySpark.html) of this databricks notebook and its recorded Uji :\n\n[](https://www.youtube.com/v/O8JbxgPpAU8?rel=0&autoplay=1&modestbranding=1&start=2065&end=3330)","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"7ee1070a-372f-420e-9b33-27ff8a978946"},{"version":"CommandV1","origId":5142,"guid":"187b12a6-e6d7-4051-8755-7b2e825ff06c","subtype":"command","commandType":"auto","position":2.0,"command":"%md\n# The Berkeley Data Analytics Stack is BDAS\n## Spark is a sub-stack of BDAS\n**Source:** \n* [Ion Stoica's State of Spark Union AmpCamp 6, Nov 2015](https://www.slideshare.net/secret/9ON8EEAlVKP3Sl)\n* [Machine learning: Trends, perspectives, and prospects, M. I. Jordan, T. M. Mitchell, Science 17 Jul 2015: Vol. 349, Issue 6245, pp. 255-260, DOI: 10.1126/science.aaa8415](http://science.sciencemag.org/content/349/6245/255.full-text.pdf+html)\n\n### BDAS State of The Union Talk by Ion Stoica, AMP Camp 6, Nov 2015 \nThe followign talk outlines the motivation and insights behind BDAS' research approach and how they address the cross-disciplinary nature of Big Data challenges and current work.\n* **watch now (5 mins.):** \n\n[](https://www.youtube.com/v/s7kj9XzRBQk?rel=0&autoplay=1&modestbranding=1&start=91&end=386)\n \n## key points\n* started in 2011 with strong public-private funding\n * Defense Advanced Research Projects Agency\n * Lawrance Berkeley Laboratory\n * National Science Foundation\n * Amazon Web Services\n * Google\n * SAP\n* The Berkeley AMPLab is creating a new approach to data analytics to seamlessly integrate the three main resources available for making sense of data at scale: \n * Algorithms (machine learning and statistical techniques), \n * Machines (in the form of scalable clusters and elastic cloud computing), and \n * People (both individually as analysts and in crowds). \n* The lab is realizing its ideas through the development of a freely-available Open Source software stack called BDAS: the Berkeley Data Analytics Stack. \n* Several components of BDAS have gained significant traction in industry and elsewhere, including: \n * the Mesos cluster resource manager, \n * the Spark in-memory computation framework, a sub-stack of the BDAS stack, \n * and more... \n\n","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"6d2c3216-f573-40f3-9366-2dbef624ab07"},{"version":"CommandV1","origId":5144,"guid":"586692e5-730c-4b4d-951d-50a4ccac46f5","subtype":"command","commandType":"auto","position":2.125,"command":"%md\n### The big data problem, Hardware, distributing work, handling failed and slow machines\n#### by Anthony Joseph in BerkeleyX/CS100.1x\n\n* **(watch now 1:48)**: The Big Data Problem\n * [](https://www.youtube.com/v/0JdJe5iehhw?rel=0&autoplay=1&modestbranding=1&start=1)\n* **(watch now 1:43)**: Hardware for Big Data\n * [](https://www.youtube.com/v/KmIIMdsXGzc?rel=0&autoplay=1&modestbranding=1&start=1)\n* **(watch now 1:17)**: How to distribute work across a cluster of commodity machines?\n * [](https://www.youtube.com/v/Euk1v3VtNcM?rel=0&autoplay=1&modestbranding=1&start=1)\n* **(watch now 0:36)**: How to deal with failures or slow machines?\n * [](https://www.youtube.com/v/NaHNsPEK3KA?rel=0&autoplay=1&modestbranding=1&start=1)\n ","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"f518287a-09c3-450e-afdd-b94c69648ee1"},{"version":"CommandV1","origId":5145,"guid":"8e36e7a5-fb9b-438d-a88b-837d137edc81","subtype":"command","commandType":"auto","position":2.1875,"command":"%md\n## MapReduce and Apache Spark.\n#### by Anthony Joseph in BerkeleyX/CS100.1x\n\n* **(watch now 1:48)**: Map Reduce (is bounded by Disk I/O)\n * [](https://www.youtube.com/v/NqG_hYAKjYk?rel=0&autoplay=1&modestbranding=1&start=1)\n* **(watch now 2:49)**: Apache Spark (uses Memory instead of Disk)\n * [](https://www.youtube.com/v/vat5Jki1lbI?rel=0&autoplay=1&modestbranding=1&start=1)\n* **(watch now 3:00)**: Spark Versus MapReduce\n * [](https://www.youtube.com/v/Ddq3Gua2QFg?rel=0&autoplay=1&modestbranding=1&start=1)\n* SUMMARY\n * uses memory instead of disk alone and is thus fater than Hadoop MapReduce\n * resilience abstraction is by RDD (resilient distributed dataset)\n * RDDs can be recovered upon failures from their *lineage graphs*, the recipes to make them starting from raw data\n * Spark supports a lot more than MapReduce, including streaming, interactive in-memory querying, etc.\n * Spark demonstrated an unprecedented sort of 1 petabyte (1,000 terabytes) worth of data in 234 minutes running on 190 Amazon EC2 instances (in 2015).\n * Spark expertise corresponds to the highest Median Salary in the US (~ 150K)","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"1369d907-a0af-44c4-b7a3-73f6ffe8fe4b"},{"version":"CommandV1","origId":5146,"guid":"f8d697ed-13b4-4087-970c-b9f0ff4dca47","subtype":"command","commandType":"auto","position":2.25,"command":"%md\n## Key Papers\n\n* Key Historical Milestones\n * 1956-1979: [Stanford, MIT, CMU, and other universities develop set/list operations in LISP, Prolog, and other languages for parallel processing](http://www-formal.stanford.edu/jmc/history/lisp/lisp.html)\n * 2004: **READ**: [Google's MapReduce: Simplified Data Processing on Large Clusters, by Jeffrey Dean and Sanjay Ghemawat](http://research.google.com/archive/mapreduce.html)\n * 2006: [Yahoo!'s Apache Hadoop, originating from the Yahoo!?s Nutch Project, Doug Cutting](http://developer.yahoo.com/hadoop/)\n * 2009: [Cloud computing with Amazon Web Services Elastic MapReduce](http://aws.amazon.com/elasticmapreduce/), a Hadoop version modified for Amazon Elastic Cloud Computing (EC2) and Amazon Simple Storage System (S3), including support for Apache Hive and Pig.\n * 2010: **READ**: [The Hadoop Distributed File System, by Konstantin Shvachko, Hairong Kuang, Sanjay Radia, and Robert Chansler. IEEE MSST](http://dx.doi.org/10.1109/MSST.2010.5496972) \n* Apache Spark Core Papers</h1>\n * 2010: [Spark: Cluster Computing with Working Sets, Matei Zaharia, Mosharaf Chowdhury, Michael J. Franklin, Scott Shenker, Ion Stoica. USENIX HotCloud](http://people.csail.mit.edu/matei/papers/2010/hotcloud_spark.pdf).\n * 2012: **READ**: [Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing, Matei Zaharia, Mosharaf Chowdhury, Tathagata Das, Ankur Dave, Justin Ma, Murphy McCauley, Michael J. Franklin, Scott Shenker and Ion Stoica. NSDI](http://usenix.org/system/files/conference/nsdi12/nsdi12-final138.pdf)\n \n ","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"11e5d388-c8e4-45a8-9df0-3dcd7b4fa94e"},{"version":"CommandV1","origId":5147,"guid":"621ac974-e311-4fd9-8fc1-7516c7411ed9","subtype":"command","commandType":"auto","position":2.375,"command":"%md\n***\n***\n55 minutes\n55 out of 90+10 minutes.\n\nWe have come to the end of this section.\n\n**Next let us get everyone to login to databricks** to get our hands dirty with some Spark code! \n\n10-15 minutes.\nThen break for 5.\n***\n***","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"4a8b5565-22a6-4487-9f2e-286e762f99c6"},{"version":"CommandV1","origId":5148,"guid":"4c8799ec-6fff-4c80-adfb-72c4f3f2487f","subtype":"command","commandType":"auto","position":2.5,"command":"%md\n\n## EXTRA: For a historical insight see excerpts from an interview with Ion Stoica\n\n#### Beginnings of Apache Spark and Databricks (academia-industry roots)\n[](https://www.youtube.com/v/bHH8sG-F9tg?rel=0&autoplay=1&modestbranding=1&start=231&end=365)\n\n#### Advantages of Apache Spark: A Unified System for Batch, Stream, Interactive / Ad Hoc or Graph Processing\n[](https://www.youtube.com/v/bHH8sG-F9tg?rel=0&autoplay=1&modestbranding=1&start=458&end=726)\n\n#### Main Goal of Databricks Cloud: To Make Big Data Easy\n[](https://www.youtube.com/v/bHH8sG-F9tg?rel=0&autoplay=1&modestbranding=1&start=890&end=985)\n\n***\n***","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"6c11d7ab-392d-4add-a9f1-537553f2e718"},{"version":"CommandV1","origId":5150,"guid":"83c32fbb-88e2-436c-92bc-9eac21b63db2","subtype":"command","commandType":"auto","position":3.0,"command":"%md\n\n# [Scalable Data Science](http://www.math.canterbury.ac.nz/~r.sainudiin/courses/ScalableDataScience/)\n\n\n### prepared by [Raazesh Sainudiin](https://nz.linkedin.com/in/raazesh-sainudiin-45955845) and [Sivanand Sivaram](https://www.linkedin.com/in/sivanand)\n\n*supported by* [](https://databricks.com/)\nand \n[](https://www.awseducate.com/microsite/CommunitiesEngageHome)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"0c35a1a5-e6c4-40ab-a616-9626b5adc413"}],"dashboards":[],"guid":"0680dfeb-6091-43cc-bdea-66ce6816de1f","globalVars":{},"iPythonMetadata":null,"inputWidgets":{}};</script> <script src="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/js/notebook-main.js" onerror="window.mainJsLoadError = true;"></script> </head> <body> <script> if (window.mainJsLoadError) { var u = 'https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/js/notebook-main.js'; var b = document.getElementsByTagName('body')[0]; var c = document.createElement('div'); c.innerHTML = ('<h1>Network Error</h1>' + '<p><b>Please check your network connection and try again.</b></p>' + '<p>Could not load a required resource: ' + u + '</p>'); c.style.margin = '30px'; c.style.padding = '20px 50px'; c.style.backgroundColor = '#f5f5f5'; c.style.borderRadius = '5px'; b.appendChild(c); } </script> </body> </html>