<!DOCTYPE html> <html> <head> <meta name="databricks-html-version" content="1"> <title>021_SparkStreamingIntro - Databricks</title> <meta charset="utf-8"> <meta name="google" content="notranslate"> <meta http-equiv="Content-Language" content="en"> <meta http-equiv="Content-Type" content="text/html; charset=UTF8"> <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Source+Code+Pro:400,700"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/lib/css/bootstrap.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/lib/jquery-ui-bundle/jquery-ui.min.css"> <link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/css/main.css"> <link rel="stylesheet" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/css/print.css" media="print"> <link rel="icon" type="image/png" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/img/favicon.ico"/> <script>window.settings = {"sparkDocsSearchGoogleCx":"004588677886978090460:_rj0wilqwdm","dbcForumURL":"http://forums.databricks.com/","dbfsS3Host":"https://databricks-prod-storage-sydney.s3.amazonaws.com","enableThirdPartyApplicationsUI":false,"enableClusterAcls":false,"notebookRevisionVisibilityHorizon":0,"enableTableHandler":true,"isAdmin":true,"enableLargeResultDownload":false,"nameAndEmail":"Raazesh Sainudiin (r.sainudiin@math.canterbury.ac.nz)","enablePresentationTimerConfig":true,"enableFullTextSearch":true,"enableElasticSparkUI":true,"clusters":true,"hideOffHeapCache":false,"applications":false,"useStaticGuide":false,"fileStoreBase":"FileStore","configurableSparkOptionsSpec":[{"keyPattern":"spark\\.kryo(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.kryo.*","valuePatternDisplay":"*","description":"Configuration options for Kryo serialization"},{"keyPattern":"spark\\.io\\.compression\\.codec","valuePattern":"(lzf|snappy|org\\.apache\\.spark\\.io\\.LZFCompressionCodec|org\\.apache\\.spark\\.io\\.SnappyCompressionCodec)","keyPatternDisplay":"spark.io.compression.codec","valuePatternDisplay":"snappy|lzf","description":"The codec used to compress internal data such as RDD partitions, broadcast variables and shuffle outputs."},{"keyPattern":"spark\\.serializer","valuePattern":"(org\\.apache\\.spark\\.serializer\\.JavaSerializer|org\\.apache\\.spark\\.serializer\\.KryoSerializer)","keyPatternDisplay":"spark.serializer","valuePatternDisplay":"org.apache.spark.serializer.JavaSerializer|org.apache.spark.serializer.KryoSerializer","description":"Class to use for serializing objects that will be sent over the network or need to be cached in serialized form."},{"keyPattern":"spark\\.rdd\\.compress","valuePattern":"(true|false)","keyPatternDisplay":"spark.rdd.compress","valuePatternDisplay":"true|false","description":"Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER). Can save substantial space at the cost of some extra CPU time."},{"keyPattern":"spark\\.speculation","valuePattern":"(true|false)","keyPatternDisplay":"spark.speculation","valuePatternDisplay":"true|false","description":"Whether to use speculation (recommended off for streaming)"},{"keyPattern":"spark\\.es(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"es(\\.([^\\.]+))+","valuePattern":".*","keyPatternDisplay":"es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"spark\\.(storage|shuffle)\\.memoryFraction","valuePattern":"0?\\.0*([1-9])([0-9])*","keyPatternDisplay":"spark.(storage|shuffle).memoryFraction","valuePatternDisplay":"(0.0,1.0)","description":"Fraction of Java heap to use for Spark's shuffle or storage"},{"keyPattern":"spark\\.streaming\\.backpressure\\.enabled","valuePattern":"(true|false)","keyPatternDisplay":"spark.streaming.backpressure.enabled","valuePatternDisplay":"true|false","description":"Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` if they are set."},{"keyPattern":"spark\\.streaming\\.receiver\\.maxRate","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.receiver.maxRate","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRatePerPartition","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRatePerPartition","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the Kafka direct stream API introduced in Spark 1.3. See the Kafka Integration guide for more details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRetries","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRetries","valuePatternDisplay":"numeric","description":"Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the Kafka direct stream API introduced in Spark 1.3."},{"keyPattern":"spark\\.streaming\\.ui\\.retainedBatches","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.ui.retainedBatches","valuePatternDisplay":"numeric","description":"How many batches the Spark Streaming UI and status APIs remember before garbage collecting."}],"enableReactNotebookComments":true,"enableResetPassword":true,"enableJobsSparkUpgrade":true,"sparkVersions":[{"key":"1.3.x-ubuntu15.10","displayName":"Spark 1.3.0","packageLabel":"spark-1.3-jenkins-ip-10-30-9-162-U0c2673ac85-Sa2ee4664b2-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.4.x-ubuntu15.10","displayName":"Spark 1.4.1","packageLabel":"spark-1.4-jenkins-ip-10-30-9-162-U0c2673ac85-S33a1e4b9c6-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2","packageLabel":"spark-1.5-jenkins-ip-10-30-9-162-U0c2673ac85-S5917a1044d-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.x-ubuntu15.10","displayName":"Spark 1.6.0","packageLabel":"spark-1.6-jenkins-ip-10-30-9-162-U0c2673ac85-Scabba801f3-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"master","displayName":"Spark master (dev)","packageLabel":"","upgradable":true,"deprecated":false,"customerVisible":false}],"enableRestrictedClusterCreation":false,"enableFeedback":false,"defaultNumWorkers":8,"serverContinuationTimeoutMillis":10000,"driverStderrFilePrefix":"stderr","driverStdoutFilePrefix":"stdout","enableSparkDocsSearch":true,"prefetchSidebarNodes":true,"sparkHistoryServerEnabled":true,"sanitizeMarkdownHtml":true,"enableIPythonImportExport":true,"enableNotebookHistoryDiffing":true,"branch":"2.12.3","accountsLimit":-1,"enableNotebookGitBranching":true,"local":false,"displayDefaultContainerMemoryGB":6,"deploymentMode":"production","useSpotForWorkers":false,"enableUserInviteWorkflow":false,"enableStaticNotebooks":true,"dbcGuideURL":"#workspace/databricks_guide/00 Welcome to Databricks","enableCssTransitions":true,"pricingURL":"https://databricks.com/product/pricing","enableClusterAclsConfig":false,"orgId":0,"enableNotebookGitVersioning":true,"files":"files/","enableDriverLogsUI":true,"disableLegacyDashboards":false,"enableWorkspaceAclsConfig":true,"dropzoneMaxFileSize":4096,"enableNewDashboardViews":false,"driverLog4jFilePrefix":"log4j","enableMavenLibraries":true,"displayRowLimit":1000,"defaultSparkVersion":{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2","packageLabel":"spark-1.5-jenkins-ip-10-30-9-162-U0c2673ac85-S5917a1044d-2016-02-09-02:05:59.455061","upgradable":true,"deprecated":false,"customerVisible":true},"clusterPublisherRootId":5,"enableLatestJobRunResultPermalink":true,"disallowAddingAdmins":false,"enableSparkConfUI":true,"enableOrgSwitcherUI":false,"clustersLimit":-1,"enableJdbcImport":true,"logfiles":"logfiles/","enableWebappSharding":false,"enableClusterDeltaUpdates":true,"csrfToken":"3f4d8617-8d0d-47dd-a072-38dbe25947da","useFixedStaticNotebookVersionForDevelopment":false,"enableBasicReactDialogBoxes":true,"requireEmailUserName":true,"enableDashboardViews":false,"dbcFeedbackURL":"http://feedback.databricks.com/forums/263785-product-feedback","enableWorkspaceAclService":true,"someName":"Raazesh Sainudiin","enableWorkspaceAcls":true,"gitHash":"0c2673ac858e227cad536fdb45d140aeded238db","userFullname":"Raazesh Sainudiin","enableClusterCreatePage":false,"enableImportFromUrl":true,"enableMiniClusters":false,"enableWebSocketDeltaUpdates":true,"enableDebugUI":false,"showHiddenSparkVersions":false,"allowNonAdminUsers":true,"userId":100005,"dbcSupportURL":"","staticNotebookResourceUrl":"https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/","enableSparkPackages":true,"enableHybridClusterType":false,"enableNotebookHistoryUI":true,"availableWorkspaces":[{"name":"Workspace 0","orgId":0}],"enableFolderHtmlExport":true,"enableSparkVersionsUI":true,"databricksGuideStaticUrl":"","enableHybridClusters":true,"notebookLoadingBackground":"#fff","enableNewJobRunDetailsPage":true,"enableDashboardExport":true,"user":"r.sainudiin@math.canterbury.ac.nz","enableServerAutoComplete":true,"enableStaticHtmlImport":true,"defaultMemoryPerContainerMB":6000,"enablePresenceUI":true,"tablesPublisherRootId":7,"enableNewInputWidgetUI":false,"accounts":true,"enableNewProgressReportUI":true,"defaultCoresPerContainer":4};</script> <script>var __DATABRICKS_NOTEBOOK_MODEL = {"version":"NotebookV1","origId":89112,"name":"021_SparkStreamingIntro","language":"scala","commands":[{"version":"CommandV1","origId":89114,"guid":"87a29b40-81cf-4670-8071-8bb2911311c4","subtype":"command","commandType":"auto","position":0.5,"command":"%md\n\n# [Scalable Data Science](http://www.math.canterbury.ac.nz/~r.sainudiin/courses/ScalableDataScience/)\n\n\n### prepared by [Raazesh Sainudiin](https://nz.linkedin.com/in/raazesh-sainudiin-45955845) and [Sivanand Sivaram](https://www.linkedin.com/in/sivanand)\n\n*supported by* [](https://databricks.com/)\nand \n[](https://www.awseducate.com/microsite/CommunitiesEngageHome)","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"fbe98dc4-28dd-4fb7-847a-653b12595105"},{"version":"CommandV1","origId":129731,"guid":"4becb1b6-5954-4141-8e07-102a8ac5ca4a","subtype":"command","commandType":"auto","position":0.75,"command":"%md\nThe [html source url](https://raw.githubusercontent.com/raazesh-sainudiin/scalable-data-science/master/db/week6/12_SparkStreaming/021_SparkStreamingIntro.html) of this databricks notebook and its recorded Uji :\n\n[](https://www.youtube.com/v/jqLcr2eS-Vs?rel=0&autoplay=1&modestbranding=1&start=0&end=2111)\n","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"b610712f-f9f3-4684-bed7-ac5274f1ae61"},{"version":"CommandV1","origId":89115,"guid":"fbb264c6-28a7-4250-b1c5-87ea5e53f8f5","subtype":"command","commandType":"auto","position":1.0,"command":"%md\n\n# **Spark Streaming**\nSpark Streaming is an extension of the core Spark API that enables scalable, high-throughput, fault-tolerant stream processing of live data streams. \n\nThis is an augmentation of the following resources:\n* the Databricks Guide [Workspace -> Databricks_Guide -> 08 Spark Streaming -> 00 Spark Streaming](/#workspace/databricks_guide/08 Spark Streaming/00 Spark Streaming) and \n* [http://spark.apache.org/docs/latest/streaming-programming-guide.html](http://spark.apache.org/docs/latest/streaming-programming-guide.html)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"e90cd0b2-36da-4b37-aebd-c8a3eb96ffd6"},{"version":"CommandV1","origId":89116,"guid":"b8cadf98-2d64-4536-aeb2-a1195f638c80","subtype":"command","commandType":"auto","position":1.25,"command":"%md\n\nOverview\n========\n\nSpark Streaming is an extension of the core Spark API that enables\nscalable, high-throughput, fault-tolerant stream processing of live data\nstreams. \n\nData can be ingested from many sources like \n\n* [Kafka](http://kafka.apache.org/documentation.html#introduction), \n* [Flume](https://flume.apache.org/),\n* [Twitter](https://twitter.com/) [Streaming](https://dev.twitter.com/streaming/overview) and [REST](https://dev.twitter.com/rest/public) APIs, \n* [ZeroMQ](http://zeromq.org/), \n* [Amazon Kinesis](https://aws.amazon.com/kinesis/streams/), or \n* [TCP sockets](http://www.gnu.org/software/mit-scheme/documentation/mit-scheme-ref/TCP-Sockets.html), \n\nand can be processed using\ncomplex algorithms expressed with high-level functions like `map`,\n`reduce`, `join` and `window`. \n \nFinally, processed data can be pushed out\nto filesystems, databases, and live dashboards. In fact, you can apply Spark's \n* [machine learning](http://spark.apache.org/docs/latest/mllib-guide.html) and\n* [graph processing](http://spark.apache.org/docs/latest/graphx-programming-guide.html) algorithms \non data streams.\n\n\n\n#### Internally, it works as follows: \n* Spark Streaming receives live input data streams and \n* divides the data into batches, \n* which are then processed by the Spark engine \n* to generate the final stream of results in batches.\n\n\n\nSpark Streaming provides a high-level abstraction called **discretized\nstream** or **DStream**, which represents a continuous stream of data.\nDStreams can be created either from input data streams from sources such\nas Kafka, Flume, and Kinesis, or by applying high-level operations on\nother DStreams. Internally, a **DStream is represented as a sequence of\n[RDDs](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.rdd.RDD)**.\n\nThis guide shows you how to start writing Spark Streaming programs with\nDStreams. You can write Spark Streaming programs in Scala, Java or\nPython (introduced in Spark 1.2), all of which are presented in this\n[guide](http://spark.apache.org/docs/latest/streaming-programming-guide.html). \n\nHere, we will focus on Streaming in Scala.\n\n* * * * *\n","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"413e9280-f075-4c66-8cb1-2d1fabf98471"},{"version":"CommandV1","origId":89117,"guid":"adc9cc48-ee45-420f-98fb-ab4bc0d839d0","subtype":"command","commandType":"auto","position":1.5,"command":"%md\n\n#### Spark Streaming Resources\n* [Spark Streaming Programming Guide](https://spark.apache.org/docs/latest/streaming-programming-guide.html) - The official Apache Spark Streaming programming guide.\n* [Debugging Spark Streaming in Databricks](/#workspace/databricks_guide/08 Spark Streaming/02 Debugging Spark Streaming Application)\n* [Streaming FAQs and Best Practices](/#workspace/databricks_guide/08 Spark Streaming/15 Streaming FAQs)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"9bd69abe-2fac-4e76-989d-a7a0364d75e1"},{"version":"CommandV1","origId":89118,"guid":"0aca9b97-36c8-48b1-a261-613e43cc4a8b","subtype":"command","commandType":"auto","position":1.75,"command":"%md\nThree Quick Examples\n===============\n\nBefore we go into the details of how to write your own Spark Streaming\nprogram, let?s take a quick look at what a simple Spark Streaming\nprogram looks like. \n\nWe will choose the first two examples in Databricks notebooks below.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"4eed1fe8-f362-4222-8ca4-a36daa2335f9"},{"version":"CommandV1","origId":89119,"guid":"189b3a3a-b8ca-4278-9508-ae23140245da","subtype":"command","commandType":"auto","position":2.0,"command":"%md #### Spark Streaming Hello World Examples in Databricks Notebooks\n\n1. [Streaming Word Count (Scala)](/#workspace/databricks_guide/08 Spark Streaming/01 Streaming Word Count - Scala)\n* Tweet Collector for Capturing Live Tweets\n* [Twitter Hashtag Count (Scala)](/#workspace/databricks_guide/08 Spark Streaming/03 Twitter Hashtag Count - Scala)\n\nOther examples we won't try here:\n* [Kinesis Word Count (Scala)](/#workspace/databricks_guide/08 Spark Streaming/04 Kinesis Word Count - Scala)\n* [Kafka Word Count (Scala)](/#workspace/databricks_guide/08 Spark Streaming/05 Kafka Word Count - Scala)\n* [FileStream Word Count (Python)](/#workspace/databricks_guide/08 Spark Streaming/06 FileStream Word Count - Python)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"2657c028-1813-4e5c-993d-2f4fd0cc0f4b"},{"version":"CommandV1","origId":89120,"guid":"500cd8f6-5fac-433e-b3de-1122d172f393","subtype":"command","commandType":"auto","position":2.75,"command":"%md ## 1. Streaming Word Count\n\nThis is a *hello world* example of Spark Streaming which counts words on 1 second batches of streaming data. \n\nIt uses an in-memory string generator as a dummy source for streaming data.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"5472fa3e-f9ac-439d-81c6-ff919f2678ab"},{"version":"CommandV1","origId":89121,"guid":"3931d35e-66af-4377-aaca-67c46b117f94","subtype":"command","commandType":"auto","position":2.875,"command":"%md \n## Configurations\n\nConfigurations that control the streaming app in the notebook","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"23e31ebf-8191-4aee-b34d-b6294138798a"},{"version":"CommandV1","origId":89122,"guid":"eb9fd79f-f96a-426f-8f1b-318d3ee88362","subtype":"command","commandType":"auto","position":2.90625,"command":"// === Configuration to control the flow of the application ===\nval stopActiveContext = true\t \n// \"true\" = stop if any existing StreamingContext is running; \n// \"false\" = dont stop, and let it run undisturbed, but your latest code may not be used\n\n// === Configurations for Spark Streaming ===\nval batchIntervalSeconds = 1 \nval eventsPerSecond = 1000 // For the dummy source\n\n// Verify that the attached Spark cluster is 1.4.0+\nrequire(sc.version.replace(\".\", \"\").toInt >= 140, \"Spark 1.4.0+ is required to run this notebook. Please attach it to a Spark 1.4.0+ cluster.\")\n","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">stopActiveContext: Boolean = true\nbatchIntervalSeconds: Int = 1\neventsPerSecond: Int = 1000\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459390568929E12,"submitTime":1.459390570666E12,"finishTime":1.459390574156E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"f72a7d6e-6c02-410c-b641-f9a6d10e9bbf"},{"version":"CommandV1","origId":89123,"guid":"3acc4f11-3565-4f1f-862c-e1dac7827e22","subtype":"command","commandType":"auto","position":2.921875,"command":"%md \n### Imports\n\nImport all the necessary libraries. If you see any error here, you have to make sure that you have attached the necessary libraries to the attached cluster. ","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"39d2ac1f-b6c9-4889-8e29-60fc9253f916"},{"version":"CommandV1","origId":89124,"guid":"507f1f40-9183-424d-b887-9f55c87527ae","subtype":"command","commandType":"auto","position":2.9296875,"command":"import org.apache.spark._\nimport org.apache.spark.storage._\nimport org.apache.spark.streaming._","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">import org.apache.spark._\nimport org.apache.spark.storage._\nimport org.apache.spark.streaming._\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459391061123E12,"submitTime":1.459391063087E12,"finishTime":1.459391061577E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"2c02c3dc-14f7-4af4-ab9c-87ae3891a50f"},{"version":"CommandV1","origId":89125,"guid":"93565f46-e54a-4bd0-a57a-72fc57ef9661","subtype":"command","commandType":"auto","position":2.931640625,"command":"%md\nDiscretized Streams (DStreams)\n------------------------------\n\n**Discretized Stream** or **DStream** is the basic abstraction provided\nby Spark Streaming. It represents a continuous stream of data, either\nthe input data stream received from source, or the processed data stream\ngenerated by transforming the input stream. Internally, a DStream is\nrepresented by a continuous series of RDDs, which is Spark?s abstraction\nof an immutable, distributed dataset (see [Spark Programming\nGuide](http://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds)\nfor more details). Each RDD in a DStream contains data from a certain\ninterval, as shown in the following figure.\n\n\n","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"96f462a8-85a2-44c6-8b67-a73e8acb8a1b"},{"version":"CommandV1","origId":89126,"guid":"86d4da17-fabf-4009-92e4-8d3028495cf9","subtype":"command","commandType":"auto","position":2.93212890625,"command":"%md \n### Setup: Define the function that sets up the StreamingContext\n\nIn this we will do two things. \n* Define a custom receiver as the dummy source (no need to understand this)\n * this custom receiver will have lines that end with a random number between 0 and 9 and read: \n ```\n I am a dummy source 2\n I am a dummy source 8\n ...\n ```","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"83d5b947-1246-4f79-819d-e9b3e94f7d69"},{"version":"CommandV1","origId":89127,"guid":"f007c2b7-6e16-4bbd-a17c-4872e5066fc5","subtype":"command","commandType":"auto","position":2.9326171875,"command":"%md\nThis is the dummy source implemented as a custom receiver. **No need to understand this now.**","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"ec8bfb1b-c10a-4cc6-8b62-a251bbdfddd6"},{"version":"CommandV1","origId":89128,"guid":"0bf1cfb3-ce21-4f62-9e60-b8f89a054bf9","subtype":"command","commandType":"auto","position":2.93359375,"command":"// This is the dummy source implemented as a custom receiver. No need to understand this.\n\nimport scala.util.Random\nimport org.apache.spark.streaming.receiver._\n\nclass DummySource(ratePerSec: Int) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) {\n\n def onStart() {\n // Start the thread that receives data over a connection\n new Thread(\"Dummy Source\") {\n override def run() { receive() }\n }.start()\n }\n\n def onStop() {\n // There is nothing much to do as the thread calling receive()\n // is designed to stop by itself isStopped() returns false\n }\n\n /** Create a socket connection and receive data until receiver is stopped */\n private def receive() {\n while(!isStopped()) { \n store(\"I am a dummy source \" + Random.nextInt(10))\n Thread.sleep((1000.toDouble / ratePerSec).toInt)\n }\n }\n}","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">import scala.util.Random\nimport org.apache.spark.streaming.receiver._\ndefined class DummySource\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459391066412E12,"submitTime":1.459391068376E12,"finishTime":1.45939106703E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"cac4cea7-7ff5-4f57-b8a2-baa3ea907357"},{"version":"CommandV1","origId":89129,"guid":"9eb5e6c6-b243-4671-9129-e30a634b50d8","subtype":"command","commandType":"auto","position":2.9345703125,"command":"%md\nLet's try to understand the following `creatingFunc` to create a new StreamingContext and setting it up for word count and registering it as temp table for each batch of 1000 lines per second in the stream.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"2cea65d9-278a-4b2c-99f3-a1be3233247f"},{"version":"CommandV1","origId":89130,"guid":"0f204241-341e-472d-8943-f5030fa59b28","subtype":"command","commandType":"auto","position":2.935546875,"command":"var newContextCreated = false // Flag to detect whether new context was created or not\n\n// Function to create a new StreamingContext and set it up\ndef creatingFunc(): StreamingContext = {\n \n // Create a StreamingContext\n val ssc = new StreamingContext(sc, Seconds(batchIntervalSeconds))\n \n // Create a stream that generates 1000 lines per second\n val stream = ssc.receiverStream(new DummySource(eventsPerSecond)) \n \n // Split the lines into words, and then do word count\n val wordStream = stream.flatMap { _.split(\" \") }\n val wordCountStream = wordStream.map(word => (word, 1)).reduceByKey(_ + _)\n\n // Create temp table at every batch interval\n wordCountStream.foreachRDD { rdd => \n rdd.toDF(\"word\", \"count\").registerTempTable(\"batch_word_count\") \n }\n \n stream.foreachRDD { rdd =>\n System.out.println(\"# events = \" + rdd.count())\n System.out.println(\"\\t \" + rdd.take(10).mkString(\", \") + \", ...\")\n }\n \n ssc.remember(Minutes(1)) // To make sure data is not deleted by the time we query it interactively\n \n println(\"Creating function called to create new StreamingContext\")\n newContextCreated = true \n ssc\n}","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">newContextCreated: Boolean = false\ncreatingFunc: ()org.apache.spark.streaming.StreamingContext\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459391538157E12,"submitTime":1.459391540126E12,"finishTime":1.459391540319E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"a81da964-bbfd-4c1c-9f43-69aaabec8d4a"},{"version":"CommandV1","origId":89131,"guid":"a1c9ae7a-b6b9-4b99-94b4-2a48a6face93","subtype":"command","commandType":"auto","position":2.935791015625,"command":"%md\n## Transforming and Acting on the DStream of lines\n\nAny operation applied on a DStream translates to operations on the\nunderlying RDDs. For converting\na stream of lines to words, the `flatMap` operation is applied on each\nRDD in the `lines` DStream to generate the RDDs of the `wordStream` DStream.\nThis is shown in the following figure.\n\n\n\nThese underlying RDD transformations are computed by the Spark engine.\nThe DStream operations hide most of these details and provide the\ndeveloper with a higher-level API for convenience. \n\nNext `reduceByKey` is used to get `wordCountStream` that counts the words in `wordStream`.\n\nFinally, this is registered as a temporary table for each RDD in the DStream.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"86648a59-0a4f-41d3-8c6f-a6e63a7751ae"},{"version":"CommandV1","origId":89132,"guid":"47b76921-ee66-4a36-b9a8-64827d8ccd18","subtype":"command","commandType":"auto","position":2.93603515625,"command":"%md \n## Start Streaming Job: Stop existing StreamingContext if any and start/restart the new one\n\nHere we are going to use the configurations at the top of the notebook to decide whether to stop any existing StreamingContext, and start a new one, or recover one from existing checkpoints.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"99abae4b-52e5-4470-aa27-c818be0789ad"},{"version":"CommandV1","origId":89133,"guid":"0ad289cd-d16e-4328-9422-1830bdd3e81e","subtype":"command","commandType":"auto","position":2.936279296875,"command":"// Stop any existing StreamingContext \n// The getActive function is proviced by Databricks to access active Streaming Contexts\nif (stopActiveContext) {\t\n StreamingContext.getActive.foreach { _.stop(stopSparkContext = false) }\n} \n\n// Get or create a streaming context\nval ssc = StreamingContext.getActiveOrCreate(creatingFunc)\nif (newContextCreated) {\n println(\"New context created from currently defined creating function\") \n} else {\n println(\"Existing context running or recovered from checkpoint, may not be running currently defined creating function\")\n}\n\n// Start the streaming context in the background.\nssc.start()\n\n// This is to ensure that we wait for some time before the background streaming job starts. This will put this cell on hold for 5 times the batchIntervalSeconds.\nssc.awaitTerminationOrTimeout(batchIntervalSeconds * 5 * 1000)\n","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">Creating function called to create new StreamingContext\nNew context created from currently defined creating function\n# events = 0\n\t , ...\n# events = 340\n\t I am a dummy source 4, I am a dummy source 2, I am a dummy source 8, I am a dummy source 4, I am a dummy source 4, I am a dummy source 5, I am a dummy source 5, I am a dummy source 5, I am a dummy source 8, I am a dummy source 4, ...\n# events = 852\n\t I am a dummy source 0, I am a dummy source 0, I am a dummy source 0, I am a dummy source 7, I am a dummy source 5, I am a dummy source 2, I am a dummy source 8, I am a dummy source 3, I am a dummy source 0, I am a dummy source 9, ...\n# events = 883\n\t I am a dummy source 4, I am a dummy source 3, I am a dummy source 5, I am a dummy source 2, I am a dummy source 4, I am a dummy source 9, I am a dummy source 2, I am a dummy source 4, I am a dummy source 2, I am a dummy source 5, ...\n# events = 860\n\t I am a dummy source 7, I am a dummy source 2, I am a dummy source 9, I am a dummy source 8, I am a dummy source 4, I am a dummy source 8, I am a dummy source 8, I am a dummy source 8, I am a dummy source 7, I am a dummy source 8, ...\nssc: org.apache.spark.streaming.StreamingContext = org.apache.spark.streaming.StreamingContext@689392bd\nres1: Boolean = false\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"startTime":1.459391566871E12,"submitTime":1.459391568839E12,"finishTime":1.45939157401E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"16fb3492-02ec-43a6-9d1e-54fd795c4b0c"},{"version":"CommandV1","origId":89134,"guid":"d35d3c43-746c-40d4-b797-9517055e79f6","subtype":"command","commandType":"auto","position":2.9365234375,"command":"%md \n### Interactive Querying\n\nNow let's try querying the table. You can run this command again and again, you will find the numbers changing.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"97cea130-f9e1-448e-b6a9-2b31cf70b950"},{"version":"CommandV1","origId":89135,"guid":"e95ee6bf-a222-4e4d-ba09-4c44bc3b39c0","subtype":"command","commandType":"auto","position":2.936767578125,"command":"%sql select * from batch_word_count","commandVersion":0,"state":"finished","results":{"type":"table","data":[["4",77.0],["8",91.0],["am",889.0],["0",103.0],["dummy",889.0],["5",95.0],["a",889.0],["I",889.0],["9",74.0],["1",90.0],["6",89.0],["2",90.0],["7",91.0],["source",889.0],["3",89.0]],"arguments":{},"addedWidgets":{},"removedWidgets":[],"schema":[{"name":"word","type":"\"string\""},{"name":"count","type":"\"integer\""}],"overflow":false,"aggData":[],"aggSchema":[],"aggOverflow":false,"aggSeriesLimitReached":false,"aggError":"","aggType":"","plotOptions":null,"isJsonSchema":true,"dbfsResultPath":null},"errorSummary":null,"error":null,"startTime":1.459391633919E12,"submitTime":1.45939163588E12,"finishTime":1.459391636815E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"15bb5f83-5aab-4b69-97fb-c35d12040361"},{"version":"CommandV1","origId":89136,"guid":"6fb065d0-4bbb-4f0c-9a50-6e57b9d37abe","subtype":"command","commandType":"auto","position":2.93682861328125,"command":"%md\nTry again for current table.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"82c337b6-d049-4118-8fa0-be29b35f5297"},{"version":"CommandV1","origId":89137,"guid":"de888b4d-a69c-4139-8e8c-a71150e97630","subtype":"command","commandType":"auto","position":2.9368896484375,"command":"%sql select * from batch_word_count ","commandVersion":0,"state":"finished","results":{"type":"table","data":[["4",89.0],["8",93.0],["am",893.0],["0",93.0],["dummy",893.0],["5",92.0],["a",893.0],["I",893.0],["9",77.0],["1",91.0],["6",86.0],["2",84.0],["7",97.0],["source",893.0],["3",91.0]],"arguments":{},"addedWidgets":{},"removedWidgets":[],"schema":[{"name":"word","type":"\"string\""},{"name":"count","type":"\"integer\""}],"overflow":false,"aggData":[],"aggSchema":[],"aggOverflow":false,"aggSeriesLimitReached":false,"aggError":"","aggType":"","plotOptions":null,"isJsonSchema":true,"dbfsResultPath":null},"errorSummary":"Error in SQL statement: AnalysisException: cannot recognize input near 'batch_word_count' '/' '/' in from source; line 1 pos 31","error":"com.databricks.backend.common.rpc.DatabricksExceptions$SQLExecutionException: org.apache.spark.sql.AnalysisException: cannot recognize input near 'batch_word_count' '/' '/' in from source; line 1 pos 31\n\tat org.apache.spark.sql.hive.HiveQl$.createPlan(HiveQl.scala:318)\n\tat org.apache.spark.sql.hive.ExtendedHiveQlParser$$anonfun$hiveQl$1.apply(ExtendedHiveQlParser.scala:41)\n\tat org.apache.spark.sql.hive.ExtendedHiveQlParser$$anonfun$hiveQl$1.apply(ExtendedHiveQlParser.scala:40)\n\tat scala.util.parsing.combinator.Parsers$Success.map(Parsers.scala:136)\n\tat scala.util.parsing.combinator.Parsers$Success.map(Parsers.scala:135)\n\tat scala.util.parsing.combinator.Parsers$Parser$$anonfun$map$1.apply(Parsers.scala:242)\n\tat scala.util.parsing.combinator.Parsers$Parser$$anonfun$map$1.apply(Parsers.scala:242)\n\tat scala.util.parsing.combinator.Parsers$$anon$3.apply(Parsers.scala:222)\n\tat scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1$$anonfun$apply$2.apply(Parsers.scala:254)\n\tat scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1$$anonfun$apply$2.apply(Parsers.scala:254)\n\tat scala.util.parsing.combinator.Parsers$Failure.append(Parsers.scala:202)\n\tat scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1.apply(Parsers.scala:254)\n\tat scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1.apply(Parsers.scala:254)\n\tat scala.util.parsing.combinator.Parsers$$anon$3.apply(Parsers.scala:222)\n\tat scala.util.parsing.combinator.Parsers$$anon$2$$anonfun$apply$14.apply(Parsers.scala:891)\n\tat scala.util.parsing.combinator.Parsers$$anon$2$$anonfun$apply$14.apply(Parsers.scala:891)\n\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)\n\tat scala.util.parsing.combinator.Parsers$$anon$2.apply(Parsers.scala:890)\n\tat scala.util.parsing.combinator.PackratParsers$$anon$1.apply(PackratParsers.scala:110)\n\tat org.apache.spark.sql.catalyst.AbstractSparkSQLParser.parse(AbstractSparkSQLParser.scala:34)\n\tat org.apache.spark.sql.hive.HiveQl$.parseSql(HiveQl.scala:295)\n\tat org.apache.spark.sql.hive.HiveQLDialect$$anonfun$parse$1.apply(HiveContext.scala:66)\n\tat org.apache.spark.sql.hive.HiveQLDialect$$anonfun$parse$1.apply(HiveContext.scala:66)\n\tat org.apache.spark.sql.hive.client.ClientWrapper$$anonfun$withHiveState$1.apply(ClientWrapper.scala:279)\n\tat org.apache.spark.sql.hive.client.ClientWrapper.liftedTree1$1(ClientWrapper.scala:226)\n\tat org.apache.spark.sql.hive.client.ClientWrapper.retryLocked(ClientWrapper.scala:225)\n\tat org.apache.spark.sql.hive.client.ClientWrapper.withHiveState(ClientWrapper.scala:268)\n\tat org.apache.spark.sql.hive.HiveQLDialect.parse(HiveContext.scala:65)\n\tat org.apache.spark.sql.SQLContext$$anonfun$2.apply(SQLContext.scala:210)\n\tat org.apache.spark.sql.SQLContext$$anonfun$2.apply(SQLContext.scala:210)\n\tat org.apache.spark.sql.execution.SparkSQLParser$$anonfun$org$apache$spark$sql$execution$SparkSQLParser$$others$1.apply(SparkSQLParser.scala:114)\n\tat org.apache.spark.sql.execution.SparkSQLParser$$anonfun$org$apache$spark$sql$execution$SparkSQLParser$$others$1.apply(SparkSQLParser.scala:113)\n\tat scala.util.parsing.combinator.Parsers$Success.map(Parsers.scala:136)\n\tat scala.util.parsing.combinator.Parsers$Success.map(Parsers.scala:135)\n\tat scala.util.parsing.combinator.Parsers$Parser$$anonfun$map$1.apply(Parsers.scala:242)\n\tat scala.util.parsing.combinator.Parsers$Parser$$anonfun$map$1.apply(Parsers.scala:242)\n\tat scala.util.parsing.combinator.Parsers$$anon$3.apply(Parsers.scala:222)\n\tat scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1$$anonfun$apply$2.apply(Parsers.scala:254)\n\tat scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1$$anonfun$apply$2.apply(Parsers.scala:254)\n\tat scala.util.parsing.combinator.Parsers$Failure.append(Parsers.scala:202)\n\tat scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1.apply(Parsers.scala:254)\n\tat scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1.apply(Parsers.scala:254)\n\tat scala.util.parsing.combinator.Parsers$$anon$3.apply(Parsers.scala:222)\n\tat scala.util.parsing.combinator.Parsers$$anon$2$$anonfun$apply$14.apply(Parsers.scala:891)\n\tat scala.util.parsing.combinator.Parsers$$anon$2$$anonfun$apply$14.apply(Parsers.scala:891)\n\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)\n\tat scala.util.parsing.combinator.Parsers$$anon$2.apply(Parsers.scala:890)\n\tat scala.util.parsing.combinator.PackratParsers$$anon$1.apply(PackratParsers.scala:110)\n\tat org.apache.spark.sql.catalyst.AbstractSparkSQLParser.parse(AbstractSparkSQLParser.scala:34)\n\tat org.apache.spark.sql.SQLContext$$anonfun$1.apply(SQLContext.scala:207)\n\tat org.apache.spark.sql.SQLContext$$anonfun$1.apply(SQLContext.scala:207)\n\tat org.apache.spark.sql.execution.datasources.DDLParser.parse(DDLParser.scala:43)\n\tat org.apache.spark.sql.SQLContext.parseSql(SQLContext.scala:230)\n\tat org.apache.spark.sql.hive.HiveContext.parseSql(HiveContext.scala:332)\n\tat org.apache.spark.sql.SQLContext.sql(SQLContext.scala:816)\n\tat com.databricks.backend.daemon.driver.SQLDriverLocal$$anonfun$1.apply(SQLDriverLocal.scala:71)\n\tat com.databricks.backend.daemon.driver.SQLDriverLocal$$anonfun$1.apply(SQLDriverLocal.scala:29)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)\n\tat scala.collection.immutable.List.foreach(List.scala:318)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:244)\n\tat scala.collection.AbstractTraversable.map(Traversable.scala:105)\n\tat com.databricks.backend.daemon.driver.SQLDriverLocal.executeSql(SQLDriverLocal.scala:29)\n\tat com.databricks.backend.daemon.driver.SQLDriverLocal.repl(SQLDriverLocal.scala:116)\n\tat com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$2.apply(DriverLocal.scala:157)\n\tat com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$2.apply(DriverLocal.scala:157)\n\tat com.databricks.logging.UsageLogging$$anonfun$withAttributionContext$1.apply(UsageLogging.scala:83)\n\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)\n\tat com.databricks.logging.UsageLogging$class.withAttributionContext(UsageLogging.scala:78)\n\tat com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:30)\n\tat com.databricks.logging.UsageLogging$class.withAttributionTags(UsageLogging.scala:116)\n\tat com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:30)\n\tat com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:156)\n\tat com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$3.apply(DriverWrapper.scala:483)\n\tat com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$3.apply(DriverWrapper.scala:483)\n\tat scala.util.Try$.apply(Try.scala:161)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.executeCommand(DriverWrapper.scala:480)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.runInner(DriverWrapper.scala:381)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.run(DriverWrapper.scala:212)\n\tat java.lang.Thread.run(Thread.java:745)\n\n\tat com.databricks.backend.daemon.driver.SQLDriverLocal.executeSql(SQLDriverLocal.scala:104)\n\tat com.databricks.backend.daemon.driver.SQLDriverLocal.repl(SQLDriverLocal.scala:116)\n\tat com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$2.apply(DriverLocal.scala:157)\n\tat com.databricks.backend.daemon.driver.DriverLocal$$anonfun$execute$2.apply(DriverLocal.scala:157)\n\tat com.databricks.logging.UsageLogging$$anonfun$withAttributionContext$1.apply(UsageLogging.scala:83)\n\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)\n\tat com.databricks.logging.UsageLogging$class.withAttributionContext(UsageLogging.scala:78)\n\tat com.databricks.backend.daemon.driver.DriverLocal.withAttributionContext(DriverLocal.scala:30)\n\tat com.databricks.logging.UsageLogging$class.withAttributionTags(UsageLogging.scala:116)\n\tat com.databricks.backend.daemon.driver.DriverLocal.withAttributionTags(DriverLocal.scala:30)\n\tat com.databricks.backend.daemon.driver.DriverLocal.execute(DriverLocal.scala:156)\n\tat com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$3.apply(DriverWrapper.scala:483)\n\tat com.databricks.backend.daemon.driver.DriverWrapper$$anonfun$3.apply(DriverWrapper.scala:483)\n\tat scala.util.Try$.apply(Try.scala:161)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.executeCommand(DriverWrapper.scala:480)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.runInner(DriverWrapper.scala:381)\n\tat com.databricks.backend.daemon.driver.DriverWrapper.run(DriverWrapper.scala:212)\n\tat java.lang.Thread.run(Thread.java:745)\n","startTime":1.459391748376E12,"submitTime":1.459391750335E12,"finishTime":1.45939174848E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"7ee02709-112b-42e2-aa63-31b52c9a6199"},{"version":"CommandV1","origId":89138,"guid":"5b72c864-0d17-4987-951e-345d2ca16472","subtype":"command","commandType":"auto","position":2.93701171875,"command":"%md ### Finally, if you want stop the StreamingContext, you can uncomment and execute the following\n\n`StreamingContext.getActive.foreach { _.stop(stopSparkContext = false) }`","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"265bd455-72e4-439e-9db9-35db66de7c65"},{"version":"CommandV1","origId":89139,"guid":"0b65686c-69bf-4568-994c-75a61d66f0fa","subtype":"command","commandType":"auto","position":2.937255859375,"command":"StreamingContext.getActive.foreach { _.stop(stopSparkContext = false) }","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<div class=\"ansiout\"><console>:34: error: not found: value StreamingContext\n StreamingContext.getActive.foreach { _.stop(stopSparkContext = false) }\n ^\n</div>","error":null,"startTime":1.459391793712E12,"submitTime":1.459391795685E12,"finishTime":1.459391794232E12,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"raazesh.sainudiin@gmail.com","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"673922ec-6ca2-4aa5-a2e2-451742690f4f"},{"version":"CommandV1","origId":89140,"guid":"2a919ad6-ec42-4ab6-85a3-245337671ea6","subtype":"command","commandType":"auto","position":2.9843716621398926,"command":"%md\n***\n***","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"233ade9b-6d5b-4047-80f2-caace9f94ef7"},{"version":"CommandV1","origId":89141,"guid":"33de6ec2-46f5-4c9c-8fb9-b4ac0e97dc60","subtype":"command","commandType":"auto","position":2.9882787466049194,"command":"%md\n# Let's do two more example applications of streaming involving live tweets.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"83dd7b54-6c2e-4897-add9-5de8a008b280"},{"version":"CommandV1","origId":89142,"guid":"2513957e-68f1-4a2e-ac99-f14c17957af0","subtype":"command","commandType":"auto","position":2.9921858310699463,"command":"%md\n***\n***","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"0971a9d3-750f-4ae6-9818-7c53a60a0ca0"},{"version":"CommandV1","origId":89143,"guid":"d3852191-178e-488f-ad34-ea64fb70754f","subtype":"command","commandType":"auto","position":3.0,"command":"%md\n\n# More Pointers to Spark Streaming","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"89a61973-eb51-4bca-aa4d-e3be7f0be197"},{"version":"CommandV1","origId":89144,"guid":"34ac3eb3-8253-4a77-ac6d-44bf541594ca","subtype":"command","commandType":"auto","position":3.5,"command":"%md \n# Spark Streaming Common Sinks\n\n* [Writing data to Kinesis](/#workspace/databricks_guide/08 Spark Streaming/07 Write Output To Kinesis)\n* [Writing data to S3](/#workspace/databricks_guide/08 Spark Streaming/08 Write Output To S3)\n* [Writing data to Kafka](/#workspace/databricks_guide/08 Spark Streaming/09 Write Output To Kafka)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"662aca42-ed32-432c-a690-667142ed172a"},{"version":"CommandV1","origId":89145,"guid":"088345fa-c23a-407b-9945-f810718cb1e7","subtype":"command","commandType":"auto","position":3.96875,"command":"%md\n## Writing to S3\n\nWe will be storing large amounts of data in s3, [Amazon's simple storage service](https://aws.amazon.com/s3/).","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"731d52f4-22b9-4056-ae2b-a9608caff5d8"},{"version":"CommandV1","origId":89146,"guid":"e4b99673-8d0e-47c8-8f0d-e390c5db5b02","subtype":"command","commandType":"auto","position":4.0,"command":"%md\n\n# Spark Streaming Tutorials\n* [Window Aggregations in Streaming](/#workspace/databricks_guide/08 Spark Streaming/10 Window Aggregations) - Has examples for the different window aggregations available in spark streaming\n* [Global Aggregations using updateStateByKey](/#workspace/databricks_guide/08 Spark Streaming/11 Global Aggregations - updateStateByKey) - Provides an example of how to do global aggregations\n* [Global Aggregations using mapWithState](/#workspace/databricks_guide/08 Spark Streaming/12 Global Aggregations - mapWithState) - From Spark 1.6, you can use the `mapWithState` interface to do global aggregations more efficiently.\n* [Joining DStreams](/#workspace/databricks_guide/08 Spark Streaming/13 Joining DStreams) - Has an example for joining 2 dstreams\n* [Joining DStreams with static datasets](/#workspace/databricks_guide/08 Spark Streaming/14 Joining DStreams With Static Datasets) - Builds on the previous example and shows how to join DStreams with static dataframes or RDDs efficiently","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"946096e1-5ba9-45b8-8168-d4436c7284cd"},{"version":"CommandV1","origId":89147,"guid":"da646a9d-0d99-45de-90a2-9dd207709166","subtype":"command","commandType":"auto","position":5.0,"command":"%md \n# Example Streaming Producers\n* [Kinesis Word Producer](/#workspace/databricks_guide/08 Spark Streaming/Producers/1 Kinesis Word Producer)\n* [Kafka Word Producer](/#workspace/databricks_guide/08 Spark Streaming/Producers/2 Kafka Word Producer)\n* [Kafka Ads Data Producer](/#workspace/databricks_guide/08 Spark Streaming/Producers/3 Kafka Ads Data Producer)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"966546c3-da21-4dec-ae82-4e921cd989f8"},{"version":"CommandV1","origId":89148,"guid":"a0314c0e-12eb-4ada-9b1e-f591959a83e1","subtype":"command","commandType":"auto","position":6.0,"command":"%md \n# Spark Streaming Applications\n\n* [Sessionization - Building Sessions from Streams](/#workspace/databricks_guide/08 Spark Streaming/Applications/01 Sessionization)","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"de6e3fda-3c48-489a-87a2-5b1a50b50960"},{"version":"CommandV1","origId":89149,"guid":"91579305-0378-4d40-8a0c-ba414295e6b3","subtype":"command","commandType":"auto","position":7.0,"command":"%md\n\n# [Scalable Data Science](http://www.math.canterbury.ac.nz/~r.sainudiin/courses/ScalableDataScience/)\n\n\n### prepared by [Raazesh Sainudiin](https://nz.linkedin.com/in/raazesh-sainudiin-45955845) and [Sivanand Sivaram](https://www.linkedin.com/in/sivanand)\n\n*supported by* [](https://databricks.com/)\nand \n[](https://www.awseducate.com/microsite/CommunitiesEngageHome)","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"startTime":0.0,"submitTime":0.0,"finishTime":0.0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"nuid":"f07f0228-97c1-480b-b4af-87d92c200daa"}],"dashboards":[],"guid":"194f63cc-c13d-4572-b717-198f9b0cf9c6","globalVars":{},"iPythonMetadata":null,"inputWidgets":{}};</script> <script src="https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/js/notebook-main.js" onerror="window.mainJsLoadError = true;"></script> </head> <body> <script> if (window.mainJsLoadError) { var u = 'https://databricks-prod-cloudfront.cloud.databricks.com/static/201602081754420800-0c2673ac858e227cad536fdb45d140aeded238db/js/notebook-main.js'; var b = document.getElementsByTagName('body')[0]; var c = document.createElement('div'); c.innerHTML = ('<h1>Network Error</h1>' + '<p><b>Please check your network connection and try again.</b></p>' + '<p>Could not load a required resource: ' + u + '</p>'); c.style.margin = '30px'; c.style.padding = '20px 50px'; c.style.backgroundColor = '#f5f5f5'; c.style.borderRadius = '5px'; b.appendChild(c); } </script> </body> </html>